mirror of
https://github.com/GoldenCheetah/GoldenCheetah.git
synced 2026-02-13 08:08:42 +00:00
.. with grateful thanks to Greg Hamerly A fast kmeans algorithm described here: https://epubs.siam.org/doi/10.1137/1.9781611972801.12 The source repository is also here: https://github.com/ghamerly/fast-kmeans NOTE: The original source has been included largely as-is with a view to writing a wrapper around it using Qt semantics for use in GoldenCheetah (e.g. via datafilter) The original source included multiple kmeans algorithms we have only kept the `fast' Hamerly variant.
86 lines
3.2 KiB
C++
86 lines
3.2 KiB
C++
#ifndef DATASET_H
|
|
#define DATASET_H
|
|
|
|
/* Authors: Greg Hamerly and Jonathan Drake
|
|
* Feedback: hamerly@cs.baylor.edu
|
|
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
|
|
* Copyright 2014
|
|
*
|
|
* A Dataset class represents a collection of multidimensional records, as is
|
|
* typical in metric machine learning. Every record has the same number of
|
|
* dimensions (values), and every value must be numeric. Undefined values are
|
|
* not allowed.
|
|
*
|
|
* This particular implementation keeps all the data in a 1-dimensional array,
|
|
* and also optionally keeps extra storage for the sum of the squared values of
|
|
* each record. However, the Dataset class does NOT automatically populate or
|
|
* update the sumDataSquared values.
|
|
*/
|
|
|
|
#include <cstddef>
|
|
#include <iostream>
|
|
|
|
class Dataset {
|
|
public:
|
|
// default constructor -- constructs a completely empty dataset with no
|
|
// records
|
|
Dataset() : n(0), d(0), nd(0), data(NULL), sumDataSquared(NULL) {}
|
|
|
|
// construct a dataset of a particular size, and determine whether to
|
|
// keep the sumDataSquared
|
|
Dataset(int aN, int aD, bool keepSDS = false) : n(aN), d(aD), nd(n * d),
|
|
data(new double[nd]),
|
|
sumDataSquared(keepSDS ? new double[n] : NULL) {}
|
|
|
|
// copy constructor -- makes a deep copy of everything in x
|
|
Dataset(Dataset const &x);
|
|
|
|
// destroys the dataset safely
|
|
~Dataset() {
|
|
n = d = nd = 0;
|
|
double *dp = data, *sdsp = sumDataSquared;
|
|
data = sumDataSquared = NULL;
|
|
delete [] dp;
|
|
delete [] sdsp;
|
|
}
|
|
|
|
// operator= is the standard deep-copy assignment operator, which
|
|
// returns a const reference to *this.
|
|
Dataset const &operator=(Dataset const &x);
|
|
|
|
// allows modification of the record ndx and dimension dim
|
|
double &operator()(int ndx, int dim);
|
|
|
|
// allows const access to record ndx and dimension dim
|
|
const double &operator()(int ndx, int dim) const;
|
|
|
|
// fill the entire dataset with value. Does NOT update sumDataSquared.
|
|
void fill(double value);
|
|
|
|
// print the dataset to standard output (cout), using formatting to keep the
|
|
// data in matrix format
|
|
void print(std::ostream &out = std::cout) const;
|
|
|
|
// n represents the number of records
|
|
// d represents the dimension
|
|
// nd is a shortcut for the value n * d
|
|
int n, d, nd;
|
|
|
|
// data is an array of length n*d that stores all of the records in
|
|
// record-major (row-major) order. Thus data[0]...data[d-1] are the
|
|
// values associated with the first record.
|
|
double *data;
|
|
|
|
// sumDataSquared is an (optional) sum of squared values for every
|
|
// record. Thus,
|
|
// sumDataSquared[0] = data[0]^2 + data[1]^2 + ... + data[d-1]^2
|
|
// sumDataSquared[1] = data[d]^2 + data[d+1]^2 + ... + data[2*d-1]^2
|
|
// and so on. Note that this is the *intended* use of the sumDataSquared
|
|
// field, but that the Dataset class does NOT automatically populate or
|
|
// update the values in sumDataSquared.
|
|
double *sumDataSquared;
|
|
};
|
|
|
|
#endif
|
|
|