Files
GoldenCheetah/contrib/kmeans/kmeans_general_functions.cpp
Mark Liversedge 4c720884b6 DataFilter - kmeans()
.. kmeans(centers|assignments, k, dim1, dim2 .. dimn)

   perform a k means cluster on data with multiple dimensions
   and return the centers, or the assignments.

   the return values are ordered so they can be displayed
   easily in an overview table e.g.

   values {
       kmeans(centers, 3, metrics(TSS), metrics(IF));
   }

.. will look at how we might plot these in charts with either
   color coding of points or perhaps voronoi diagrams.
2021-09-28 17:31:09 +01:00

257 lines
7.2 KiB
C++

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/
#include "kmeans_dataset.h"
#include "kmeans.h"
#include "kmeans_general_functions.h"
#include <cassert>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <cstring>
#include <cstdio>
void addVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) += *(b++);
}
}
void subVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) -= *(b++);
}
}
double distance2silent(double const *a, double const *b, int d) {
double d2 = 0.0, diff;
double const *end = a + d; // one past the last valid entry in a
while (a < end) {
diff = *(a++) - *(b++);
d2 += diff * diff;
}
return d2;
}
void centerDataset(Dataset *x) {
double *xCentroid = new double[x->d];
for (int d = 0; d < x->d; ++d) {
xCentroid[d] = 0.0;
}
for (int i = 0; i < x->n; ++i) {
addVectors(xCentroid, x->data + i * x->d, x->d);
}
// compute average (divide by n)
for (int d = 0; d < x->d; ++d) {
xCentroid[d] /= x->n;
}
// re-center the dataset
const double *xEnd = x->data + x->n * x->d;
for (double *xp = x->data; xp != xEnd; xp += x->d) {
subVectors(xp, xCentroid, x->d);
}
delete [] xCentroid;
}
Dataset *init_centers(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < k; ++i) {
bool acceptable = true;
do {
acceptable = true;
chosen_pts[i] = rand() % x.n;
for (int j = 0; j < i; ++j) {
if (chosen_pts[i] == chosen_pts[j]) {
acceptable = false;
break;
}
}
} while (! acceptable);
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
return c;
}
Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
double *distribution = new double[x.n];
// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}
// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;
while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
sum_distribution += dist2[i].first;
}
// sort the examples by their distance from centers
sort(dist2, dist2 + x.n);
// turn distribution into a CDF
distribution[0] = dist2[0].first / sum_distribution;
for (int i = 1; i < x.n; ++i) {
distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution;
}
// choose a random interval according to the new distribution
double r = (double)rand() / (double)RAND_MAX;
double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r);
int distribution_ndx = (int)(new_center_ptr - distribution);
chosen_pts[ndx] = dist2[distribution_ndx].second;
/*
cout << "chose " << distribution_ndx << " which is actually "
<< chosen_pts[ndx] << " with distance "
<< dist2[distribution_ndx].first << std::endl;
*/
++ndx;
}
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < k; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
delete [] dist2;
delete [] distribution;
return c;
}
Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}
// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;
while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
double max_dist = 0.0;
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
if (dist2[i].first > max_dist) {
max_dist = dist2[i].first;
}
sum_distribution += dist2[i].first;
}
bool unique = true;
do {
// choose a random interval according to the new distribution
double r = sum_distribution * (double)rand() / (double)RAND_MAX;
double sum_cdf = dist2[0].first;
int cdf_ndx = 0;
while (sum_cdf < r) {
sum_cdf += dist2[++cdf_ndx].first;
}
chosen_pts[ndx] = cdf_ndx;
for (int i = 0; i < ndx; ++i) {
unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
}
} while (! unique);
++ndx;
}
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < c->n; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
delete [] dist2;
return c;
}
void kmeans_assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
for (int i = 0; i < x.n; ++i) {
double shortestDist2 = std::numeric_limits<double>::max();
int closest = 0;
for (int j = 0; j < c.n; ++j) {
double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d;
for (; a != x.data + (i + 1) * x.d; ++a, ++b) {
d2 += (*a - *b) * (*a - *b);
}
if (d2 < shortestDist2) {
shortestDist2 = d2;
closest = j;
}
}
assignment[i] = closest;
}
}