mirror of
https://github.com/GoldenCheetah/GoldenCheetah.git
synced 2026-02-13 08:08:42 +00:00
.. kmeans(centers|assignments, k, dim1, dim2 .. dimn)
perform a k means cluster on data with multiple dimensions
and return the centers, or the assignments.
the return values are ordered so they can be displayed
easily in an overview table e.g.
values {
kmeans(centers, 3, metrics(TSS), metrics(IF));
}
.. will look at how we might plot these in charts with either
color coding of points or perhaps voronoi diagrams.
257 lines
7.2 KiB
C++
257 lines
7.2 KiB
C++
/* Authors: Greg Hamerly and Jonathan Drake
|
|
* Feedback: hamerly@cs.baylor.edu
|
|
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
|
|
* Copyright 2014
|
|
*/
|
|
|
|
#include "kmeans_dataset.h"
|
|
#include "kmeans.h"
|
|
#include "kmeans_general_functions.h"
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <cstring>
|
|
#include <cstdio>
|
|
|
|
void addVectors(double *a, double const *b, int d) {
|
|
double const *end = a + d;
|
|
while (a < end) {
|
|
*(a++) += *(b++);
|
|
}
|
|
}
|
|
|
|
void subVectors(double *a, double const *b, int d) {
|
|
double const *end = a + d;
|
|
while (a < end) {
|
|
*(a++) -= *(b++);
|
|
}
|
|
}
|
|
|
|
double distance2silent(double const *a, double const *b, int d) {
|
|
double d2 = 0.0, diff;
|
|
double const *end = a + d; // one past the last valid entry in a
|
|
while (a < end) {
|
|
diff = *(a++) - *(b++);
|
|
d2 += diff * diff;
|
|
}
|
|
return d2;
|
|
}
|
|
|
|
|
|
void centerDataset(Dataset *x) {
|
|
double *xCentroid = new double[x->d];
|
|
|
|
for (int d = 0; d < x->d; ++d) {
|
|
xCentroid[d] = 0.0;
|
|
}
|
|
|
|
for (int i = 0; i < x->n; ++i) {
|
|
addVectors(xCentroid, x->data + i * x->d, x->d);
|
|
}
|
|
|
|
// compute average (divide by n)
|
|
for (int d = 0; d < x->d; ++d) {
|
|
xCentroid[d] /= x->n;
|
|
}
|
|
|
|
// re-center the dataset
|
|
const double *xEnd = x->data + x->n * x->d;
|
|
for (double *xp = x->data; xp != xEnd; xp += x->d) {
|
|
subVectors(xp, xCentroid, x->d);
|
|
}
|
|
|
|
delete [] xCentroid;
|
|
}
|
|
|
|
Dataset *init_centers(Dataset const &x, unsigned short k) {
|
|
int *chosen_pts = new int[k];
|
|
Dataset *c = new Dataset(k, x.d);
|
|
for (int i = 0; i < k; ++i) {
|
|
bool acceptable = true;
|
|
do {
|
|
acceptable = true;
|
|
chosen_pts[i] = rand() % x.n;
|
|
for (int j = 0; j < i; ++j) {
|
|
if (chosen_pts[i] == chosen_pts[j]) {
|
|
acceptable = false;
|
|
break;
|
|
}
|
|
}
|
|
} while (! acceptable);
|
|
double *cdp = c->data + i * x.d;
|
|
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
|
|
if (c->sumDataSquared) {
|
|
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
|
|
}
|
|
}
|
|
|
|
delete [] chosen_pts;
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) {
|
|
int *chosen_pts = new int[k];
|
|
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
|
|
double *distribution = new double[x.n];
|
|
|
|
// initialize dist2
|
|
for (int i = 0; i < x.n; ++i) {
|
|
dist2[i].first = std::numeric_limits<double>::max();
|
|
dist2[i].second = i;
|
|
}
|
|
|
|
// choose the first point randomly
|
|
int ndx = 1;
|
|
chosen_pts[ndx - 1] = rand() % x.n;
|
|
|
|
while (ndx < k) {
|
|
double sum_distribution = 0.0;
|
|
// look for the point that is furthest from any center
|
|
for (int i = 0; i < x.n; ++i) {
|
|
int example = dist2[i].second;
|
|
double d2 = 0.0, diff;
|
|
for (int j = 0; j < x.d; ++j) {
|
|
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
|
|
d2 += diff * diff;
|
|
}
|
|
if (d2 < dist2[i].first) {
|
|
dist2[i].first = d2;
|
|
}
|
|
|
|
sum_distribution += dist2[i].first;
|
|
}
|
|
|
|
// sort the examples by their distance from centers
|
|
sort(dist2, dist2 + x.n);
|
|
|
|
// turn distribution into a CDF
|
|
distribution[0] = dist2[0].first / sum_distribution;
|
|
for (int i = 1; i < x.n; ++i) {
|
|
distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution;
|
|
}
|
|
|
|
// choose a random interval according to the new distribution
|
|
double r = (double)rand() / (double)RAND_MAX;
|
|
double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r);
|
|
int distribution_ndx = (int)(new_center_ptr - distribution);
|
|
chosen_pts[ndx] = dist2[distribution_ndx].second;
|
|
/*
|
|
cout << "chose " << distribution_ndx << " which is actually "
|
|
<< chosen_pts[ndx] << " with distance "
|
|
<< dist2[distribution_ndx].first << std::endl;
|
|
*/
|
|
|
|
++ndx;
|
|
}
|
|
|
|
Dataset *c = new Dataset(k, x.d);
|
|
|
|
for (int i = 0; i < k; ++i) {
|
|
double *cdp = c->data + i * x.d;
|
|
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
|
|
if (c->sumDataSquared) {
|
|
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
|
|
}
|
|
}
|
|
|
|
delete [] chosen_pts;
|
|
delete [] dist2;
|
|
delete [] distribution;
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
|
|
int *chosen_pts = new int[k];
|
|
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
|
|
|
|
// initialize dist2
|
|
for (int i = 0; i < x.n; ++i) {
|
|
dist2[i].first = std::numeric_limits<double>::max();
|
|
dist2[i].second = i;
|
|
}
|
|
|
|
// choose the first point randomly
|
|
int ndx = 1;
|
|
chosen_pts[ndx - 1] = rand() % x.n;
|
|
|
|
while (ndx < k) {
|
|
double sum_distribution = 0.0;
|
|
// look for the point that is furthest from any center
|
|
double max_dist = 0.0;
|
|
for (int i = 0; i < x.n; ++i) {
|
|
int example = dist2[i].second;
|
|
double d2 = 0.0, diff;
|
|
for (int j = 0; j < x.d; ++j) {
|
|
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
|
|
d2 += diff * diff;
|
|
}
|
|
if (d2 < dist2[i].first) {
|
|
dist2[i].first = d2;
|
|
}
|
|
|
|
if (dist2[i].first > max_dist) {
|
|
max_dist = dist2[i].first;
|
|
}
|
|
|
|
sum_distribution += dist2[i].first;
|
|
}
|
|
|
|
bool unique = true;
|
|
|
|
do {
|
|
// choose a random interval according to the new distribution
|
|
double r = sum_distribution * (double)rand() / (double)RAND_MAX;
|
|
double sum_cdf = dist2[0].first;
|
|
int cdf_ndx = 0;
|
|
while (sum_cdf < r) {
|
|
sum_cdf += dist2[++cdf_ndx].first;
|
|
}
|
|
chosen_pts[ndx] = cdf_ndx;
|
|
|
|
for (int i = 0; i < ndx; ++i) {
|
|
unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
|
|
}
|
|
} while (! unique);
|
|
|
|
++ndx;
|
|
}
|
|
|
|
Dataset *c = new Dataset(k, x.d);
|
|
for (int i = 0; i < c->n; ++i) {
|
|
double *cdp = c->data + i * x.d;
|
|
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
|
|
if (c->sumDataSquared) {
|
|
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
|
|
}
|
|
}
|
|
|
|
delete [] chosen_pts;
|
|
delete [] dist2;
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
void kmeans_assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
|
|
for (int i = 0; i < x.n; ++i) {
|
|
double shortestDist2 = std::numeric_limits<double>::max();
|
|
int closest = 0;
|
|
for (int j = 0; j < c.n; ++j) {
|
|
double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d;
|
|
for (; a != x.data + (i + 1) * x.d; ++a, ++b) {
|
|
d2 += (*a - *b) * (*a - *b);
|
|
}
|
|
if (d2 < shortestDist2) {
|
|
shortestDist2 = d2;
|
|
closest = j;
|
|
}
|
|
}
|
|
assignment[i] = closest;
|
|
}
|
|
}
|