Files
GoldenCheetah/contrib/kmeans/kmeans_general_functions.cpp
Mark Liversedge 4c6c8e6d71 Additional Windows fixups for Fast Kmeans
.. std::max needs <algorithm> and don't need <unistd> at all.
2021-09-28 11:45:34 +01:00

257 lines
7.1 KiB
C++

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/
#include "kmeans_dataset.h"
#include "kmeans.h"
#include "kmeans_general_functions.h"
#include <cassert>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <cstring>
#include <cstdio>
void addVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) += *(b++);
}
}
void subVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) -= *(b++);
}
}
double distance2silent(double const *a, double const *b, int d) {
double d2 = 0.0, diff;
double const *end = a + d; // one past the last valid entry in a
while (a < end) {
diff = *(a++) - *(b++);
d2 += diff * diff;
}
return d2;
}
void centerDataset(Dataset *x) {
double *xCentroid = new double[x->d];
for (int d = 0; d < x->d; ++d) {
xCentroid[d] = 0.0;
}
for (int i = 0; i < x->n; ++i) {
addVectors(xCentroid, x->data + i * x->d, x->d);
}
// compute average (divide by n)
for (int d = 0; d < x->d; ++d) {
xCentroid[d] /= x->n;
}
// re-center the dataset
const double *xEnd = x->data + x->n * x->d;
for (double *xp = x->data; xp != xEnd; xp += x->d) {
subVectors(xp, xCentroid, x->d);
}
delete [] xCentroid;
}
Dataset *init_centers(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < k; ++i) {
bool acceptable = true;
do {
acceptable = true;
chosen_pts[i] = rand() % x.n;
for (int j = 0; j < i; ++j) {
if (chosen_pts[i] == chosen_pts[j]) {
acceptable = false;
break;
}
}
} while (! acceptable);
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
return c;
}
Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
double *distribution = new double[x.n];
// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}
// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;
while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
sum_distribution += dist2[i].first;
}
// sort the examples by their distance from centers
sort(dist2, dist2 + x.n);
// turn distribution into a CDF
distribution[0] = dist2[0].first / sum_distribution;
for (int i = 1; i < x.n; ++i) {
distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution;
}
// choose a random interval according to the new distribution
double r = (double)rand() / (double)RAND_MAX;
double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r);
int distribution_ndx = (int)(new_center_ptr - distribution);
chosen_pts[ndx] = dist2[distribution_ndx].second;
/*
cout << "chose " << distribution_ndx << " which is actually "
<< chosen_pts[ndx] << " with distance "
<< dist2[distribution_ndx].first << std::endl;
*/
++ndx;
}
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < k; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
delete [] dist2;
delete [] distribution;
return c;
}
Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}
// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;
while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
double max_dist = 0.0;
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
if (dist2[i].first > max_dist) {
max_dist = dist2[i].first;
}
sum_distribution += dist2[i].first;
}
bool unique = true;
do {
// choose a random interval according to the new distribution
double r = sum_distribution * (double)rand() / (double)RAND_MAX;
double sum_cdf = dist2[0].first;
int cdf_ndx = 0;
while (sum_cdf < r) {
sum_cdf += dist2[++cdf_ndx].first;
}
chosen_pts[ndx] = cdf_ndx;
for (int i = 0; i < ndx; ++i) {
unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
}
} while (! unique);
++ndx;
}
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < c->n; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}
delete [] chosen_pts;
delete [] dist2;
return c;
}
void assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
for (int i = 0; i < x.n; ++i) {
double shortestDist2 = std::numeric_limits<double>::max();
int closest = 0;
for (int j = 0; j < c.n; ++j) {
double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d;
for (; a != x.data + (i + 1) * x.d; ++a, ++b) {
d2 += (*a - *b) * (*a - *b);
}
if (d2 < shortestDist2) {
shortestDist2 = d2;
closest = j;
}
}
assignment[i] = closest;
}
}