/* Authors: Greg Hamerly and Jonathan Drake * Feedback: hamerly@cs.baylor.edu * See: http://cs.baylor.edu/~hamerly/software/kmeans.php * Copyright 2014 */ #include "kmeans_dataset.h" #include "kmeans.h" #include "kmeans_general_functions.h" #include #include #include #include #include #include void addVectors(double *a, double const *b, int d) { double const *end = a + d; while (a < end) { *(a++) += *(b++); } } void subVectors(double *a, double const *b, int d) { double const *end = a + d; while (a < end) { *(a++) -= *(b++); } } double distance2silent(double const *a, double const *b, int d) { double d2 = 0.0, diff; double const *end = a + d; // one past the last valid entry in a while (a < end) { diff = *(a++) - *(b++); d2 += diff * diff; } return d2; } void centerDataset(Dataset *x) { double *xCentroid = new double[x->d]; for (int d = 0; d < x->d; ++d) { xCentroid[d] = 0.0; } for (int i = 0; i < x->n; ++i) { addVectors(xCentroid, x->data + i * x->d, x->d); } // compute average (divide by n) for (int d = 0; d < x->d; ++d) { xCentroid[d] /= x->n; } // re-center the dataset const double *xEnd = x->data + x->n * x->d; for (double *xp = x->data; xp != xEnd; xp += x->d) { subVectors(xp, xCentroid, x->d); } delete [] xCentroid; } Dataset *init_centers(Dataset const &x, unsigned short k) { int *chosen_pts = new int[k]; Dataset *c = new Dataset(k, x.d); for (int i = 0; i < k; ++i) { bool acceptable = true; do { acceptable = true; chosen_pts[i] = rand() % x.n; for (int j = 0; j < i; ++j) { if (chosen_pts[i] == chosen_pts[j]) { acceptable = false; break; } } } while (! acceptable); double *cdp = c->data + i * x.d; memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); if (c->sumDataSquared) { c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); } } delete [] chosen_pts; return c; } Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) { int *chosen_pts = new int[k]; std::pair *dist2 = new std::pair[x.n]; double *distribution = new double[x.n]; // initialize dist2 for (int i = 0; i < x.n; ++i) { dist2[i].first = std::numeric_limits::max(); dist2[i].second = i; } // choose the first point randomly int ndx = 1; chosen_pts[ndx - 1] = rand() % x.n; while (ndx < k) { double sum_distribution = 0.0; // look for the point that is furthest from any center for (int i = 0; i < x.n; ++i) { int example = dist2[i].second; double d2 = 0.0, diff; for (int j = 0; j < x.d; ++j) { diff = x(example,j) - x(chosen_pts[ndx - 1],j); d2 += diff * diff; } if (d2 < dist2[i].first) { dist2[i].first = d2; } sum_distribution += dist2[i].first; } // sort the examples by their distance from centers sort(dist2, dist2 + x.n); // turn distribution into a CDF distribution[0] = dist2[0].first / sum_distribution; for (int i = 1; i < x.n; ++i) { distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution; } // choose a random interval according to the new distribution double r = (double)rand() / (double)RAND_MAX; double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r); int distribution_ndx = (int)(new_center_ptr - distribution); chosen_pts[ndx] = dist2[distribution_ndx].second; /* cout << "chose " << distribution_ndx << " which is actually " << chosen_pts[ndx] << " with distance " << dist2[distribution_ndx].first << std::endl; */ ++ndx; } Dataset *c = new Dataset(k, x.d); for (int i = 0; i < k; ++i) { double *cdp = c->data + i * x.d; memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); if (c->sumDataSquared) { c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); } } delete [] chosen_pts; delete [] dist2; delete [] distribution; return c; } Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) { int *chosen_pts = new int[k]; std::pair *dist2 = new std::pair[x.n]; // initialize dist2 for (int i = 0; i < x.n; ++i) { dist2[i].first = std::numeric_limits::max(); dist2[i].second = i; } // choose the first point randomly int ndx = 1; chosen_pts[ndx - 1] = rand() % x.n; while (ndx < k) { double sum_distribution = 0.0; // look for the point that is furthest from any center double max_dist = 0.0; for (int i = 0; i < x.n; ++i) { int example = dist2[i].second; double d2 = 0.0, diff; for (int j = 0; j < x.d; ++j) { diff = x(example,j) - x(chosen_pts[ndx - 1],j); d2 += diff * diff; } if (d2 < dist2[i].first) { dist2[i].first = d2; } if (dist2[i].first > max_dist) { max_dist = dist2[i].first; } sum_distribution += dist2[i].first; } bool unique = true; do { // choose a random interval according to the new distribution double r = sum_distribution * (double)rand() / (double)RAND_MAX; double sum_cdf = dist2[0].first; int cdf_ndx = 0; while (sum_cdf < r) { sum_cdf += dist2[++cdf_ndx].first; } chosen_pts[ndx] = cdf_ndx; for (int i = 0; i < ndx; ++i) { unique = unique && (chosen_pts[ndx] != chosen_pts[i]); } } while (! unique); ++ndx; } Dataset *c = new Dataset(k, x.d); for (int i = 0; i < c->n; ++i) { double *cdp = c->data + i * x.d; memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); if (c->sumDataSquared) { c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); } } delete [] chosen_pts; delete [] dist2; return c; } void kmeans_assign(Dataset const &x, Dataset const &c, unsigned short *assignment) { for (int i = 0; i < x.n; ++i) { double shortestDist2 = std::numeric_limits::max(); int closest = 0; for (int j = 0; j < c.n; ++j) { double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d; for (; a != x.data + (i + 1) * x.d; ++a, ++b) { d2 += (*a - *b) * (*a - *b); } if (d2 < shortestDist2) { shortestDist2 = d2; closest = j; } } assignment[i] = closest; } }