[doc fix] fix documentations in k_means (#592)

* fix documentations * clang-tidy fixes for 814f9077b7 Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
2024-12-24 03:56:52 +03:00 · 2020-08-08 12:27:46 -04:00 · 2020-08-08 12:27:46 -04:00 · 0b426c0124
commit 0b426c0124
parent 05d9af45f3
1 changed files with 253 additions and 198 deletions
--- a/machine_learning/k_means_clustering.c
+++ b/machine_learning/k_means_clustering.c
@ -35,10 +35,11 @@
 *  the name observation is used to denote
 *  a random point in plane
 */
-typedef struct observation {
-  double x;  /**< abscissa of 2D data point */
-  double y;  /**< ordinate of 2D data point */
-  int group; /**< the group no in which this observation would go */
+typedef struct observation
+{
+    double x;  /**< abscissa of 2D data point */
+    double y;  /**< ordinate of 2D data point */
+    int group; /**< the group no in which this observation would go */
 } observation;

 /*! @struct cluster
@ -48,13 +49,14 @@ typedef struct observation {
 *  stores the count of observations
 *  belonging to this cluster
 */
-typedef struct cluster {
-  double x;     /**< abscissa centroid of this cluster */
-  double y;     /**< ordinate of centroid of this cluster */
-  size_t count; /**< count of observations present in this cluster */
+typedef struct cluster
+{
+    double x;     /**< abscissa centroid of this cluster */
+    double y;     /**< ordinate of centroid of this cluster */
+    size_t count; /**< count of observations present in this cluster */
 } cluster;

-/*! @fn calculateNearest
+/*!
 * Returns the index of centroid nearest to
 * given observation
 *
@ -64,24 +66,27 @@ typedef struct cluster {
 *
 * @returns the index of nearest centroid for given observation
 */
-int calculateNearst(observation* o, cluster clusters[], int k) {
-  double minD = DBL_MAX;
-  double dist = 0;
-  int index = -1;
-  int i = 0;
-  for (; i < k; i++) {
-    /* Calculate Squared Distance*/
-    dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
-           (clusters[i].y - o->y) * (clusters[i].y - o->y);
-    if (dist < minD) {
-      minD = dist;
-      index = i;
+int calculateNearst(observation* o, cluster clusters[], int k)
+{
+    double minD = DBL_MAX;
+    double dist = 0;
+    int index = -1;
+    int i = 0;
+    for (; i < k; i++)
+    {
+        /* Calculate Squared Distance*/
+        dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
+               (clusters[i].y - o->y) * (clusters[i].y - o->y);
+        if (dist < minD)
+        {
+            minD = dist;
+            index = i;
+        }
    }
-  }
-  return index;
+    return index;
 }

-/*! @fn calculateCentroid
+/*!
 * Calculate centoid and assign it to the cluster variable
 *
 * @param observations  an array of observations whose centroid is calculated
@ -90,22 +95,24 @@ int calculateNearst(observation* o, cluster clusters[], int k) {
 * centroid
 */
 void calculateCentroid(observation observations[], size_t size,
-                       cluster* centroid) {
-  size_t i = 0;
-  centroid->x = 0;
-  centroid->y = 0;
-  centroid->count = size;
-  for (; i < size; i++) {
-    centroid->x += observations[i].x;
-    centroid->y += observations[i].y;
-    observations[i].group = 0;
-  }
-  centroid->x /= centroid->count;
-  centroid->y /= centroid->count;
+                       cluster* centroid)
+{
+    size_t i = 0;
+    centroid->x = 0;
+    centroid->y = 0;
+    centroid->count = size;
+    for (; i < size; i++)
+    {
+        centroid->x += observations[i].x;
+        centroid->y += observations[i].y;
+        observations[i].group = 0;
+    }
+    centroid->x /= centroid->count;
+    centroid->y /= centroid->count;
 }

-/*!  @fn kMeans
- *   --K Means Algorithm--
+/*!
+ *    --K Means Algorithm--
 * 1. Assign each observation to one of k groups
 *    creating a random initial clustering
 * 2. Find the centroid of observations for each
@ -117,77 +124,93 @@ void calculateCentroid(observation observations[], size_t size,
 * 5. Repeat step 2,3,4 until there is no change
 *    the current clustering and is same as last
 *    clustering.
+ *
 * @param observations  an array of observations to cluster
 * @param size  size of observations array
 * @param k  no of clusters to be made
 *
 * @returns pointer to cluster object
 */
-cluster* kMeans(observation observations[], size_t size, int k) {
-  cluster* clusters = NULL;
-  if (k <= 1) {
-    /*
-    If we have to cluster them only in one group
-    then calculate centroid of observations and
-    that will be a ingle cluster
-    */
-    clusters = (cluster*)malloc(sizeof(cluster));
-    memset(clusters, 0, sizeof(cluster));
-    calculateCentroid(observations, size, clusters);
-  } else if (k < size) {
-    clusters = malloc(sizeof(cluster) * k);
-    memset(clusters, 0, k * sizeof(cluster));
-    /* STEP 1 */
-    for (size_t j = 0; j < size; j++) {
-      observations[j].group = rand() % k;
+cluster* kMeans(observation observations[], size_t size, int k)
+{
+    cluster* clusters = NULL;
+    if (k <= 1)
+    {
+        /*
+        If we have to cluster them only in one group
+        then calculate centroid of observations and
+        that will be a ingle cluster
+        */
+        clusters = (cluster*)malloc(sizeof(cluster));
+        memset(clusters, 0, sizeof(cluster));
+        calculateCentroid(observations, size, clusters);
    }
-    size_t changed = 0;
-    size_t minAcceptedError =
-        size / 10000;  // Do until 99.99 percent points are in correct cluster
-    int t = 0;
-    do {
-      /* Initialize clusters */
-      for (int i = 0; i < k; i++) {
-        clusters[i].x = 0;
-        clusters[i].y = 0;
-        clusters[i].count = 0;
-      }
-      /* STEP 2*/
-      for (size_t j = 0; j < size; j++) {
-        t = observations[j].group;
-        clusters[t].x += observations[j].x;
-        clusters[t].y += observations[j].y;
-        clusters[t].count++;
-      }
-      for (int i = 0; i < k; i++) {
-        clusters[i].x /= clusters[i].count;
-        clusters[i].y /= clusters[i].count;
-      }
-      /* STEP 3 and 4 */
-      changed = 0;  // this variable stores change in clustering
-      for (size_t j = 0; j < size; j++) {
-        t = calculateNearst(observations + j, clusters, k);
-        if (t != observations[j].group) {
-          changed++;
-          observations[j].group = t;
+    else if (k < size)
+    {
+        clusters = malloc(sizeof(cluster) * k);
+        memset(clusters, 0, k * sizeof(cluster));
+        /* STEP 1 */
+        for (size_t j = 0; j < size; j++)
+        {
+            observations[j].group = rand() % k;
        }
-      }
-    } while (changed > minAcceptedError);  // Keep on grouping until we have
-                                           // got almost best clustering
-  } else {
-    /* If no of clusters is more than observations
-       each observation can be its own cluster
-    */
-    clusters = (cluster*)malloc(sizeof(cluster) * k);
-    memset(clusters, 0, k * sizeof(cluster));
-    for (int j = 0; j < size; j++) {
-      clusters[j].x = observations[j].x;
-      clusters[j].y = observations[j].y;
-      clusters[j].count = 1;
-      observations[j].group = j;
+        size_t changed = 0;
+        size_t minAcceptedError =
+            size /
+            10000;  // Do until 99.99 percent points are in correct cluster
+        int t = 0;
+        do
+        {
+            /* Initialize clusters */
+            for (int i = 0; i < k; i++)
+            {
+                clusters[i].x = 0;
+                clusters[i].y = 0;
+                clusters[i].count = 0;
+            }
+            /* STEP 2*/
+            for (size_t j = 0; j < size; j++)
+            {
+                t = observations[j].group;
+                clusters[t].x += observations[j].x;
+                clusters[t].y += observations[j].y;
+                clusters[t].count++;
+            }
+            for (int i = 0; i < k; i++)
+            {
+                clusters[i].x /= clusters[i].count;
+                clusters[i].y /= clusters[i].count;
+            }
+            /* STEP 3 and 4 */
+            changed = 0;  // this variable stores change in clustering
+            for (size_t j = 0; j < size; j++)
+            {
+                t = calculateNearst(observations + j, clusters, k);
+                if (t != observations[j].group)
+                {
+                    changed++;
+                    observations[j].group = t;
+                }
+            }
+        } while (changed > minAcceptedError);  // Keep on grouping until we have
+                                               // got almost best clustering
    }
-  }
-  return clusters;
+    else
+    {
+        /* If no of clusters is more than observations
+           each observation can be its own cluster
+        */
+        clusters = (cluster*)malloc(sizeof(cluster) * k);
+        memset(clusters, 0, k * sizeof(cluster));
+        for (int j = 0; j < size; j++)
+        {
+            clusters[j].x = observations[j].x;
+            clusters[j].y = observations[j].y;
+            clusters[j].count = 1;
+            observations[j].group = j;
+        }
+    }
+    return clusters;
 }

 /**
@ -195,73 +218,96 @@ cluster* kMeans(observation observations[], size_t size, int k) {
 * @}
 */

-/*! @fn printEPS
+/*!
 * A function to print observations and clusters
 * The code is taken from
- * @link http://rosettacode.org/wiki/K-means%2B%2B_clustering
- * its C implementation
+ * http://rosettacode.org/wiki/K-means%2B%2B_clustering.
 * Even the K Means code is also inspired from it
 *
- * Note: To print in a file use pipeline operator ( ./a.out > image.eps )
+ * @note To print in a file use pipeline operator
+ * ```sh
+ * ./k_means_clustering > image.eps
+ * ```
 *
 * @param observations  observations array
 * @param len  size of observation array
 * @param cent  clusters centroid's array
 * @param k  size of cent array
 */
-void printEPS(observation pts[], size_t len, cluster cent[], int k) {
-  int W = 400, H = 400;
-  double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
-  double scale = 0, cx = 0, cy = 0;
-  double* colors = (double*)malloc(sizeof(double) * (k * 3));
-  int i;
-  size_t j;
-  double kd = k * 1.0;
-  for (i = 0; i < k; i++) {
-    *(colors + 3 * i) = (3 * (i + 1) % k) / kd;
-    *(colors + 3 * i + 1) = (7 * i % k) / kd;
-    *(colors + 3 * i + 2) = (9 * i % k) / kd;
-  }
-
-  for (j = 0; j < len; j++) {
-    if (max_x < pts[j].x) max_x = pts[j].x;
-    if (min_x > pts[j].x) min_x = pts[j].x;
-    if (max_y < pts[j].y) max_y = pts[j].y;
-    if (min_y > pts[j].y) min_y = pts[j].y;
-  }
-  scale = W / (max_x - min_x);
-  if (scale > (H / (max_y - min_y))) {
-    scale = H / (max_y - min_y);
-  };
-  cx = (max_x + min_x) / 2;
-  cy = (max_y + min_y) / 2;
-
-  printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
-         H + 10);
-  printf(
-      "/l {rlineto} def /m {rmoveto} def\n"
-      "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
-      "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
-      "	gsave 1 setgray fill grestore gsave 3 setlinewidth"
-      " 1 setgray stroke grestore 0 setgray stroke }def\n");
-  for (int i = 0; i < k; i++) {
-    printf("%g %g %g setrgbcolor\n", *(colors + 3 * i), *(colors + 3 * i + 1),
-           *(colors + 3 * i + 2));
-    for (j = 0; j < len; j++) {
-      if (pts[j].group != i) continue;
-      printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
-             (pts[j].y - cy) * scale + H / 2);
+void printEPS(observation pts[], size_t len, cluster cent[], int k)
+{
+    int W = 400, H = 400;
+    double min_x = DBL_MAX, max_x = DBL_MIN, min_y = DBL_MAX, max_y = DBL_MIN;
+    double scale = 0, cx = 0, cy = 0;
+    double* colors = (double*)malloc(sizeof(double) * (k * 3));
+    int i;
+    size_t j;
+    double kd = k * 1.0;
+    for (i = 0; i < k; i++)
+    {
+        *(colors + 3 * i) = (3 * (i + 1) % k) / kd;
+        *(colors + 3 * i + 1) = (7 * i % k) / kd;
+        *(colors + 3 * i + 2) = (9 * i % k) / kd;
    }
-    printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
-           (cent[i].y - cy) * scale + H / 2);
-  }
-  printf("\n%%%%EOF");

-  // free accquired memory
-  free(colors);
+    for (j = 0; j < len; j++)
+    {
+        if (max_x < pts[j].x)
+        {
+            max_x = pts[j].x;
+        }
+        if (min_x > pts[j].x)
+        {
+            min_x = pts[j].x;
+        }
+        if (max_y < pts[j].y)
+        {
+            max_y = pts[j].y;
+        }
+        if (min_y > pts[j].y)
+        {
+            min_y = pts[j].y;
+        }
+    }
+    scale = W / (max_x - min_x);
+    if (scale > (H / (max_y - min_y)))
+    {
+        scale = H / (max_y - min_y);
+    };
+    cx = (max_x + min_x) / 2;
+    cy = (max_y + min_y) / 2;
+
+    printf("%%!PS-Adobe-3.0 EPSF-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + 10,
+           H + 10);
+    printf(
+        "/l {rlineto} def /m {rmoveto} def\n"
+        "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
+        "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
+        "	gsave 1 setgray fill grestore gsave 3 setlinewidth"
+        " 1 setgray stroke grestore 0 setgray stroke }def\n");
+    for (int i = 0; i < k; i++)
+    {
+        printf("%g %g %g setrgbcolor\n", *(colors + 3 * i),
+               *(colors + 3 * i + 1), *(colors + 3 * i + 2));
+        for (j = 0; j < len; j++)
+        {
+            if (pts[j].group != i)
+            {
+                continue;
+            }
+            printf("%.3f %.3f c\n", (pts[j].x - cx) * scale + W / 2,
+                   (pts[j].y - cy) * scale + H / 2);
+        }
+        printf("\n0 setgray %g %g s\n", (cent[i].x - cx) * scale + W / 2,
+               (cent[i].y - cy) * scale + H / 2);
+    }
+    printf("\n%%%%EOF");
+
+    // free accquired memory
+    free(colors);
 }

-/*! @fn test
+/*!
 * A function to test the kMeans function
 * Generates 100000 points in a circle of
 * radius 20.0 with center at (0,0)
@ -270,29 +316,33 @@ void printEPS(observation pts[], size_t len, cluster cent[], int k) {
 * <img alt="Output for 100000 points divided in 5 clusters" src=
 * "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest1.png"
 * width="400px" heiggt="400px">
+ * @returns None
 */
-static void test() {
-  size_t size = 100000L;
-  observation* observations = (observation*)malloc(sizeof(observation) * size);
-  double maxRadius = 20.00;
-  double radius = 0;
-  double ang = 0;
-  size_t i = 0;
-  for (; i < size; i++) {
-    radius = maxRadius * ((double)rand() / RAND_MAX);
-    ang = 2 * M_PI * ((double)rand() / RAND_MAX);
-    observations[i].x = radius * cos(ang);
-    observations[i].y = radius * sin(ang);
-  }
-  int k = 5;  // No of clusters
-  cluster* clusters = kMeans(observations, size, k);
-  printEPS(observations, size, clusters, k);
-  // Free the accquired memory
-  free(observations);
-  free(clusters);
+static void test()
+{
+    size_t size = 100000L;
+    observation* observations =
+        (observation*)malloc(sizeof(observation) * size);
+    double maxRadius = 20.00;
+    double radius = 0;
+    double ang = 0;
+    size_t i = 0;
+    for (; i < size; i++)
+    {
+        radius = maxRadius * ((double)rand() / RAND_MAX);
+        ang = 2 * M_PI * ((double)rand() / RAND_MAX);
+        observations[i].x = radius * cos(ang);
+        observations[i].y = radius * sin(ang);
+    }
+    int k = 5;  // No of clusters
+    cluster* clusters = kMeans(observations, size, k);
+    printEPS(observations, size, clusters, k);
+    // Free the accquired memory
+    free(observations);
+    free(clusters);
 }

-/*! @fn test2
+/*!
 * A function to test the kMeans function
 * Generates 1000000 points in a circle of
 * radius 20.0 with center at (0,0)
@ -301,35 +351,40 @@ static void test() {
 * <img alt="Output for 1000000 points divided in 11 clusters" src=
 * "https://raw.githubusercontent.com/TheAlgorithms/C/docs/images/machine_learning/k_means_clustering/kMeansTest2.png"
 * width="400px" heiggt="400px">
+ * @returns None
 */
-void test2() {
-  size_t size = 1000000L;
-  observation* observations = (observation*)malloc(sizeof(observation) * size);
-  double maxRadius = 20.00;
-  double radius = 0;
-  double ang = 0;
-  size_t i = 0;
-  for (; i < size; i++) {
-    radius = maxRadius * ((double)rand() / RAND_MAX);
-    ang = 2 * M_PI * ((double)rand() / RAND_MAX);
-    observations[i].x = radius * cos(ang);
-    observations[i].y = radius * sin(ang);
-  }
-  int k = 11;  // No of clusters
-  cluster* clusters = kMeans(observations, size, k);
-  printEPS(observations, size, clusters, k);
-  // Free the accquired memory
-  free(observations);
-  free(clusters);
+void test2()
+{
+    size_t size = 1000000L;
+    observation* observations =
+        (observation*)malloc(sizeof(observation) * size);
+    double maxRadius = 20.00;
+    double radius = 0;
+    double ang = 0;
+    size_t i = 0;
+    for (; i < size; i++)
+    {
+        radius = maxRadius * ((double)rand() / RAND_MAX);
+        ang = 2 * M_PI * ((double)rand() / RAND_MAX);
+        observations[i].x = radius * cos(ang);
+        observations[i].y = radius * sin(ang);
+    }
+    int k = 11;  // No of clusters
+    cluster* clusters = kMeans(observations, size, k);
+    printEPS(observations, size, clusters, k);
+    // Free the accquired memory
+    free(observations);
+    free(clusters);
 }

-/*! @fn main
+/*!
 * This function calls the test
 * function
 */
-int main() {
-  srand(time(NULL));
-  test();
-  /* test2(); */
-  return 0;
+int main()
+{
+    srand(time(NULL));
+    test();
+    /* test2(); */
+    return 0;
 }