Algorithms_in_C 1.0.0
Set of algorithms implemented in C.
K-Means Clustering Algorithm
Collaboration diagram for K-Means Clustering Algorithm:

Data Structures

struct  observation
 
struct  cluster
 

Typedefs

typedef struct observation observation
 
typedef struct cluster cluster
 

Functions

int calculateNearst (observation *o, cluster clusters[], int k)
 
void calculateCentroid (observation observations[], size_t size, cluster *centroid)
 
clusterkMeans (observation observations[], size_t size, int k)
 

Detailed Description

Function Documentation

◆ calculateCentroid()

void calculateCentroid ( observation  observations[],
size_t  size,
cluster centroid 
)

Calculate centoid and assign it to the cluster variable

Parameters
observationsan array of observations whose centroid is calculated
sizesize of the observations array
centroida reference to cluster object to store information of centroid
99{
100 size_t i = 0;
101 centroid->x = 0;
102 centroid->y = 0;
103 centroid->count = size;
104 for (; i < size; i++)
105 {
106 centroid->x += observations[i].x;
107 centroid->y += observations[i].y;
108 observations[i].group = 0;
109 }
110 centroid->x /= centroid->count;
111 centroid->y /= centroid->count;
112}
double y
ordinate of centroid of this cluster
Definition: k_means_clustering.c:55
double x
abscissa centroid of this cluster
Definition: k_means_clustering.c:54
size_t count
count of observations present in this cluster
Definition: k_means_clustering.c:56
double x
abscissa of 2D data point
Definition: k_means_clustering.c:40
int group
the group no in which this observation would go
Definition: k_means_clustering.c:42
double y
ordinate of 2D data point
Definition: k_means_clustering.c:41

◆ calculateNearst()

int calculateNearst ( observation o,
cluster  clusters[],
int  k 
)

Returns the index of centroid nearest to given observation

Parameters
oobservation
clustersarray of cluster having centroids coordinates
ksize of clusters array
Returns
the index of nearest centroid for given observation
70{
71 double minD = DBL_MAX;
72 double dist = 0;
73 int index = -1;
74 int i = 0;
75 for (; i < k; i++)
76 {
77 /* Calculate Squared Distance*/
78 dist = (clusters[i].x - o->x) * (clusters[i].x - o->x) +
79 (clusters[i].y - o->y) * (clusters[i].y - o->y);
80 if (dist < minD)
81 {
82 minD = dist;
83 index = i;
84 }
85 }
86 return index;
87}

◆ kMeans()

cluster * kMeans ( observation  observations[],
size_t  size,
int  k 
)

–K Means Algorithm–

  1. Assign each observation to one of k groups creating a random initial clustering
  2. Find the centroid of observations for each cluster to form new centroids
  3. Find the centroid which is nearest for each observation among the calculated centroids
  4. Assign the observation to its nearest centroid to create a new clustering.
  5. Repeat step 2,3,4 until there is no change the current clustering and is same as last clustering.
Parameters
observationsan array of observations to cluster
sizesize of observations array
kno of clusters to be made
Returns
pointer to cluster object
135{
136 cluster* clusters = NULL;
137 if (k <= 1)
138 {
139 /*
140 If we have to cluster them only in one group
141 then calculate centroid of observations and
142 that will be a ingle cluster
143 */
144 clusters = (cluster*)malloc(sizeof(cluster));
145 memset(clusters, 0, sizeof(cluster));
146 calculateCentroid(observations, size, clusters);
147 }
148 else if (k < size)
149 {
150 clusters = malloc(sizeof(cluster) * k);
151 memset(clusters, 0, k * sizeof(cluster));
152 /* STEP 1 */
153 for (size_t j = 0; j < size; j++)
154 {
155 observations[j].group = rand() % k;
156 }
157 size_t changed = 0;
158 size_t minAcceptedError =
159 size /
160 10000; // Do until 99.99 percent points are in correct cluster
161 int t = 0;
162 do
163 {
164 /* Initialize clusters */
165 for (int i = 0; i < k; i++)
166 {
167 clusters[i].x = 0;
168 clusters[i].y = 0;
169 clusters[i].count = 0;
170 }
171 /* STEP 2*/
172 for (size_t j = 0; j < size; j++)
173 {
174 t = observations[j].group;
175 clusters[t].x += observations[j].x;
176 clusters[t].y += observations[j].y;
177 clusters[t].count++;
178 }
179 for (int i = 0; i < k; i++)
180 {
181 clusters[i].x /= clusters[i].count;
182 clusters[i].y /= clusters[i].count;
183 }
184 /* STEP 3 and 4 */
185 changed = 0; // this variable stores change in clustering
186 for (size_t j = 0; j < size; j++)
187 {
188 t = calculateNearst(observations + j, clusters, k);
189 if (t != observations[j].group)
190 {
191 changed++;
192 observations[j].group = t;
193 }
194 }
195 } while (changed > minAcceptedError); // Keep on grouping until we have
196 // got almost best clustering
197 }
198 else
199 {
200 /* If no of clusters is more than observations
201 each observation can be its own cluster
202 */
203 clusters = (cluster*)malloc(sizeof(cluster) * k);
204 memset(clusters, 0, k * sizeof(cluster));
205 for (int j = 0; j < size; j++)
206 {
207 clusters[j].x = observations[j].x;
208 clusters[j].y = observations[j].y;
209 clusters[j].count = 1;
210 observations[j].group = j;
211 }
212 }
213 return clusters;
214}
int calculateNearst(observation *o, cluster clusters[], int k)
Definition: k_means_clustering.c:69
void calculateCentroid(observation observations[], size_t size, cluster *centroid)
Definition: k_means_clustering.c:97
#define malloc(bytes)
This macro replace the standard malloc function with malloc_dbg.
Definition: malloc_dbg.h:18
Definition: k_means_clustering.c:53
Here is the call graph for this function: