diff --git a/bin/cluster_kmeans_constrained_mahalanobis.py b/bin/cluster_kmeans_constrained_mahalanobis.py new file mode 100644 index 0000000..8ea8c65 --- /dev/null +++ b/bin/cluster_kmeans_constrained_mahalanobis.py @@ -0,0 +1,125 @@ +''' +Un petit test pour faire du clustering +avec une distance de mahalanobis +From paper: +Convergence problems of Mahalanobis distance-based k-means clustering, +Itshak Lapidot + +Just one thing: Column and lines are inversed in this script. +TODO: Random selection from the set +''' + +import matplotlib.pyplot as plt +import numpy as np +# from sklearn.manifold import TSNE + +N = 50 # Number of individus +d = 2 # Number of dimensions +K = 3 # number of clusters + + +def initialize_model(X, number_clusters): + rand = np.random.choice(X.shape[0], number_clusters) + C = X[rand] + L = np.zeros((K, d, d)) + for k in range(K): + L[k] = np.identity(d) + return C, L + + +X = np.random.rand(N, d) # Features +C, L = initialize_model(X, K) + + +def gaussian(x, c, l): + ''' + G function + ''' + p1 = np.power(np.linalg.det(l), 0.5) / np.power((2 * np.pi), d/2) + p2 = np.exp(np.transpose(x - c).dot(1/2).dot(l).dot(x - c)) + return (p1 * p2).reshape(1) + + +def dist(a, b, l): + ''' + Distance euclidienne + ''' + a = np.reshape(a, (-1, 1)) + b = np.reshape(b, (-1, 1)) + result = np.transpose(a - b).dot(l).dot(a-b)[0][0] + return result + + +def plot_iteration(iteration, points, clusters, centers): + fig = plt.figure() + ax = fig.add_subplot(111) + scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50) + for i, j in centers: + ax.scatter(i, j, s=50, c='red', marker='+') + ax.set_xlabel('x') + ax.set_ylabel('y') + plt.colorbar(scatter) + plt.ylim(0, 1) + plt.xlim(0, 1) + plt.savefig("test_" + str(iteration) + ".pdf") + + +end_algo = False +i = 0 +while not end_algo: + if i == 10: + exit(1) + print("Iteration: ", i) + # -- Calcul matrix distance + distances = np.zeros((N, K)) + + # -- Calcul closest cluster + for n in range(N): + for k in range(K): + distances[n][k] = dist(X[n], C[k], L[k]) + closest_cluster = np.argmin(distances, axis=1) + + # -- Debug tool ---------------------- + if i % 1 == 0: + # TSNE + X_embedded = np.concatenate((X, C), axis=0) + # X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0)) + # Then plot + plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] + ) + # ------------------------------------ + + end_algo = True + for k in range(K): + # Find subset of X with values closed to the centroid c_k. + X_sub = np.where(closest_cluster == k) + X_sub = np.take(X, X_sub[0], axis=0) + np.mean(X_sub, axis=0) + C_new = np.mean(X_sub, axis=0) + + # -- COMPUTE NEW LAMBDA (here named K) -- + K_new = np.zeros((L.shape[1], L.shape[2])) + for x in X_sub: + x = np.reshape(x, (-1, 1)) + c_tmp = np.reshape(C_new, (-1, 1)) + K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose()) + K_new = K_new / X_sub.shape[0] + K_new = K_new / np.power(np.linalg.det(K_new), 1/d) + K_new = np.linalg.inv(K_new) + + if end_algo and (not (C[k] == C_new).all()): # If the same stop + end_algo = False + C[k] = C_new + L[k] = K_new + i = i + 1 + +plot_iteration( + i, + X_embedded[:X.shape[0]], + closest_cluster, + X_embedded[X.shape[0]:] +)