Quillot Mathias / volia

Commit ed89325d5d02f6e7878e3fd52498c8ad1ca653be

Authored by quillotm 2021-08-16 15:57:59 +0200

Exists in master

Now, we can give more parameters to k-means command. Mahalanobis was tested and …

…seems to work well. Need more tests.

Showing 3 changed files with 71 additions and 26 deletions Inline Diff

volia/clustering.py
volia/clustering_modules/kmeans.py
volia/clustering_modules/kmeans_mahalanobis.py

volia/clustering.py

Diff comments View file @ ed89325

 import argparse
 from os import path, mkdir
 from utils import SubCommandRunner
 from core.data import read_features, read_lst, read_labels
 import numpy as np
 from sklearn.cluster import KMeans
 import pickle
 from clustering_modules.kmeans import kmeans
 from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
 import core.measures
 import json
 CLUSTERING_METHODS = {
     "k-means": kmeans(),
     "k-means-mahalanobis": kmeansMahalanobis()
 }
 EVALUATION_METHODS = {
     "entropy": core.measures.entropy_score,
     "purity": core.measures.purity_score,
     "v-measure": v_measure_score,
     "homogeneity": homogeneity_score,
     "completeness": completeness_score,
 }
 def disequilibrium_run():
     pass
 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
     """
     @param measure:
     @param features:
     @param lst:
     @param truelabels:
     @param model:
     @param modeltype:
     @return:
     """
     module = CLUSTERING_METHODS[modeltype]
     module.load(model)
     eval = {}
     for ms in measure:
         evaluation = EVALUATION_METHODS[ms]
         feats_dict = read_features(features)
         labels_dict = read_labels(truelabels)
         lst_dict = read_lst(lst)
         lst_keys = [key for key in lst_dict]
         feats = np.asarray([feats_dict[key] for key in lst_keys])
         Y_pred = module.predict(feats)
         Y_truth = [labels_dict[key][0] for key in lst_keys]
         le = LabelEncoder()
         le.fit(Y_truth)
         Y_truth = le.transform(Y_truth)
         eval[ms] = evaluation(Y_truth, Y_pred)
     print(json.dumps(eval))
-def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
+def kmeans_run(features: str,
+               lst: str,
+               k:int,
+               kmax: int,
+               klist,
+               maxiter: int,
+               ninit: int,
+               output: str,
+               tol: float,
+               debug: bool = False,
+               mahalanobis: str = False):
     """
     @param features: output features
     @param lst: list file
     @param k: k (kmin if kmax specified)
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
     @param mahalanobis: distance option of k-means.
     """
     # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
     X = np.asarray([features_dict[x] for x in lst_dict])
     # Exception cases
     if kmax is None and klist is None and path.isdir(output):
         raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
     if (kmax is not None or klist is not None) and path.isfile(output):
         raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
     # Mono value case
     if kmax is None and klist is None:
-        print(f"Computing clustering with k={k}")
+        if debug:
+            print(f"Computing clustering with k={k}")
         model = CLUSTERING_METHODS["k-means"]
         if mahalanobis:
-            print("Computing with mahalanobis distance")
             model = CLUSTERING_METHODS["k-means-mahalanobis"]
-        model.fit(X, k)
+        model.fit(X, k, tol, maxiter, debug)
         model.save(output)
     # Multi values case with kmax
     if kmax is not None:
         if not path.isdir(output):
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, i)
+            model.fit(X, i, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
     # Second multi values case with klist
     if klist is not None:
         if not path.isdir(output):
             mkdir(output)
         for k in klist:
             k = int(k)
             model = CLUSTERING_METHODS["k-means"]
             if mahalanobis:
-                print("Computing with mahalanobis distance")
                 model = CLUSTERING_METHODS["k-means-mahalanobis"]
-            model.fit(X, k)
+            model.fit(X, k, tol, maxiter, debug)
             model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
+    # TODO: Output json to explain the end parameters like number of iteration, tol reached and stoped the process ?
+    # etc. (what distance, what parameters etc)
+    # TODO: Move example data into a directory.
+    # TODO: Add example receipts
+    # TODO: n_init have to be taken into account for mahalanobis case of k-means algorithm.
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
     subparsers = parser.add_subparsers(title="action")
     # kmeans
     parser_kmeans = subparsers.add_parser(
         "kmeans", help="Compute clustering using k-means algorithm")
     parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
     parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
     parser_kmeans.add_argument("-k", default=2, type=int,
                                help="number of clusters to compute. It is kmin if kmax is specified.")
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
+    parser_kmeans.add_argument("--maxiter",
+                               type=int,
+                               default=300,
+                               help="Max number of iteration before stoping if not converging")
+    parser_kmeans.add_argument("--ninit",
+                               type=int,
+                               default=10,
+                               help="Number of time the k-means algorithm will be run with different centroid seeds.")
+    parser_kmeans.add_argument("--tol",
+                               type=float,
+                               default=0.0001,
+                               help="Tolerance to finish of distance between centroids and their updates.")
+    parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
     parser_kmeans.add_argument("--mahalanobis", action="store_true")
     parser_kmeans.set_defaults(which="kmeans")
     # measure
     parser_measure = subparsers.add_parser(
         "measure", help="compute the entropy")
     parser_measure.add_argument("--measure",
                                 required=True,
                                 nargs="+",
                                 choices=[key for key in EVALUATION_METHODS],
                                 help="...")
     parser_measure.add_argument("--features", required=True, type=str, help="...")
     parser_measure.add_argument("--lst", required=True, type=str, help="...")
     parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
     parser_measure.add_argument("--model", required=True, type=str, help="...")
     parser_measure.add_argument("--modeltype",
                                 required=True,
                                 choices=[key for key in CLUSTERING_METHODS],
                                 help="type of model for learning")
     parser_measure.set_defaults(which="measure")
     # disequilibrium
     parser_disequilibrium = subparsers.add_parser(
         "disequilibrium", help="...")
     parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model-type",
                                 required=True,
                                 choices=["kmeans", "2", "3"],
                                 help="...")
     parser_disequilibrium.set_defaults(which="disequilibrium")
     # Parse
     args = parser.parse_args()
     # Run commands
     runner = SubCommandRunner({
         "kmeans": kmeans_run,
         "measure": measure_run,
         "disequilibrium": disequilibrium_run
     })

volia/clustering_modules/kmeans.py

Diff comments View file @ ed89325

1		1
2	from sklearn.cluster import KMeans	2	from sklearn.cluster import KMeans
3	import pickle	3	import pickle
4	from abstract_clustering import AbstractClustering	4	from abstract_clustering import AbstractClustering
5		5
6	class kmeans():	6	class kmeans():
7	def __init__(self):	7	def __init__(self):
8	self.kmeans_model = None	8	self.kmeans_model = None
9		9
10	def predict(self, features):	10	def predict(self, features):
11	"""	11	"""
12		12
13	@param features:	13	@param features:
14	@return:	14	@return:
15	"""	15	"""
16	return self.kmeans_model.predict(features)	16	return self.kmeans_model.predict(features)
17		17
18	def load(self, model_path: str):	18	def load(self, model_path: str):
19	"""	19	"""
20		20
21	@param model_path:	21	@param model_path:
22	@return:	22	@return:
23	"""	23	"""
24	with open(model_path, "rb") as f:	24	with open(model_path, "rb") as f:
25	self.kmeans_model = pickle.load(f)	25	self.kmeans_model = pickle.load(f)
26		26
27	def save(self, model_path: str):	27	def save(self, model_path: str):
28	"""	28	"""
29		29
30	@param model_path:	30	@param model_path:
31	@return:	31	@return:
32	"""	32	"""
33	with open(model_path, "wb") as f:	33	with open(model_path, "wb") as f:
34	pickle.dump(self.kmeans_model, f)	34	pickle.dump(self.kmeans_model, f)
35		35
36	def fit(self, features, k: int):	36	def fit(self, features, k: int, tol: float, maxiter: int=300, debug: bool=False):
37	"""	37	"""
38		38
39	@param features:	39	@param features:
40	@param k:	40	@param k:
41	@return:	41	@return:
42	"""	42	"""
43	self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)	43	self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0, max_iter=maxiter, tol=tol).fit(features)
44		44

volia/clustering_modules/kmeans_mahalanobis.py

Diff comments View file @ ed89325

1		1
2		2
3	from sklearn.cluster import KMeans	3	from sklearn.cluster import KMeans
4	import pickle	4	import pickle
5	import numpy as np	5	import numpy as np
6	import matplotlib.pyplot as plt	6	import matplotlib.pyplot as plt
7	from sklearn.manifold import TSNE	7	from sklearn.manifold import TSNE
8	from abstract_clustering import AbstractClustering	8	from abstract_clustering import AbstractClustering
9		9
10	class kmeansMahalanobis():	10	class kmeansMahalanobis():
11	def __init__(self):	11	def __init__(self):
12	"""	12	"""
13		13
14	"""	14	"""
15	self.C = None	15	self.C = None
16	self.L = None	16	self.L = None
17	self.K = None	17	self.K = None
18		18
19	def predict(self, features):	19	def predict(self, features):
20	"""	20	"""
21		21
22	@param features:	22	@param features:
23	@return:	23	@return:
24	"""	24	"""
25	N = features.shape[0]	25	N = features.shape[0]
26	distances = np.zeros((N, self.K))	26	distances = np.zeros((N, self.K))
27	for n in range(N):	27	for n in range(N):
28	for k in range(self.K):	28	for k in range(self.K):
29	distances[n][k] = self._dist(features[n], self.C[k], self.L[k])	29	distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
30	closest_cluster = np.argmin(distances, axis=1)	30	closest_cluster = np.argmin(distances, axis=1)
31	return closest_cluster	31	return closest_cluster
32		32
33	def load(self, model_path):	33	def load(self, model_path):
34	"""	34	"""
35		35
36	@param model_path:	36	@param model_path:
37	@return:	37	@return:
38	"""	38	"""
39	data = None	39	data = None
40	with open(model_path):	40	with open(model_path, "rb") as f:
41	data = pickle.load()	41	data = pickle.load(f)
42	if data is None:	42	if data is None:
43	raise Exception("Le modèle n'a pas pu être chargé")	43	raise Exception("Le modèle n'a pas pu être chargé")
44	else:	44	else:
45	self.C = data["C"]	45	self.C = data["C"]
46	self.L = data["L"]	46	self.L = data["L"]
47	self.K = data["K"]	47	self.K = data["K"]
48		48
49	def save(self, modelpath: str):	49	def save(self, modelpath: str):
50	"""	50	"""
51		51
52	@param modelpath:	52	@param modelpath:
53	@return:	53	@return:
54	"""	54	"""
55	data = {	55	data = {
56	"C": self.C,	56	"C": self.C,
57	"L": self.L,	57	"L": self.L,
58	"K": self.K	58	"K": self.K
59	}	59	}
60	with open(modelpath, "wb") as f:	60	with open(modelpath, "wb") as f:
61	pickle.dump(data, f)	61	pickle.dump(data, f)
62		62
63	def fit(self, features, K: int):	63	def fit(self, features, k: int, tol: float = 0.0001, maxiter: int=300, debug: bool=False):
64	self._train(features, K)	64	self._train(features, k, tol, maxiter, debug)
65		65
66	def _initialize_model(self, X, number_clusters):	66	def _initialize_model(self, X, number_clusters):
67	d = X.shape[1]	67	d = X.shape[1]
68	C = X[np.random.choice(X.shape[0], number_clusters)]	68	C = X[np.random.choice(X.shape[0], number_clusters)]
69	L = np.zeros((number_clusters, d, d))	69	L = np.zeros((number_clusters, d, d))
70	for k in range(number_clusters):	70	for k in range(number_clusters):
71	L[k] = np.identity(d)	71	L[k] = np.identity(d)
72	return C, L	72	return C, L
73		73
74	def _dist(self, a, b, l):	74	def _dist(self, a, b, l):
75	'''	75	'''
76	Distance euclidienne	76	Distance euclidienne
77	'''	77	'''
78	a = np.reshape(a, (-1, 1))	78	a = np.reshape(a, (-1, 1))
79	b = np.reshape(b, (-1, 1))	79	b = np.reshape(b, (-1, 1))
80	result = np.transpose(a - b).dot(l).dot(a-b)[0][0]	80	result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
81	return result	81	return result
82		82
83	def _plot_iteration(self, iteration, points, clusters, centers):	83	def _plot_iteration(self, iteration, points, clusters, centers):
84	fig = plt.figure()	84	fig = plt.figure()
85	ax = fig.add_subplot(111)	85	ax = fig.add_subplot(111)
86	scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)	86	scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
87		87
88	#for center in centers:	88	#for center in centers:
89	# ax.scatter(center[0], center[1], s=50, c='red', marker='+')	89	# ax.scatter(center[0], center[1], s=50, c='red', marker='+')
90	ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')	90	ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
91		91
92	ax.set_xlabel('x')	92	ax.set_xlabel('x')
93	ax.set_ylabel('y')	93	ax.set_ylabel('y')
94	plt.colorbar(scatter)	94	plt.colorbar(scatter)
95	#plt.ylim(0, 1)	95	#plt.ylim(0, 1)
96	#plt.xlim(0, 1)	96	#plt.xlim(0, 1)
97	plt.savefig("test_" + str(iteration) + ".pdf")	97	plt.savefig("test_" + str(iteration) + ".pdf")
98		98
99	def _train(self, features, K: int):	99	def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
100	X = features	100	X = features
101	N = X.shape[0]	101	N = X.shape[0]
102	d = X.shape[1]	102	d = X.shape[1]
103		103
		104	X_embedded = None
104	C, L = self._initialize_model(X, K)	105	C, L = self._initialize_model(X, K)
105	self.C = C	106	self.C = C
106	self.L = L	107	self.L = L
107	self.K = K	108	self.K = K
108		109
109	end_algo = False	110	end_algo = False
110	i = 0	111	i = 0
111	while not end_algo:	112	while not end_algo:
112	if i == 10:	113	if debug:
113	exit(1)	114	print("Iteration: ", i)
114	print("Iteration: ", i)	115
115	# Calcul matrix distance	116	# Calcul matrix distance
116	distances = np.zeros((N, K))	117	distances = np.zeros((N, K))
117		118
118	for n in range(N):	119	for n in range(N):
119	for k in range(self.K):	120	for k in range(self.K):
120	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])	121	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
121	closest_cluster = np.argmin(distances, axis=1)	122	closest_cluster = np.argmin(distances, axis=1)
122	if i % 1 == 0:	123
123	# -- Debug tool ----------------------	124	# -- Debug tool ----------------------
124	# TSNE	125	if debug and i % 10 == 0:
125	#X_embedded = np.concatenate((X, self.C), axis=0)	126	# TSNE if needed
126	X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))	127	X_embedded = np.concatenate((X, self.C), axis=0)
		128	if d > 2:
		129	X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
		130
127	# Then plot	131	# Then plot
128	self._plot_iteration(	132	self._plot_iteration(
129	i,	133	i,
130	X_embedded[:X.shape[0]],	134	X_embedded[:X.shape[0]],
131	closest_cluster,	135	closest_cluster,
132	X_embedded[X.shape[0]:]	136	X_embedded[X.shape[0]:]
133	)	137	)
134	# ------------------------------------	138	# ------------------------------------
135		139
136	end_algo = True	140	old_c = self.C.copy()
137	for k in range(K):	141	for k in range(K):
138	# Find subset of X with values closed to the centroid c_k.	142	# Find subset of X with values closed to the centroid c_k.
139	X_sub = np.where(closest_cluster == k)	143	X_sub = np.where(closest_cluster == k)
140	X_sub = np.take(X, X_sub[0], axis=0)	144	X_sub = np.take(X, X_sub[0], axis=0)
141	if X_sub.shape[0] == 0:	145	if X_sub.shape[0] == 0:
142	continue	146	continue
143	np.mean(X_sub, axis=0)	147	np.mean(X_sub, axis=0)
144	C_new = np.mean(X_sub, axis=0)	148	C_new = np.mean(X_sub, axis=0)
145		149
146	# -- COMPUTE NEW LAMBDA (here named K) --	150	# -- COMPUTE NEW LAMBDA (here named K) --
147	K_new = np.zeros((L.shape[1], L.shape[2]))	151	K_new = np.zeros((L.shape[1], L.shape[2]))
148	for x in X_sub:	152	for x in X_sub:
149	x = np.reshape(x, (-1, 1))	153	x = np.reshape(x, (-1, 1))
150	c_tmp = np.reshape(C_new, (-1, 1))	154	c_tmp = np.reshape(C_new, (-1, 1))
151	K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())	155	K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
152	K_new = K_new / X_sub.shape[0]	156	K_new = K_new / X_sub.shape[0]
153	K_new = np.linalg.pinv(K_new)	157	K_new = np.linalg.pinv(K_new)
154		158
155	if end_algo and (not (self.C[k] == C_new).all()): # If the same stop	159	#if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
156	end_algo = False	160	# end_algo = False
157	self.C[k] = C_new	161	self.C[k] = C_new
158	self.L[k] = K_new	162	self.L[k] = K_new
		163
		164	diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
		165	if diff > tol:
		166	end_algo = False
		167	if debug:
		168	print(f"{diff}")
		169	elif debug:
		170	print(f"Tolerance threshold {tol} reached with diff {diff}")
		171	end_algo = True
159	i = i + 1	172	i = i + 1
		173	if i > maxiter:
		174	end_algo = True
		175	if debug:
		176	print(f"Iteration {maxiter} reached")
160		177