Quillot Mathias / volia

Browse Code »

Commit 4309b4a340144c6dff7757892cd5539e89e20538

Authored by quillotm 2021-08-17 16:00:31 +0200

1 parent 88d1d67e9d

Exists in master

Adding constrained mahalanobis to help converging

Showing 2 changed files with 23 additions and 12 deletions Inline Diff

volia/clustering.py
volia/clustering_modules/kmeans_mahalanobis.py

volia/clustering.py

Diff comments View file @ 4309b4a

 import argparse
 from os import path, mkdir
 from utils import SubCommandRunner
 from core.data import read_features, read_lst, read_labels
 import numpy as np
 from sklearn.cluster import KMeans
 import pickle
 from clustering_modules.kmeans import kmeans
 from clustering_modules.kmeans_mahalanobis import  kmeansMahalanobis
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
 import core.measures
 import json
 CLUSTERING_METHODS = {
     "k-means": kmeans(),
-    "k-means-mahalanobis": kmeansMahalanobis()
+    "k-means-mahalanobis": kmeansMahalanobis(),
+    "k-means-mahalanobis-constrained": kmeansMahalanobis(constrained=True)
 }
 EVALUATION_METHODS = {
     "entropy": core.measures.entropy_score,
     "purity": core.measures.purity_score,
     "v-measure": v_measure_score,
     "homogeneity": homogeneity_score,
     "completeness": completeness_score,
 }
 def disequilibrium_run():
     pass
 def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
     """
     @param measure:
     @param features:
     @param lst:
     @param truelabels:
     @param model:
     @param modeltype:
     @return:
     """
     module = CLUSTERING_METHODS[modeltype]
     module.load(model)
     eval = {}
     for ms in measure:
         evaluation = EVALUATION_METHODS[ms]
         feats_dict = read_features(features)
         labels_dict = read_labels(truelabels)
         lst_dict = read_lst(lst)
         lst_keys = [key for key in lst_dict]
         feats = np.asarray([feats_dict[key] for key in lst_keys])
         Y_pred = module.predict(feats)
         Y_truth = [labels_dict[key][0] for key in lst_keys]
         le = LabelEncoder()
         le.fit(Y_truth)
         Y_truth = le.transform(Y_truth)
         eval[ms] = evaluation(Y_truth, Y_pred)
     print(json.dumps(eval))
 def kmeans_run(features: str,
                lst: str,
                k:int,
                kmax: int,
                klist,
                maxiter: int,
                ninit: int,
                output: str,
                tol: float,
                debug: bool = False,
                mahalanobis: str = False):
     """
     @param features: output features
     @param lst: list file
     @param k: k (kmin if kmax specified)
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
     @param mahalanobis: distance option of k-means.
     """
     json_content = locals().copy()
     def fit_model(k: int, output_file):
         if debug:
             print(f"Computing clustering with k={k}")
         model = CLUSTERING_METHODS["k-means"]
         if mahalanobis:
             if debug:
                 print("Mahalanobis activated")
             model = CLUSTERING_METHODS["k-means-mahalanobis"]
         model.fit(X, k, tol, ninit, maxiter, debug)
         model.save(output_file)
         json_content["models"].append({
             "model_file": output_file,
             "k": k,
         })
     json_content["models"] = []
     # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
     X = np.asarray([features_dict[x] for x in lst_dict])
     # Exception cases
     if kmax is None and klist is None and path.isdir(output):
         raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
     if (kmax is not None or klist is not None) and path.isfile(output):
         raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
     # Mono value case
     if kmax is None and klist is None:
         fit_model(k, output)
     # Multi values case with kmax
     if kmax is not None:
         if not path.isdir(output):
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
             fit_model(i, path.join(output, "clustering_" + str(i) + ".pkl"))
     # Second multi values case with klist
     if klist is not None:
         if not path.isdir(output):
             mkdir(output)
         for k in klist:
             k = int(k)
             fit_model(k, path.join(output, "clustering_" + str(i) + ".pkl"))
     print(json_content)
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
     subparsers = parser.add_subparsers(title="action")
     # kmeans
     parser_kmeans = subparsers.add_parser(
         "kmeans", help="Compute clustering using k-means algorithm")
     parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
     parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
     parser_kmeans.add_argument("-k", default=2, type=int,
                                help="number of clusters to compute. It is kmin if kmax is specified.")
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
     parser_kmeans.add_argument("--maxiter",
                                type=int,
                                default=300,
                                help="Max number of iteration before stoping if not converging")
     parser_kmeans.add_argument("--ninit",
                                type=int,
                                default=10,
                                help="Number of time the k-means algorithm will be run with different centroid seeds.")
     parser_kmeans.add_argument("--tol",
                                type=float,
                                default=0.0001,
                                help="Tolerance to finish of distance between centroids and their updates.")
     parser_kmeans.add_argument("--debug", action="store_true")
     parser_kmeans.add_argument("--output",
                                default=".kmeans",
                                help="output file if only k. Output directory if multiple kmax specified.")
     parser_kmeans.add_argument("--mahalanobis", action="store_true")
     parser_kmeans.set_defaults(which="kmeans")
     # measure
     parser_measure = subparsers.add_parser(
         "measure", help="compute the entropy")
     parser_measure.add_argument("--measure",
                                 required=True,
                                 nargs="+",
                                 choices=[key for key in EVALUATION_METHODS],
                                 help="...")
     parser_measure.add_argument("--features", required=True, type=str, help="...")
     parser_measure.add_argument("--lst", required=True, type=str, help="...")
     parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
     parser_measure.add_argument("--model", required=True, type=str, help="...")
     parser_measure.add_argument("--modeltype",
                                 required=True,
                                 choices=[key for key in CLUSTERING_METHODS],
                                 help="type of model for learning")
     parser_measure.set_defaults(which="measure")
     # disequilibrium
     parser_disequilibrium = subparsers.add_parser(
         "disequilibrium", help="...")
     parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
     parser_disequilibrium.add_argument("--model-type",
                                 required=True,
                                 choices=["kmeans", "2", "3"],
                                 help="...")
     parser_disequilibrium.set_defaults(which="disequilibrium")
     # Parse
     args = parser.parse_args()
     # Run commands
     runner = SubCommandRunner({
         "kmeans": kmeans_run,
         "measure": measure_run,
         "disequilibrium": disequilibrium_run
     })
     runner.run(args.which, args.__dict__, remove="which")

volia/clustering_modules/kmeans_mahalanobis.py

Diff comments View file @ 4309b4a

1		1
2		2
3	from sklearn.cluster import KMeans	3	from sklearn.cluster import KMeans
4	import pickle	4	import pickle
5	import numpy as np	5	import numpy as np
6	import matplotlib.pyplot as plt	6	import matplotlib.pyplot as plt
7	from sklearn.manifold import TSNE	7	from sklearn.manifold import TSNE
8	from abstract_clustering import AbstractClustering	8	from abstract_clustering import AbstractClustering
9		9
10	class kmeansMahalanobis():	10	class kmeansMahalanobis():
11	def __init__(self):	11	def __init__(self, constrained: bool = False):
12	"""	12	"""
13		13
14	"""	14	"""
15	self.C = None	15	self.C = None
16	self.L = None	16	self.L = None
17	self.K = None	17	self.K = None
		18	self.constrained = constrained
18		19
19	def predict(self, features):	20	def predict(self, features):
20	"""	21	"""
21		22
22	@param features:	23	@param features:
23	@return:	24	@return:
24	"""	25	"""
25	N = features.shape[0]	26	N = features.shape[0]
26	distances = np.zeros((N, self.K))	27	distances = np.zeros((N, self.K))
27	for n in range(N):	28	for n in range(N):
28	for k in range(self.K):	29	for k in range(self.K):
29	distances[n][k] = self._dist(features[n], self.C[k], self.L[k])	30	distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
30	closest_cluster = np.argmin(distances, axis=1)	31	closest_cluster = np.argmin(distances, axis=1)
31	return closest_cluster	32	return closest_cluster
32		33
33	def load(self, model_path):	34	def load(self, model_path):
34	"""	35	"""
35		36
36	@param model_path:	37	@param model_path:
37	@return:	38	@return:
38	"""	39	"""
39	data = None	40	data = None
40	with open(model_path, "rb") as f:	41	with open(model_path, "rb") as f:
41	data = pickle.load(f)	42	data = pickle.load(f)
42	if data is None:	43	if data is None:
43	raise Exception("Le modèle n'a pas pu être chargé")	44	raise Exception("Le modèle n'a pas pu être chargé")
44	else:	45	else:
45	self.C = data["C"]	46	self.C = data["C"]
46	self.L = data["L"]	47	self.L = data["L"]
47	self.K = data["K"]	48	self.K = data["K"]
		49	self.constrained = data["constrained"]
48		50
49	def save(self, modelpath: str):	51	def save(self, modelpath: str):
50	"""	52	"""
51		53
52	@param modelpath:	54	@param modelpath:
53	@return:	55	@return:
54	"""	56	"""
55	data = {	57	data = {
56	"C": self.C,	58	"C": self.C,
57	"L": self.L,	59	"L": self.L,
58	"K": self.K	60	"K": self.K,
		61	"constrained": self.constrained
59	}	62	}
60	with open(modelpath, "wb") as f:	63	with open(modelpath, "wb") as f:
61	pickle.dump(data, f)	64	pickle.dump(data, f)
62		65
63	def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):	66	def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
64	results = []	67	results = []
65	for i in range(ninit):	68	for i in range(ninit):
66	results.append(self._train(features, k, tol, maxiter, debug))	69	results.append(self._train(features, k, tol, maxiter, debug))
67	losses = [v["loss"] for v in results]	70	losses = [v["loss"] for v in results]
68	best = results[losses.index(min(losses))]	71	best = results[losses.index(min(losses))]
69	if debug:	72	if debug:
70	print(f"best: {best['loss']} loss")	73	print(f"best: {best['loss']} loss")
71	self.C = best["C"]	74	self.C = best["C"]
72	self.L = best["L"]	75	self.L = best["L"]
73	self.K = best["K"]	76	self.K = best["K"]
74		77
75	def _initialize_model(self, X, number_clusters):	78	def _initialize_model(self, X, number_clusters):
76	d = X.shape[1]	79	d = X.shape[1]
77	C = X[np.random.choice(X.shape[0], number_clusters)]	80	C = X[np.random.choice(X.shape[0], number_clusters)]
78	L = np.zeros((number_clusters, d, d))	81	L = np.zeros((number_clusters, d, d))
79	for k in range(number_clusters):	82	for k in range(number_clusters):
80	L[k] = np.identity(d)	83	L[k] = np.identity(d)
81	return C, L	84	return C, L
82		85
83	def _dist(self, a, b, l):	86	def _dist(self, a, b, l):
84	'''	87	'''
85	Distance euclidienne	88	Distance euclidienne with mahalanobis
86	'''	89	'''
87	a = np.reshape(a, (-1, 1))	90	a = np.reshape(a, (-1, 1))
88	b = np.reshape(b, (-1, 1))	91	b = np.reshape(b, (-1, 1))
89	result = np.transpose(a - b).dot(l).dot(a-b)[0][0]	92	result = np.transpose(a - b).dot(l).dot(a - b)[0][0]
90	return result	93	return result
91		94
92	def _plot_iteration(self, iteration, points, clusters, centers):	95	def _plot_iteration(self, iteration, points, clusters, centers):
93	fig = plt.figure()	96	fig = plt.figure()
94	ax = fig.add_subplot(111)	97	ax = fig.add_subplot(111)
95	scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)	98	scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
96		99
97	#for center in centers:	100	#for center in centers:
98	# ax.scatter(center[0], center[1], s=50, c='red', marker='+')	101	# ax.scatter(center[0], center[1], s=50, c='red', marker='+')
99	ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')	102	ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
100		103
101	ax.set_xlabel('x')	104	ax.set_xlabel('x')
102	ax.set_ylabel('y')	105	ax.set_ylabel('y')
103	plt.colorbar(scatter)	106	plt.colorbar(scatter)
104	#plt.ylim(0, 1)	107	#plt.ylim(0, 1)
105	#plt.xlim(0, 1)	108	#plt.xlim(0, 1)
106	plt.savefig("test_" + str(iteration) + ".pdf")	109	plt.savefig("test_" + str(iteration) + ".pdf")
107		110
108	def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):	111	def _train(self, features, K: int, tol: float, maxiter: int, debug: bool=False):
109	X = features	112	X = features
110	N = X.shape[0]	113	N = X.shape[0]
111	d = X.shape[1]	114	d = X.shape[1]
112		115
113	C, L = self._initialize_model(X, K)	116	C, L = self._initialize_model(X, K)
114	self.C = C	117	self.C = C
115	self.L = L	118	self.L = L
116	self.K = K	119	self.K = K
117		120
118	end_algo = False	121	end_algo = False
119	i = 0	122	i = 0
120	while not end_algo:	123	while not end_algo:
121	if debug:	124	if debug:
122	print("Iteration: ", i)	125	print("Iteration: ", i)
123		126
124	# Calcul matrix distance	127	# Calcul matrix distance
125	distances = np.zeros((N, self.K))	128	distances = np.zeros((N, self.K))
126		129
127	for n in range(N):	130	for n in range(N):
128	for k in range(self.K):	131	for k in range(self.K):
129	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])	132	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
130		133
131	closest_cluster = np.argmin(distances, axis=1)	134	closest_cluster = np.argmin(distances, axis=1)
		135
132	loss = np.sum(distances[np.arange(len(distances)), closest_cluster])	136	loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
133	if debug:	137	if debug:
134	print(f"loss {loss}")	138	print(f"loss {loss}")
135		139
136		140
137	# -- Debug tool ----------------------	141	# -- Debug tool ----------------------
138	if debug and i % 10 == 0:	142	if debug and i % 1 == 0:
139	# TSNE if needed	143	# TSNE if needed
140	X_embedded = np.concatenate((X, self.C), axis=0)	144	X_embedded = np.concatenate((X, self.C), axis=0)
141	if d > 2:	145	if d > 2:
142	X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))	146	X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, self.C), axis=0))
143		147
144	# Then plot	148	# Then plot
145	self._plot_iteration(	149	self._plot_iteration(
146	i,	150	i,
147	X_embedded[:X.shape[0]],	151	X_embedded[:X.shape[0]],
148	closest_cluster,	152	closest_cluster,
149	X_embedded[X.shape[0]:]	153	X_embedded[X.shape[0]:]
150	)	154	)
151	# ------------------------------------	155	# ------------------------------------
152		156
153	old_c = self.C.copy()	157	old_c = self.C.copy()
154	for k in range(K):	158	for k in range(self.K):
155	# Find subset of X with values closed to the centroid c_k.	159	# Find subset of X with values closed to the centroid c_k.
156	X_sub = np.where(closest_cluster == k)	160	X_sub = np.where(closest_cluster == k)
157	X_sub = np.take(X, X_sub[0], axis=0)	161	X_sub = np.take(X, X_sub[0], axis=0)
158	if X_sub.shape[0] == 0:	162	if X_sub.shape[0] == 0:
159	continue	163	continue
160	np.mean(X_sub, axis=0)	164
161	C_new = np.mean(X_sub, axis=0)	165	C_new = np.mean(X_sub, axis=0)
162		166
163	# -- COMPUTE NEW LAMBDA (here named K) --	167	# -- COMPUTE NEW LAMBDA (here named K) --
164	K_new = np.zeros((L.shape[1], L.shape[2]))	168	K_new = np.zeros((self.L.shape[1], self.L.shape[2]))
		169	tmp = np.zeros((self.L.shape[1], self.L.shape[2]))
165	for x in X_sub:	170	for x in X_sub:
166	x = np.reshape(x, (-1, 1))	171	x = np.reshape(x, (-1, 1))
167	c_tmp = np.reshape(C_new, (-1, 1))	172	c_tmp = np.reshape(C_new, (-1, 1))
168	K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())	173	#K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
169	K_new = K_new / X_sub.shape[0]	174
		175	tmp = tmp + (x - c_tmp).dot((x - c_tmp).transpose())
		176	if self.constrained:
		177	K_new = (tmp / X_sub.shape[0]) / np.power(np.linalg.det((tmp / X_sub.shape[0])), 1/d)
		178	else:
		179	K_new = tmp / X_sub.shape[0]
170	K_new = np.linalg.pinv(K_new)	180	K_new = np.linalg.pinv(K_new)
171		181
172	#if end_algo and (not (self.C[k] == C_new).all()): # If the same stop	182	#if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
173	# end_algo = False	183	# end_algo = False
174	self.C[k] = C_new	184	self.C[k] = C_new
175	self.L[k] = K_new	185	self.L[k] = K_new
176		186
177		187
178	diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))	188	diff = np.sum(np.absolute((self.C - old_c) / old_c * 100))
179	if diff > tol:	189	if diff > tol:
180	end_algo = False	190	end_algo = False
181	if debug:	191	if debug:
182	print(f"{diff}")	192	print(f"{diff}")
183	else:	193	else:
184	if debug:	194	if debug:
185	print(f"Tolerance threshold {tol} reached with diff {diff}")	195	print(f"Tolerance threshold {tol} reached with diff {diff}")
186	end_algo = True	196	end_algo = True
187		197
188	i = i + 1	198	i = i + 1
189	if i > maxiter:	199	if i > maxiter:
190	end_algo = True	200	end_algo = True
191	if debug:	201	if debug:
192	print(f"Iteration {maxiter} reached")	202	print(f"Iteration {maxiter} reached")
193	return {	203	return {
194	"loss": loss,	204	"loss": loss,
195	"C": self.C,	205	"C": self.C,
196	"K": self.K,	206	"K": self.K,
197	"L": self.L	207	"L": self.L
198	}	208	}