Quillot Mathias / volia

Commit 4152e83df25ef19c8b048592e9629911bcf77e1a

Authored by quillotm 2021-08-13 17:03:44 +0200

Exists in master

Addind kmeans mahalanobis. The algorithm is now fully functional. Need to solve …

…some problems with identity matrix usage and infinite or nan values.

Showing 3 changed files with 216 additions and 17 deletions Inline Diff

volia/clustering.py
volia/clustering_modules/kmeans.py
volia/clustering_modules/kmeans_mahalanobis.py

volia/clustering.py

Diff comments View file @ 4152e83

1	import argparse	1	import argparse
2	from os import path, mkdir	2	from os import path, mkdir
3	from utils import SubCommandRunner	3	from utils import SubCommandRunner
4	from core.data import read_features, read_lst, read_labels	4	from core.data import read_features, read_lst, read_labels
5	import numpy as np	5	import numpy as np
6	from sklearn.cluster import KMeans	6	from sklearn.cluster import KMeans
7	import pickle	7	import pickle
8	from clustering_modules.kmeans import kmeans	8	from clustering_modules.kmeans import kmeans
		9	from clustering_modules.kmeans_mahalanobis import kmeansMahalanobis
9		10
10	from sklearn.preprocessing import LabelEncoder	11	from sklearn.preprocessing import LabelEncoder
11	from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score	12	from sklearn.metrics import v_measure_score, homogeneity_score, completeness_score
12		13
13	import core.measures	14	import core.measures
14	import json	15	import json
15		16
16		17
17	CLUSTERING_METHODS = {	18	CLUSTERING_METHODS = {
18	"k-means": kmeans()	19	"k-means": kmeans(),
		20	"k-means-mahalanobis": kmeansMahalanobis()
19	}	21	}
20		22
21	EVALUATION_METHODS = {	23	EVALUATION_METHODS = {
22	"entropy": core.measures.entropy_score,	24	"entropy": core.measures.entropy_score,
23	"purity": core.measures.purity_score,	25	"purity": core.measures.purity_score,
24	"v-measure": v_measure_score,	26	"v-measure": v_measure_score,
25	"homogeneity": homogeneity_score,	27	"homogeneity": homogeneity_score,
26	"completeness": completeness_score,	28	"completeness": completeness_score,
27	}	29	}
28		30
29		31
30	def disequilibrium_run():	32	def disequilibrium_run():
31	pass	33	pass
32		34
33		35
34	def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):	36	def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
35	"""	37	"""
36		38
37	@param measure:	39	@param measure:
38	@param features:	40	@param features:
39	@param lst:	41	@param lst:
40	@param truelabels:	42	@param truelabels:
41	@param model:	43	@param model:
42	@param modeltype:	44	@param modeltype:
43	@return:	45	@return:
44	"""	46	"""
45	module = CLUSTERING_METHODS[modeltype]	47	module = CLUSTERING_METHODS[modeltype]
46	module.load(model)	48	module.load(model)
47		49
48	eval = {}	50	eval = {}
49	for ms in measure:	51	for ms in measure:
50	evaluation = EVALUATION_METHODS[ms]	52	evaluation = EVALUATION_METHODS[ms]
51	feats_dict = read_features(features)	53	feats_dict = read_features(features)
52	labels_dict = read_labels(truelabels)	54	labels_dict = read_labels(truelabels)
53	lst_dict = read_lst(lst)	55	lst_dict = read_lst(lst)
54	lst_keys = [key for key in lst_dict]	56	lst_keys = [key for key in lst_dict]
55	feats = np.asarray([feats_dict[key] for key in lst_keys])	57	feats = np.asarray([feats_dict[key] for key in lst_keys])
56	Y_pred = module.predict(feats)	58	Y_pred = module.predict(feats)
57	Y_truth = [labels_dict[key][0] for key in lst_keys]	59	Y_truth = [labels_dict[key][0] for key in lst_keys]
58		60
59	le = LabelEncoder()	61	le = LabelEncoder()
60	le.fit(Y_truth)	62	le.fit(Y_truth)
61	Y_truth = le.transform(Y_truth)	63	Y_truth = le.transform(Y_truth)
62		64
63	eval[ms] = evaluation(Y_truth, Y_pred)	65	eval[ms] = evaluation(Y_truth, Y_pred)
64		66
65	print(json.dumps(eval))	67	print(json.dumps(eval))
66		68
67		69
68		70	def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str, mahalanobis: str = False):
69	def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
70	"""	71	"""
71		72
72	@param features: output features	73	@param features: output features
73	@param lst: list file	74	@param lst: list file
74	@param k: k (kmin if kmax specified)	75	@param k: k (kmin if kmax specified)
75	@param kmax: maximum k to compute	76	@param kmax: maximum k to compute
76	@param klist: list of k values to compute, ignore k value	77	@param klist: list of k values to compute, ignore k value
77	@param output: output file if kmax not specified, else, output directory	78	@param output: output file if kmax not specified, else, output directory
		79	@param mahalanobis: distance option of k-means.
78	"""	80	"""
79	# -- READ FILES --	81	# -- READ FILES --
80	features_dict = read_features(features)	82	features_dict = read_features(features)
81	lst_dict = read_lst(lst)	83	lst_dict = read_lst(lst)
82	X = np.asarray([features_dict[x] for x in lst_dict])	84	X = np.asarray([features_dict[x] for x in lst_dict])
83		85
84	# Exception cases	86	# Exception cases
85	if kmax is None and klist is None and path.isdir(output):	87	if kmax is None and klist is None and path.isdir(output):
86	raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")	88	raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
87		89
88	if (kmax is not None or klist is not None) and path.isfile(output):	90	if (kmax is not None or klist is not None) and path.isfile(output):
89	raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")	91	raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
90		92
91	# Mono value case	93	# Mono value case
92	if kmax is None and klist is None:	94	if kmax is None and klist is None:
93	print(f"Computing clustering with k={k}")	95	print(f"Computing clustering with k={k}")
94	kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)	96	model = CLUSTERING_METHODS["k-means"]
95	preds = kmeans.predict(X)	97	if mahalanobis:
96	pickle.dump(kmeans, open(output, "wb"))	98	print("Computing with mahalanobis distance")
		99	model = CLUSTERING_METHODS["k-means-mahalanobis"]
		100	model.fit(X, k)
		101	model.save(output)
97		102
98	# Multi values case with kmax	103	# Multi values case with kmax
99	if kmax is not None:	104	if kmax is not None:
100	if not path.isdir(output):	105	if not path.isdir(output):
101	mkdir(output)	106	mkdir(output)
102	Ks = range(k, kmax + 1)	107	Ks = range(k, kmax + 1)
103	for i in Ks:	108	for i in Ks:
104	print(f"Computing clustering with k={i}")	109	model = CLUSTERING_METHODS["k-means"]
105	kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)	110	if mahalanobis:
106	preds = kmeans.predict(X)	111	model = CLUSTERING_METHODS["k-means-mahalanobis"]
107	pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))	112	model.fit(X, i)
		113	model.save(path.join(output, "clustering_" + str(i) + ".pkl"))
108		114
109	# Second multi values case with klist	115	# Second multi values case with klist
110	if klist is not None:	116	if klist is not None:
111	if not path.isdir(output):	117	if not path.isdir(output):
112	mkdir(output)	118	mkdir(output)
113	for k in klist:	119	for k in klist:
114	k = int(k)	120	k = int(k)
115	print(f"Computing clustering with k={k}")	121	model = CLUSTERING_METHODS["k-means"]
116	kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)	122	if mahalanobis:
117	preds = kmeans.predict(X)	123	print("Computing with mahalanobis distance")
118	pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))	124	model = CLUSTERING_METHODS["k-means-mahalanobis"]
		125	model.fit(X, k)
		126	model.save(path.join(output, "clustering_" + str(k) + ".pkl"))
119		127
120		128
121	if __name__ == "__main__":	129	if __name__ == "__main__":
122	# Main parser	130	# Main parser
123	parser = argparse.ArgumentParser(description="Clustering methods to apply")	131	parser = argparse.ArgumentParser(description="Clustering methods to apply")
124	subparsers = parser.add_subparsers(title="action")	132	subparsers = parser.add_subparsers(title="action")
125		133
126	# kmeans	134	# kmeans
127	parser_kmeans = subparsers.add_parser(	135	parser_kmeans = subparsers.add_parser(
128	"kmeans", help="Compute clustering using k-means algorithm")	136	"kmeans", help="Compute clustering using k-means algorithm")
129		137
130	parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")	138	parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
131	parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")	139	parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
132	parser_kmeans.add_argument("-k", default=2, type=int,	140	parser_kmeans.add_argument("-k", default=2, type=int,
133	help="number of clusters to compute. It is kmin if kmax is specified.")	141	help="number of clusters to compute. It is kmin if kmax is specified.")
134	parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")	142	parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
135	parser_kmeans.add_argument("--klist", nargs="+",	143	parser_kmeans.add_argument("--klist", nargs="+",
136	help="List of k values to test. As kmax, activate the multi values mod.")	144	help="List of k values to test. As kmax, activate the multi values mod.")
137	parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")	145	parser_kmeans.add_argument("--output",
		146	default=".kmeans",
		147	help="output file if only k. Output directory if multiple kmax specified.")
		148	parser_kmeans.add_argument("--mahalanobis", action="store_true")
138	parser_kmeans.set_defaults(which="kmeans")	149	parser_kmeans.set_defaults(which="kmeans")
139		150
140	# measure	151	# measure
141	parser_measure = subparsers.add_parser(	152	parser_measure = subparsers.add_parser(
142	"measure", help="compute the entropy")	153	"measure", help="compute the entropy")
143		154
144	parser_measure.add_argument("--measure",	155	parser_measure.add_argument("--measure",
145	required=True,	156	required=True,
146	nargs="+",	157	nargs="+",
147	choices=[key for key in EVALUATION_METHODS],	158	choices=[key for key in EVALUATION_METHODS],
148	help="...")	159	help="...")
149	parser_measure.add_argument("--features", required=True, type=str, help="...")	160	parser_measure.add_argument("--features", required=True, type=str, help="...")
150	parser_measure.add_argument("--lst", required=True, type=str, help="...")	161	parser_measure.add_argument("--lst", required=True, type=str, help="...")
151	parser_measure.add_argument("--truelabels", required=True, type=str, help="...")	162	parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
152	parser_measure.add_argument("--model", required=True, type=str, help="...")	163	parser_measure.add_argument("--model", required=True, type=str, help="...")
153	parser_measure.add_argument("--modeltype",	164	parser_measure.add_argument("--modeltype",
154	required=True,	165	required=True,
155	choices=[key for key in CLUSTERING_METHODS],	166	choices=[key for key in CLUSTERING_METHODS],
156	help="type of model for learning")	167	help="type of model for learning")
157	parser_measure.set_defaults(which="measure")	168	parser_measure.set_defaults(which="measure")
158		169
159	# disequilibrium	170	# disequilibrium
160	parser_disequilibrium = subparsers.add_parser(	171	parser_disequilibrium = subparsers.add_parser(
161	"disequilibrium", help="...")	172	"disequilibrium", help="...")
162		173
163	parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")	174	parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
164	parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")	175	parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
165	parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")	176	parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
166	parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")	177	parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
167	parser_disequilibrium.add_argument("--model-type",	178	parser_disequilibrium.add_argument("--model-type",
168	required=True,	179	required=True,
169	choices=["kmeans", "2", "3"],	180	choices=["kmeans", "2", "3"],
170	help="...")	181	help="...")
171	parser_disequilibrium.set_defaults(which="disequilibrium")	182	parser_disequilibrium.set_defaults(which="disequilibrium")
172		183
173	# Parse	184	# Parse
174	args = parser.parse_args()	185	args = parser.parse_args()
175		186
176	# Run commands	187	# Run commands
177	runner = SubCommandRunner({	188	runner = SubCommandRunner({
178	"kmeans": kmeans_run,	189	"kmeans": kmeans_run,
179	"measure": measure_run,	190	"measure": measure_run,
180	"disequilibrium": disequilibrium_run	191	"disequilibrium": disequilibrium_run
181	})	192	})
182		193
183	runner.run(args.which, args.__dict__, remove="which")	194	runner.run(args.which, args.__dict__, remove="which")

volia/clustering_modules/kmeans.py

Diff comments View file @ 4152e83

1		1
2	from sklearn.cluster import KMeans	2	from sklearn.cluster import KMeans
3	import pickle	3	import pickle
4	from abstract_clustering import AbstractClustering	4	from abstract_clustering import AbstractClustering
5		5
6	class kmeans():	6	class kmeans():
7	def __init__(self):	7	def __init__(self):
8	self.kmeans_model = None	8	self.kmeans_model = None
9		9
10	def predict(self, features):	10	def predict(self, features):
		11	"""
		12
		13	@param features:
		14	@return:
		15	"""
11	return self.kmeans_model.predict(features)	16	return self.kmeans_model.predict(features)
12		17
13	def load(self, model_path):	18	def load(self, model_path: str):
14	self.kmeans_model = pickle.load(open(model_path, "rb"))	19	"""
		20
		21	@param model_path:
		22	@return:
		23	"""
		24	with open(model_path, "rb") as f:
		25	self.kmeans_model = pickle.load(f)
		26
		27	def save(self, model_path: str):
		28	"""
		29
		30	@param model_path:
		31	@return:
		32	"""
		33	with open(model_path, "wb") as f:
		34	pickle.dump(self.kmeans_model, f)
		35
		36	def fit(self, features, k: int):
		37	"""
		38
		39	@param features:
		40	@param k:
		41	@return:
		42	"""
		43	self.kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=0).fit(features)
15		44

volia/clustering_modules/kmeans_mahalanobis.py

Diff comments View file @ 4152e83

File was created	1
	2
	3	from sklearn.cluster import KMeans
	4	import pickle
	5	import numpy as np
	6	import matplotlib.pyplot as plt
	7	from sklearn.manifold import TSNE
	8	from abstract_clustering import AbstractClustering
	9
	10	class kmeansMahalanobis():
	11	def __init__(self):
	12	"""
	13
	14	"""
	15	self.C = None
	16	self.L = None
	17	self.K = None
	18
	19	def predict(self, features):
	20	"""
	21
	22	@param features:
	23	@return:
	24	"""
	25	N = features.shape[0]
	26	distances = np.zeros((N, self.K))
	27	for n in range(N):
	28	for k in range(self.K):
	29	distances[n][k] = self._dist(features[n], self.C[k], self.L[k])
	30	print(distances)
	31	closest_cluster = np.argmin(distances, axis=1)
	32	return closest_cluster
	33
	34	def load(self, model_path):
	35	"""
	36
	37	@param model_path:
	38	@return:
	39	"""
	40	data = None
	41	with open(model_path):
	42	data = pickle.load()
	43	if data is None:
	44	raise Exception("Le modèle n'a pas pu être chargé")
	45	else:
	46	self.C = data["C"]
	47	self.L = data["L"]
	48	self.K = data["K"]
	49
	50	def save(self, modelpath: str):
	51	"""
	52
	53	@param modelpath:
	54	@return:
	55	"""
	56	data = {
	57	"C": self.C,
	58	"L": self.L,
	59	"K": self.K
	60	}
	61	with open(modelpath, "wb") as f:
	62	pickle.dump(data, f)
	63
	64	def fit(self, features, K: int):
	65	self._train(features, K)
	66
	67	def _initialize_model(self, X, number_clusters):
	68	d = X.shape[1]
	69	C = X[np.random.choice(X.shape[0], number_clusters)]
	70	L = np.zeros((number_clusters, d, d))
	71	for k in range(number_clusters):
	72	L[k] = np.identity(d)
	73	return C, L
	74
	75	def _dist(self, a, b, l):
	76	'''
	77	Distance euclidienne
	78	'''
	79	a = np.reshape(a, (-1, 1))
	80	b = np.reshape(b, (-1, 1))
	81	result = np.transpose(a - b).dot(l).dot(a-b)[0][0]
	82	return result
	83
	84	def _plot_iteration(self, iteration, points, clusters, centers):
	85	fig = plt.figure()
	86	ax = fig.add_subplot(111)
	87	scatter = ax.scatter(points[:, 0], points[:, 1], c=clusters, s=50)
	88
	89	#for center in centers:
	90	# ax.scatter(center[0], center[1], s=50, c='red', marker='+')
	91	ax.scatter(centers[:, 0], centers[:, 1], s=50, c='red', marker='+')
	92
	93	ax.set_xlabel('x')
	94	ax.set_ylabel('y')
	95	plt.colorbar(scatter)
	96	#plt.ylim(0, 1)
	97	#plt.xlim(0, 1)
	98	plt.savefig("test_" + str(iteration) + ".pdf")
	99
	100	def _train(self, features, K: int):
	101	X = features
	102	N = X.shape[0]
	103	d = X.shape[1]
	104
	105	C, L = self._initialize_model(X, K)
	106	self.C = C
	107	self.L = L
	108	self.K = K
	109
	110	end_algo = False
	111	i = 0
	112	while not end_algo:
	113	if i == 10:
	114	exit(1)
	115	print("Iteration: ", i)
	116	# Calcul matrix distance
	117	distances = np.zeros((N, K))
	118
	119	for n in range(N):
	120	for k in range(self.K):
	121	distances[n][k] = self._dist(X[n], self.C[k], self.L[k])
	122	print(distances)
	123	closest_cluster = np.argmin(distances, axis=1)
	124	if i % 1 == 0:
	125	# -- Debug tool ----------------------
	126	# TSNE
	127	#X_embedded = np.concatenate((X, self.C), axis=0)
	128	X_embedded = TSNE(n_components=2).fit_transform(np.concatenate((X, C), axis=0))
	129	# Then plot
	130	self._plot_iteration(
	131	i,
	132	X_embedded[:X.shape[0]],
	133	closest_cluster,
	134	X_embedded[X.shape[0]:]
	135	)
	136	# ------------------------------------
	137
	138	end_algo = True
	139	for k in range(K):
	140	# Find subset of X with values closed to the centroid c_k.
	141	X_sub = np.where(closest_cluster == k)
	142	X_sub = np.take(X, X_sub[0], axis=0)
	143	np.mean(X_sub, axis=0)
	144	C_new = np.mean(X_sub, axis=0)
	145
	146	# -- COMPUTE NEW LAMBDA (here named K) --
	147	K_new = np.zeros((L.shape[1], L.shape[2]))
	148	for x in X_sub:
	149	x = np.reshape(x, (-1, 1))
	150	c_tmp = np.reshape(C_new, (-1, 1))
	151	K_new = K_new + (x - c_tmp).dot((x - c_tmp).transpose())
	152	K_new = K_new / X_sub.shape[0]
	153	K_new = np.linalg.inv(K_new)
	154
	155	if end_algo and (not (self.C[k] == C_new).all()): # If the same stop
	156	end_algo = False
	157	self.C[k] = C_new
	158	self.L[k] = K_new
	159	i = i + 1
	160