Commit 8b8c216f3d939bb8289ae7442a7282c0f46ffc51

Authored by quillotm
1 parent 0774ae544e
Exists in master

Fixed issues, k was equal to 5 instead of the k parameter

Showing 1 changed file with 1 additions and 1 deletions Inline Diff

volia/clustering_modules/kmeans_multidistance.py
1 1
2 import pickle 2 import pickle
3 from abstract_clustering import AbstractClustering 3 from abstract_clustering import AbstractClustering
4 from KMeans_Multidistance.KMeans_Class import KMeans 4 from KMeans_Multidistance.KMeans_Class import KMeans
5 from random import seed 5 from random import seed
6 from random import randint 6 from random import randint
7 import numpy as np 7 import numpy as np
8 from sklearn.metrics import pairwise_distances 8 from sklearn.metrics import pairwise_distances
9 9
10 class kmeansMultidistance(): 10 class kmeansMultidistance():
11 def __init__(self, distance="cosine"): 11 def __init__(self, distance="cosine"):
12 self.kmeans_model = None # Best model 12 self.kmeans_model = None # Best model
13 self.centroids = None 13 self.centroids = None
14 self.distance = distance 14 self.distance = distance
15 self.seed = None # Seed of the best 15 self.seed = None # Seed of the best
16 self.seeds = None 16 self.seeds = None
17 17
18 def predict(self, features): 18 def predict(self, features):
19 """ 19 """
20 20
21 @param features: 21 @param features:
22 @return: 22 @return:
23 """ 23 """
24 return self.kmeans_model.assign_clusters(data=features, centroids=self.centroids, distance=self.kmeans_model.distance) 24 return self.kmeans_model.assign_clusters(data=features, centroids=self.centroids, distance=self.kmeans_model.distance)
25 25
26 def load(self, model_path: str): 26 def load(self, model_path: str):
27 """ 27 """
28 28
29 @param model_path: 29 @param model_path:
30 @return: 30 @return:
31 """ 31 """
32 with open(model_path, "rb") as f: 32 with open(model_path, "rb") as f:
33 data = pickle.load(f) 33 data = pickle.load(f)
34 self.kmeans_model = data["kmeans_model"] 34 self.kmeans_model = data["kmeans_model"]
35 self.centroids = data["centroids"] 35 self.centroids = data["centroids"]
36 self.distance = self.kmeans_model.distance 36 self.distance = self.kmeans_model.distance
37 37
38 def save(self, model_path: str): 38 def save(self, model_path: str):
39 """ 39 """
40 40
41 @param model_path: 41 @param model_path:
42 @return: 42 @return:
43 """ 43 """
44 with open(model_path, "wb") as f: 44 with open(model_path, "wb") as f:
45 pickle.dump({ 45 pickle.dump({
46 "kmeans_model": self.kmeans_model, 46 "kmeans_model": self.kmeans_model,
47 "centroids": self.centroids 47 "centroids": self.centroids
48 }, f) 48 }, f)
49 49
50 def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): 50 def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
51 """ 51 """
52 52
53 @param features: 53 @param features:
54 @param k: 54 @param k:
55 @return: 55 @return:
56 """ 56 """
57 # Initialization 57 # Initialization
58 self.kmeans_model = None 58 self.kmeans_model = None
59 self.centroids = None 59 self.centroids = None
60 self.seed = None 60 self.seed = None
61 61
62 # Compute seeds before using seeds 62 # Compute seeds before using seeds
63 seed() 63 seed()
64 self.seeds = [randint(1, 100000) for i in range(ninit)] 64 self.seeds = [randint(1, 100000) for i in range(ninit)]
65 65
66 # Learning k-means model 66 # Learning k-means model
67 results = [] 67 results = []
68 for i in range(ninit): 68 for i in range(ninit):
69 model = KMeans(k=5, 69 model = KMeans(k=k,
70 maxiter=maxiter, 70 maxiter=maxiter,
71 distance=self.distance, 71 distance=self.distance,
72 record_heterogeneity=[], 72 record_heterogeneity=[],
73 verbose=debug, 73 verbose=debug,
74 seed=self.seeds[i]) 74 seed=self.seeds[i])
75 centroids, closest_cluster = model.fit(features) 75 centroids, closest_cluster = model.fit(features)
76 76
77 # Compute distance 77 # Compute distance
78 kwds = {} 78 kwds = {}
79 if self.distance == "mahalanobis": 79 if self.distance == "mahalanobis":
80 VI = np.linalg.pinv(np.cov(features.T)).T 80 VI = np.linalg.pinv(np.cov(features.T)).T
81 kwds = {"VI": VI} 81 kwds = {"VI": VI}
82 distances = pairwise_distances(features, centroids, metric=self.distance, **kwds) 82 distances = pairwise_distances(features, centroids, metric=self.distance, **kwds)
83 83
84 # Then compute the loss 84 # Then compute the loss
85 loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) 85 loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
86 86
87 results.append({ 87 results.append({
88 "model": model, 88 "model": model,
89 "centroids": centroids, 89 "centroids": centroids,
90 "seed": self.seeds[i], 90 "seed": self.seeds[i],
91 "loss": loss 91 "loss": loss
92 }) 92 })
93 losses = [result["loss"] for result in results] 93 losses = [result["loss"] for result in results]
94 best = results[losses.index(min(losses))] 94 best = results[losses.index(min(losses))]
95 self.kmeans_model = best["model"] 95 self.kmeans_model = best["model"]
96 self.centroids = best["centroids"] 96 self.centroids = best["centroids"]
97 self.seed = best["seed"] 97 self.seed = best["seed"]
98 98