Commit 8b8c216f3d939bb8289ae7442a7282c0f46ffc51
1 parent
0774ae544e
Exists in
master
Fixed issues, k was equal to 5 instead of the k parameter
Showing 1 changed file with 1 additions and 1 deletions Inline Diff
volia/clustering_modules/kmeans_multidistance.py
1 | 1 | ||
2 | import pickle | 2 | import pickle |
3 | from abstract_clustering import AbstractClustering | 3 | from abstract_clustering import AbstractClustering |
4 | from KMeans_Multidistance.KMeans_Class import KMeans | 4 | from KMeans_Multidistance.KMeans_Class import KMeans |
5 | from random import seed | 5 | from random import seed |
6 | from random import randint | 6 | from random import randint |
7 | import numpy as np | 7 | import numpy as np |
8 | from sklearn.metrics import pairwise_distances | 8 | from sklearn.metrics import pairwise_distances |
9 | 9 | ||
10 | class kmeansMultidistance(): | 10 | class kmeansMultidistance(): |
11 | def __init__(self, distance="cosine"): | 11 | def __init__(self, distance="cosine"): |
12 | self.kmeans_model = None # Best model | 12 | self.kmeans_model = None # Best model |
13 | self.centroids = None | 13 | self.centroids = None |
14 | self.distance = distance | 14 | self.distance = distance |
15 | self.seed = None # Seed of the best | 15 | self.seed = None # Seed of the best |
16 | self.seeds = None | 16 | self.seeds = None |
17 | 17 | ||
18 | def predict(self, features): | 18 | def predict(self, features): |
19 | """ | 19 | """ |
20 | 20 | ||
21 | @param features: | 21 | @param features: |
22 | @return: | 22 | @return: |
23 | """ | 23 | """ |
24 | return self.kmeans_model.assign_clusters(data=features, centroids=self.centroids, distance=self.kmeans_model.distance) | 24 | return self.kmeans_model.assign_clusters(data=features, centroids=self.centroids, distance=self.kmeans_model.distance) |
25 | 25 | ||
26 | def load(self, model_path: str): | 26 | def load(self, model_path: str): |
27 | """ | 27 | """ |
28 | 28 | ||
29 | @param model_path: | 29 | @param model_path: |
30 | @return: | 30 | @return: |
31 | """ | 31 | """ |
32 | with open(model_path, "rb") as f: | 32 | with open(model_path, "rb") as f: |
33 | data = pickle.load(f) | 33 | data = pickle.load(f) |
34 | self.kmeans_model = data["kmeans_model"] | 34 | self.kmeans_model = data["kmeans_model"] |
35 | self.centroids = data["centroids"] | 35 | self.centroids = data["centroids"] |
36 | self.distance = self.kmeans_model.distance | 36 | self.distance = self.kmeans_model.distance |
37 | 37 | ||
38 | def save(self, model_path: str): | 38 | def save(self, model_path: str): |
39 | """ | 39 | """ |
40 | 40 | ||
41 | @param model_path: | 41 | @param model_path: |
42 | @return: | 42 | @return: |
43 | """ | 43 | """ |
44 | with open(model_path, "wb") as f: | 44 | with open(model_path, "wb") as f: |
45 | pickle.dump({ | 45 | pickle.dump({ |
46 | "kmeans_model": self.kmeans_model, | 46 | "kmeans_model": self.kmeans_model, |
47 | "centroids": self.centroids | 47 | "centroids": self.centroids |
48 | }, f) | 48 | }, f) |
49 | 49 | ||
50 | def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): | 50 | def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False): |
51 | """ | 51 | """ |
52 | 52 | ||
53 | @param features: | 53 | @param features: |
54 | @param k: | 54 | @param k: |
55 | @return: | 55 | @return: |
56 | """ | 56 | """ |
57 | # Initialization | 57 | # Initialization |
58 | self.kmeans_model = None | 58 | self.kmeans_model = None |
59 | self.centroids = None | 59 | self.centroids = None |
60 | self.seed = None | 60 | self.seed = None |
61 | 61 | ||
62 | # Compute seeds before using seeds | 62 | # Compute seeds before using seeds |
63 | seed() | 63 | seed() |
64 | self.seeds = [randint(1, 100000) for i in range(ninit)] | 64 | self.seeds = [randint(1, 100000) for i in range(ninit)] |
65 | 65 | ||
66 | # Learning k-means model | 66 | # Learning k-means model |
67 | results = [] | 67 | results = [] |
68 | for i in range(ninit): | 68 | for i in range(ninit): |
69 | model = KMeans(k=5, | 69 | model = KMeans(k=k, |
70 | maxiter=maxiter, | 70 | maxiter=maxiter, |
71 | distance=self.distance, | 71 | distance=self.distance, |
72 | record_heterogeneity=[], | 72 | record_heterogeneity=[], |
73 | verbose=debug, | 73 | verbose=debug, |
74 | seed=self.seeds[i]) | 74 | seed=self.seeds[i]) |
75 | centroids, closest_cluster = model.fit(features) | 75 | centroids, closest_cluster = model.fit(features) |
76 | 76 | ||
77 | # Compute distance | 77 | # Compute distance |
78 | kwds = {} | 78 | kwds = {} |
79 | if self.distance == "mahalanobis": | 79 | if self.distance == "mahalanobis": |
80 | VI = np.linalg.pinv(np.cov(features.T)).T | 80 | VI = np.linalg.pinv(np.cov(features.T)).T |
81 | kwds = {"VI": VI} | 81 | kwds = {"VI": VI} |
82 | distances = pairwise_distances(features, centroids, metric=self.distance, **kwds) | 82 | distances = pairwise_distances(features, centroids, metric=self.distance, **kwds) |
83 | 83 | ||
84 | # Then compute the loss | 84 | # Then compute the loss |
85 | loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) | 85 | loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) |
86 | 86 | ||
87 | results.append({ | 87 | results.append({ |
88 | "model": model, | 88 | "model": model, |
89 | "centroids": centroids, | 89 | "centroids": centroids, |
90 | "seed": self.seeds[i], | 90 | "seed": self.seeds[i], |
91 | "loss": loss | 91 | "loss": loss |
92 | }) | 92 | }) |
93 | losses = [result["loss"] for result in results] | 93 | losses = [result["loss"] for result in results] |
94 | best = results[losses.index(min(losses))] | 94 | best = results[losses.index(min(losses))] |
95 | self.kmeans_model = best["model"] | 95 | self.kmeans_model = best["model"] |
96 | self.centroids = best["centroids"] | 96 | self.centroids = best["centroids"] |
97 | self.seed = best["seed"] | 97 | self.seed = best["seed"] |
98 | 98 |