kmeans_multidistance.py
2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pickle
from abstract_clustering import AbstractClustering
from KMeans_Multidistance.KMeans_Class import KMeans
from random import seed
from random import randint
import numpy as np
from sklearn.metrics import pairwise_distances
class kmeansMultidistance():
def __init__(self, distance="cosine"):
self.kmeans_model = None # Best model
self.centroids = None
self.distance = distance
self.seed = None # Seed of the best
self.seeds = None
def predict(self, features):
"""
@param features:
@return:
"""
return self.kmeans_model.assign_clusters(data=features, centroids=self.centroids, distance=self.kmeans_model.distance)
def load(self, model_path: str):
"""
@param model_path:
@return:
"""
with open(model_path, "rb") as f:
data = pickle.load(f)
self.kmeans_model = data["kmeans_model"]
self.centroids = data["centroids"]
self.distance = self.kmeans_model.distance
def save(self, model_path: str):
"""
@param model_path:
@return:
"""
with open(model_path, "wb") as f:
pickle.dump({
"kmeans_model": self.kmeans_model,
"centroids": self.centroids
}, f)
def fit(self, features, k: int, tol: float, ninit: int, maxiter: int=300, debug: bool=False):
"""
@param features:
@param k:
@return:
"""
# Initialization
self.kmeans_model = None
self.centroids = None
self.seed = None
# Compute seeds before using seeds
seed()
self.seeds = [randint(1, 100000) for i in range(ninit)]
# Learning k-means model
results = []
for i in range(ninit):
model = KMeans(k=5,
maxiter=maxiter,
distance=self.distance,
record_heterogeneity=[],
verbose=debug,
seed=self.seeds[i])
centroids, closest_cluster = model.fit(features)
# Compute distance
kwds = {}
if self.distance == "mahalanobis":
VI = np.linalg.pinv(np.cov(features.T)).T
kwds = {"VI": VI}
distances = pairwise_distances(features, centroids, metric=self.distance, **kwds)
# Then compute the loss
loss = np.sum(distances[np.arange(len(distances)), closest_cluster])
results.append({
"model": model,
"centroids": centroids,
"seed": self.seeds[i],
"loss": loss
})
losses = [result["loss"] for result in results]
best = results[losses.index(min(losses))]
self.kmeans_model = best["model"]
self.centroids = best["centroids"]
self.seed = best["seed"]