From 1c1f0f29a768a0519933f876ecc8a9494e8984a4 Mon Sep 17 00:00:00 2001 From: quillotm Date: Tue, 24 Aug 2021 09:39:35 +0200 Subject: [PATCH] Now we train n_init time with the basic multidistance implementation of k-means. --- volia/clustering_modules/kmeans_multidistance.py | 53 +++++++++++++++++++++--- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/volia/clustering_modules/kmeans_multidistance.py b/volia/clustering_modules/kmeans_multidistance.py index f1c7944..47fe159 100644 --- a/volia/clustering_modules/kmeans_multidistance.py +++ b/volia/clustering_modules/kmeans_multidistance.py @@ -2,12 +2,18 @@ import pickle from abstract_clustering import AbstractClustering from KMeans_Multidistance.KMeans_Class import KMeans +from random import seed +from random import random +import numpy as np +from sklearn.metrics import pairwise_distances class kmeansMultidistance(): def __init__(self, distance="cosine"): - self.kmeans_model = None + self.kmeans_model = None # Best model self.centroids = None self.distance = distance + self.seed = None # Seed of the best + self.seeds = None def predict(self, features): """ @@ -48,7 +54,44 @@ class kmeansMultidistance(): @param k: @return: """ - model = KMeans(k=5, maxiter=maxiter, distance=self.distance, record_heterogeneity=[], verbose=True, seed=123) - centroids, _ = model.fit(features) - self.centroids = centroids - self.kmeans_model = model \ No newline at end of file + # Initialization + self.kmeans_model = None + self.centroids = None + self.seed = None + + # Compute seeds before using seeds + seed() + self.seeds = [random() for i in range(ninit)] + + # Learning k-means model + results = [] + for i in range(ninit): + model = KMeans(k=5, + maxiter=maxiter, + distance=self.distance, + record_heterogeneity=[], + verbose=True, + seed=self.seeds[i]) + centroids, closest_cluster = model.fit(features) + + # Compute distance + kwds = {} + if self.distance == "mahalanobis": + VI = np.linalg.pinv(np.cov(features.T)).T + kwds = {"VI": VI} + distances = pairwise_distances(features, centroids, metric=self.distance, **kwds) + + # Then compute the loss + loss = np.sum(distances[np.arange(len(distances)), closest_cluster]) + + results.append({ + "model": model, + "centroids": centroids, + "seed": self.seeds[i], + "loss": loss + }) + losses = [result["loss"] for result in results] + best = results[losses.index(min(losses))] + self.kmeans_model = results[best]["model"] + self.centroids = results[best]["centroids"] + self.seed = results[best]["seed"] -- 1.8.2.3