clustering.py
3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
from os import path, mkdir
from utils import SubCommandRunner
from core.data import read_features, read_lst
import numpy as np
from sklearn.cluster import KMeans
import pickle
def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
"""
@param features: output features
@param lst: list file
@param k: k (kmin if kmax specified)
@param kmax: maximum k to compute
@param klist: list of k values to compute, ignore k value
@param output: output file if kmax not specified, else, output directory
"""
# -- READE FILES --
features_dict = read_features(features)
lst_dict = read_lst(lst)
X = np.asarray([features_dict[x] for x in lst_dict])
# Exception cases
if kmax is None and klist is None and path.isdir(output):
raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
if (kmax is not None or klist is not None) and path.isfile(output):
raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
# Mono value case
if kmax is None and klist is None:
print(f"Computing clustering with k={k}")
kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
preds = kmeans.predict(X)
pickle.dump(kmeans, open(output, "wb"))
# Multi values case with kmax
if kmax is not None:
if not path.isdir(output):
mkdir(output)
Ks = range(k, kmax + 1)
for i in Ks:
print(f"Computing clustering with k={i}")
kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
preds = kmeans.predict(X)
pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
# Second multi values case with klist
if klist is not None:
if not path.isdir(output):
mkdir(output)
for k in klist:
k = int(k)
print(f"Computing clustering with k={k}")
kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
preds = kmeans.predict(X)
pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
if __name__ == "__main__":
# Main parser
parser = argparse.ArgumentParser(description="Clustering methods to apply")
subparsers = parser.add_subparsers(title="action")
# kmeans
parser_kmeans = subparsers.add_parser(
"kmeans", help="Compute clustering using k-means algorithm")
parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
parser_kmeans.add_argument("-k", default=2, type=int,
help="number of clusters to compute. It is kmin if kmax is specified.")
parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
parser_kmeans.add_argument("--klist", nargs="+",
help="List of k values to test. As kmax, activate the multi values mod.")
parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
parser_kmeans.set_defaults(which="kmeans")
# Parse
args = parser.parse_args()
# Run commands
runner = SubCommandRunner({
"kmeans": kmeans_run
})
runner.run(args.which, args.__dict__, remove="which")