cluster_kmeans.py
1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
'''
This script aims in computing k-means for a given
data set.
'''
import argparse
import numpy as np
from sklearn.cluster import KMeans
from os import path
from os import mkdir
import pickle
from data import read_file, index_by_id
# -- ARGPARSE --
parser = argparse.ArgumentParser(description="Cluster with kmeans")
parser.add_argument("features", type=str, help="Features file")
parser.add_argument("list", type=str, help="List on which apply kmeans")
parser.add_argument("outdir", type=str, help="Output directory for k-means models")
parser.add_argument("--kmin", type=int, help="minimum k", default=2)
parser.add_argument("--kmax", type=int, help="maximum k", default=100)
parser.add_argument("--allindir", type=bool, default=False,
help="all in same dir or separed ?")
args = vars(parser.parse_args())
FEATURES = args["features"]
LST = args["list"]
OUTDIR = args["outdir"]
KMIN = args["kmin"]
KMAX = args["kmax"]
ALLINDIR = args["allindir"]
# -- READE FILES --
features = read_file(FEATURES)
feat_ind = index_by_id(features)
lst = read_file(LST)
subdir = ""
# -- TRANSFORM INTO NUMPY --
X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
Ks = range(KMIN, KMAX+1)
for k in Ks:
kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
preds = kmeans.predict(X)
if ALLINDIR is False:
subdir = str(k)
dirname=path.join(OUTDIR, subdir)
if not path.exists(dirname):
mkdir(dirname)
pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))