Commit 9aec207cb0722ed298e1cfd8acf39f89d177f27b

Authored by Mathias Quillot
1 parent 60d1f63cd5
Exists in master

little change

Showing 1 changed file with 1 additions and 0 deletions Inline Diff

bin/cluster_kmeans.py
1 ''' 1 '''
2 This script aims in computing k-means for a given 2 This script aims in computing k-means for a given
3 data set. 3 data set.
4 ''' 4 '''
5 5
6 import argparse 6 import argparse
7 import numpy as np 7 import numpy as np
8 from sklearn.cluster import KMeans 8 from sklearn.cluster import KMeans
9 from os import path 9 from os import path
10 from os import mkdir 10 from os import mkdir
11 11
12 import pickle 12 import pickle
13 from data import read_file, index_by_id 13 from data import read_file, index_by_id
14 14
15 # -- ARGPARSE -- 15 # -- ARGPARSE --
16 parser = argparse.ArgumentParser(description="Cluster with kmeans") 16 parser = argparse.ArgumentParser(description="Cluster with kmeans")
17 parser.add_argument("features", type=str, help="Features file") 17 parser.add_argument("features", type=str, help="Features file")
18 parser.add_argument("list", type=str, help="List on which apply kmeans") 18 parser.add_argument("list", type=str, help="List on which apply kmeans")
19 parser.add_argument("outdir", type=str, help="Output directory for k-means models") 19 parser.add_argument("outdir", type=str, help="Output directory for k-means models")
20 parser.add_argument("--kmin", type=int, help="minimum k", default=2) 20 parser.add_argument("--kmin", type=int, help="minimum k", default=2)
21 parser.add_argument("--kmax", type=int, help="maximum k", default=100) 21 parser.add_argument("--kmax", type=int, help="maximum k", default=100)
22 parser.add_argument("--allindir", type=bool, default=False, 22 parser.add_argument("--allindir", type=bool, default=False,
23 help="all in same dir or separed ?") 23 help="all in same dir or separed ?")
24 24
25 args = vars(parser.parse_args()) 25 args = vars(parser.parse_args())
26 FEATURES = args["features"] 26 FEATURES = args["features"]
27 LST = args["list"] 27 LST = args["list"]
28 OUTDIR = args["outdir"] 28 OUTDIR = args["outdir"]
29 KMIN = args["kmin"] 29 KMIN = args["kmin"]
30 KMAX = args["kmax"] 30 KMAX = args["kmax"]
31 ALLINDIR = args["allindir"] 31 ALLINDIR = args["allindir"]
32 32
33 # -- READE FILES -- 33 # -- READE FILES --
34 features = read_file(FEATURES) 34 features = read_file(FEATURES)
35 feat_ind = index_by_id(features) 35 feat_ind = index_by_id(features)
36 36
37 lst = read_file(LST) 37 lst = read_file(LST)
38 38
39 subdir = "" 39 subdir = ""
40 # -- TRANSFORM INTO NUMPY -- 40 # -- TRANSFORM INTO NUMPY --
41 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) 41 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
42 Ks = range(KMIN, KMAX+1) 42 Ks = range(KMIN, KMAX+1)
43 for k in Ks: 43 for k in Ks:
44 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 44 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
45 preds = kmeans.predict(X)
45 if ALLINDIR is False: 46 if ALLINDIR is False:
46 subdir = str(k) 47 subdir = str(k)
47 dirname=path.join(OUTDIR, subdir) 48 dirname=path.join(OUTDIR, subdir)
48 if not path.exists(dirname): 49 if not path.exists(dirname):
49 mkdir(dirname) 50 mkdir(dirname)
50 pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) 51 pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))
51 52