Commit 0c12dd8941fe2df65582721840c61d17a08a9c77

Authored by Mathias Quillot
1 parent b6841495fc
Exists in master

usable script

Showing 1 changed file with 11 additions and 3 deletions Inline Diff

bin/cluster_kmeans.py
1 ''' 1 '''
2 This script aims in computing k-means for a given 2 This script aims in computing k-means for a given
3 data set. 3 data set.
4 ''' 4 '''
5 5
6 import argparse 6 import argparse
7 import numpy as np 7 import numpy as np
8 from sklearn.cluster import KMeans 8 from sklearn.cluster import KMeans
9 from os import path 9 from os import path
10 from os import mkdir
10 11
11 import pickle 12 import pickle
12 from data import read_file, index_by_id 13 from data import read_file, index_by_id
13 14
14 # -- ARGPARSE -- 15 # -- ARGPARSE --
15 parser = argparse.ArgumentParser(description="Cluster with kmeans") 16 parser = argparse.ArgumentParser(description="Cluster with kmeans")
16 parser.add_argument("features", type=str, help="Features file") 17 parser.add_argument("features", type=str, help="Features file")
17 parser.add_argument("list", type=str, help="List on which apply kmeans") 18 parser.add_argument("list", type=str, help="List on which apply kmeans")
18 parser.add_argument("outdir", type=str, help="Output directory for k-means models") 19 parser.add_argument("outdir", type=str, help="Output directory for k-means models")
19 parser.add_argument("--kmin", type=int, help="minimum k", default=2) 20 parser.add_argument("--kmin", type=int, help="minimum k", default=2)
20 parser.add_argument("--kmax", type=int, help="maximum k", default=100) 21 parser.add_argument("--kmax", type=int, help="maximum k", default=100)
22 parser.add_argument("--allindir", type=bool, default=False,
23 help="all in same dir or separed ?")
21 24
22 args = vars(parser.parse_args()) 25 args = vars(parser.parse_args())
23 FEATURES = args["features"] 26 FEATURES = args["features"]
24 LST = args["list"] 27 LST = args["list"]
25 OUTDIR = args["outdir"] 28 OUTDIR = args["outdir"]
26 KMIN = args["kmin"] 29 KMIN = args["kmin"]
27 KMAX = args["kmax"] 30 KMAX = args["kmax"]
31 ALLINDIR = args["allindir"]
28 32
29 # -- READE FILES -- 33 # -- READE FILES --
30 features = read_file(FEATURES) 34 features = read_file(FEATURES)
31 feat_ind = index_by_id(features) 35 feat_ind = index_by_id(features)
32 36
33 lst = read_file(LST) 37 lst = read_file(LST)
34 38
39 subdir = ""
35 # -- TRANSFORM INTO NUMPY -- 40 # -- TRANSFORM INTO NUMPY --
36 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) 41 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
37
38 Ks = range(KMIN, KMAX+1) 42 Ks = range(KMIN, KMAX+1)
39 for k in Ks: 43 for k in Ks:
40 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) 44 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
41 pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) 45 if ALLINDIR is False:
46 subdir = str(k)
47 dirname=path.join(OUTDIR, subdir)
48 if not path.exists(dirname):
49 mkdir(dirname)
50 pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))