Commit 9aec207cb0722ed298e1cfd8acf39f89d177f27b
1 parent
60d1f63cd5
Exists in
master
little change
Showing 1 changed file with 1 additions and 0 deletions Inline Diff
bin/cluster_kmeans.py
1 | ''' | 1 | ''' |
2 | This script aims in computing k-means for a given | 2 | This script aims in computing k-means for a given |
3 | data set. | 3 | data set. |
4 | ''' | 4 | ''' |
5 | 5 | ||
6 | import argparse | 6 | import argparse |
7 | import numpy as np | 7 | import numpy as np |
8 | from sklearn.cluster import KMeans | 8 | from sklearn.cluster import KMeans |
9 | from os import path | 9 | from os import path |
10 | from os import mkdir | 10 | from os import mkdir |
11 | 11 | ||
12 | import pickle | 12 | import pickle |
13 | from data import read_file, index_by_id | 13 | from data import read_file, index_by_id |
14 | 14 | ||
15 | # -- ARGPARSE -- | 15 | # -- ARGPARSE -- |
16 | parser = argparse.ArgumentParser(description="Cluster with kmeans") | 16 | parser = argparse.ArgumentParser(description="Cluster with kmeans") |
17 | parser.add_argument("features", type=str, help="Features file") | 17 | parser.add_argument("features", type=str, help="Features file") |
18 | parser.add_argument("list", type=str, help="List on which apply kmeans") | 18 | parser.add_argument("list", type=str, help="List on which apply kmeans") |
19 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") | 19 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") |
20 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) | 20 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) |
21 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) | 21 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) |
22 | parser.add_argument("--allindir", type=bool, default=False, | 22 | parser.add_argument("--allindir", type=bool, default=False, |
23 | help="all in same dir or separed ?") | 23 | help="all in same dir or separed ?") |
24 | 24 | ||
25 | args = vars(parser.parse_args()) | 25 | args = vars(parser.parse_args()) |
26 | FEATURES = args["features"] | 26 | FEATURES = args["features"] |
27 | LST = args["list"] | 27 | LST = args["list"] |
28 | OUTDIR = args["outdir"] | 28 | OUTDIR = args["outdir"] |
29 | KMIN = args["kmin"] | 29 | KMIN = args["kmin"] |
30 | KMAX = args["kmax"] | 30 | KMAX = args["kmax"] |
31 | ALLINDIR = args["allindir"] | 31 | ALLINDIR = args["allindir"] |
32 | 32 | ||
33 | # -- READE FILES -- | 33 | # -- READE FILES -- |
34 | features = read_file(FEATURES) | 34 | features = read_file(FEATURES) |
35 | feat_ind = index_by_id(features) | 35 | feat_ind = index_by_id(features) |
36 | 36 | ||
37 | lst = read_file(LST) | 37 | lst = read_file(LST) |
38 | 38 | ||
39 | subdir = "" | 39 | subdir = "" |
40 | # -- TRANSFORM INTO NUMPY -- | 40 | # -- TRANSFORM INTO NUMPY -- |
41 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) | 41 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) |
42 | Ks = range(KMIN, KMAX+1) | 42 | Ks = range(KMIN, KMAX+1) |
43 | for k in Ks: | 43 | for k in Ks: |
44 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 44 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
45 | preds = kmeans.predict(X) | ||
45 | if ALLINDIR is False: | 46 | if ALLINDIR is False: |
46 | subdir = str(k) | 47 | subdir = str(k) |
47 | dirname=path.join(OUTDIR, subdir) | 48 | dirname=path.join(OUTDIR, subdir) |
48 | if not path.exists(dirname): | 49 | if not path.exists(dirname): |
49 | mkdir(dirname) | 50 | mkdir(dirname) |
50 | pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) | 51 | pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) |
51 | 52 |