Commit 0c12dd8941fe2df65582721840c61d17a08a9c77
1 parent
b6841495fc
Exists in
master
usable script
Showing 1 changed file with 11 additions and 3 deletions Inline Diff
bin/cluster_kmeans.py
1 | ''' | 1 | ''' |
2 | This script aims in computing k-means for a given | 2 | This script aims in computing k-means for a given |
3 | data set. | 3 | data set. |
4 | ''' | 4 | ''' |
5 | 5 | ||
6 | import argparse | 6 | import argparse |
7 | import numpy as np | 7 | import numpy as np |
8 | from sklearn.cluster import KMeans | 8 | from sklearn.cluster import KMeans |
9 | from os import path | 9 | from os import path |
10 | from os import mkdir | ||
10 | 11 | ||
11 | import pickle | 12 | import pickle |
12 | from data import read_file, index_by_id | 13 | from data import read_file, index_by_id |
13 | 14 | ||
14 | # -- ARGPARSE -- | 15 | # -- ARGPARSE -- |
15 | parser = argparse.ArgumentParser(description="Cluster with kmeans") | 16 | parser = argparse.ArgumentParser(description="Cluster with kmeans") |
16 | parser.add_argument("features", type=str, help="Features file") | 17 | parser.add_argument("features", type=str, help="Features file") |
17 | parser.add_argument("list", type=str, help="List on which apply kmeans") | 18 | parser.add_argument("list", type=str, help="List on which apply kmeans") |
18 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") | 19 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") |
19 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) | 20 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) |
20 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) | 21 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) |
22 | parser.add_argument("--allindir", type=bool, default=False, | ||
23 | help="all in same dir or separed ?") | ||
21 | 24 | ||
22 | args = vars(parser.parse_args()) | 25 | args = vars(parser.parse_args()) |
23 | FEATURES = args["features"] | 26 | FEATURES = args["features"] |
24 | LST = args["list"] | 27 | LST = args["list"] |
25 | OUTDIR = args["outdir"] | 28 | OUTDIR = args["outdir"] |
26 | KMIN = args["kmin"] | 29 | KMIN = args["kmin"] |
27 | KMAX = args["kmax"] | 30 | KMAX = args["kmax"] |
31 | ALLINDIR = args["allindir"] | ||
28 | 32 | ||
29 | # -- READE FILES -- | 33 | # -- READE FILES -- |
30 | features = read_file(FEATURES) | 34 | features = read_file(FEATURES) |
31 | feat_ind = index_by_id(features) | 35 | feat_ind = index_by_id(features) |
32 | 36 | ||
33 | lst = read_file(LST) | 37 | lst = read_file(LST) |
34 | 38 | ||
39 | subdir = "" | ||
35 | # -- TRANSFORM INTO NUMPY -- | 40 | # -- TRANSFORM INTO NUMPY -- |
36 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) | 41 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) |
37 | |||
38 | Ks = range(KMIN, KMAX+1) | 42 | Ks = range(KMIN, KMAX+1) |
39 | for k in Ks: | 43 | for k in Ks: |
40 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 44 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
41 | pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) | 45 | if ALLINDIR is False: |
46 | subdir = str(k) | ||
47 | dirname=path.join(OUTDIR, subdir) | ||
48 | if not path.exists(dirname): | ||
49 | mkdir(dirname) | ||
50 | pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) |