Commit 0c12dd8941fe2df65582721840c61d17a08a9c77
1 parent
b6841495fc
Exists in
master
usable script
Showing 1 changed file with 11 additions and 3 deletions Inline Diff
bin/cluster_kmeans.py
| 1 | ''' | 1 | ''' |
| 2 | This script aims in computing k-means for a given | 2 | This script aims in computing k-means for a given |
| 3 | data set. | 3 | data set. |
| 4 | ''' | 4 | ''' |
| 5 | 5 | ||
| 6 | import argparse | 6 | import argparse |
| 7 | import numpy as np | 7 | import numpy as np |
| 8 | from sklearn.cluster import KMeans | 8 | from sklearn.cluster import KMeans |
| 9 | from os import path | 9 | from os import path |
| 10 | from os import mkdir | ||
| 10 | 11 | ||
| 11 | import pickle | 12 | import pickle |
| 12 | from data import read_file, index_by_id | 13 | from data import read_file, index_by_id |
| 13 | 14 | ||
| 14 | # -- ARGPARSE -- | 15 | # -- ARGPARSE -- |
| 15 | parser = argparse.ArgumentParser(description="Cluster with kmeans") | 16 | parser = argparse.ArgumentParser(description="Cluster with kmeans") |
| 16 | parser.add_argument("features", type=str, help="Features file") | 17 | parser.add_argument("features", type=str, help="Features file") |
| 17 | parser.add_argument("list", type=str, help="List on which apply kmeans") | 18 | parser.add_argument("list", type=str, help="List on which apply kmeans") |
| 18 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") | 19 | parser.add_argument("outdir", type=str, help="Output directory for k-means models") |
| 19 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) | 20 | parser.add_argument("--kmin", type=int, help="minimum k", default=2) |
| 20 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) | 21 | parser.add_argument("--kmax", type=int, help="maximum k", default=100) |
| 22 | parser.add_argument("--allindir", type=bool, default=False, | ||
| 23 | help="all in same dir or separed ?") | ||
| 21 | 24 | ||
| 22 | args = vars(parser.parse_args()) | 25 | args = vars(parser.parse_args()) |
| 23 | FEATURES = args["features"] | 26 | FEATURES = args["features"] |
| 24 | LST = args["list"] | 27 | LST = args["list"] |
| 25 | OUTDIR = args["outdir"] | 28 | OUTDIR = args["outdir"] |
| 26 | KMIN = args["kmin"] | 29 | KMIN = args["kmin"] |
| 27 | KMAX = args["kmax"] | 30 | KMAX = args["kmax"] |
| 31 | ALLINDIR = args["allindir"] | ||
| 28 | 32 | ||
| 29 | # -- READE FILES -- | 33 | # -- READE FILES -- |
| 30 | features = read_file(FEATURES) | 34 | features = read_file(FEATURES) |
| 31 | feat_ind = index_by_id(features) | 35 | feat_ind = index_by_id(features) |
| 32 | 36 | ||
| 33 | lst = read_file(LST) | 37 | lst = read_file(LST) |
| 34 | 38 | ||
| 39 | subdir = "" | ||
| 35 | # -- TRANSFORM INTO NUMPY -- | 40 | # -- TRANSFORM INTO NUMPY -- |
| 36 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) | 41 | X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) |
| 37 | |||
| 38 | Ks = range(KMIN, KMAX+1) | 42 | Ks = range(KMIN, KMAX+1) |
| 39 | for k in Ks: | 43 | for k in Ks: |
| 40 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) | 44 | kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
| 41 | pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) | 45 | if ALLINDIR is False: |
| 46 | subdir = str(k) | ||
| 47 | dirname=path.join(OUTDIR, subdir) | ||
| 48 | if not path.exists(dirname): | ||
| 49 | mkdir(dirname) | ||
| 50 | pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) |