diff --git a/bin/cluster_kmeans.py b/bin/cluster_kmeans.py index 63e951a..230757b 100644 --- a/bin/cluster_kmeans.py +++ b/bin/cluster_kmeans.py @@ -7,6 +7,7 @@ import argparse import numpy as np from sklearn.cluster import KMeans from os import path +from os import mkdir import pickle from data import read_file, index_by_id @@ -18,6 +19,8 @@ parser.add_argument("list", type=str, help="List on which apply kmeans") parser.add_argument("outdir", type=str, help="Output directory for k-means models") parser.add_argument("--kmin", type=int, help="minimum k", default=2) parser.add_argument("--kmax", type=int, help="maximum k", default=100) +parser.add_argument("--allindir", type=bool, default=False, + help="all in same dir or separed ?") args = vars(parser.parse_args()) FEATURES = args["features"] @@ -25,6 +28,7 @@ LST = args["list"] OUTDIR = args["outdir"] KMIN = args["kmin"] KMAX = args["kmax"] +ALLINDIR = args["allindir"] # -- READE FILES -- features = read_file(FEATURES) @@ -32,11 +36,15 @@ feat_ind = index_by_id(features) lst = read_file(LST) +subdir = "" # -- TRANSFORM INTO NUMPY -- X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) - Ks = range(KMIN, KMAX+1) for k in Ks: kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) - pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) - + if ALLINDIR is False: + subdir = str(k) + dirname=path.join(OUTDIR, subdir) + if not path.exists(dirname): + mkdir(dirname) + pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))