Blame view
bin/cluster_kmeans.py
1.5 KB
ac78b07ea All base bin file... |
1 2 3 4 5 6 7 8 9 |
''' This script aims in computing k-means for a given data set. ''' import argparse import numpy as np from sklearn.cluster import KMeans from os import path |
0c12dd894 usable script |
10 |
from os import mkdir |
ac78b07ea All base bin file... |
11 12 13 14 15 16 17 18 19 20 21 |
import pickle from data import read_file, index_by_id # -- ARGPARSE -- parser = argparse.ArgumentParser(description="Cluster with kmeans") parser.add_argument("features", type=str, help="Features file") parser.add_argument("list", type=str, help="List on which apply kmeans") parser.add_argument("outdir", type=str, help="Output directory for k-means models") parser.add_argument("--kmin", type=int, help="minimum k", default=2) parser.add_argument("--kmax", type=int, help="maximum k", default=100) |
0c12dd894 usable script |
22 23 |
parser.add_argument("--allindir", type=bool, default=False, help="all in same dir or separed ?") |
ac78b07ea All base bin file... |
24 25 26 27 28 29 30 |
args = vars(parser.parse_args()) FEATURES = args["features"] LST = args["list"] OUTDIR = args["outdir"] KMIN = args["kmin"] KMAX = args["kmax"] |
0c12dd894 usable script |
31 |
ALLINDIR = args["allindir"] |
ac78b07ea All base bin file... |
32 33 34 35 36 37 |
# -- READE FILES -- features = read_file(FEATURES) feat_ind = index_by_id(features) lst = read_file(LST) |
0c12dd894 usable script |
38 |
subdir = "" |
ac78b07ea All base bin file... |
39 40 |
# -- TRANSFORM INTO NUMPY -- X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) |
ac78b07ea All base bin file... |
41 42 43 |
Ks = range(KMIN, KMAX+1) for k in Ks: kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) |
0c12dd894 usable script |
44 45 46 47 48 49 |
if ALLINDIR is False: subdir = str(k) dirname=path.join(OUTDIR, subdir) if not path.exists(dirname): mkdir(dirname) pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb")) |