Blame view
bin/cluster_kmeans.py
1.17 KB
ac78b07ea All base bin file... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
''' This script aims in computing k-means for a given data set. ''' import argparse import numpy as np from sklearn.cluster import KMeans from os import path import pickle from data import read_file, index_by_id # -- ARGPARSE -- parser = argparse.ArgumentParser(description="Cluster with kmeans") parser.add_argument("features", type=str, help="Features file") parser.add_argument("list", type=str, help="List on which apply kmeans") parser.add_argument("outdir", type=str, help="Output directory for k-means models") parser.add_argument("--kmin", type=int, help="minimum k", default=2) parser.add_argument("--kmax", type=int, help="maximum k", default=100) args = vars(parser.parse_args()) FEATURES = args["features"] LST = args["list"] OUTDIR = args["outdir"] KMIN = args["kmin"] KMAX = args["kmax"] # -- READE FILES -- features = read_file(FEATURES) feat_ind = index_by_id(features) lst = read_file(LST) # -- TRANSFORM INTO NUMPY -- X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) Ks = range(KMIN, KMAX+1) for k in Ks: kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) |