Quillot Mathias / Clustering

Blame view

bin/cluster_kmeans.py 1.53 KB

ac78b07ea Mathias Quillot All base bin file...	1 2 3 4 5 6 7 8 9	''' This script aims in computing k-means for a given data set. ''' import argparse import numpy as np from sklearn.cluster import KMeans from os import path
0c12dd894 Mathias Quillot usable script	10	from os import mkdir
ac78b07ea Mathias Quillot All base bin file...	11 12 13 14 15 16 17 18 19 20 21	import pickle from data import read_file, index_by_id # -- ARGPARSE -- parser = argparse.ArgumentParser(description="Cluster with kmeans") parser.add_argument("features", type=str, help="Features file") parser.add_argument("list", type=str, help="List on which apply kmeans") parser.add_argument("outdir", type=str, help="Output directory for k-means models") parser.add_argument("--kmin", type=int, help="minimum k", default=2) parser.add_argument("--kmax", type=int, help="maximum k", default=100)
0c12dd894 Mathias Quillot usable script	22 23	parser.add_argument("--allindir", type=bool, default=False, help="all in same dir or separed ?")
ac78b07ea Mathias Quillot All base bin file...	24 25 26 27 28 29 30	args = vars(parser.parse_args()) FEATURES = args["features"] LST = args["list"] OUTDIR = args["outdir"] KMIN = args["kmin"] KMAX = args["kmax"]
0c12dd894 Mathias Quillot usable script	31	ALLINDIR = args["allindir"]
ac78b07ea Mathias Quillot All base bin file...	32 33 34 35 36 37	# -- READE FILES -- features = read_file(FEATURES) feat_ind = index_by_id(features) lst = read_file(LST)
0c12dd894 Mathias Quillot usable script	38	subdir = ""
ac78b07ea Mathias Quillot All base bin file...	39 40	# -- TRANSFORM INTO NUMPY -- X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
ac78b07ea Mathias Quillot All base bin file...	41 42 43	Ks = range(KMIN, KMAX+1) for k in Ks: kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
9aec207cb Mathias Quillot little change	44	preds = kmeans.predict(X)
0c12dd894 Mathias Quillot usable script	45 46 47 48 49 50	if ALLINDIR is False: subdir = str(k) dirname=path.join(OUTDIR, subdir) if not path.exists(dirname): mkdir(dirname) pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))