Blame view

bin/cluster_kmeans.py 1.17 KB
ac78b07ea   Mathias Quillot   All base bin file...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
  '''
  This script aims in computing k-means for a given
  data set.
  '''
  
  import argparse
  import numpy as np
  from sklearn.cluster import KMeans
  from os import path
  
  import pickle
  from data import read_file, index_by_id
  
  # -- ARGPARSE --
  parser = argparse.ArgumentParser(description="Cluster with kmeans")
  parser.add_argument("features", type=str, help="Features file")
  parser.add_argument("list", type=str, help="List on which apply kmeans")
  parser.add_argument("outdir", type=str, help="Output directory for k-means models")
  parser.add_argument("--kmin", type=int, help="minimum k", default=2)
  parser.add_argument("--kmax", type=int, help="maximum k", default=100)
  
  args = vars(parser.parse_args())
  FEATURES = args["features"]
  LST = args["list"]
  OUTDIR = args["outdir"]
  KMIN = args["kmin"]
  KMAX = args["kmax"]
  
  # -- READE FILES --
  features = read_file(FEATURES)
  feat_ind = index_by_id(features)
  
  lst = read_file(LST)
  
  # -- TRANSFORM INTO NUMPY --
  X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
  
  Ks = range(KMIN, KMAX+1)
  for k in Ks:
      kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
      pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb"))