Blame view

bin/cluster_kmeans.py 1.53 KB
ac78b07ea   Mathias Quillot   All base bin file...
1
2
3
4
5
6
7
8
9
  '''
  This script aims in computing k-means for a given
  data set.
  '''
  
  import argparse
  import numpy as np
  from sklearn.cluster import KMeans
  from os import path
0c12dd894   Mathias Quillot   usable script
10
  from os import mkdir
ac78b07ea   Mathias Quillot   All base bin file...
11
12
13
14
15
16
17
18
19
20
21
  
  import pickle
  from data import read_file, index_by_id
  
  # -- ARGPARSE --
  parser = argparse.ArgumentParser(description="Cluster with kmeans")
  parser.add_argument("features", type=str, help="Features file")
  parser.add_argument("list", type=str, help="List on which apply kmeans")
  parser.add_argument("outdir", type=str, help="Output directory for k-means models")
  parser.add_argument("--kmin", type=int, help="minimum k", default=2)
  parser.add_argument("--kmax", type=int, help="maximum k", default=100)
0c12dd894   Mathias Quillot   usable script
22
23
  parser.add_argument("--allindir", type=bool, default=False,
                      help="all in same dir or separed ?")
ac78b07ea   Mathias Quillot   All base bin file...
24
25
26
27
28
29
30
  
  args = vars(parser.parse_args())
  FEATURES = args["features"]
  LST = args["list"]
  OUTDIR = args["outdir"]
  KMIN = args["kmin"]
  KMAX = args["kmax"]
0c12dd894   Mathias Quillot   usable script
31
  ALLINDIR = args["allindir"]
ac78b07ea   Mathias Quillot   All base bin file...
32
33
34
35
36
37
  
  # -- READE FILES --
  features = read_file(FEATURES)
  feat_ind = index_by_id(features)
  
  lst = read_file(LST)
0c12dd894   Mathias Quillot   usable script
38
  subdir = ""
ac78b07ea   Mathias Quillot   All base bin file...
39
40
  # -- TRANSFORM INTO NUMPY --
  X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
ac78b07ea   Mathias Quillot   All base bin file...
41
42
43
  Ks = range(KMIN, KMAX+1)
  for k in Ks:
      kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
9aec207cb   Mathias Quillot   little change
44
      preds = kmeans.predict(X)
0c12dd894   Mathias Quillot   usable script
45
46
47
48
49
50
      if ALLINDIR is False:
          subdir = str(k)
          dirname=path.join(OUTDIR, subdir)
          if not path.exists(dirname):
              mkdir(dirname)
      pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))