Blame view

bin/cluster_kmeans.py 1.5 KB
ac78b07ea   Mathias Quillot   All base bin file...
1
2
3
4
5
6
7
8
9
  '''
  This script aims in computing k-means for a given
  data set.
  '''
  
  import argparse
  import numpy as np
  from sklearn.cluster import KMeans
  from os import path
0c12dd894   Mathias Quillot   usable script
10
  from os import mkdir
ac78b07ea   Mathias Quillot   All base bin file...
11
12
13
14
15
16
17
18
19
20
21
  
  import pickle
  from data import read_file, index_by_id
  
  # -- ARGPARSE --
  parser = argparse.ArgumentParser(description="Cluster with kmeans")
  parser.add_argument("features", type=str, help="Features file")
  parser.add_argument("list", type=str, help="List on which apply kmeans")
  parser.add_argument("outdir", type=str, help="Output directory for k-means models")
  parser.add_argument("--kmin", type=int, help="minimum k", default=2)
  parser.add_argument("--kmax", type=int, help="maximum k", default=100)
0c12dd894   Mathias Quillot   usable script
22
23
  parser.add_argument("--allindir", type=bool, default=False,
                      help="all in same dir or separed ?")
ac78b07ea   Mathias Quillot   All base bin file...
24
25
26
27
28
29
30
  
  args = vars(parser.parse_args())
  FEATURES = args["features"]
  LST = args["list"]
  OUTDIR = args["outdir"]
  KMIN = args["kmin"]
  KMAX = args["kmax"]
0c12dd894   Mathias Quillot   usable script
31
  ALLINDIR = args["allindir"]
ac78b07ea   Mathias Quillot   All base bin file...
32
33
34
35
36
37
  
  # -- READE FILES --
  features = read_file(FEATURES)
  feat_ind = index_by_id(features)
  
  lst = read_file(LST)
0c12dd894   Mathias Quillot   usable script
38
  subdir = ""
ac78b07ea   Mathias Quillot   All base bin file...
39
40
  # -- TRANSFORM INTO NUMPY --
  X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
ac78b07ea   Mathias Quillot   All base bin file...
41
42
43
  Ks = range(KMIN, KMAX+1)
  for k in Ks:
      kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
0c12dd894   Mathias Quillot   usable script
44
45
46
47
48
49
      if ALLINDIR is False:
          subdir = str(k)
          dirname=path.join(OUTDIR, subdir)
          if not path.exists(dirname):
              mkdir(dirname)
      pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))