cluster_kmeans.py 1.5 KB
'''
This script aims in computing k-means for a given
data set.
'''

import argparse
import numpy as np
from sklearn.cluster import KMeans
from os import path
from os import mkdir

import pickle
from data import read_file, index_by_id

# -- ARGPARSE --
parser = argparse.ArgumentParser(description="Cluster with kmeans")
parser.add_argument("features", type=str, help="Features file")
parser.add_argument("list", type=str, help="List on which apply kmeans")
parser.add_argument("outdir", type=str, help="Output directory for k-means models")
parser.add_argument("--kmin", type=int, help="minimum k", default=2)
parser.add_argument("--kmax", type=int, help="maximum k", default=100)
parser.add_argument("--allindir", type=bool, default=False,
                    help="all in same dir or separed ?")

args = vars(parser.parse_args())
FEATURES = args["features"]
LST = args["list"]
OUTDIR = args["outdir"]
KMIN = args["kmin"]
KMAX = args["kmax"]
ALLINDIR = args["allindir"]

# -- READE FILES --
features = read_file(FEATURES)
feat_ind = index_by_id(features)

lst = read_file(LST)

subdir = ""
# -- TRANSFORM INTO NUMPY --
X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
Ks = range(KMIN, KMAX+1)
for k in Ks:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
    if ALLINDIR is False:
        subdir = str(k)
        dirname=path.join(OUTDIR, subdir)
        if not path.exists(dirname):
            mkdir(dirname)
    pickle.dump(kmeans, open(path.join(OUTDIR, subdir, "clustering_" + str(k) + ".pkl"), "wb"))