extract_kmeans.py 1.33 KB
'''
This script aims to extract k-means clustering from a 
a priori trained k-means.
'''

import argparse
import numpy as np
import pickle
from data import read_file, index_by_id, write_line
import sys

# -- ARGPARSE --
parser = argparse.ArgumentParser(description="extract clusters")
parser.add_argument("model", type=str, help="k-means model pickle")
parser.add_argument("features", type=str, help="features")
parser.add_argument("--list", type=str, default=None, help="list file")
parser.add_argument("--outfile", type=str, default=None, help="output file std")

args = vars(parser.parse_args())
MODEL = args["model"]
FEATURES = args["features"]
LST = args["list"]
OUTFILE = args["outfile"]

if OUTFILE == None:
    OUTFILE = sys.stdout
else:
    OUTFILE = open(OUTFILE, "w")

# -- READ FILE --
features = read_file(FEATURES)
feat_ind = index_by_id(features)

if LST is not None:  
    lst = read_file(LST)
else:
    lst = features

kmeans = pickle.load(open(MODEL, "rb"))

# -- CONVERT TO NUMPY --
X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])

predictions = kmeans.predict(X)

for i, line in enumerate(lst):
    meta = line[0]
    meta[1] = str(predictions[i])
    write_line(
        meta,
        feat_ind[meta[0]][meta[3]][1],
        OUTFILE
    )

# -- CLOSE OUT FILE IF NECESSARY --
if not OUTFILE == sys.stdout:
    OUTFILE.close()