extract_kmeans.py
1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
'''
This script aims to extract k-means clustering from a
a priori trained k-means.
'''
import argparse
import numpy as np
import pickle
from data import read_file, index_by_id, write_line
import sys
# -- ARGPARSE --
parser = argparse.ArgumentParser(description="extract clusters")
parser.add_argument("model", type=str, help="k-means model pickle")
parser.add_argument("features", type=str, help="features")
parser.add_argument("--list", type=str, default=None, help="list file")
parser.add_argument("--outfile", type=str, default=None, help="output file std")
args = vars(parser.parse_args())
MODEL = args["model"]
FEATURES = args["features"]
LST = args["list"]
OUTFILE = args["outfile"]
if OUTFILE == None:
OUTFILE = sys.stdout
else:
OUTFILE = open(OUTFILE, "w")
# -- READ FILE --
features = read_file(FEATURES)
feat_ind = index_by_id(features)
if LST is not None:
lst = read_file(LST)
else:
lst = features
kmeans = pickle.load(open(MODEL, "rb"))
# -- CONVERT TO NUMPY --
X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
predictions = kmeans.predict(X)
for i, line in enumerate(lst):
meta = line[0]
meta[1] = str(predictions[i])
write_line(
meta,
feat_ind[meta[0]][meta[3]][1],
OUTFILE
)
# -- CLOSE OUT FILE IF NECESSARY --
if not OUTFILE == sys.stdout:
OUTFILE.close()