Commit e36dbbc98b8ad182980cb714b6c6024b467ac072
1 parent
9aec207cb0
Exists in
master
Improving code style
Showing 1 changed file with 13 additions and 24 deletions Side-by-side Diff
bin/extract_vectors.py
... | ... | @@ -7,6 +7,7 @@ |
7 | 7 | import os |
8 | 8 | import numpy as np |
9 | 9 | import argparse |
10 | +from data import read_file, index_by_id, write_line | |
10 | 11 | |
11 | 12 | parser = argparse.ArgumentParser(description='Extract a subset of vectors') |
12 | 13 | parser.add_argument('vectorsfile', type=str, |
13 | 14 | |
... | ... | @@ -25,28 +26,16 @@ |
25 | 26 | OUTPUT_FILE = args.output |
26 | 27 | |
27 | 28 | # READ VECTOR DATA |
28 | -data = {} | |
29 | -data["en-us"] = {} | |
30 | -data["fr-fr"] = {} | |
31 | -with open(VECTOR_FILE, "r") as f: | |
32 | - for i, line in enumerate(f): | |
33 | - if TOY_VERSION == True and i > 100: | |
34 | - break | |
35 | - spl_line = line.split(" ") | |
36 | - if(len(pvectors) == 0): | |
37 | - pvectors = np.empty((0, len(spl_line[1:])), np.float32) | |
38 | - spl_meta = spl_line.split(",") | |
39 | - lang = spl_meta[0] | |
40 | - iden = spl_meta[3] | |
41 | - data[lang][iden] = line | |
29 | +features = read_file(VECTOR_FILE) | |
30 | +features_ind = index_by_id(features) | |
31 | +lst = read_file(LIST_FILE) | |
42 | 32 | |
43 | -# READ LIST AND WRITE NEW FILE | |
44 | -with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o: | |
45 | - for i, line in enumerate(LIST_FILE): | |
46 | - if TOY_VERSION == True and i > 100: | |
47 | - break | |
48 | - spl_meta = line.split(",") | |
49 | - lang = spl_meta[0] | |
50 | - iden = spl_meta[3] | |
51 | - OUTPUT_FILE.write(data[lang][iden]) | |
33 | + | |
34 | +# COMPUTE KEPT FEATS | |
35 | +kept_feats = [features_ind[x[0][0]][x[0][3]] for x in lst] | |
36 | + | |
37 | +# WRITE IN FILE | |
38 | +with open(OUTPUT_FILE, 'w') as f: | |
39 | + for feat in kept_feats: | |
40 | + write_line(feat[0], feat[1], f=f) |