Commit e36dbbc98b8ad182980cb714b6c6024b467ac072
1 parent
9aec207cb0
Exists in
master
Improving code style
Showing 1 changed file with 13 additions and 24 deletions Inline Diff
bin/extract_vectors.py
1 | ''' | 1 | ''' |
2 | The goal of this script is to extract vectors from a list. | 2 | The goal of this script is to extract vectors from a list. |
3 | One file is the full content, and the list only enumerate the | 3 | One file is the full content, and the list only enumerate the |
4 | vectors you want to keep. | 4 | vectors you want to keep. |
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | import os | 7 | import os |
8 | import numpy as np | 8 | import numpy as np |
9 | import argparse | 9 | import argparse |
10 | from data import read_file, index_by_id, write_line | ||
10 | 11 | ||
11 | parser = argparse.ArgumentParser(description='Extract a subset of vectors') | 12 | parser = argparse.ArgumentParser(description='Extract a subset of vectors') |
12 | parser.add_argument('vectorsfile', type=str, | 13 | parser.add_argument('vectorsfile', type=str, |
13 | help='the path of the file containing the convectors') | 14 | help='the path of the file containing the convectors') |
14 | parser.add_argument('listfile', type=str, | 15 | parser.add_argument('listfile', type=str, |
15 | help='the path of the file containing the list of vectors kept') | 16 | help='the path of the file containing the list of vectors kept') |
16 | parser.add_argument('-o', '--output', type=str, | 17 | parser.add_argument('-o', '--output', type=str, |
17 | default='a.out', | 18 | default='a.out', |
18 | help='the path the output file containing the vectors kept') | 19 | help='the path the output file containing the vectors kept') |
19 | 20 | ||
20 | args = parser.parse_args() | 21 | args = parser.parse_args() |
21 | 22 | ||
22 | # Editing global variable | 23 | # Editing global variable |
23 | VECTOR_FILE = args.vectorsfile | 24 | VECTOR_FILE = args.vectorsfile |
24 | LIST_FILE = args.listfile | 25 | LIST_FILE = args.listfile |
25 | OUTPUT_FILE = args.output | 26 | OUTPUT_FILE = args.output |
26 | 27 | ||
27 | # READ VECTOR DATA | 28 | # READ VECTOR DATA |
28 | data = {} | 29 | features = read_file(VECTOR_FILE) |
29 | data["en-us"] = {} | 30 | features_ind = index_by_id(features) |
30 | data["fr-fr"] = {} | 31 | lst = read_file(LIST_FILE) |
31 | with open(VECTOR_FILE, "r") as f: | ||
32 | for i, line in enumerate(f): | ||
33 | if TOY_VERSION == True and i > 100: | ||
34 | break | ||
35 | spl_line = line.split(" ") | ||
36 | if(len(pvectors) == 0): | ||
37 | pvectors = np.empty((0, len(spl_line[1:])), np.float32) | ||
38 | spl_meta = spl_line.split(",") | ||
39 | lang = spl_meta[0] | ||
40 | iden = spl_meta[3] | ||
41 | data[lang][iden] = line | ||
42 | 32 | ||
43 | # READ LIST AND WRITE NEW FILE | 33 | |
44 | with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o: | 34 | # COMPUTE KEPT FEATS |
45 | for i, line in enumerate(LIST_FILE): | 35 | kept_feats = [features_ind[x[0][0]][x[0][3]] for x in lst] |
46 | if TOY_VERSION == True and i > 100: | 36 | |
47 | break | 37 | # WRITE IN FILE |
48 | spl_meta = line.split(",") | 38 | with open(OUTPUT_FILE, 'w') as f: |
49 | lang = spl_meta[0] | 39 | for feat in kept_feats: |
50 | iden = spl_meta[3] | 40 | write_line(feat[0], feat[1], f=f) |
51 | OUTPUT_FILE.write(data[lang][iden]) | ||
52 | 41 |