Commit e36dbbc98b8ad182980cb714b6c6024b467ac072

Authored by Mathias Quillot
1 parent 9aec207cb0
Exists in master

Improving code style

Showing 1 changed file with 13 additions and 24 deletions Inline Diff

bin/extract_vectors.py
1 ''' 1 '''
2 The goal of this script is to extract vectors from a list. 2 The goal of this script is to extract vectors from a list.
3 One file is the full content, and the list only enumerate the 3 One file is the full content, and the list only enumerate the
4 vectors you want to keep. 4 vectors you want to keep.
5 ''' 5 '''
6 6
7 import os 7 import os
8 import numpy as np 8 import numpy as np
9 import argparse 9 import argparse
10 from data import read_file, index_by_id, write_line
10 11
11 parser = argparse.ArgumentParser(description='Extract a subset of vectors') 12 parser = argparse.ArgumentParser(description='Extract a subset of vectors')
12 parser.add_argument('vectorsfile', type=str, 13 parser.add_argument('vectorsfile', type=str,
13 help='the path of the file containing the convectors') 14 help='the path of the file containing the convectors')
14 parser.add_argument('listfile', type=str, 15 parser.add_argument('listfile', type=str,
15 help='the path of the file containing the list of vectors kept') 16 help='the path of the file containing the list of vectors kept')
16 parser.add_argument('-o', '--output', type=str, 17 parser.add_argument('-o', '--output', type=str,
17 default='a.out', 18 default='a.out',
18 help='the path the output file containing the vectors kept') 19 help='the path the output file containing the vectors kept')
19 20
20 args = parser.parse_args() 21 args = parser.parse_args()
21 22
22 # Editing global variable 23 # Editing global variable
23 VECTOR_FILE = args.vectorsfile 24 VECTOR_FILE = args.vectorsfile
24 LIST_FILE = args.listfile 25 LIST_FILE = args.listfile
25 OUTPUT_FILE = args.output 26 OUTPUT_FILE = args.output
26 27
27 # READ VECTOR DATA 28 # READ VECTOR DATA
28 data = {} 29 features = read_file(VECTOR_FILE)
29 data["en-us"] = {} 30 features_ind = index_by_id(features)
30 data["fr-fr"] = {} 31 lst = read_file(LIST_FILE)
31 with open(VECTOR_FILE, "r") as f:
32 for i, line in enumerate(f):
33 if TOY_VERSION == True and i > 100:
34 break
35 spl_line = line.split(" ")
36 if(len(pvectors) == 0):
37 pvectors = np.empty((0, len(spl_line[1:])), np.float32)
38 spl_meta = spl_line.split(",")
39 lang = spl_meta[0]
40 iden = spl_meta[3]
41 data[lang][iden] = line
42 32
43 # READ LIST AND WRITE NEW FILE 33
44 with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o: 34 # COMPUTE KEPT FEATS
45 for i, line in enumerate(LIST_FILE): 35 kept_feats = [features_ind[x[0][0]][x[0][3]] for x in lst]
46 if TOY_VERSION == True and i > 100: 36
47 break 37 # WRITE IN FILE
48 spl_meta = line.split(",") 38 with open(OUTPUT_FILE, 'w') as f:
49 lang = spl_meta[0] 39 for feat in kept_feats:
50 iden = spl_meta[3] 40 write_line(feat[0], feat[1], f=f)
51 OUTPUT_FILE.write(data[lang][iden])
52 41