Blame view

bin/extract_vectors.py 1.48 KB
ac78b07ea   Mathias Quillot   All base bin file...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  '''
  The goal of this script is to extract vectors from a list. 
  One file is the full content, and the list only enumerate the 
  vectors you want to keep.
  '''
  
  import os
  import numpy as np
  import argparse
  
  parser = argparse.ArgumentParser(description='Extract a subset of vectors')
  parser.add_argument('vectorsfile', type=str,
                      help='the path of the file containing the convectors')
  parser.add_argument('listfile', type=str,
                      help='the path of the file containing the list of vectors kept')
  parser.add_argument('-o', '--output', type=str,
                      default='a.out',
                      help='the path the output file containing the vectors kept')
  
  args = parser.parse_args()
  
  # Editing global variable
  VECTOR_FILE = args.vectorsfile
  LIST_FILE = args.listfile
  OUTPUT_FILE = args.output
  
  # READ VECTOR DATA
  data = {}
  data["en-us"] = {}
  data["fr-fr"] = {}
  with open(VECTOR_FILE, "r") as f:
  	for i, line in enumerate(f):
  		if TOY_VERSION == True and i > 100:
  			break
  		spl_line = line.split(" ")
  		if(len(pvectors) == 0):
  			pvectors = np.empty((0, len(spl_line[1:])), np.float32)
  		spl_meta = spl_line.split(",")
  		lang = spl_meta[0]
  		iden = spl_meta[3]
  		data[lang][iden] = line	
  
  # READ LIST AND WRITE NEW FILE
  with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
  	for i, line in enumerate(LIST_FILE):
  		if TOY_VERSION == True and i > 100:
  			break
  		spl_meta = line.split(",")
  		lang = spl_meta[0]
  		iden = spl_meta[3]
  		OUTPUT_FILE.write(data[lang][iden])