extract_vectors.py 1.48 KB
'''
The goal of this script is to extract vectors from a list. 
One file is the full content, and the list only enumerate the 
vectors you want to keep.
'''

import os
import numpy as np
import argparse

parser = argparse.ArgumentParser(description='Extract a subset of vectors')
parser.add_argument('vectorsfile', type=str,
                    help='the path of the file containing the convectors')
parser.add_argument('listfile', type=str,
                    help='the path of the file containing the list of vectors kept')
parser.add_argument('-o', '--output', type=str,
                    default='a.out',
                    help='the path the output file containing the vectors kept')

args = parser.parse_args()

# Editing global variable
VECTOR_FILE = args.vectorsfile
LIST_FILE = args.listfile
OUTPUT_FILE = args.output

# READ VECTOR DATA
data = {}
data["en-us"] = {}
data["fr-fr"] = {}
with open(VECTOR_FILE, "r") as f:
	for i, line in enumerate(f):
		if TOY_VERSION == True and i > 100:
			break
		spl_line = line.split(" ")
		if(len(pvectors) == 0):
			pvectors = np.empty((0, len(spl_line[1:])), np.float32)
		spl_meta = spl_line.split(",")
		lang = spl_meta[0]
		iden = spl_meta[3]
		data[lang][iden] = line	

# READ LIST AND WRITE NEW FILE
with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
	for i, line in enumerate(LIST_FILE):
		if TOY_VERSION == True and i > 100:
			break
		spl_meta = line.split(",")
		lang = spl_meta[0]
		iden = spl_meta[3]
		OUTPUT_FILE.write(data[lang][iden])