Blame view
bin/extract_vectors.py
1.48 KB
ac78b07ea All base bin file... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
''' The goal of this script is to extract vectors from a list. One file is the full content, and the list only enumerate the vectors you want to keep. ''' import os import numpy as np import argparse parser = argparse.ArgumentParser(description='Extract a subset of vectors') parser.add_argument('vectorsfile', type=str, help='the path of the file containing the convectors') parser.add_argument('listfile', type=str, help='the path of the file containing the list of vectors kept') parser.add_argument('-o', '--output', type=str, default='a.out', help='the path the output file containing the vectors kept') args = parser.parse_args() # Editing global variable VECTOR_FILE = args.vectorsfile LIST_FILE = args.listfile OUTPUT_FILE = args.output # READ VECTOR DATA data = {} data["en-us"] = {} data["fr-fr"] = {} with open(VECTOR_FILE, "r") as f: for i, line in enumerate(f): if TOY_VERSION == True and i > 100: break spl_line = line.split(" ") if(len(pvectors) == 0): pvectors = np.empty((0, len(spl_line[1:])), np.float32) spl_meta = spl_line.split(",") lang = spl_meta[0] iden = spl_meta[3] data[lang][iden] = line # READ LIST AND WRITE NEW FILE with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o: for i, line in enumerate(LIST_FILE): if TOY_VERSION == True and i > 100: break spl_meta = line.split(",") lang = spl_meta[0] iden = spl_meta[3] OUTPUT_FILE.write(data[lang][iden]) |