Quillot Mathias / Clustering

Browse Code »

Commit e36dbbc98b8ad182980cb714b6c6024b467ac072

Authored by Mathias Quillot 2019-07-22 12:07:36 +0200

1 parent 9aec207cb0

Exists in master

Improving code style

Showing 1 changed file with 13 additions and 24 deletions Inline Diff

bin/extract_vectors.py

bin/extract_vectors.py

Diff comments View file @ e36dbbc

1	'''	1	'''
2	The goal of this script is to extract vectors from a list.	2	The goal of this script is to extract vectors from a list.
3	One file is the full content, and the list only enumerate the	3	One file is the full content, and the list only enumerate the
4	vectors you want to keep.	4	vectors you want to keep.
5	'''	5	'''
6		6
7	import os	7	import os
8	import numpy as np	8	import numpy as np
9	import argparse	9	import argparse
		10	from data import read_file, index_by_id, write_line
10		11
11	parser = argparse.ArgumentParser(description='Extract a subset of vectors')	12	parser = argparse.ArgumentParser(description='Extract a subset of vectors')
12	parser.add_argument('vectorsfile', type=str,	13	parser.add_argument('vectorsfile', type=str,
13	help='the path of the file containing the convectors')	14	help='the path of the file containing the convectors')
14	parser.add_argument('listfile', type=str,	15	parser.add_argument('listfile', type=str,
15	help='the path of the file containing the list of vectors kept')	16	help='the path of the file containing the list of vectors kept')
16	parser.add_argument('-o', '--output', type=str,	17	parser.add_argument('-o', '--output', type=str,
17	default='a.out',	18	default='a.out',
18	help='the path the output file containing the vectors kept')	19	help='the path the output file containing the vectors kept')
19		20
20	args = parser.parse_args()	21	args = parser.parse_args()
21		22
22	# Editing global variable	23	# Editing global variable
23	VECTOR_FILE = args.vectorsfile	24	VECTOR_FILE = args.vectorsfile
24	LIST_FILE = args.listfile	25	LIST_FILE = args.listfile
25	OUTPUT_FILE = args.output	26	OUTPUT_FILE = args.output
26		27
27	# READ VECTOR DATA	28	# READ VECTOR DATA
28	data = {}	29	features = read_file(VECTOR_FILE)
29	data["en-us"] = {}	30	features_ind = index_by_id(features)
30	data["fr-fr"] = {}	31	lst = read_file(LIST_FILE)
31	with open(VECTOR_FILE, "r") as f:
32	for i, line in enumerate(f):
33	if TOY_VERSION == True and i > 100:
34	break
35	spl_line = line.split(" ")
36	if(len(pvectors) == 0):
37	pvectors = np.empty((0, len(spl_line[1:])), np.float32)
38	spl_meta = spl_line.split(",")
39	lang = spl_meta[0]
40	iden = spl_meta[3]
41	data[lang][iden] = line
42		32
43	# READ LIST AND WRITE NEW FILE	33
44	with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:	34	# COMPUTE KEPT FEATS
45	for i, line in enumerate(LIST_FILE):	35	kept_feats = [features_ind[x[0][0]][x[0][3]] for x in lst]
46	if TOY_VERSION == True and i > 100:	36
47	break	37	# WRITE IN FILE
48	spl_meta = line.split(",")	38	with open(OUTPUT_FILE, 'w') as f:
49	lang = spl_meta[0]	39	for feat in kept_feats:
50	iden = spl_meta[3]	40	write_line(feat[0], feat[1], f=f)
51	OUTPUT_FILE.write(data[lang][iden])
52		41