Commit 6c40a57b2cbad2a09bd384f5b8ca94360a470343
1 parent
ee5cc2a7e7
Exists in
master
Allow you to replace labels from a given meta file with elses from an other one.
Showing 1 changed file with 73 additions and 0 deletions Inline Diff
bin/replace_label.py
File was created | 1 | ''' | |
2 | This script aims to replace label from data file (meta or features) | ||
3 | with given new labels. | ||
4 | This new labels is taken from character information file. | ||
5 | ''' | ||
6 | import argparse | ||
7 | import numpy as np | ||
8 | import csv | ||
9 | from data import read_file, index_by_id, write_line | ||
10 | |||
11 | # -- ARGPARSE | ||
12 | parser = argparse.ArgumentParser("") | ||
13 | parser.add_argument("metas", type=str, help="metas file (or features) with character label") | ||
14 | parser.add_argument("char_info_file", type=str, help="csv file with char info") | ||
15 | parser.add_argument("--field", type=str, default="gender", help="field of info char file that you want to give as replacement") | ||
16 | parser.add_argument("--outfile", type=str, default="out.lst", help="outfile") | ||
17 | parser.add_argument("--lst", default=None, type=str, help="Given list to only take a subset") | ||
18 | |||
19 | args = parser.parse_args() | ||
20 | |||
21 | METAS = args.metas | ||
22 | CHAR_INFO_FILE = args.char_info_file | ||
23 | FIELD = args.field | ||
24 | LST = args.lst | ||
25 | OUTFILE = args.outfile | ||
26 | |||
27 | # -- READ FILES | ||
28 | metas = read_file(METAS) | ||
29 | metas_ind = index_by_id(metas) | ||
30 | |||
31 | char_info = [] | ||
32 | char_info_ind = {} | ||
33 | with open(CHAR_INFO_FILE, newline='') as f: | ||
34 | reader = csv.DictReader(f) | ||
35 | for row in reader: | ||
36 | char_info.append(row) | ||
37 | char_info_ind[row["character_id"]] = row | ||
38 | |||
39 | lst = None | ||
40 | ids = [] | ||
41 | if LST is not None: | ||
42 | lst = read_file(LST) | ||
43 | ids = [(x[0][0], x[0][3]) for x in lst] | ||
44 | else: | ||
45 | ids = [(x[0][0], x[0][3]) for x in metas] | ||
46 | |||
47 | # -- GET CHARACTERS FOR EACH FILE | ||
48 | meta_chars = [] | ||
49 | meta_chars_uniq = [] | ||
50 | |||
51 | if LST is not None: | ||
52 | meta_chars = [metas_ind[x[0][0]][x[0][3]][0][1] for x in lst] | ||
53 | else: | ||
54 | meta_chars = [x[0][1] for x in metas] | ||
55 | meta_chars_uniq = np.unique(np.asarray(meta_chars)) | ||
56 | |||
57 | info_chars = list(char_info_ind.keys()) | ||
58 | |||
59 | # -- CHECK IF A CHARACTERS MATCH | ||
60 | for char in meta_chars_uniq: | ||
61 | if not char in info_chars: | ||
62 | print("A character is not in the information file: " + str(char)) | ||
63 | exit(-1) | ||
64 | |||
65 | # -- REPLACE ORIGINAL VALUES AND WRITE FILE | ||
66 | with open(OUTFILE, "w") as f: | ||
67 | for line in ids: | ||
68 | line_ = metas_ind[line[0]][line[1]] | ||
69 | meta = line_[0] | ||
70 | meta[1] = char_info_ind[meta[1]][FIELD] | ||
71 | write_line(meta, line_[1], f) | ||
72 | |||
73 | |||
74 |