diff --git a/bin/replace_label.py b/bin/replace_label.py new file mode 100644 index 0000000..e70d407 --- /dev/null +++ b/bin/replace_label.py @@ -0,0 +1,73 @@ +''' +This script aims to replace label from data file (meta or features) +with given new labels. +This new labels is taken from character information file. +''' +import argparse +import numpy as np +import csv +from data import read_file, index_by_id, write_line + +# -- ARGPARSE +parser = argparse.ArgumentParser("") +parser.add_argument("metas", type=str, help="metas file (or features) with character label") +parser.add_argument("char_info_file", type=str, help="csv file with char info") +parser.add_argument("--field", type=str, default="gender", help="field of info char file that you want to give as replacement") +parser.add_argument("--outfile", type=str, default="out.lst", help="outfile") +parser.add_argument("--lst", default=None, type=str, help="Given list to only take a subset") + +args = parser.parse_args() + +METAS = args.metas +CHAR_INFO_FILE = args.char_info_file +FIELD = args.field +LST = args.lst +OUTFILE = args.outfile + +# -- READ FILES +metas = read_file(METAS) +metas_ind = index_by_id(metas) + +char_info = [] +char_info_ind = {} +with open(CHAR_INFO_FILE, newline='') as f: + reader = csv.DictReader(f) + for row in reader: + char_info.append(row) + char_info_ind[row["character_id"]] = row + +lst = None +ids = [] +if LST is not None: + lst = read_file(LST) + ids = [(x[0][0], x[0][3]) for x in lst] +else: + ids = [(x[0][0], x[0][3]) for x in metas] + +# -- GET CHARACTERS FOR EACH FILE +meta_chars = [] +meta_chars_uniq = [] + +if LST is not None: + meta_chars = [metas_ind[x[0][0]][x[0][3]][0][1] for x in lst] +else: + meta_chars = [x[0][1] for x in metas] +meta_chars_uniq = np.unique(np.asarray(meta_chars)) + +info_chars = list(char_info_ind.keys()) + +# -- CHECK IF A CHARACTERS MATCH +for char in meta_chars_uniq: + if not char in info_chars: + print("A character is not in the information file: " + str(char)) + exit(-1) + +# -- REPLACE ORIGINAL VALUES AND WRITE FILE +with open(OUTFILE, "w") as f: + for line in ids: + line_ = metas_ind[line[0]][line[1]] + meta = line_[0] + meta[1] = char_info_ind[meta[1]][FIELD] + write_line(meta, line_[1], f) + +