Commit 44433f16b467e18dd57b970857e08a5830cad4e1
1 parent
f774442a85
Exists in
master
Updating converter command.
Showing 1 changed file with 7 additions and 3 deletions Inline Diff
volia/masseffect.py
1 | import argparse | 1 | import argparse |
2 | from os import path | 2 | from os import path |
3 | import core.data | 3 | import core.data |
4 | from utils import SubCommandRunner | 4 | from utils import SubCommandRunner |
5 | import os | 5 | import os |
6 | 6 | ||
7 | 7 | ||
8 | def utt2char(features: str, outfile: str): | 8 | def utt2char(features: str, outfile: str): |
9 | """Allow the user to generate utt2char file from masseffect features file. | 9 | """Allow the user to generate utt2char file from masseffect features file. |
10 | 10 | ||
11 | TODO: Don't forget to manage two cases: one with old ids, and an other with | 11 | TODO: Don't forget to manage two cases: one with old ids, and an other with |
12 | new ones. | 12 | new ones. |
13 | 13 | ||
14 | Args: | 14 | Args: |
15 | features (str): [description] | 15 | features (str): [description] |
16 | outfile (str): [description] | 16 | outfile (str): [description] |
17 | """ | 17 | """ |
18 | data = core.data.read_features(features) | 18 | data = core.data.read_features(features) |
19 | keys = list(data.keys()) | 19 | keys = list(data.keys()) |
20 | 20 | ||
21 | with open(outfile, "w") as f: | 21 | with open(outfile, "w") as f: |
22 | for key in keys: | 22 | for key in keys: |
23 | splited = key.replace("\n", "").split(",") | 23 | splited = key.replace("\n", "").split(",") |
24 | character = splited[1] | 24 | character = splited[1] |
25 | f.write(",".join(splited) + " " + character + "\n") | 25 | f.write(",".join(splited) + " " + character + "\n") |
26 | 26 | ||
27 | 27 | ||
28 | def char2utt(features: str, outfile: str): | 28 | def char2utt(features: str, outfile: str): |
29 | raise Exception("Not implemented yet") | 29 | raise Exception("Not implemented yet") |
30 | pass | 30 | pass |
31 | 31 | ||
32 | 32 | ||
33 | def wavscp(datadir: str, outfile: str): | 33 | def wavscp(datadir: str, outfile: str): |
34 | """Generate the masseffect wav scp file from the directories. | 34 | """Generate the masseffect wav scp file from the directories. |
35 | 35 | ||
36 | Args: | 36 | Args: |
37 | datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available | 37 | datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available |
38 | outfile (str): path of the wav scp output file | 38 | outfile (str): path of the wav scp output file |
39 | 39 | ||
40 | Raises: | 40 | Raises: |
41 | Exception: if one of the directory is not available | 41 | Exception: if one of the directory is not available |
42 | """ | 42 | """ |
43 | en_us_dir = os.path.join(datadir, "audio_en-us") | 43 | en_us_dir = os.path.join(datadir, "audio_en-us") |
44 | fr_fr_dir = os.path.join(datadir, "audio_fr-fr") | 44 | fr_fr_dir = os.path.join(datadir, "audio_fr-fr") |
45 | 45 | ||
46 | if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)): | 46 | if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)): |
47 | raise Exception("Directory audio_en-us or audio_fr-fr does not exist") | 47 | raise Exception("Directory audio_en-us or audio_fr-fr does not exist") |
48 | 48 | ||
49 | _,_,filenames_en=next(os.walk(en_us_dir)) | 49 | _,_,filenames_en=next(os.walk(en_us_dir)) |
50 | # filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ] | 50 | # filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ] |
51 | dir_en = [ en_us_dir for f in filenames_en ] | 51 | dir_en = [ en_us_dir for f in filenames_en ] |
52 | _,_,filenames_fr=next(os.walk(fr_fr_dir)) | 52 | _,_,filenames_fr=next(os.walk(fr_fr_dir)) |
53 | dir_fr = [ fr_fr_dir for f in filenames_fr ] | 53 | dir_fr = [ fr_fr_dir for f in filenames_fr ] |
54 | # filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ] | 54 | # filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ] |
55 | 55 | ||
56 | directories = dir_en + dir_fr | 56 | directories = dir_en + dir_fr |
57 | filenames = filenames_en + filenames_fr | 57 | filenames = filenames_en + filenames_fr |
58 | 58 | ||
59 | with open(outfile, "w") as f: | 59 | with open(outfile, "w") as f: |
60 | for i, fn in enumerate(filenames): | 60 | for i, fn in enumerate(filenames): |
61 | splited = fn.split(".")[0].split(",") | 61 | splited = fn.split(".")[0].split(",") |
62 | lang = splited[0] | 62 | lang = splited[0] |
63 | character = splited[1] | 63 | character = splited[1] |
64 | record_id = splited[3] | 64 | record_id = splited[3] |
65 | path = os.path.join(directories[i], fn) | 65 | path = os.path.join(directories[i], fn) |
66 | f.write(f"{lang},{character},{record_id} {path}\n") | 66 | f.write(f"{lang},{character},{record_id} {path}\n") |
67 | 67 | ||
68 | 68 | ||
69 | def changelabels(source: str, labels: str, outfile: str): | 69 | def changelabels(source: str, labels: str, outfile: str): |
70 | data_dict = core.data.read_id_values(source) | 70 | data_dict = core.data.read_id_values(source) |
71 | labels_dict = core.data.read_labels(labels) | 71 | labels_dict = core.data.read_labels(labels) |
72 | keys = list(data_dict.keys()) | 72 | keys = list(data_dict.keys()) |
73 | 73 | ||
74 | with open(outfile, "w") as f: | 74 | with open(outfile, "w") as f: |
75 | for key in keys: | 75 | for key in keys: |
76 | splited = key.split(",") | 76 | splited = key.split(",") |
77 | splited[1] = labels_dict[key][0] | 77 | splited[1] = labels_dict[key][0] |
78 | core.data.write_line(",".join(splited), data_dict[key], out=f) | 78 | core.data.write_line(",".join(splited), data_dict[key], out=f) |
79 | 79 | ||
80 | 80 | ||
81 | def converter(file: str, outtype: str, outfile: str): | 81 | def converter(file: str, outtype: str, outfile: str): |
82 | data = core.data.read_id_values(file) | 82 | data = core.data.read_id_values(file) |
83 | 83 | ||
84 | with open(outfile, "w") as of: | 84 | with open(outfile, "w") as of: |
85 | for key in data: | 85 | for key in data: |
86 | splited = key.replace("\n", "").split(",") | 86 | splited = key.replace("\n", "").split(",") |
87 | of.write(key.replace("\n", "") + " " + ",".join([splited[0], splited[1], splited[3]]) + "\n") | 87 | masseffect_id = key.replace("\n", "") |
88 | kaldi_id = ",".join([splited[0], splited[1], splited[3]]) | ||
89 | if outtype == "masseffect2kaldi": | ||
90 | of.write(f"{masseffect_id} {kaldi_id}\n") | ||
91 | elif outtype == "kaldi2masseffect": | ||
92 | of.write(f"{kaldi_id} {masseffect_id}\n") | ||
88 | 93 | ||
89 | 94 | ||
90 | |||
91 | if __name__ == '__main__': | 95 | if __name__ == '__main__': |
92 | # Main parser | 96 | # Main parser |
93 | parser = argparse.ArgumentParser(description="...") | 97 | parser = argparse.ArgumentParser(description="...") |
94 | subparsers = parser.add_subparsers(title="action") | 98 | subparsers = parser.add_subparsers(title="action") |
95 | 99 | ||
96 | # utt2char | 100 | # utt2char |
97 | parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file") | 101 | parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file") |
98 | parser_utt2char.add_argument("--features", type=str, help="features file") | 102 | parser_utt2char.add_argument("--features", type=str, help="features file") |
99 | parser_utt2char.add_argument("--outfile", type=str, help="output file") | 103 | parser_utt2char.add_argument("--outfile", type=str, help="output file") |
100 | parser_utt2char.set_defaults(which="utt2char") | 104 | parser_utt2char.set_defaults(which="utt2char") |
101 | 105 | ||
102 | # char2utt | 106 | # char2utt |
103 | parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file") | 107 | parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file") |
104 | parser_char2utt.add_argument("--features", type=str, help="features file") | 108 | parser_char2utt.add_argument("--features", type=str, help="features file") |
105 | parser_char2utt.add_argument("--outfile", type=str, help="output file") | 109 | parser_char2utt.add_argument("--outfile", type=str, help="output file") |
106 | parser_char2utt.set_defaults(which="char2utt") | 110 | parser_char2utt.set_defaults(which="char2utt") |
107 | 111 | ||
108 | # wavscp | 112 | # wavscp |
109 | parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file") | 113 | parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file") |
110 | parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") | 114 | parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") |
111 | parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") | 115 | parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") |
112 | parser_wavscp.set_defaults(which="wavscp") | 116 | parser_wavscp.set_defaults(which="wavscp") |
113 | 117 | ||
114 | # Change labels | 118 | # Change labels |
115 | parser_changelabels = subparsers.add_parser("changelabels", help="...") | 119 | parser_changelabels = subparsers.add_parser("changelabels", help="...") |
116 | parser_changelabels.add_argument("--source", required=True, type=str, help="source file where we want to change ids.") | 120 | parser_changelabels.add_argument("--source", required=True, type=str, help="source file where we want to change ids.") |
117 | parser_changelabels.add_argument("--labels", required=True, type=str, help="file with labels") | 121 | parser_changelabels.add_argument("--labels", required=True, type=str, help="file with labels") |
118 | parser_changelabels.add_argument("--outfile", required=True, type=str, help="Output file") | 122 | parser_changelabels.add_argument("--outfile", required=True, type=str, help="Output file") |
119 | parser_changelabels.set_defaults(which="changelabels") | 123 | parser_changelabels.set_defaults(which="changelabels") |
120 | 124 | ||
121 | # Create converter | 125 | # Create converter |
122 | parser_converter = subparsers.add_parser("converter", help="Create converter file") | 126 | parser_converter = subparsers.add_parser("converter", help="Create converter file") |
123 | parser_converter.add_argument("--file", | 127 | parser_converter.add_argument("--file", |
124 | type=str, | 128 | type=str, |
125 | required=True, | 129 | required=True, |
126 | help="File with ids from which create converter.") | 130 | help="File with ids from which create converter.") |
127 | parser_converter.add_argument("--outtype", type=str, choices=["complet", "kaldi"]) | 131 | parser_converter.add_argument("--outtype", type=str, choices=["kaldi2masseffect", "masseffect2kaldi"]) |
128 | parser_converter.add_argument("--outfile", type=str, required=True, help="") | 132 | parser_converter.add_argument("--outfile", type=str, required=True, help="") |
129 | parser_converter.set_defaults(which="converter") | 133 | parser_converter.set_defaults(which="converter") |
130 | 134 | ||
131 | 135 | ||
132 | # Parse | 136 | # Parse |
133 | args = parser.parse_args() | 137 | args = parser.parse_args() |
134 | 138 | ||
135 | # Run commands | 139 | # Run commands |
136 | runner = SubCommandRunner({ | 140 | runner = SubCommandRunner({ |
137 | "utt2char" : utt2char, | 141 | "utt2char" : utt2char, |
138 | "char2utt": char2utt, | 142 | "char2utt": char2utt, |
139 | "wavscp": wavscp, | 143 | "wavscp": wavscp, |
140 | "changelabels": changelabels, | 144 | "changelabels": changelabels, |
141 | "converter": converter | 145 | "converter": converter |
142 | }) | 146 | }) |
143 | 147 | ||
144 | runner.run(args.which, args.__dict__, remove="which") | 148 | runner.run(args.which, args.__dict__, remove="which") |