Commit 362c9d37a9e82b861d794ffc9dd11cd43d066cce
1 parent
1d8b79f614
Exists in
master
Fixed simple issue
Showing 1 changed file with 4 additions and 3 deletions Inline Diff
volia/masseffect.py
1 | import argparse | 1 | import argparse |
2 | from os import path | 2 | from os import path |
3 | import core.data | 3 | import core.data |
4 | from utils import SubCommandRunner | 4 | from utils import SubCommandRunner |
5 | import os | 5 | import os |
6 | 6 | ||
7 | 7 | ||
8 | def utt2char(features: str, outfile: str): | 8 | def utt2char(features: str, outfile: str): |
9 | """Allow the user to generate utt2char file from masseffect features file. | 9 | """Allow the user to generate utt2char file from masseffect features file. |
10 | 10 | ||
11 | TODO: Don't forget to manage two cases: one with old ids, and an other with | 11 | TODO: Don't forget to manage two cases: one with old ids, and an other with |
12 | new ones. | 12 | new ones. |
13 | 13 | ||
14 | Args: | 14 | Args: |
15 | features (str): [description] | 15 | features (str): [description] |
16 | outfile (str): [description] | 16 | outfile (str): [description] |
17 | """ | 17 | """ |
18 | data = core.data.read_features(features) | 18 | data = core.data.read_features(features) |
19 | keys = list(data.keys()) | 19 | keys = list(data.keys()) |
20 | 20 | ||
21 | with open(outfile, "w") as f: | 21 | with open(outfile, "w") as f: |
22 | for key in keys: | 22 | for key in keys: |
23 | splited = key.replace("\n", "").split(",") | 23 | splited = key.replace("\n", "").split(",") |
24 | character = splited[1] | 24 | character = splited[1] |
25 | f.write(",".join(splited) + " " + character + "\n") | 25 | f.write(",".join(splited) + " " + character + "\n") |
26 | 26 | ||
27 | 27 | ||
28 | def char2utt(features: str, outfile: str): | 28 | def char2utt(features: str, outfile: str): |
29 | raise Exception("Not implemented yet") | 29 | raise Exception("Not implemented yet") |
30 | pass | 30 | pass |
31 | 31 | ||
32 | 32 | ||
33 | def wavscp(datadir: str, outfile: str): | 33 | def wavscp(datadir: str, outfile: str): |
34 | """Generate the masseffect wav scp file from the directories. | 34 | """Generate the masseffect wav scp file from the directories. |
35 | 35 | ||
36 | Args: | 36 | Args: |
37 | datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available | 37 | datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available |
38 | outfile (str): path of the wav scp output file | 38 | outfile (str): path of the wav scp output file |
39 | 39 | ||
40 | Raises: | 40 | Raises: |
41 | Exception: if one of the directory is not available | 41 | Exception: if one of the directory is not available |
42 | """ | 42 | """ |
43 | en_us_dir = os.path.join(datadir, "audio_en-us") | 43 | en_us_dir = os.path.join(datadir, "audio_en-us") |
44 | fr_fr_dir = os.path.join(datadir, "audio_fr-fr") | 44 | fr_fr_dir = os.path.join(datadir, "audio_fr-fr") |
45 | 45 | ||
46 | if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)): | 46 | if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)): |
47 | raise Exception("Directory audio_en-us or audio_fr-fr does not exist") | 47 | raise Exception("Directory audio_en-us or audio_fr-fr does not exist") |
48 | 48 | ||
49 | _,_,filenames_en=next(os.walk(en_us_dir)) | 49 | _,_,filenames_en=next(os.walk(en_us_dir)) |
50 | # filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ] | 50 | # filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ] |
51 | dir_en = [ en_us_dir for f in filenames_en ] | 51 | dir_en = [ en_us_dir for f in filenames_en ] |
52 | _,_,filenames_fr=next(os.walk(fr_fr_dir)) | 52 | _,_,filenames_fr=next(os.walk(fr_fr_dir)) |
53 | dir_fr = [ fr_fr_dir for f in filenames_fr ] | 53 | dir_fr = [ fr_fr_dir for f in filenames_fr ] |
54 | # filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ] | 54 | # filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ] |
55 | 55 | ||
56 | directories = dir_en + dir_fr | 56 | directories = dir_en + dir_fr |
57 | filenames = filenames_en + filenames_fr | 57 | filenames = filenames_en + filenames_fr |
58 | 58 | ||
59 | with open(outfile, "w") as f: | 59 | with open(outfile, "w") as f: |
60 | for i, fn in enumerate(filenames): | 60 | for i, fn in enumerate(filenames): |
61 | splited = fn.split(".")[0].split(",") | 61 | splited = fn.split(".")[0].split(",") |
62 | lang = splited[0] | 62 | lang = splited[0] |
63 | character = splited[1] | 63 | character = splited[1] |
64 | record_id = splited[3] | 64 | record_id = splited[3] |
65 | path = os.path.join(directories[i], fn) | 65 | path = os.path.join(directories[i], fn) |
66 | f.write(f"{lang},{character},{record_id} {path}\n") | 66 | f.write(f"{lang},{character},{record_id} {path}\n") |
67 | 67 | ||
68 | 68 | ||
69 | def changelabels(source: str, labels: str, outfile: str): | 69 | def changelabels(source: str, labels: str, outfile: str): |
70 | data_dict = core.data.read_id_values(source) | 70 | data_dict = core.data.read_id_values(source) |
71 | labels_dict = core.data.read_labels(labels) | 71 | labels_dict = core.data.read_labels(labels) |
72 | keys = list(data_dict.keys()) | 72 | keys = list(data_dict.keys()) |
73 | 73 | ||
74 | with open(outfile, "w") as f: | 74 | with open(outfile, "w") as f: |
75 | for key in keys: | 75 | for key in keys: |
76 | splited = key.split(",") | 76 | splited = key.split(",") |
77 | splited[1] = labels_dict[key][0] | 77 | splited[1] = labels_dict[key][0] |
78 | core.data.write_line(",".join(splited), data_dict[key], out=f) | 78 | core.data.write_line(",".join(splited), data_dict[key], out=f) |
79 | 79 | ||
80 | 80 | ||
81 | def converter(file: str, outtype: str, outfile: str): | 81 | def converter(file: str, outtype: str, outfile: str): |
82 | data = core.data.read_id_values(file) | 82 | data = core.data.read_id_values(file) |
83 | 83 | ||
84 | with open(outfile, "w") as of: | 84 | with open(outfile, "w") as of: |
85 | for key in data: | 85 | for key in data: |
86 | splited = key.replace("\n", "").split(",") | 86 | splited = key.replace("\n", "").split(",") |
87 | masseffect_id = key.replace("\n", "") | 87 | masseffect_id = key.replace("\n", "") |
88 | kaldi_id = ",".join([splited[0], splited[1], splited[3]]) | 88 | kaldi_id = ",".join([splited[0], splited[1], splited[3]]) |
89 | if outtype == "masseffect2kaldi": | 89 | if outtype == "masseffect2kaldi": |
90 | of.write(f"{masseffect_id} {kaldi_id}\n") | 90 | of.write(f"{masseffect_id} {kaldi_id}\n") |
91 | elif outtype == "kaldi2masseffect": | 91 | elif outtype == "kaldi2masseffect": |
92 | of.write(f"{kaldi_id} {masseffect_id}\n") | 92 | of.write(f"{kaldi_id} {masseffect_id}\n") |
93 | 93 | ||
94 | 94 | ||
95 | def utt2sub(file: str, outfile: str): | 95 | def utt2sub(file: str, outfile: str): |
96 | data = core.data.read_id_values(file) | 96 | data = core.data.read_id_values(file) |
97 | keys = [key for key in data] | 97 | keys = [key for key in data] |
98 | 98 | ||
99 | with open(outfile, "w") as of: | 99 | with open(outfile, "w") as of: |
100 | key_2_subkeys = {} | 100 | key_2_subkeys = {} |
101 | for subkey in keys: | 101 | for subkey in keys: |
102 | key = subkey.replace(" ", "").replace("\n", "").split("_")[:-1] | 102 | key = "_".join(subkey.replace(" ", "").replace("\n", "").split("_")[:-1]) |
103 | key_2_subkeys[key] = [] | 103 | if key not in key_2_subkeys: |
104 | key_2_subkeys[key] = [] | ||
104 | key_2_subkeys[key].append(subkey) | 105 | key_2_subkeys[key].append(subkey) |
105 | 106 | ||
106 | for key in key_2_subkeys: | 107 | for key in key_2_subkeys: |
107 | subkeys_str = " ".join(key_2_subkeys[key]) | 108 | subkeys_str = " ".join(key_2_subkeys[key]) |
108 | of.write(f"{key} {subkeys_str}\n") | 109 | of.write(f"{key} {subkeys_str}\n") |
109 | 110 | ||
110 | 111 | ||
111 | def sub2utt(file: str, outfile: str): | 112 | def sub2utt(file: str, outfile: str): |
112 | data = core.data.read_id_values(file) | 113 | data = core.data.read_id_values(file) |
113 | keys = [key for key in data] | 114 | keys = [key for key in data] |
114 | 115 | ||
115 | with open(outfile, "w") as of: | 116 | with open(outfile, "w") as of: |
116 | for subkey in keys: | 117 | for subkey in keys: |
117 | key = subkey.replace(" ", "").replace("\n", "").split("_")[:-1] | 118 | key = "_".join(subkey.replace(" ", "").replace("\n", "").split("_")[:-1]) |
118 | of.write(f"{subkey} {key}\n") | 119 | of.write(f"{subkey} {key}\n") |
119 | 120 | ||
120 | 121 | ||
121 | if __name__ == '__main__': | 122 | if __name__ == '__main__': |
122 | # Main parser | 123 | # Main parser |
123 | parser = argparse.ArgumentParser(description="...") | 124 | parser = argparse.ArgumentParser(description="...") |
124 | subparsers = parser.add_subparsers(title="action") | 125 | subparsers = parser.add_subparsers(title="action") |
125 | 126 | ||
126 | # utt2char | 127 | # utt2char |
127 | parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file") | 128 | parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file") |
128 | parser_utt2char.add_argument("--features", type=str, help="features file") | 129 | parser_utt2char.add_argument("--features", type=str, help="features file") |
129 | parser_utt2char.add_argument("--outfile", type=str, help="output file") | 130 | parser_utt2char.add_argument("--outfile", type=str, help="output file") |
130 | parser_utt2char.set_defaults(which="utt2char") | 131 | parser_utt2char.set_defaults(which="utt2char") |
131 | 132 | ||
132 | # char2utt | 133 | # char2utt |
133 | parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file") | 134 | parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file") |
134 | parser_char2utt.add_argument("--features", type=str, help="features file") | 135 | parser_char2utt.add_argument("--features", type=str, help="features file") |
135 | parser_char2utt.add_argument("--outfile", type=str, help="output file") | 136 | parser_char2utt.add_argument("--outfile", type=str, help="output file") |
136 | parser_char2utt.set_defaults(which="char2utt") | 137 | parser_char2utt.set_defaults(which="char2utt") |
137 | 138 | ||
138 | # wavscp | 139 | # wavscp |
139 | parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file") | 140 | parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file") |
140 | parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") | 141 | parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") |
141 | parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") | 142 | parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") |
142 | parser_wavscp.set_defaults(which="wavscp") | 143 | parser_wavscp.set_defaults(which="wavscp") |
143 | 144 | ||
144 | # Change labels | 145 | # Change labels |
145 | parser_changelabels = subparsers.add_parser("changelabels", help="...") | 146 | parser_changelabels = subparsers.add_parser("changelabels", help="...") |
146 | parser_changelabels.add_argument("--source", required=True, type=str, help="source file where we want to change ids.") | 147 | parser_changelabels.add_argument("--source", required=True, type=str, help="source file where we want to change ids.") |
147 | parser_changelabels.add_argument("--labels", required=True, type=str, help="file with labels") | 148 | parser_changelabels.add_argument("--labels", required=True, type=str, help="file with labels") |
148 | parser_changelabels.add_argument("--outfile", required=True, type=str, help="Output file") | 149 | parser_changelabels.add_argument("--outfile", required=True, type=str, help="Output file") |
149 | parser_changelabels.set_defaults(which="changelabels") | 150 | parser_changelabels.set_defaults(which="changelabels") |
150 | 151 | ||
151 | # Create converter | 152 | # Create converter |
152 | parser_converter = subparsers.add_parser("converter", help="Create converter file") | 153 | parser_converter = subparsers.add_parser("converter", help="Create converter file") |
153 | parser_converter.add_argument("--file", | 154 | parser_converter.add_argument("--file", |
154 | type=str, | 155 | type=str, |
155 | required=True, | 156 | required=True, |
156 | help="File with ids from which create converter.") | 157 | help="File with ids from which create converter.") |
157 | parser_converter.add_argument("--outtype", type=str, choices=["kaldi2masseffect", "masseffect2kaldi"]) | 158 | parser_converter.add_argument("--outtype", type=str, choices=["kaldi2masseffect", "masseffect2kaldi"]) |
158 | parser_converter.add_argument("--outfile", type=str, required=True, help="") | 159 | parser_converter.add_argument("--outfile", type=str, required=True, help="") |
159 | parser_converter.set_defaults(which="converter") | 160 | parser_converter.set_defaults(which="converter") |
160 | 161 | ||
161 | # Create utt2sub | 162 | # Create utt2sub |
162 | parser_utt2sub = subparsers.add_parser("utt2sub", help="generate utt2sub file") | 163 | parser_utt2sub = subparsers.add_parser("utt2sub", help="generate utt2sub file") |
163 | parser_utt2sub.add_argument("--file", required=True, type=str, help="features, list or labels file with sub ids") | 164 | parser_utt2sub.add_argument("--file", required=True, type=str, help="features, list or labels file with sub ids") |
164 | parser_utt2sub.add_argument("--outfile", required=True, type=str, help="output file") | 165 | parser_utt2sub.add_argument("--outfile", required=True, type=str, help="output file") |
165 | parser_utt2sub.set_defaults(which="utt2sub") | 166 | parser_utt2sub.set_defaults(which="utt2sub") |
166 | 167 | ||
167 | # Create sub2utt | 168 | # Create sub2utt |
168 | parser_sub2utt = subparsers.add_parser("sub2utt", help="generate sub2utt file") | 169 | parser_sub2utt = subparsers.add_parser("sub2utt", help="generate sub2utt file") |
169 | parser_sub2utt.add_argument("--file", required=True, type=str, help="features, list or labels file with sub ids") | 170 | parser_sub2utt.add_argument("--file", required=True, type=str, help="features, list or labels file with sub ids") |
170 | parser_sub2utt.add_argument("--outfile", required=True, type=str, help="output file") | 171 | parser_sub2utt.add_argument("--outfile", required=True, type=str, help="output file") |
171 | parser_sub2utt.set_defaults(which="sub2utt") | 172 | parser_sub2utt.set_defaults(which="sub2utt") |
172 | 173 | ||
173 | 174 | ||
174 | # Parse | 175 | # Parse |
175 | args = parser.parse_args() | 176 | args = parser.parse_args() |
176 | 177 | ||
177 | # Run commands | 178 | # Run commands |
178 | runner = SubCommandRunner({ | 179 | runner = SubCommandRunner({ |
179 | "utt2char" : utt2char, | 180 | "utt2char" : utt2char, |
180 | "char2utt": char2utt, | 181 | "char2utt": char2utt, |
181 | "wavscp": wavscp, | 182 | "wavscp": wavscp, |
182 | "changelabels": changelabels, | 183 | "changelabels": changelabels, |
183 | "converter": converter, | 184 | "converter": converter, |
184 | "utt2sub": utt2sub, | 185 | "utt2sub": utt2sub, |
185 | "sub2utt": sub2utt | 186 | "sub2utt": sub2utt |
186 | }) | 187 | }) |
187 | 188 | ||
188 | runner.run(args.which, args.__dict__, remove="which") | 189 | runner.run(args.which, args.__dict__, remove="which") |
189 | 190 | ||
190 | 191 |