masseffect.py
7.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import argparse
from os import path
import core.data
from utils import SubCommandRunner
import os
def utt2char(features: str, outfile: str):
"""Allow the user to generate utt2char file from masseffect features file.
TODO: Don't forget to manage two cases: one with old ids, and an other with
new ones.
Args:
features (str): [description]
outfile (str): [description]
"""
data = core.data.read_features(features)
keys = list(data.keys())
with open(outfile, "w") as f:
for key in keys:
splited = key.replace("\n", "").split(",")
character = splited[1]
f.write(",".join(splited) + " " + character + "\n")
def char2utt(features: str, outfile: str):
raise Exception("Not implemented yet")
pass
def wavscp(datadir: str, outfile: str):
"""Generate the masseffect wav scp file from the directories.
Args:
datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available
outfile (str): path of the wav scp output file
Raises:
Exception: if one of the directory is not available
"""
en_us_dir = os.path.join(datadir, "audio_en-us")
fr_fr_dir = os.path.join(datadir, "audio_fr-fr")
if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)):
raise Exception("Directory audio_en-us or audio_fr-fr does not exist")
_,_,filenames_en=next(os.walk(en_us_dir))
# filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ]
dir_en = [ en_us_dir for f in filenames_en ]
_,_,filenames_fr=next(os.walk(fr_fr_dir))
dir_fr = [ fr_fr_dir for f in filenames_fr ]
# filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ]
directories = dir_en + dir_fr
filenames = filenames_en + filenames_fr
with open(outfile, "w") as f:
for i, fn in enumerate(filenames):
splited = fn.split(".")[0].split(",")
lang = splited[0]
character = splited[1]
record_id = splited[3]
path = os.path.join(directories[i], fn)
f.write(f"{lang},{character},{record_id} {path}\n")
def changelabels(source: str, labels: str, outfile: str):
data_dict = core.data.read_id_values(source)
labels_dict = core.data.read_labels(labels)
keys = list(data_dict.keys())
with open(outfile, "w") as f:
for key in keys:
splited = key.split(",")
splited[1] = labels_dict[key][0]
core.data.write_line(",".join(splited), data_dict[key], out=f)
def converter(file: str, outtype: str, outfile: str):
data = core.data.read_id_values(file)
with open(outfile, "w") as of:
for key in data:
splited = key.replace("\n", "").split(",")
masseffect_id = key.replace("\n", "")
kaldi_id = ",".join([splited[0], splited[1], splited[3]])
if outtype == "masseffect2kaldi":
of.write(f"{masseffect_id} {kaldi_id}\n")
elif outtype == "kaldi2masseffect":
of.write(f"{kaldi_id} {masseffect_id}\n")
def utt2sub(file: str, subfile: str, outfile: str):
data = core.data.read_id_values(file)
keys = [key for key in data]
data_sub = core.data.read_id_values(subfile)
keys_sub = [key for key in data_sub]
with open(outfile, "w") as of:
for key in keys:
subkeys = [subkey for subkey in keys_sub if subkey.startswith(key)]
subkeys_str = " ".join(subkeys)
of.write(f"{key} {subkeys_str}")
def sub2utt(file: str, subfile: str, outfile: str):
data = core.data.read_id_values(file)
keys = [key for key in data]
data_sub = core.data.read_id_values(subfile)
keys_sub = [key for key in data_sub]
with open(outfile, "w") as of:
for key in keys:
subkeys = [subkey for subkey in keys_sub if subkey.startswith(key)]
for subkey in subkeys:
of.write(f"{subkey} {key}")
if __name__ == '__main__':
# Main parser
parser = argparse.ArgumentParser(description="...")
subparsers = parser.add_subparsers(title="action")
# utt2char
parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file")
parser_utt2char.add_argument("--features", type=str, help="features file")
parser_utt2char.add_argument("--outfile", type=str, help="output file")
parser_utt2char.set_defaults(which="utt2char")
# char2utt
parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file")
parser_char2utt.add_argument("--features", type=str, help="features file")
parser_char2utt.add_argument("--outfile", type=str, help="output file")
parser_char2utt.set_defaults(which="char2utt")
# wavscp
parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file")
parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect")
parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file")
parser_wavscp.set_defaults(which="wavscp")
# Change labels
parser_changelabels = subparsers.add_parser("changelabels", help="...")
parser_changelabels.add_argument("--source", required=True, type=str, help="source file where we want to change ids.")
parser_changelabels.add_argument("--labels", required=True, type=str, help="file with labels")
parser_changelabels.add_argument("--outfile", required=True, type=str, help="Output file")
parser_changelabels.set_defaults(which="changelabels")
# Create converter
parser_converter = subparsers.add_parser("converter", help="Create converter file")
parser_converter.add_argument("--file",
type=str,
required=True,
help="File with ids from which create converter.")
parser_converter.add_argument("--outtype", type=str, choices=["kaldi2masseffect", "masseffect2kaldi"])
parser_converter.add_argument("--outfile", type=str, required=True, help="")
parser_converter.set_defaults(which="converter")
# Create utt2sub
parser_utt2sub = subparsers.add_parser("utt2sub", help="generate utt2sub file")
parser_utt2sub.add_argument("--file", required=True, type=str, help="features, list or labels file with normal ids")
parser_utt2sub.add_argument("--subfile", required=True, type=str, help="features, list or labels file with sub ids")
parser_utt2sub.add_argument("--outfile", required=True, type=str, help="output file")
parser_utt2sub.set_defaults(which="utt2sub")
# Create sub2utt
parser_sub2utt = subparsers.add_parser("sub2utt", help="generate sub2utt file")
parser_sub2utt.add_argument("--file", required=True, type=str, help="features, list or labels file with normal ids")
parser_sub2utt.add_argument("--subfile", required=True, type=str, help="features, list or labels file sub ids")
parser_sub2utt.add_argument("--outfile", required=True, type=str, help="output file")
parser_sub2utt.set_defaults(which="sub2utt")
# Parse
args = parser.parse_args()
# Run commands
runner = SubCommandRunner({
"utt2char" : utt2char,
"char2utt": char2utt,
"wavscp": wavscp,
"changelabels": changelabels,
"converter": converter,
"utt2sub": utt2sub,
"sub2utt": sub2utt
})
runner.run(args.which, args.__dict__, remove="which")