Compare View
Commits (3)
Changes
Showing 3 changed files Side-by-side Diff
volia/data.py
... | ... | @@ -8,7 +8,6 @@ import core.data |
8 | 8 | |
9 | 9 | |
10 | 10 | |
11 | - | |
12 | 11 | def filter_file(file, filter, outfile): |
13 | 12 | file_path = file |
14 | 13 | filter_path = filter |
... | ... | @@ -24,8 +23,28 @@ def filter_file(file, filter, outfile): |
24 | 23 | |
25 | 24 | |
26 | 25 | def convert(file, type_from, type): |
26 | + | |
27 | 27 | pass |
28 | 28 | |
29 | +def utt2dur(wavscp: str, outfile: str): | |
30 | + | |
31 | + import wave | |
32 | + import contextlib | |
33 | + | |
34 | + with open(wavscp, "r") as f, open(outfile, "w") as of: | |
35 | + for line in f: | |
36 | + splited = line.replace("\n", "").split(" ") | |
37 | + id_ = splited[0] | |
38 | + wav_ = splited[1] | |
39 | + duration = 0 | |
40 | + | |
41 | + with contextlib.closing(wave.open(wav_,'r')) as wav_f: | |
42 | + frames = wav_f.getnframes() | |
43 | + rate = wav_f.getframerate() | |
44 | + duration = frames / float(rate) | |
45 | + | |
46 | + of.write(f"{id_} {duration}\n") | |
47 | + | |
29 | 48 | |
30 | 49 | if __name__ == "__main__": |
31 | 50 | # Main parser |
... | ... | @@ -46,6 +65,16 @@ if __name__ == "__main__": |
46 | 65 | parser_convert.add_argument("--type", choices=["old-masseffect", "new-masseffect"], required=True) |
47 | 66 | parser_convert.set_defaults(which="convert") |
48 | 67 | |
68 | + # utt2dur | |
69 | + parser_utt2dur = subparsers.add_parser("utt2dur", help="generate utt2dur file") | |
70 | + parser_utt2dur.add_argument("--wavscp", type=str, help="wav file", required=True) | |
71 | + parser_utt2dur.add_argument("--outfile", type=str, default="utt2dur", help="output file") | |
72 | + parser_utt2dur.set_defaults(which="utt2dur") | |
73 | + | |
74 | + # TODO: utt2label_to_label2utt | |
75 | + | |
76 | + # TODO: label2utt_to_utt2label | |
77 | + | |
49 | 78 | # Parse |
50 | 79 | args = parser.parse_args() |
51 | 80 | |
... | ... | @@ -53,6 +82,7 @@ if __name__ == "__main__": |
53 | 82 | runner = SubCommandRunner({ |
54 | 83 | "convert" : convert, |
55 | 84 | "filter": filter_file, |
85 | + "utt2dur": utt2dur | |
56 | 86 | }) |
57 | 87 | |
58 | 88 | runner.run(args.which, args.__dict__, remove="which") |
volia/masseffect.py
1 | 1 | import argparse |
2 | +from os import path | |
2 | 3 | import core.data |
3 | 4 | from utils import SubCommandRunner |
4 | - | |
5 | +import os | |
5 | 6 | |
6 | 7 | def utt2char(features: str, outfile: str): |
7 | 8 | """Allow the user to generate utt2char file from masseffect features file. |
... | ... | @@ -28,23 +29,66 @@ def char2utt(features: str, outfile: str): |
28 | 29 | pass |
29 | 30 | |
30 | 31 | |
32 | +def wavscp(datadir: str, outfile: str): | |
33 | + """Generate the masseffect wav scp file from the directories. | |
34 | + | |
35 | + Args: | |
36 | + datadir (str): oath of the data directory where "audio_en-us" and "audio_fr-fr" are available | |
37 | + outfile (str): path of the wav scp output file | |
38 | + | |
39 | + Raises: | |
40 | + Exception: if one of the directory is not available | |
41 | + """ | |
42 | + en_us_dir = os.path.join(datadir, "audio_en-us") | |
43 | + fr_fr_dir = os.path.join(datadir, "audio_fr-fr") | |
44 | + | |
45 | + if (not os.path.isdir(en_us_dir)) or (not os.path.isdir(fr_fr_dir)): | |
46 | + raise Exception("Directory audio_en-us or audio_fr-fr does not exist") | |
47 | + | |
48 | + _,_,filenames_en=next(os.walk(en_us_dir)) | |
49 | + # filenames_en = [ os.path.join(en_us_dir, f) for f in filenames_en ] | |
50 | + dir_en = [ en_us_dir for f in filenames_en ] | |
51 | + _,_,filenames_fr=next(os.walk(fr_fr_dir)) | |
52 | + dir_fr = [ fr_fr_dir for f in filenames_fr ] | |
53 | + # filenames_fr = [ os.path.join(fr_fr_dir, f) for f in filenames_fr ] | |
54 | + | |
55 | + directories = dir_en + dir_fr | |
56 | + filenames = filenames_en + filenames_fr | |
57 | + | |
58 | + | |
59 | + with open(outfile, "w") as f: | |
60 | + for i, fn in enumerate(filenames): | |
61 | + splited = fn.split(".")[0].split(",") | |
62 | + lang = splited[0] | |
63 | + character = splited[1] | |
64 | + record_id = splited[3] | |
65 | + path = os.path.join(directories[i], fn) | |
66 | + f.write(f"{lang},{character},{record_id} {path}\n") | |
67 | + | |
68 | + | |
69 | + | |
31 | 70 | if __name__ == '__main__': |
32 | 71 | # Main parser |
33 | 72 | parser = argparse.ArgumentParser(description="...") |
34 | 73 | subparsers = parser.add_subparsers(title="action") |
35 | 74 | |
36 | 75 | # utt2char |
37 | - parser_utt2char = subparsers.add_parser("utt2char") | |
76 | + parser_utt2char = subparsers.add_parser("utt2char", help="generate utt2char file") | |
38 | 77 | parser_utt2char.add_argument("--features", type=str, help="features file") |
39 | 78 | parser_utt2char.add_argument("--outfile", type=str, help="output file") |
40 | 79 | parser_utt2char.set_defaults(which="utt2char") |
41 | 80 | |
42 | 81 | # char2utt |
43 | - parser_char2utt = subparsers.add_parser("char2utt") | |
82 | + parser_char2utt = subparsers.add_parser("char2utt", help="generate char2utt file") | |
44 | 83 | parser_char2utt.add_argument("--features", type=str, help="features file") |
45 | 84 | parser_char2utt.add_argument("--outfile", type=str, help="output file") |
46 | 85 | parser_char2utt.set_defaults(which="char2utt") |
47 | 86 | |
87 | + # wavscp | |
88 | + parser_wavscp = subparsers.add_parser("wavscp", help="generate wav scp file") | |
89 | + parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") | |
90 | + parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") | |
91 | + parser_wavscp.set_defaults(which="wavscp") | |
48 | 92 | |
49 | 93 | # Parse |
50 | 94 | args = parser.parse_args() |
... | ... | @@ -53,6 +97,7 @@ if __name__ == '__main__': |
53 | 97 | runner = SubCommandRunner({ |
54 | 98 | "utt2char" : utt2char, |
55 | 99 | "char2utt": char2utt, |
100 | + "wavscp": wavscp | |
56 | 101 | }) |
57 | 102 | |
58 | 103 | runner.run(args.which, args.__dict__, remove="which") |
59 | 104 | \ No newline at end of file |
volia/voxceleb.py
... | ... | @@ -41,6 +41,10 @@ def spk2utt(features: str, outfile: str): |
41 | 41 | out.write(spk + " " + " ".join(ids) + "\n") |
42 | 42 | |
43 | 43 | |
44 | +def wavscp(datadir: str, outfile: str): | |
45 | + raise Exception("Under construction") | |
46 | + pass | |
47 | + | |
44 | 48 | if __name__ == "__main__": |
45 | 49 | # Main parser |
46 | 50 | parser = argparse.ArgumentParser(description="Voxceleb data management") |
... | ... | @@ -58,6 +62,12 @@ if __name__ == "__main__": |
58 | 62 | parser_spk2utt.add_argument("--outfile", default="spk2utt", help="output file") |
59 | 63 | parser_spk2utt.set_defaults(which="spk2utt") |
60 | 64 | |
65 | + # wavscp | |
66 | + parser_wavscp = subparser.add_parser("wavscp", help="generate wav scp file") | |
67 | + parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect") | |
68 | + parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file") | |
69 | + parser_wavscp.set_defaults(which="wavscp") | |
70 | + | |
61 | 71 | # Parse |
62 | 72 | args = parser.parse_args() |
63 | 73 | |
... | ... | @@ -65,6 +75,7 @@ if __name__ == "__main__": |
65 | 75 | runner = SubCommandRunner({ |
66 | 76 | "utt2spk" : utt2spk, |
67 | 77 | "spk2utt": spk2utt, |
78 | + "wavscp": wavscp | |
68 | 79 | }) |
69 | 80 | |
70 | 81 | runner.run(args.which, args.__dict__, remove="which") |