make_lre05.pl
3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env perl
#
# Copyright 2014 David Snyder
if (@ARGV != 2) {
print STDERR "Usage: $0 <path-to-LDC2008S05> <output-dir>\n";
print STDERR "e.g. $0 /export/corpora5/LDC/LDC2008S05 data\n";
exit(1);
}
($db_base, $out_base_dir) = @ARGV;
# This is the Indian English part of the corpora, we will prep it first.
$db_ie = $db_base . "/data/lid05d1/";
$key = $db_ie . "key.txt";
$out_dir = $out_base_dir . "/lid05d1/";
if (system("mkdir -p $out_dir") != 0) {
die "Error making directory $out_dir";
}
open(WAV, ">$out_dir" . '/wav.scp')
|| die "Failed opening output file $out_dir/wav.scp";
open(UTT2LANG, ">$out_dir" . '/utt2lang')
|| die "Failed opening output file $out_dir/utt2lang";
open(UTT2SPK, ">$out_dir" . '/utt2spk')
|| die "Failed opening output file $out_dir/utt2spk";
open(KEY, "<$key")
|| die "Failed opening input file $key";
while($line = <KEY>) {
chomp($line);
# If the line isn't a comment
if (index($line, "#") == -1) {
($fi, $lang, $conv_id, $channel, $test_cut) = split(" ", $line);
# Verify that we have only Indian English.
if (not ($lang eq "IE")) {
die "$db_ie contains non-Indian English utterances.";
}
($set, $part, $utt_fi) = split("/", $fi);
($utt, $ext) = split("[.]", $utt_fi);
# This part of the corpus is only english.indian.
$uttId = "lid05d1_$utt";
$wav = $db_ie . $fi;
if (! -f $wav) {
print STDERR "No such file $wav (skipping)\n";
next;
}
$channel =~ tr/AB/12/;
print WAV "$uttId"," sph2pipe -f wav -p -c $channel $wav |\n";
print UTT2SPK "$uttId $uttId\n";
print UTT2LANG "$uttId english.indian\n";
}
}
close(WAV) || die;
close(UTT2SPK) || die;
close(UTT2LANG) || die;
system("utils/fix_data_dir.sh $out_dir");
(system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") == 0)
|| die "Error validating data dir.";
$out_dir = $out_base_dir . "/lid05e1/";
$db_dir = $db_base . "/data/lid05e1/";
$key = $db_dir . "lid05e_key_v2.txt";
open(KEY, "<$key")
|| die "Failed opening input file $key";
if (system("mkdir -p $out_dir") != 0) {
die "Error making directory $out_dir";
}
open(WAV, ">$out_dir" . '/wav.scp')
|| die "Failed opening output file $out_dir/wav.scp";
open(UTT2LANG, ">$out_dir" . '/utt2lang')
|| die "Failed opening output file $out_dir/utt2lang";
open(UTT2SPK, ">$out_dir" . '/utt2spk')
|| die "Failed opening output file $out_dir/utt2spk";
open(SPK2GEN, ">$out_dir" . '/spk2gender')
|| die "Failed opening output file $out_dir/spk2gender";
while($line = <KEY>) {
chomp($line);
if (index($line, "#") == -1) {
($seg_id, $lang, $dialect, $conv_id, $channel,
$cut, $dur, $corp, $gender, $loc, $alt_lang) = split(" ", $line);
$wav = "$db_dir/test/${dur}/${seg_id}.sph";
if (! -f $wav) {
print STDERR "No such file $wav (skipping this utterance)\n";
next;
}
$lang = lc $lang;
$dialect = lc $dialect;
if ($dialect ne "na") {
$full_lang = "$lang.$dialect";
} else {
$full_lang = $lang;
}
$gender = lc $gender;
# Defaulting to male if the gender info is missing.
if (not ($gender eq 'm' || $gender eq 'f')) {
$gender = 'm';
}
$uttId = "lid05e1_".$seg_id;
$channel =~ tr/AB/12/;
print WAV "$uttId"," sph2pipe -f wav -p -c ${channel} $wav |\n";
print UTT2SPK "$uttId $uttId\n";
print SPK2GEN "$uttId $gender\n";
print UTT2LANG "$uttId $full_lang\n";
}
}
close(WAV) || die;
close(UTT2SPK) || die;
close(UTT2LANG) || die;
close(SPK2GEN) || die;
system("utils/fix_data_dir.sh $out_dir");
(system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") == 0)
|| die "Error validating data dir.";