fisher_data_prep.sh
7.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/bin/bash
# Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
stage=0
calldata=
while test $# -gt 0
do
case "$1" in
--calldata) calldata=1
;;
*) break;
;;
esac
shift
done
. utils/parse_options.sh
if [ $# -eq 0 ]; then
echo "$0 [--calldata] <fisher-dir-1> [<fisher-dir-2> ...]"
echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\"
echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13"
echo " (We also support a single directory that has the contents of all of them)"
echo " If specified, --calldata will be used to map Kaldi speaker ID to real"
echo " speaker PIN released with the Fisher corpus."
exit 1;
fi
# Check that the arguments are all absolute pathnames.
for dir in $*; do
case $dir in /*) ;; *)
echo "$0: all arguments must be absolute pathnames."; exit 1;
esac
done
# First check we have the right things in there...
#
rm -r data/local/data/links 2>/dev/null
mkdir -p data/local/data/links || exit 1;
for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \
fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \
fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \
fe_03_p2_sph6 fe_03_p2_tran; do
found_subdir=false
for dir in $*; do
if [ -d $dir/$subdir ]; then
found_subdir=true
ln -s $dir/$subdir data/local/data/links
else
new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
if [ -d $dir/$new_style_subdir ]; then
found_subdir=true
ln -s $dir/$new_style_subdir data/local/data/links/$subdir
fi
fi
done
if ! $found_subdir; then
echo "$0: could not find the subdirectory $subdir in any of $*"
exit 1;
fi
done
tmpdir=`pwd`/data/local/data
links=data/local/data/links
. ./path.sh # Needed for KALDI_ROOT
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
# (1) Get transcripts in one file, and clean them up ..
if [ $stage -le 0 ]; then
find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -name '*.txt' > $tmpdir/transcripts.flist
for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do
find $links/$dir/ -name '*.sph'
done > $tmpdir/sph.flist
n=`cat $tmpdir/transcripts.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 transcript files in the Fisher data, found $n"
exit 1;
fi
n=`cat $tmpdir/sph.flist | wc -l`
if [ $n -ne 11699 ]; then
echo "Expected to find 11699 .sph files in the Fisher data, found $n"
exit 1;
fi
fi
if [ $stage -le 1 ]; then
mkdir -p data/train_all_asr
## fe_03_00004.sph
## Transcpribed at the LDC
#
#7.38 8.78 A: an- so the topic is
echo -n > $tmpdir/text.1 || exit 1;
perl -e '
use File::Basename;
($tmpdir)=@ARGV;
open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts";
open(R, "|sort >data/train_all_asr/reco2file_and_channel") || die "Opening reco2file_and_channel";
open(T, ">$tmpdir/text.1") || die "Opening text output";
while (<F>) {
$file = $_;
m:([^/]+)\.txt: || die "Bad filename $_";
$call_id = $1;
print R "$call_id-A $call_id A\n";
print R "$call_id-B $call_id B\n";
open(I, "<$file") || die "Opening file $_";
$line1 = <I>;
$line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file";
$call_id eq $1 || die "Mismatch call-id $call_id vs $1\n";
while (<I>) {
if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) {
$start = sprintf("%06d", $1 * 100.0);
$end = sprintf("%06d", $2 * 100.0);
length($end) > 6 && die "Time too long $end in file $file";
$side = $3;
$words = $4;
$utt_id = "${call_id}-$side-$start-$end";
print T "$utt_id $words\n" || die "Error writing to text file";
}
}
}
close(R); close(T) ' $tmpdir || exit 1;
fi
if [ $stage -le 2 ]; then
sort $tmpdir/text.1 | grep -v '((' | \
awk '{if (NF > 1){ print; }}' | \
sed 's:\[laugh\]:[laughter]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[cough\]:[noise]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[mn\]:[noise]:g' | \
sed 's:\[breath\]:[noise]:g' | \
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
cp $tmpdir/text.2 data/train_all_asr/text
# create segments file and utt2spk file...
! cat data/train_all_asr/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > data/train_all_asr/utt2spk \
&& echo "Error producing utt2spk file" && exit 1;
cat data/train_all_asr/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3);
$e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' > data/train_all_asr/segments
utils/utt2spk_to_spk2utt.pl <data/train_all_asr/utt2spk > data/train_all_asr/spk2utt
fi
if [ $stage -le 3 ]; then
for f in `cat $tmpdir/sph.flist`; do
# convert to absolute path
utils/make_absolute.sh $f
done > $tmpdir/sph_abs.flist
cat $tmpdir/sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp
cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort -k1,1 -u > data/train_all_asr/wav.scp || exit 1;
fi
if [ $stage -le 4 ]; then
# get the spk2gender information. This is not a standard part of our
# file formats
# The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff
cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \
$links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \
perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2\n", "$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
if [ ! -s data/train_all_asr/spk2gender ]; then
echo "It looks like our first try at getting the spk2gender info did not work."
echo "(possibly older distribution?) Trying something else."
cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \
perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_";
print "fe_03_$1-A $2\n", "fe_03_$1-B $3\n"; ' | \
sort | uniq | utils/filter_scp.pl data/train_all_asr/spk2utt > data/train_all_asr/spk2gender
fi
fi
if [ ! -z "$calldata" ]; then # fix speaker IDs
cat $links/fe_03_p{1,2}_tran/doc/*calldata.tbl > $tmpdir/combined-calldata.tbl
local/fisher_fix_speakerid.pl $tmpdir/combined-calldata.tbl data/train_all_asr
utils/utt2spk_to_spk2utt.pl data/train_all_asr/utt2spk.new > data/train_all_asr/spk2utt.new
# patch files
for f in spk2utt utt2spk text segments spk2gender; do
cp data/train_all_asr/$f data/train_all_asr/$f.old || exit 1;
cp data/train_all_asr/$f.new data/train_all_asr/$f || exit 1;
done
rm $tmpdir/combined-calldata.tbl
fi
echo "Data preparation succeeded"