reverb_wsj0_data_prep.sh
3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
set -e
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.
# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 29/05/12
if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level WSJ corpus directory."
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi
CORPUS=$1
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
# reverb list for SI-84
find $1/si_tr_s -name '*.wav' | sort -u > train_si84_reverb.flist
# Dev-set Hub 1,2 (503, 913 utterances)
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find $1/si_dt_20 -name '*.wav' | sort -u > dev_dt_20_reverb.flist
find $1/si_dt_05 -name '*.wav' | sort -u > dev_dt_05_reverb.flist
# Finding the transcript files:
#find -L $CORPUS -iname '*.dot' > dot_files.flist
if [ ! -e $dir/dot_files.flist ]; then
echo "Could not find $dir/dot_files.flist files, first run clean_data_prep.sh";
exit 1;
fi
# Convert the transcripts into our format (no normalization yet)
# adding suffix to utt_id
# 1 for reverb condition
for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
cat $x.flist | perl -e '
while(<>) {
m:^\S+/(\w+)\.wav$: || die "Bad line $_";
$id = $1;
$id =~ tr/A-Z/a-z/;
print "$id $_";
}
' | sort > ${x}_wav_tmp.scp
cat ${x}_wav_tmp.scp | awk '{print $1}' \
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
cat ${x}_wav_tmp.scp | awk '{printf("%s1 %s\n", $1, $2);}' > ${x}_wav.scp
cat ${x}_tmp.trans1 | awk '{printf("%s1 ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
done
# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
#for x in train_si84_clean test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean; do
# awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
# > ${x}_wav.scp
#done
# Make the utt2spk and spk2utt files.
for x in train_si84_reverb dev_dt_05_reverb dev_dt_20_reverb; do
cat ${x}_wav.scp | awk '{print $1}' \
| perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
done
echo "Data preparation succeeded"