Blame view
egs/reverb/s5/local/wsjcam0_data_prep.sh
4.42 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/bin/bash # Copyright 2013 MERL (author: Felix Weninger) # Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. dir=`pwd`/data/local/data lmdir=`pwd`/data/local/nist_lm mkdir -p $dir $lmdir local=`pwd`/local utils=`pwd`/utils root=`pwd` . ./path.sh # Needed for KALDI_ROOT export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi cd $dir WSJ=$1 wsj0_dir=$2 if [ ! -d "$WSJ" ]; then echo Could not find directory $WSJ! Check pathnames in corpus.sh! exit 1 fi # The REVERB Challenge uses only the primary microphone data for the development # set, but primary and secondary for the evaluation set. # The REVERB simulated evaluation set (SimData_et) is based on the union of # the WSJCAM0 si_et_1 and si_et_2 sets. We thus have to merge the transcription, # script etc. files from si_et_1 and si_et_2 to create the "virtual" si_et set # so that it can be processed in analogy to si_dt for SimData_dt. # concatenate dt / et transcription for set in si_dt si_et_1 si_et_2; do # this can be done as in the htk baseline if [[ "$set" =~ et ]]; then find $WSJ/data/{primary,secondary}_microphone/$set -name '*.wv1' | sort > $set.flist else find $WSJ/data/primary_microphone/$set -name '*.wv1' | sort > $set.flist fi nl=`wc -l $set.flist` nl=${nl% *} echo "$set: $nl files" find $WSJ/data/*/$set -type f -name '*.dot' \ | grep '/[a-z0-9]\{3\}/[a-z0-9]\{3\}c02[a-z0-9]\{2\}\.dot$' \ | xargs cat > $dir/$set.dot done cat $dir/si_et_1.dot $dir/si_et_2.dot > $dir/si_et.dot # for si_tr we need the transcribed utterances (not all) si_tr_dot=$WSJ/data/primary_microphone/etc/si_tr.dot # copy this, for consistency ... cp $si_tr_dot $dir chmod 644 $dir/si_tr.dot utts=$(perl -e 'while (<>) { chomp; if (m/\((\w{8})\)/) { print $1, " "; } }' $si_tr_dot) for utt in ${utts[@]}; do #echo utt = $utt spk=${utt:0:3} echo $WSJ/data/primary_microphone/si_tr/$spk/$utt.wv1 done > si_tr.flist nl=`wc -l si_tr.flist` nl=${nl% *} echo "si_tr: $nl files" [ "$nl" -eq 7861 ] || echo "Warning: expected 7861 lines in si_tr.flist, got $nl" for x in si_tr si_dt si_et_1 si_et_2; do $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp done cat si_et_{1,2}_sph.scp > si_et_sph.scp # for WSJCAM0 training set, there's only one transcript file which contains all training speakers # just use that $local/convert_transcripts.pl $si_tr_dot > si_tr.trans1 || exit 1 $local/convert_transcripts.pl $dir/si_dt.dot > si_dt.trans1 || exit 1 $local/convert_transcripts.pl $dir/si_et.dot > si_et.trans1 || exit 1 # Do some basic normalization steps. At this point we don't remove OOVs-- # that will be done inside the training scripts, as we'd like to make the # data-preparation stage independent of the specific lexicon used. noiseword="<NOISE>"; for x in si_tr si_dt si_et; do cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort | uniq > $x.txt || exit 1; done echo "done" # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) for x in si_tr si_dt si_et; do awk '{printf("%s '$sph2pipe' -f wav %s | ", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp done # Make the utt2spk and spk2utt files. for x in si_tr si_dt si_et; do cat ${x}_sph.scp | awk '{print $1, $1}' > $x.utt2spk cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; done # REVERB language model is bcb05cnp # We also use tri-gram echo "Copy language model" cp $wsj0_dir/wsj0/doc/lng_modl/base_lm/bcb05cnp.z $lmdir/lm_bg_5k.arpa.gz || exit 1; chmod 644 $lmdir/lm_bg_5k.arpa.gz cp $wsj0_dir/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1 chmod 644 $lmdir/lm_tg_5k.arpa.gz echo "Data preparation succeeded" |