Blame view
egs/aspire/s5/local/multi_condition/prep_test_aspire.sh
14.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2015. Apache 2.0. # This script generates the ctm files for dev_aspire, test_aspire and eval_aspire # for scoring with ASpIRE scoring server. # It also provides the WER for dev_aspire data. iter=final mfccdir=mfcc_reverb_submission stage=0 decode_num_jobs=200 num_jobs=30 LMWT=12 word_ins_penalty=0 min_lmwt=9 max_lmwt=20 word_ins_penalties=0.0,0.25,0.5,0.75,1.0 decode_mbr=true acwt=0.1 lattice_beam=8 ctm_beam=6 do_segmentation=true max_count=100 # parameter for extract_ivectors.sh sub_speaker_frames=1500 overlap=5 window=30 affix= ivector_scale=1.0 pad_frames=0 # this did not seem to be helpful but leaving it as an option. tune_hyper=true pass2_decode_opts= filter_ctm=true weights_file= silence_weight=0.00001 . ./cmd.sh [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir>" echo " Options:" echo " --stage (0|1|2) # start scoring script from part-way through." echo "e.g.:" echo "$0 data/train data/lang exp/nnet2_multicondition/nnet_ms_a" exit 1; fi data_dir=$1 #select from {dev_aspire, test_aspire, eval_aspire} lang=$2 # data/lang dir=$3 # exp/nnet2_multicondition/nnet_ms_a model_affix=`basename $dir` ivector_dir=`dirname $dir` ivector_affix=${affix:+_$affix}_${model_affix}_iter$iter affix=_${affix}_iter${iter} act_data_dir=${data_dir} if [ "$data_dir" == "test_aspire" ]; then out_file=single_dev_test${affix}_$model_affix.ctm elif [ "$data_dir" == "eval_aspire" ]; then out_file=single_eval${affix}_$model_affix.ctm else if [ $stage -le 1 ]; then echo "Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments unseg_dir=data/${data_dir}_whole src_dir=data/$data_dir mkdir -p $unseg_dir echo "Creating the $unseg_dir/wav.scp file" cp $src_dir/wav.scp $unseg_dir echo "Creating the $unseg_dir/reco2file_and_channel file" cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" --mfcc-config conf/mfcc_hires.conf $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1; steps/compute_cmvn_stats.sh $unseg_dir exp/make_mfcc_reverb/${data_dir}_whole $mfccdir || exit 1; fi data_dir=${data_dir}_whole out_file=single_dev${affix}_${model_affix}.ctm fi num_jobs=`cat data/${act_data_dir}/wav.scp|wc -l` segmented_data_dir=${data_dir} # extract the ivectors if $do_segmentation; then segmented_data_dir=${data_dir}_uniformsegmented_win${window}_over${overlap} fi if [ $stage -le 2 ]; then echo "Generating uniform segments with length $window and overlap $overlap." rm -rf data/$segmented_data_dir copy_data_dir.sh --validate-opts "--no-text" data/$data_dir data/$segmented_data_dir || exit 1; cp data/$data_dir/reco2file_and_channel data/$segmented_data_dir/ || exit 1; python local/multi_condition/create_uniform_segments.py --overlap $overlap --window $window data/$segmented_data_dir || exit 1; for file in cmvn.scp feats.scp; do rm -f data/$segmented_data_dir/$file done utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_dir || exit 1; fi if [ $stage -le 3 ]; then echo "Extracting features for the segments" # extract the features/i-vectors once again so that they are indexed by utterance and not by recording rm -rf data/${segmented_data_dir}_hires copy_data_dir.sh --validate-opts "--no-text " data/${segmented_data_dir} data/${segmented_data_dir}_hires || exit 1; steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/${segmented_data_dir}_hires \ exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/${segmented_data_dir}_hires exp/make_reverb_hires/${segmented_data_dir} $mfccdir || exit 1; utils/fix_data_dir.sh data/${segmented_data_dir}_hires utils/validate_data_dir.sh --no-text data/${segmented_data_dir}_hires fi if [ $stage -le 4 ]; then echo "Extracting i-vectors, stage 1" steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \ --max-count $max_count \ data/${segmented_data_dir}_hires $ivector_dir/extractor \ $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1 || exit 1; fi if [ $ivector_scale != 1.0 ] && [ $ivector_scale != 1 ]; then ivector_scale_affix=_scale$ivector_scale else ivector_scale_affix= fi if [ $stage -le 5 ]; then if [ "$ivector_scale_affix" != "" ]; then echo "$0: Scaling iVectors, stage 1" srcdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}_stage1 outdir=$ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1 mkdir -p $outdir copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- | \ copy-feats --compress=true ark:- ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp || exit 1; cp $srcdir/ivector_period $outdir/ivector_period fi fi decode_dir=$dir/decode_${segmented_data_dir}${affix}_pp # generate the lattices if [ $stage -le 6 ]; then echo "Generating lattices, stage 1" local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \ --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix}${ivector_scale_affix}_stage1 \ --skip-scoring true --iter $iter \ exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_stage1 || exit 1; fi if [ $stage -le 7 ]; then echo "$0: generating CTM from stage-1 lattices" local/multi_condition/get_ctm_conf.sh --cmd "$decode_cmd" \ --use-segments false --iter $iter \ data/${segmented_data_dir}_hires \ ${lang} \ ${decode_dir}_stage1 || exit 1; fi if [ $stage -le 8 ]; then if $filter_ctm; then if [ ! -z $weights_file ]; then echo "$0: Using provided weights file $weights_file" ivector_extractor_input=$weights_file else ctm=${decode_dir}_stage1/score_10/${segmented_data_dir}_hires.ctm echo "$0: generating weights file from stage-1 ctm $ctm" feat-to-len scp:data/${segmented_data_dir}_hires/feats.scp ark,t:- >${decode_dir}_stage1/utt.lengths.$affix if [ ! -f $ctm ]; then echo "$0: stage 8: expected ctm to exist: $ctm"; exit 1; fi cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \ grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \ grep -v -F '[laughter]' | grep -v -F '<unk>' | \ perl -e ' $lengths=shift @ARGV; $pad_frames=shift @ARGV; $silence_weight=shift @ARGV; $pad_frames >= 0 || die "bad pad-frames value $pad_frames"; open(L, "<$lengths") || die "opening lengths file"; @all_utts = (); $utt2ref = { }; while (<L>) { ($utt, $len) = split(" ", $_); push @all_utts, $utt; $array_ref = [ ]; for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; } $utt2ref{$utt} = $array_ref; } while (<STDIN>) { @A = split(" ", $_); @A == 6 || die "bad ctm line $_"; $utt = $A[0]; $beg = $A[2]; $len = $A[3]; $beg_int = int($beg * 100) - $pad_frames; $len_int = int($len * 100) + 2*$pad_frames; $array_ref = $utt2ref{$utt}; !defined $array_ref && die "No length info for utterance $utt"; for ($t = $beg_int; $t < $beg_int + $len_int; $t++) { if ($t >= 0 && $t < @$array_ref) { ${$array_ref}[$t] = 1; } } } foreach $utt (@all_utts) { $array_ref = $utt2ref{$utt}; print $utt, " [ ", join(" ", @$array_ref), " ] "; } ' ${decode_dir}_stage1/utt.lengths.$affix $pad_frames $silence_weight | gzip -c >${decode_dir}_stage1/weights${affix}.gz ivector_extractor_input=${decode_dir}_stage1/weights${affix}.gz fi else ivector_extractor_input=${decode_dir}_stage1 fi fi if [ $stage -le 8 ]; then echo "Extracting i-vectors, stage 2 with input $ivector_extractor_input" # this does offline decoding, except we estimate the iVectors per # speaker, excluding silence (based on alignments from a GMM decoding), with a # different script. This is just to demonstrate that script. # the --sub-speaker-frames is optional; if provided, it will divide each speaker # up into "sub-speakers" of at least that many frames... can be useful if # acoustic conditions drift over time within the speaker's data. steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj 20 \ --silence-weight $silence_weight \ --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ data/${segmented_data_dir}_hires $lang $ivector_dir/extractor \ $ivector_extractor_input $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} || exit 1; fi if [ $stage -le 9 ]; then echo "Generating lattices, stage 2 with --acwt $acwt" rm -f ${decode_dir}_tg/.error local/multi_condition/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config $pass2_decode_opts \ --skip-scoring true --iter $iter --acwt $acwt --lattice-beam $lattice_beam \ --online-ivector-dir $ivector_dir/ivectors_${segmented_data_dir}${ivector_affix} \ exp/tri5a/graph_pp data/${segmented_data_dir}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1; fi if [ $stage -le 10 ]; then echo "Rescoring lattices" steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ --skip-scoring true \ ${lang}_pp_test{,_fg} data/${segmented_data_dir}_hires \ ${decode_dir}_{tg,fg} || exit 1; fi # tune the LMWT and WIP # make command for filtering the ctms decode_dir=${decode_dir}_fg if [ -z $iter ]; then model=$decode_dir/../final.mdl # assume model one level up from decoding dir. else model=$decode_dir/../$iter.mdl fi mkdir -p $decode_dir/scoring # create a python script to filter the ctm, for labels which are mapped # to null strings in the glm or which are not accepted by the scoring server python -c " import sys, re lines = map(lambda x: x.strip(), open('data/${act_data_dir}/glm').readlines()) patterns = [] for line in lines: if re.search('=>', line) is not None: parts = re.split('=>', line.split('/')[0]) if parts[1].strip() == '': patterns.append(parts[0].strip()) print '|'.join(patterns) " > $decode_dir/scoring/glm_ignore_patterns || exit 1; ignore_patterns=$(cat $decode_dir/scoring/glm_ignore_patterns) echo "$0: Ignoring these patterns from the ctm ", $ignore_patterns cat << EOF > $decode_dir/scoring/filter_ctm.py import sys file = open(sys.argv[1]) out_file = open(sys.argv[2], 'w') ignore_set = "$ignore_patterns".split("|") ignore_set.append("[noise]") ignore_set.append("[laughter]") ignore_set.append("[vocalized-noise]") ignore_set.append("!SIL") ignore_set.append("<unk>") ignore_set.append("%hesitation") ignore_set = set(ignore_set) print ignore_set for line in file: if line.split()[4] not in ignore_set: out_file.write(line) out_file.close() EOF filter_ctm_command="python $decode_dir/scoring/filter_ctm.py " if $tune_hyper ; then if [ $stage -le 11 ]; then if [ "$act_data_dir" == "dev_aspire" ]; then wip_string=$(echo $word_ins_penalties | sed 's/,/ /g') temp_wips=($wip_string) $decode_cmd WIP=1:${#temp_wips[@]} $decode_dir/scoring/log/score.wip.WIP.log \ wips=\(0 $wip_string\) \&\& \ wip=\${wips[WIP]} \&\& \ echo \$wip \&\& \ $decode_cmd LMWT=$min_lmwt:$max_lmwt $decode_dir/scoring/log/score.LMWT.\$wip.log \ local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \ --window $window --overlap $overlap \ --beam $ctm_beam --decode-mbr $decode_mbr \ --glm data/${act_data_dir}/glm --stm data/${act_data_dir}/stm \ LMWT \$wip $lang data/${segmented_data_dir}_hires $model $decode_dir || exit 1; eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys"|utils/best_wer.sh 2>/dev/null eval "grep Sum $decode_dir/score_{${min_lmwt}..${max_lmwt}}/penalty_{$word_ins_penalties}/*.sys" | \ utils/best_wer.sh 2>/dev/null | python -c "import sys, re line = sys.stdin.readline() file_name=line.split()[-1] parts=file_name.split('/') penalty = re.sub('penalty_','',parts[-2]) lmwt = re.sub('score_','', parts[-3]) lmfile=open('$decode_dir/scoring/bestLMWT','w') lmfile.write(str(lmwt)) lmfile.close() wipfile=open('$decode_dir/scoring/bestWIP','w') wipfile.write(str(penalty)) wipfile.close() " || exit 1; LMWT=$(cat $decode_dir/scoring/bestLMWT) word_ins_penalty=$(cat $decode_dir/scoring/bestWIP) fi fi if [ "$act_data_dir" == "test_aspire" ] || [ "$act_data_dir" == "eval_aspire" ]; then dev_decode_dir=$(echo $decode_dir|sed "s/test_aspire/dev_aspire_whole/g; s/eval_aspire/dev_aspire_whole/g") if [ -f $dev_decode_dir/scoring/bestLMWT ]; then LMWT=$(cat $dev_decode_dir/scoring/bestLMWT) echo "Using the bestLMWT $LMWT value found in $dev_decode_dir" else echo "Unable to find the bestLMWT in the dev decode dir $dev_decode_dir" echo "Keeping the default/user-specified value" fi if [ -f $dev_decode_dir/scoring/bestWIP ]; then word_ins_penalty=$(cat $dev_decode_dir/scoring/bestWIP) echo "Using the bestWIP $word_ins_penalty value found in $dev_decode_dir" else echo "Unable to find the bestWIP in the dev decode dir $dev_decode_dir" echo "Keeping the default/user-specified value" fi else echo "Using the default/user-specified values for LMWT and word_ins_penalty" fi fi # lattice to ctm conversion and scoring. if [ $stage -le 12 ]; then echo "Generating CTMs with LMWT $LMWT and word insertion penalty of $word_ins_penalty" local/multi_condition/get_ctm.sh --filter-ctm-command "$filter_ctm_command" \ --beam $ctm_beam --decode-mbr $decode_mbr \ $LMWT $word_ins_penalty $lang data/${segmented_data_dir}_hires $model $decode_dir 2>$decode_dir/scoring/finalctm.LMWT$LMWT.WIP$word_ins_penalty.log || exit 1; fi if [ $stage -le 13 ]; then cat $decode_dir/score_$LMWT/penalty_$word_ins_penalty/ctm.filt | awk '{split($1, parts, "-"); printf("%s 1 %s %s %s ", parts[1], $3, $4, $5)}' > $out_file cat data/${segmented_data_dir}_hires/wav.scp | awk '{split($1, parts, "-"); printf("%s ", parts[1])}' > $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names local/multi_condition/fill_missing_recordings.py $out_file $out_file.submission $decode_dir/score_$LMWT/penalty_$word_ins_penalty/recording_names echo "Generated the ctm @ $out_file.submission from the ctm file $decode_dir/score_${LMWT}/penalty_$word_ins_penalty/ctm.filt" fi |