Blame view
egs/wsj/s5/steps/cleanup/debug_lexicon.sh
8.91 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
#!/bin/bash # Copyright 2014 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # this script gets some stats that will help you debug the lexicon. # Begin configuration section. stage=1 remove_stress=false nj=10 # number of jobs for various decoding-type things that we run. cmd=run.pl alidir= # End configuration section echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh # source the path. . parse_options.sh || exit 1; if [ $# != 5 ]; then echo "usage: $0 <data-dir> <lang-dir> <src-dir> <src-dict> <dir>" echo "e.g.: $0 data/train data/lang exp/tri4b data/local/dict/lexicon.txt exp/debug_lexicon" echo "main options (for others, see top of script file)" echo " --nj <nj> # number of parallel jobs" echo " --cmd <cmd> # command to run jobs, e.g. run.pl,queue.pl" echo " --stage <stage> # use to control partial reruns." echo " --remove-stress <true|false> # if true, remove stress before printing analysis" echo " # note: if you change this, you only have to rerun" echo " # from stage 10." echo " --alidir <alignment-dir> # if supplied, training-data alignments and transforms" echo " # are obtained from here instead of being generated." exit 1; fi data=$1 lang=$2 src=$3 srcdict=$4 dir=$5 set -e for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done mkdir -p $dir utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt cp $lang/phones.txt $dir if [ -z $alidir ]; then alidir=${src}_ali_$(basename $data) if [ $stage -le 1 ]; then steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir fi fi phone_lang=data/$(basename $lang)_phone_bg if [ $stage -le 2 ]; then utils/lang/make_phone_bigram_lang.sh $lang $alidir $phone_lang fi if [ $stage -le 3 ]; then utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg fi if [ $stage -le 4 ]; then steps/decode_si.sh --skip-scoring true \ --cmd "$cmd" --nj $nj --transform-dir $alidir \ --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \ $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 5 ]; then steps/get_train_ctm.sh --print-silence true --use-segments false \ --cmd "$cmd" $data $lang $alidir fi if [ $stage -le 6 ]; then steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \ $data $phone_lang $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 7 ]; then mkdir -p $dir # lmwt=4 corresponds to the scale we decoded at. cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm cp $alidir/ctm $dir/word.ctm fi if [ $stage -le 8 ]; then # we'll use 'sort' to do most of the heavy lifting when processing the data. # suppose word.ctm has an entry like # sw02054 A 213.32 0.24 and # we'll convert it into two entries like this, with the start and end separately: # sw02054-A 0021332 START and # sw02054-A 0021356 END and # # and suppose phone.ctm has lines like # sw02054 A 213.09 0.24 sil # sw02054 A 213.33 0.13 ae_B # we'll convert them into lines where the time is derived the midpoint of the phone, like # sw02054 A 0021321 PHONE sil # sw02054 A 0021340 PHONE ae_B # and then we'll remove the optional-silence phones and, if needed, the word-boundary markers from # the phones, to get just # sw02054 A 0021340 PHONE ae # then after sorting and merge-sorting the two ctm files we can easily # work out for each word, what the phones were during that time. grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \ sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt export LC_ALL=C cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_mapped.ctm cat $dir/word.ctm | awk '{printf("%s-%s %010.0f START %s ", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s ", $1, $2, 1000*($3+$4), $5);}' | \ sort > $dir/word_processed.ctm # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s ", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \ awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - | \ sort > $dir/phone_processed.ctm # merge-sort both ctm's sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm fi # after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s. if [ $stage -le 9 ]; then awk '{print $1, $3, $4}' $dir/combined.ctm | \ perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A; if ($a eq "START") { $cur_word = $b; @phones = (); } if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), " "; } if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "<eps>", " ", $b, " ";} else {push @phones, $b;}} $prev = $a;} ' |\ awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i; if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;} if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}; merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;} END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \ sort -nr > $dir/prons.txt fi if [ $stage -le 10 ]; then if $remove_stress; then perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . " "; } ' \ <$srcdict >$dir/lexicon.txt else cp $srcdict $dir/lexicon.txt fi silphone=$(cat $phone_lang/phones/optional_silence.txt) echo "<eps> $silphone" >> $dir/lexicon.txt awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \ <$dir/prons.txt >$dir/counts.txt cat $dir/prons.txt | \ if $remove_stress; then perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . " "; } ' else cat fi | perl -e ' print ";; <count-of-this-pron> <rank-of-this-pron> <frequency-of-this-pron> CORRECT|INCORRECT <word> <pron> "; open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]"; # create a hash of all reference pronuncations, and for each word, record # a list of the prons, separated by " | ". while (<D>) { @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; $w = shift @A; if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); } else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); } } open(C, "<$ARGV[1]") || die "opening counts file $ARGV[1];"; while (<C>) { @A = split(" ", $_); $word_count{$A[0]} = $A[1]; } while (<STDIN>) { @A = split(" ", $_); $count = shift @A; $word = $A[0]; $freq = sprintf("%0.2f", $count / $word_count{$word}); $rank = ++$wcount{$word}; # 1 if top observed pron of word, 2 if second... $str = (defined $is_pron{join(" ", @A)} ? "CORRECT" : "INCORRECT"); shift @A; print "$count $rank $freq $str $word \"" . join(" ", @A) . "\", ref = \"$prons{$word}\" "; } ' $dir/lexicon.txt $dir/counts.txt >$dir/pron_info.txt grep -v '^;;' $dir/pron_info.txt | \ awk '{ word=$5; count=$1; if (tot[word] == 0) { first_line[word] = $0; } corr[word] += ($4 == "CORRECT" ? count : 0); tot[word] += count; } END {for (w in tot) { printf("%s\t%s\t%s\t\t%s ", tot[w], w, (corr[w]/tot[w]), first_line[w]); }} ' \ | sort -k1 -nr | cat <( echo ';; <total-count-of-word> <word> <correct-proportion> <first-corresponding-line-in-pron_info.txt>') - \ > $dir/word_info.txt fi if [ $stage -le 11 ]; then echo "$0: some of the more interesting stuff in $dir/pron_info.txt follows." echo "# grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | head -n 20" grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | head -n 20 echo "$0: here are some other interesting things.." echo "# grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | awk '\$3 > 0.4 && \$1 > 10' | head -n 20" grep -w INCORRECT $dir/pron_info.txt | grep -w 1 | awk '$3 > 0.4 && $1 > 10' | head -n 20 echo "$0: here are some high-frequency words whose reference pronunciations rarely show up." echo "# awk '\$3 < 0.1' $dir/word_info.txt | head -n 20" awk '$3 < 0.1 || $1 == ";;"' $dir/word_info.txt | head -n 20 fi |