Blame view

egs/wsj/s5/steps/cleanup/debug_lexicon.sh 8.91 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
  #!/bin/bash
  # Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0
  
  # this script gets some stats that will help you debug the lexicon.
  
  # Begin configuration section.
  stage=1
  remove_stress=false
  nj=10  # number of jobs for various decoding-type things that we run.
  cmd=run.pl
  alidir=
  # End configuration section
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh # source the path.
  . parse_options.sh || exit 1;
  
  if [ $# != 5 ]; then
     echo "usage: $0 <data-dir> <lang-dir> <src-dir> <src-dict> <dir>"
     echo "e.g.: $0 data/train data/lang exp/tri4b data/local/dict/lexicon.txt exp/debug_lexicon"
     echo "main options (for others, see top of script file)"
     echo "  --nj <nj>                                        # number of parallel jobs"
     echo "  --cmd <cmd>                                      # command to run jobs, e.g. run.pl,queue.pl"
     echo "  --stage <stage>                                  # use to control partial reruns."
     echo "  --remove-stress <true|false>                     # if true, remove stress before printing analysis"
     echo "                                                   # note: if you change this, you only have to rerun"
     echo "                                                   # from stage 10."
     echo "  --alidir <alignment-dir>                         # if supplied, training-data alignments and transforms"
     echo "                                                   # are obtained from here instead of being generated."
     exit 1;
  fi
  
  data=$1
  lang=$2
  src=$3
  srcdict=$4
  dir=$5
  
  set -e
  
  for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
  done
  
  mkdir -p $dir
  utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
  cp $lang/phones.txt $dir
  
  if [ -z $alidir ]; then
    alidir=${src}_ali_$(basename $data)
    if [ $stage -le 1 ]; then
      steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir
    fi
  fi
  
  phone_lang=data/$(basename $lang)_phone_bg
  
  if [ $stage -le 2 ]; then
    utils/lang/make_phone_bigram_lang.sh $lang $alidir $phone_lang
  fi
  
  if [ $stage -le 3 ]; then
    utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg
  fi
  
  if [ $stage -le 4 ]; then
    steps/decode_si.sh --skip-scoring true \
      --cmd "$cmd" --nj $nj --transform-dir $alidir \
      --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \
      $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg
  fi
  
  if [ $stage -le 5 ]; then
    steps/get_train_ctm.sh --print-silence true --use-segments false \
       --cmd "$cmd" $data $lang $alidir
  fi
  
  if [ $stage -le 6 ]; then
    steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \
       $data $phone_lang $src/decode_$(basename $data)_phone_bg
  fi
  
  if [ $stage -le 7 ]; then
    mkdir -p $dir
    # lmwt=4 corresponds to the scale we decoded at.
    cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm
  
    cp $alidir/ctm $dir/word.ctm
  fi
  
  if [ $stage -le 8 ]; then
  # we'll use 'sort' to do most of the heavy lifting when processing the data.
  # suppose word.ctm has an entry like
  # sw02054 A 213.32 0.24 and
  # we'll convert it into two entries like this, with the start and end separately:
  # sw02054-A 0021332 START and
  # sw02054-A 0021356 END and
  #
  # and suppose phone.ctm has lines like
  # sw02054 A 213.09 0.24 sil
  # sw02054 A 213.33 0.13 ae_B
  # we'll convert them into lines where the time is derived the midpoint of the phone, like
  # sw02054 A 0021321 PHONE sil
  # sw02054 A 0021340 PHONE ae_B
  # and then we'll remove the optional-silence phones and, if needed, the word-boundary markers from
  # the phones, to get just
  # sw02054 A 0021340 PHONE ae
  # then after sorting and merge-sorting the two ctm files we can easily
  # work out for each word, what the phones were during that time.
  
    grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \
      sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt
  
  
    export LC_ALL=C
  
    cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_mapped.ctm
  
    cat $dir/word.ctm  | awk '{printf("%s-%s %010.0f START %s
  ", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s
  ", $1, $2, 1000*($3+$4), $5);}' | \
      sort > $dir/word_processed.ctm
  
    # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm
    cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s
  ", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \
      awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - | \
      sort > $dir/phone_processed.ctm
  
    # merge-sort both ctm's
    sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm
  fi
  
  # after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s.
  if [ $stage -le 9 ]; then
    awk '{print $1, $3, $4}' $dir/combined.ctm | \
       perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A;
       if ($a eq "START") { $cur_word = $b; @phones = (); }
       if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), "
  "; }
       if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "<eps>", " ", $b, "
  ";} else {push @phones, $b;}} $prev = $a;} ' |\
       awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i;
       if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;}
       if(merge_prev==1) {print utt_prev, word_prev, pron_prev;};
       merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;}
       END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt
  
    steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words
    steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \
      sort -nr > $dir/prons.txt
  fi
  
  if [ $stage -le 10 ]; then
    if $remove_stress; then
      perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "
  "; } ' \
        <$srcdict >$dir/lexicon.txt
    else
      cp $srcdict $dir/lexicon.txt
    fi
    silphone=$(cat $phone_lang/phones/optional_silence.txt)
    echo "<eps> $silphone" >> $dir/lexicon.txt
  
    awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \
        <$dir/prons.txt >$dir/counts.txt
  
  
  
    cat $dir/prons.txt | \
      if $remove_stress; then
        perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "
  "; } '
      else
        cat
      fi | perl -e '
       print ";; <count-of-this-pron> <rank-of-this-pron> <frequency-of-this-pron> CORRECT|INCORRECT <word> <pron>
  ";
       open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]";
       # create a hash of all reference pronuncations, and for each word, record
       # a list of the prons, separated by " | ".
       while (<D>) {
          @A = split(" ", $_); $is_pron{join(" ",@A)} = 1;
          $w = shift @A;
          if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); }
          else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); }
       }
       open(C, "<$ARGV[1]") || die "opening counts file $ARGV[1];";
       while (<C>) { @A = split(" ", $_); $word_count{$A[0]} = $A[1]; }
       while (<STDIN>) { @A = split(" ", $_);
         $count = shift @A; $word = $A[0]; $freq = sprintf("%0.2f", $count / $word_count{$word});
         $rank = ++$wcount{$word}; # 1 if top observed pron of word, 2 if second...
         $str = (defined $is_pron{join(" ", @A)} ? "CORRECT" : "INCORRECT");
         shift @A;
         print "$count $rank $freq $str $word \"" . join(" ", @A) . "\", ref = \"$prons{$word}\"
  ";
       } ' $dir/lexicon.txt $dir/counts.txt  >$dir/pron_info.txt
  
    grep -v '^;;' $dir/pron_info.txt | \
       awk '{ word=$5; count=$1; if (tot[word] == 0) { first_line[word] = $0; }
              corr[word] += ($4 == "CORRECT" ? count : 0); tot[word] += count; }
            END {for (w in tot) { printf("%s\t%s\t%s\t\t%s
  ", tot[w], w, (corr[w]/tot[w]), first_line[w]); }} ' \
       | sort -k1 -nr | cat <( echo ';; <total-count-of-word> <word> <correct-proportion>      <first-corresponding-line-in-pron_info.txt>') - \
        > $dir/word_info.txt
  fi
  
  if [ $stage -le 11 ]; then
    echo "$0: some of the more interesting stuff in $dir/pron_info.txt follows."
    echo "# grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | head -n 20"
  
    grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | head -n 20
  
    echo "$0: here are some other interesting things.."
    echo "# grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | awk '\$3 > 0.4 && \$1 > 10' | head -n 20"
    grep -w INCORRECT $dir/pron_info.txt  | grep -w 1 | awk '$3 > 0.4 && $1 > 10' | head -n 20
  
    echo "$0: here are some high-frequency words whose reference pronunciations rarely show up."
    echo "# awk '\$3 < 0.1' $dir/word_info.txt  | head -n 20"
    awk '$3 < 0.1 || $1 == ";;"' $dir/word_info.txt  | head -n 20
  
  
  fi