Blame view

egs/wsj/s5/steps/get_prons.sh 7.42 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
  #!/bin/bash
  # Copyright  2014  Johns Hopkins University (Author: Daniel Povey)
  #            2014  Guoguo Chen
  # Apache 2.0
  
  # Begin configuration section.
  cmd=run.pl
  stage=1
  lmwt=10
  # End configuration options.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh # source the path.
  . parse_options.sh || exit 1;
  
  if [ $# != 3 ]; then
     echo "usage: $0 <data-dir> <lang-dir> <dir>"
     echo "e.g.:  $0 data/train data/lang exp/tri3"
     echo "or:  $0 data/train data/lang exp/tri3/decode_dev"
     echo "This script writes files prons.*.gz in the directory provided, which must"
     echo "contain alignments (ali.*.gz) or lattices (lat.*.gz).  These files are as"
     echo "output by nbest-to-prons (see its usage message)."
     echo "Main options (for others, see top of script file)"
     echo "  --config <config-file>                           # config containing options"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     echo "  --lmwt <lm-weight>                               # scale for LM, only applicable"
     echo "                                                   # for lattice input (default: 10)"
     exit 1;
  fi
  
  # As the usage message of nbest-to-prons says, its output has lines that can be interpreted as
  #  <utterance-id> <begin-frame> <num-frames> <word> <phone1> <phone2> ... <phoneN>
  # and you could convert these into text form using a command like:
  # gunzip -c prons.*.gz | utils/sym2int.pl -f 4 words.txt | utils/sym2int.pl -f 5- phones.txt
  
  
  
  data=$1
  lang=$2
  dir=$3
  
  for f in $data/utt2spk $lang/words.txt $dir/num_jobs; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
  done
  
  nj=$(cat $dir/num_jobs) || exit 1;
  sdata=$data/split$nj
  oov=`cat $lang/oov.int` || exit 1;
  [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
  
  
  if [ -f $dir/final.mdl ]; then
    mdl=$dir/final.mdl
  else
    if [ -f $dir/../final.mdl ]; then
      mdl=$dir/../final.mdl  # e.g. decoding directories.
    else
      echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist."
      exit 1;
    fi
  fi
  
  if [ -f $lang/phones/word_boundary.int ]; then
    align_words_cmd="lattice-align-words $lang/phones/word_boundary.int $mdl ark:- ark:-"
  else
    if [ ! -f $lang/phones/align_lexicon.int ]; then
      echo "$0: expected either $lang/phones/word_boundary.int or $lang/phones/align_lexicon.int to exist."
      exit 1;
    fi
    align_words_cmd="lattice-align-words-lexicon $lang/phones/align_lexicon.int $mdl ark:- ark:-"
  fi
  
  if [ -f $dir/ali.1.gz ]; then
    echo "$0: $dir/ali.1.gz exists, so starting from alignments."
  
    if [ $stage -le 1 ]; then
      rm $dir/prons.*.gz 2>/dev/null
      $cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
        linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
        "ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$sdata/JOB/text |" \
        "" "" ark:- \| $align_words_cmd \| \
        nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
    fi
  else
    if [ ! -f $dir/lat.1.gz ]; then
      echo "$0: expected either $dir/ali.1.gz or $dir/lat.1.gz to exist."
      exit 1;
    fi
    echo "$0: $dir/lat.1.gz exists, so starting from lattices."
  
    if [ $stage -le 1 ]; then
      rm $dir/prons.*.gz 2>/dev/null
      $cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
        lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
        $align_words_cmd \| \
        nbest-to-prons $mdl ark:- "|gzip -c >$dir/prons.JOB.gz" || exit 1;
    fi
  fi
  
  
  if [ $stage -le 2 ]; then
    gunzip -c $dir/prons.*.gz | \
      awk '{ $1=""; $2=""; $3=""; count[$0]++; } END{for (k in count) { print count[k], k; }}' > $dir/pron_counts.int || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    cat $dir/pron_counts.int | utils/int2sym.pl -f 2 $lang/words.txt | \
      utils/int2sym.pl -f 3- $lang/phones.txt | sort -nr > $dir/pron_counts.txt
  fi
  
  if [ $stage -le 4 ]; then
    if [ -f $lang/phones/word_boundary.int ]; then
      # remove the _B, _I, _S, _E markers from phones; this is often convenient
      # if we want to go back to a word-position-independent source lexicon.
      cat $dir/pron_counts.txt | perl -ane '@A = split(" ", $_);
       for ($n=2;$n<@A;$n++) { $A[$n] =~ s/_[BISE]$//; } print join(" ", @A) . "
  "; ' >$dir/pron_counts_nowb.txt
    fi
  fi
  
  if [ $stage -le 5 ]; then
    # Here we figure the count of silence before and after words (actually prons)
    # 1. Create a text like file, but instead of putting words, we write
    #    "word pron" pairs. We change the format of prons.*.gz from pron-per-line
    #    to utterance-per-line (with "word pron" pairs tab-separated), and add
    #    <s> and </s> at the begin and end of each sentence. The _B, _I, _S, _E
    #    markers are removed from phones.
    gunzip -c $dir/prons.*.gz | utils/int2sym.pl -f 4 $lang/words.txt | \
      utils/int2sym.pl -f 5- $lang/phones.txt | cut -d ' ' -f 1,4- | awk '
      BEGIN { utter_id = ""; }
      {
        if (utter_id == "") { utter_id = $1; printf("%s\t<s>", utter_id); }
        else if (utter_id != $1) {
          printf "\t</s>
  "; utter_id = $1; printf("%s\t<s>", utter_id);
        }
        printf("\t%s", $2);
        for (n = 3; n <= NF; n++) { sub("_[BISE]$", "", $n); printf(" %s", $n); }
      }
      END { printf "\t</s>
  "; }' > $dir/pron_perutt_nowb.txt
  
    # 2. Collect bigram counts for words. To be more specific, we are actually
    #    collecting counts for "v ? w", where "?" represents silence or
    #    non-silence.
    cat $dir/pron_perutt_nowb.txt | perl -ape 's/<eps>[^\t]*\t//g;' | perl -e '
      while (<>) {
        chomp; @col = split("\t");
        for($i = 1; $i < scalar(@col) - 1; $i += 1) {
          $bigram{$col[$i] . "\t" . $col[$i + 1]} += 1;
        }
      }
      foreach $key (keys %bigram) {
        print "$bigram{$key}\t$key
  ";
      }' > $dir/pron_bigram_counts_nowb.txt
  
    # 3. Collect bigram counts for silence and words. the count file has 4 fields
    #    for counts, followed by the "word pron" pair. All fields are separated by
    #    spaces:
    #    <sil-before-count> <nonsil-before-count> <sil-after-count> <nonsil-after-count> <word> <phone1> <phone2 >...
    cat $dir/pron_perutt_nowb.txt | cut -f 2- | perl -e '
      %sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = ();
      %words = ();
      while (<STDIN>) {
        chomp;
        @col = split(/[\t]+/, $_); @col >= 2 || die "'$0': bad line \"$_\"
  ";
        for ($n = 0; $n < @col - 1; $n++) {
          # First word is not silence, collect the wpron_sil and wpron_nonsil
          # stats.
          if ($col[$n] !~ m/^<eps> /) {
            if ($col[$n + 1] =~ m/^<eps> /) { $wpron_sil{$col[$n]} += 1; }
            else { $wpron_nonsil{$col[$n]} += 1; }
            $words{$col[$n]} = 1;
          }
          # Second word is not silence, collect the sil_wpron and nonsil_wpron
          # stats.
          if ($col[$n + 1] !~ m/^<eps> /) {
            if ($col[$n] =~ m/^<eps> /) { $sil_wpron{$col[$n + 1]} += 1; }
            else { $nonsil_wpron{$col[$n + 1]} += 1; }
            $words{$col[$n + 1]} = 1;
          }
        }
      }
      foreach $wpron (sort keys %words) {
        $sil_wpron{$wpron} += 0; $nonsil_wpron{$wpron} += 0;
        $wpron_sil{$wpron} += 0; $wpron_nonsil{$wpron} += 0;;
        print "$sil_wpron{$wpron} $nonsil_wpron{$wpron} ";
        print "$wpron_sil{$wpron} $wpron_nonsil{$wpron} $wpron
  ";
      }
    '> $dir/sil_counts_nowb.txt
  fi
  
  echo "$0: done writing prons to $dir/prons.*.gz, silence counts in "
  echo "$0: $dir/sil_counts_nowb.txt and pronunciation counts in "
  echo "$0: $dir/pron_counts.{int,txt}"
  if [ -f $lang/phones/word_boundary.int ]; then
    echo "$0: ... and also in $dir/pron_counts_nowb.txt"
  fi
  
  exit 0;