Blame view

egs/tedlium/s5_r2_wsj/local/prepare_dict.sh 5.92 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
  #!/bin/bash
  
  # Copyright 2010-2012 Microsoft Corporation  
  #           2012-2014 Johns Hopkins University (Author: Daniel Povey)
  #                2015 Guoguo Chen
  #                2016 Vimal Manohar
  # Apache 2.0
  
  # Call this script from one level above, e.g. from the s3/ directory.  It puts
  # its output in data/local/.
  
  # The parts of the output of this that will be needed are
  # [in data/local/dict/ ]
  # lexicon.txt
  # extra_questions.txt
  # nonsilence_phones.txt
  # optional_silence.txt
  # silence_phones.txt
  
  . ./path.sh
  . ./cmd.sh
  
  set -e 
  set -o pipefail
  set -u
  
  # run this from ../
  dict_suffix=
  stage=-1
  
  echo "$0 $@"  # Print the command line for logging
  . utils/parse_options.sh || exit 1;
  
  if [ $# -ne 1 ]; then
    echo "Usage: $0 <wordlist>"
    echo "e.g. : $0 data/local/local_lm/data/work/wordlist"
    exit 1
  fi
  
  wordlist=$1
  
  dir=data/local/dict${dict_suffix}
  mkdir -p $dir
  
  if [ ! -d $dir/cmudict ]; then
    # (1) Get the CMU dictionary
    svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
      $dir/cmudict || exit 1;
  fi
  
  # can add -r 10966 for strict compatibility.
  
  
  #(2) Dictionary preparation:
  
  
  if [ $stage -le 0 ]; then
    # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
    # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
  
    # silence phones, one per line.
    (echo SIL; echo SPN; echo NSN; echo UNK;) > $dir/silence_phones.txt
    echo SIL > $dir/optional_silence.txt
  
    # nonsilence phones; on each line is a list of phones that correspond
    # really to the same base phone.
    cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
      perl -e 'while(<>){
    chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
    $phones_of{$1} .= "$_ "; }
    foreach $list (values %phones_of) {print $list . "
  "; } ' \
      > $dir/nonsilence_phones.txt || exit 1;
  
    # A few extra questions that will be added to those obtained by automatically clustering
    # the "real" phones.  These ask about stress; there's also one for silence.
    cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "
  ";}' > $dir/extra_questions.txt || exit 1;
    cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l
  ";}' \
      >> $dir/extra_questions.txt || exit 1;
  
    grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
      perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
      > $dir/dict.cmu || exit 1;
  
    # Add to cmudict the silences, noises etc.
  
    (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
      cat - $dir/dict.cmu > $dir/lexicon2_raw.txt 
    awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist
  
    cat <<EOF >$dir/silence_phones.txt 
  SIL
  SPN
  NSN
  UNK
  EOF
  
  fi
  
  
  if [ $stage -le 2 ]; then
    if [ ! -f exp/g2p/.done ]; then
      steps/dict/train_g2p.sh --cmd "$train_cmd" \
        --silence-phones $dir/silence_phones.txt \
        $dir/dict.cmu exp/g2p
      touch exp/g2p/.done
    fi
  fi
  
  export PATH=$PATH:`pwd`/local/dict
  
  if [ $stage -le 3 ]; then
    cat $wordlist | python -c '
  import sys
  
  words = {}
  for line in open(sys.argv[1]).readlines():
    words[line.strip()] = 1
  
  oovs = {}
  for line in sys.stdin.readlines():
    word = line.strip()
    if word not in words:
      oovs[word] = 1
  
  for oov in oovs:
    print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist
    
    cat $dir/oovlist | \
      get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms
  fi
  
  mkdir -p $dir/f $dir/b # forward, backward directions of rules...
  
  if [ $stage -le 4 ]; then
    # forward is normal suffix
    # rules, backward is reversed (prefix rules).  These
    # dirs contain stuff we create while making the rule-based
    # extensions to the dictionary.
  
    # Remove ; and , from words, if they are present; these
    # might crash our scripts, as they are used as separators there.
    filter_dict.pl $dir/dict.cmu > $dir/f/dict
    cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
    reverse_dict.pl $dir/f/dict > $dir/b/dict
    reverse_dict.pl $dir/f/oovs > $dir/b/oovs
  fi
  
  if [ $stage -le 5 ]; then
    # The next stage takes a few minutes.
    # Note: the forward stage takes longer, as English is
    # mostly a suffix-based language, and there are more rules
    # that it finds.
    for d in $dir/f $dir/b; do
     (
       cd $d
       cat dict | get_rules.pl 2>get_rules.log >rules
       get_rule_hierarchy.pl rules >hierarchy
       awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
         limit_candidate_prons.pl hierarchy | \
         score_prons.pl dict | \
         count_rules.pl >rule.counts
       # the sort command below is just for convenience of reading.
       score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
       get_candidate_prons.pl rules.with_scores dict oovs | \
         limit_candidate_prons.pl hierarchy > oovs.candidates
     ) &
    done
    wait
  fi
  
  if [ $stage -le 6 ]; then
    # Merge the candidates.
    reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
    select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s
  ", $1, $2);}' \
      > $dir/dict.oovs
  
    cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
    awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
    sort $dir/oovlist | { diff - $dir/oovlist.handled || true; } | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
  fi
  
  if [ $stage -le 7 ]; then
    steps/dict/apply_g2p.sh --cmd "$train_cmd" \
      $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex
    cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
      $dir/dict.oovs_g2p
  fi
  
  if [ $stage -le 8 ]; then
    # the sort | uniq is to remove a duplicated pron from cmudict.
    cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \
      $dir/lexicon.txt || exit 1;
    # lexicon.txt is without the _B, _E, _S, _I markers.
  
    rm $dir/lexiconp.txt 2>/dev/null || true
  fi
  
  echo "Dictionary preparation succeeded"