Blame view

egs/wsj/s5/local/cstr_wsj_extend_dict.sh 6.73 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
  #!/bin/bash
  
  # This script builds a larger word-list and dictionary 
  # than used for the LMs supplied with the WSJ corpus.
  # It uses a couple of strategies to fill-in words in
  # the LM training data but not in CMUdict.  One is
  # to generate special prons for possible acronyms, that
  # just consist of the constituent letters.  The other
  # is designed to handle derivatives of known words
  # (e.g. deriving the pron of a plural from the pron of
  # the base-word), but in a more general, learned-from-data
  # way.
  # It makes use of scripts in local/dict/
  
  dict_suffix=
  
  echo "$0 $@"  # Print the command line for logging
  . utils/parse_options.sh || exit 1;
  
  if [ $# -ne 1 ]; then
    echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir"
    exit 1
  fi
  
  export PATH=$PATH:`pwd`/local/dict/
  srcdir=$1
  
  if [ ! -d $srcdir/lng_modl ]; then
    echo "Expecting 'lng_modl' under WSJ doc directory '$srcdir'"
    exit 1
  fi
  
  mkdir -p data/local/dict${dict_suffix}_larger
  dir=data/local/dict${dict_suffix}_larger
  cp data/local/dict${dict_suffix}/* data/local/dict${dict_suffix}_larger # Various files describing phones etc.
    # are there; we just want to copy them as the phoneset is the same.
  rm data/local/dict${dict_suffix}_larger/lexicon.txt # we don't want this.
  rm data/local/dict${dict_suffix}_larger/lexiconp.txt # we don't want this either.
  mincount=2 # Minimum count of an OOV we will try to generate a pron for.
  
  [ ! -f data/local/dict${dict_suffix}/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
  
  # Remove comments from cmudict; print first field; remove
  # words like FOO(1) which are alternate prons: our dict format won't
  # include these markers.
  grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | 
   perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
  
  cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
  
  echo "Getting training data [this should take at least a few seconds; if not, there's a problem]"
  
  # Convert to uppercase, remove XML-like markings.
  # For words ending in "." that are not in CMUdict, we assume that these
  # are periods that somehow remained in the data during data preparation,
  # and we we replace the "." with "
  ".  Note: we found this by looking at
  # oov.counts below (before adding this rule).
  
  touch $dir/cleaned.gz
  if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
    echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
  else
   gunzip -c $srcdir/lng_modl/lm_train/np_data/{87,88,89}/*.z \
    | awk '/^</{next}{print toupper($0)}' | perl -e '
     open(F, "<$ARGV[0]")||die;
     while(<F>){ chop; $isword{$_} = 1; }
     while(<STDIN>) { 
      @A = split(" ", $_); 
      for ($n = 0; $n < @A; $n++) {
        $a = $A[$n];
        if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
           # and have no other "." in them: treat as period.
           print "$a";
           if ($n+1 < @A) { print "
  "; }
        } else { print "$a "; }
      }
      print "
  ";
    }
   ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
  fi
    
  # get unigram counts
  echo "Getting unigram counts"
  gunzip -c $dir/cleaned.gz | tr -s ' ' '
  ' | \
    awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams
  
  cat $dir/unigrams | awk -v dict=$dir/dict.cmu \
    'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
     > $dir/oov.counts
  
  echo "Most frequent unseen unigrams are: "
  head $dir/oov.counts
  
  # Prune away singleton counts, and remove things with numbers in
  # (which should have been normalized) and with no letters at all.
  
  
  cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \
    | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist
  
  # Automatic rule-finding...
  
  # First make some prons for possible acronyms.
  # Note: we don't do this for things like U.K or U.N,
  # or A.B. (which doesn't exist anyway), 
  # as we consider this normalization/spelling errors.
  
  cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
  
  mkdir $dir/f $dir/b # forward, backward directions of rules...
    # forward is normal suffix
    # rules, backward is reversed (prefix rules).  These
    # dirs contain stuff we create while making the rule-based
    # extensions to the dictionary.
  
  # Remove ; and , from words, if they are present; these
  # might crash our scripts, as they are used as separators there.
  filter_dict.pl $dir/dict.cmu > $dir/f/dict 
  cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
  reverse_dict.pl $dir/f/dict > $dir/b/dict
  reverse_dict.pl $dir/f/oovs > $dir/b/oovs
  
  # The next stage takes a few minutes.
  # Note: the forward stage takes longer, as English is
  # mostly a suffix-based language, and there are more rules
  # that it finds.
  for d in $dir/f $dir/b; do
   (
     cd $d
     cat dict | get_rules.pl 2>get_rules.log >rules
     get_rule_hierarchy.pl rules >hierarchy
     awk '{print $1}' dict | get_candidate_prons.pl rules dict | \
       limit_candidate_prons.pl hierarchy | \
       score_prons.pl dict | \
       count_rules.pl >rule.counts
     # the sort command below is just for convenience of reading.
     score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
     get_candidate_prons.pl rules.with_scores dict oovs | \
       limit_candidate_prons.pl hierarchy > oovs.candidates
   )  &   
  done 
  wait
  
  # Merge the candidates.
  reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates
  select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s  %s
  ", $1, $2);}' \
    > $dir/dict.oovs
  
  cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged
  
  awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled
  sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled
  
  
  # add_counts.pl attaches to original counts to the list of handled/not-handled OOVs
  add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
  add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
  
  echo "**Top OOVs we handled are:**"; 
  head $dir/oovlist.handled.counts
  echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
  head $dir/oovlist.not_handled.counts
  
  
  echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`"
  echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`"
  echo "Count of OOVs we didn't handle due to low count is" \
      `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts`
  # The two files created above are for humans to look at, as diagnostics.
  
  cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt
  !SIL SIL
  <SPOKEN_NOISE> SPN
  <UNK> SPN
  <NOISE> NSN
  EOF
  
  echo "Created $dir/lexicon.txt"