Blame view
egs/wsj/s5/local/wsj_extend_dict.sh
7 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
#!/bin/bash # This script builds a larger word-list and dictionary # than used for the LMs supplied with the WSJ corpus. # It uses a couple of strategies to fill-in words in # the LM training data but not in CMUdict. One is # to generate special prons for possible acronyms, that # just consist of the constituent letters. The other # is designed to handle derivatives of known words # (e.g. deriving the pron of a plural from the pron of # the base-word), but in a more general, learned-from-data # way. # It makes use of scripts in local/dict/ dict_suffix= echo "$0 $@" # Print the command line for logging . utils/parse_options.sh || exit 1; if [ $# -ne 1 ]; then echo "Usage: local/wsj_train_lms.sh /foo/bar/WSJ/13-32.1/" exit 1 fi if [ "`basename $1`" != 13-32.1 ]; then echo "Expecting the argument to this script to end in 13-32.1" echo "Note: if you have old-style WSJ distribution," echo "local/cstr_wsj_extend_dict.sh may work instead, see run.sh for example." exit 1 fi # e.g. #srcdir=/mnt/matylda2/data/WSJ1/13-32.1 export PATH=$PATH:`pwd`/local/dict/ srcdir=$1 mkdir -p data/local/dict${dict_suffix}_larger dir=data/local/dict${dict_suffix}_larger cp -r data/local/dict${dict_suffix}/* \ data/local/dict${dict_suffix}_larger # Various files describing phones etc. # are there; we just want to copy them # as the phoneset is the same. rm data/local/dict${dict_suffix}_larger/lexicon.txt # we don't want this. rm data/local/dict${dict_suffix}_larger/lexiconp.txt # we don't want this either. mincount=2 # Minimum count of an OOV we will try to generate a pron for. [ ! -f data/local/dict${dict_suffix}/cmudict/cmudict.0.7a ] && \ echo "CMU dict not in expected place" && exit 1; # Remove comments from cmudict; print first field; remove # words like FOO(1) which are alternate prons: our dict format won't # include these markers. grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu echo "Getting training data [this should take at least a few seconds; if not, there's a problem]" # Convert to uppercase, remove XML-like markings. # For words ending in "." that are not in CMUdict, we assume that these # are periods that somehow remained in the data during data preparation, # and we we replace the "." with " ". Note: we found this by looking at # oov.counts below (before adding this rule). touch $dir/cleaned.gz if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; else gunzip -c $srcdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ | awk '/^</{next}{print toupper($0)}' | perl -e ' open(F, "<$ARGV[0]")||die; while(<F>){ chop; $isword{$_} = 1; } while(<STDIN>) { @A = split(" ", $_); for ($n = 0; $n < @A; $n++) { $a = $A[$n]; if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." # and have no other "." in them: treat as period. print "$a"; if ($n+1 < @A) { print " "; } } else { print "$a "; } } print " "; } ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz fi # get unigram counts echo "Getting unigram counts" gunzip -c $dir/cleaned.gz | tr -s ' ' ' ' | \ awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | sort -nr > $dir/unigrams cat $dir/unigrams | awk -v dict=$dir/dict.cmu \ 'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \ > $dir/oov.counts echo "Most frequent unseen unigrams are: " head $dir/oov.counts # Prune away singleton counts, and remove things with numbers in # (which should have been normalized) and with no letters at all. cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' \ | awk '/[0-9]/{next;} /[A-Z]/{print;}' > $dir/oovlist # Automatic rule-finding... # First make some prons for possible acronyms. # Note: we don't do this for things like U.K or U.N, # or A.B. (which doesn't exist anyway), # as we consider this normalization/spelling errors. cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms mkdir $dir/f $dir/b # forward, backward directions of rules... # forward is normal suffix # rules, backward is reversed (prefix rules). These # dirs contain stuff we create while making the rule-based # extensions to the dictionary. # Remove ; and , from words, if they are present; these # might crash our scripts, as they are used as separators there. filter_dict.pl $dir/dict.cmu > $dir/f/dict cat $dir/oovlist | filter_dict.pl > $dir/f/oovs reverse_dict.pl $dir/f/dict > $dir/b/dict reverse_dict.pl $dir/f/oovs > $dir/b/oovs # The next stage takes a few minutes. # Note: the forward stage takes longer, as English is # mostly a suffix-based language, and there are more rules # that it finds. for d in $dir/f $dir/b; do ( cd $d cat dict | get_rules.pl 2>get_rules.log >rules get_rule_hierarchy.pl rules >hierarchy awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ limit_candidate_prons.pl hierarchy | \ score_prons.pl dict | \ count_rules.pl >rule.counts # the sort command below is just for convenience of reading. score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores get_candidate_prons.pl rules.with_scores dict oovs | \ limit_candidate_prons.pl hierarchy > oovs.candidates ) & done wait # Merge the candidates. reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s ", $1, $2);}' \ > $dir/dict.oovs cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled # add_counts.pl attaches to original counts to the list of handled/not-handled OOVs add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts echo "**Top OOVs we handled are:**"; head $dir/oovlist.handled.counts echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; head $dir/oovlist.not_handled.counts echo "Count of OOVs we handled is `awk '{x+=$1} END{print x}' $dir/oovlist.handled.counts`" echo "Count of OOVs we couldn't handle is `awk '{x+=$1} END{print x}' $dir/oovlist.not_handled.counts`" echo "Count of OOVs we didn't handle due to low count is" \ `awk -v thresh=$mincount '{if ($1 < thresh) x+=$1; } END{print x;}' $dir/oov.counts` # The two files created above are for humans to look at, as diagnostics. cat <<EOF | cat - $dir/dict.cmu $dir/dict.oovs_merged | sort | uniq > $dir/lexicon.txt !SIL SIL <SPOKEN_NOISE> SPN <UNK> SPN <NOISE> NSN EOF echo "Created $dir/lexicon.txt" |