Blame view
egs/iam/v1/local/prepare_dict.sh
1.34 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
#!/usr/bin/env bash # Copyright 2017 Hossein Hadian # 2017 Chun Chieh Chang # 2017 Ashish Arora # This script prepares the dictionary. set -e dir=data/local/dict vocab_size=50000 . ./utils/parse_options.sh mkdir -p $dir # First get the set of all letters that occur in data/train/text cat data/train/text | \ perl -ne '@A = split; shift @A; for(@A) {print join(" ", split(//)), " ";}' | \ sort -u > $dir/nonsilence_phones.txt # Now use the pocolm's wordlist which is the most N frequent words in # in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising # letters as their transcription. Only include words that use the above letters. # (Letter # is replaced with <HASH>) export letters=$(cat $dir/nonsilence_phones.txt | tr -d " ") head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ perl -e '$letters=$ENV{letters}; while(<>){ chop; $w = $_; if($w =~ m/^[$letters]+$/){ $trans = join(" ", split(//, $w)); $trans =~ s/#/<HASH>/g; print "$w $trans "; } }' | sort -u > $dir/lexicon.txt perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt echo '<sil> SIL' >> $dir/lexicon.txt echo '<unk> SIL' >> $dir/lexicon.txt echo SIL > $dir/silence_phones.txt echo SIL >$dir/optional_silence.txt echo -n "" >$dir/extra_questions.txt |