Blame view
egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
4.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
#!/usr/bin/env bash # Copyright 2014 Gaurav Kumar. Apache 2.0 . ./path.sh #First get the list of unique words from our text file if [ $# -lt 1 ]; then echo 'Usage fsp_prepare_dict.sh lexicon' exit 1; fi stage=0 dir=`pwd`/data/local/dict datadir=`pwd`/data/local/data/train_all mkdir -p $dir local=`pwd`/local utils=`pwd`/utils tmpdir=`pwd`/data/local/tmp lexicon=$1 #Get all unique words, remove punctuation. if [ $stage -le 0 ]; then cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " " " | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords if [ ! -f "${tmpdir}/es_wordlist.json" ]; then echo "Could not find the large collection of Spanish words es_wordlist.json" echo "Trying to download it via wget" if ! which wget >&/dev/null; then echo "This script requires you to first install wget" exit 1; fi cwd=`pwd` cd $tmpdir wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then echo "Download of the large Spanish word list failed" exit 1; fi tar -xovzf es_wordlist.json.tgz || exit 1; cd $cwd fi # Merge with gigaword corpus $local/merge_lexicons.py ${tmpdir} ${lexicon} mv $tmpdir/uniquewords $tmpdir/uniquewords.small mv $tmpdir/uniquewords64k $tmpdir/uniquewords fi #Then get the list of phones form basic_rules in the lexicon folder if [ $stage -le 1 ]; then if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder" exit 1; fi # This is a preliminary attempt to get the unique phones from the LDC lexicon # This will be extended based on our lexicon later perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir fi #Get pronunciation for each word using the spron.pl file in the lexicon folder if [ $stage -le 2 ]; then #cd $lexicon/callhome_spanish_lexicon_970908 # Replace all words for which no pronunciation was generated with an orthographic # representation cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ | awk -F '[/][/]' '{print $1}' \ > $tmpdir/lexicon_raw fi #Break the pronunciation down according to the format required by Kaldi if [ $stage -le 3 ]; then # Creates a KALDI compatible lexicon, and extends the phone list perl $local/isolate_phones.pl $tmpdir cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1 mv $tmpdir/phones $tmpdir/phones.small mv $tmpdir/phones_extended.1 $tmpdir/phones sort $tmpdir/phones -o $tmpdir/phones paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1 #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1 fi if [ $stage -le 4 ]; then # silence phones, one per line. for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt echo sil > $dir/optional_silence.txt # An extra question will be added by including the silence phones in one class. cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf " ";}' > \ $dir/extra_questions.txt || exit 1; # Remove [] chars from phones cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1 rm $tmpdir/phones mv $tmpdir/phones.1 $tmpdir/phones cp $tmpdir/phones $dir/nonsilence_phones.txt if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2" # Add prons for laughter, noise, oov w=$(grep -v sil $dir/silence_phones.txt | tr ' ' '|') perl -i -ne "print unless /\[(${w%?})\]/" $tmpdir/lexicon.2 for w in `grep -v sil $dir/silence_phones.txt`; do echo "[$w] $w" done | cat - $tmpdir/lexicon.2 > $tmpdir/lexicon.3 || exit 1; cat $tmpdir/lexicon.3 \ <( echo "mm m" echo "<unk> oov" ) > $tmpdir/lexicon.4 # From the lexicon remove _ from the phonetic representation cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5 cp "$tmpdir/lexicon.5" $dir/lexicon.txt cat $datadir/text | \ awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ sort -nr > $tmpdir/word_counts awk '{print $1}' $dir/lexicon.txt | \ perl -e '($word_counts)=@ARGV; open(W, "<$word_counts")||die "opening word-counts $word_counts"; while(<STDIN>) { chop; $seen{$_}=1; } while(<W>) { ($c,$w) = split; if (!defined $seen{$w}) { print; } } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt echo "*Highest-count OOVs are:" head -n 20 $tmpdir/oov_counts.txt fi $utils/validate_dict_dir.pl $dir exit 0; |