Blame view
egs/wsj/s5/local/wsj_extend_char_dict.sh
3.16 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
#!/bin/bash # Copyright 2017 Hossein Hadian # This script extends the word list by including OOVs from the training # transcripts. # Since no phonemes are involved, we need no G2P models/rules. # In other words, this script is like wsj_extend_dict.sh except # it deals with characters (i.e. graphemes) instead of phonemes # so it's much simpler. Parts of this script are taken from # EESEN (https://github.com/srvk/eesen) if [ $# -ne 3 ]; then echo "usage: $0 <wsj-corpus-dir> <dict-src-dir> <dict-larger-dir>" echo "e.g.: $0 WSJ/13-32.1/ data/local/lang_char data/local/lang_char_larger" exit 1; fi if [ "`basename $1`" != 13-32.1 ]; then echo "Expecting the first argument to this script to end in 13-32.1" exit 1 fi corpusdir=$1 srcdir=$2 dir=$3 mincount=2 # Minimum count of an OOV we include into the lexicon. mkdir -p $dir cp $srcdir/lexicon.txt $dir/lexicon.ori.txt cp $srcdir/nonsilence_phones.txt $dir cp $srcdir/silence_phones.txt $dir cp $srcdir/optional_silence.txt $dir # the original wordlist cat $dir/lexicon.ori.txt | awk '{print $1}' | sort | uniq > $dir/wordlist.ori # Get the training transcripts echo "Getting the training transcripts, may take some time ..." touch $dir/cleaned.gz if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]"; else gunzip -c $corpusdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \ | awk '/^</{next}{print toupper($0)}' | perl -e ' open(F, "<$ARGV[0]")||die; while(<F>){ chop; $isword{$_} = 1; } while(<STDIN>) { @A = split(" ", $_); for ($n = 0; $n < @A; $n++) { $a = $A[$n]; if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." # and have no other "." in them: treat as period. print "$a"; if ($n+1 < @A) { print " "; } } else { print "$a "; } } print " "; } ' $dir/wordlist.ori | gzip -c > $dir/cleaned.gz fi # Get unigram counts and the counts of the oov words echo "Getting unigram counts" gunzip -c $dir/cleaned.gz | tr -s ' ' ' ' | \ awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | \ sort -nr > $dir/unigrams cat $dir/unigrams | awk -v dict=$dir/wordlist.ori \ 'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \ > $dir/oov.counts echo "Most frequent unseen unigrams are: " head $dir/oov.counts # Select the OOVs whose counts > $mincount. Include these OOVs into the lexicon. cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' > $dir/oovlist cat $dir/oovlist | perl -e 'while(<>){ chop; $str="$_"; foreach $p (split("", $_)) {$str="$str $p"}; print "$str ";}' > $dir/lexicon.oov.txt # filter out oov words that have characters not in non-silence characters cat $dir/lexicon.oov.txt | awk -v dict=$dir/nonsilence_phones.txt \ 'BEGIN{while(getline<dict) seen[$1]=1;} {for(i=2;i<=NF;i++) {if(!seen[$i]){break;}}; if (i==(NF+1)){print;}}' > $dir/lexicon.oov.filt.txt # THe final expanded lexicon cat $dir/lexicon.ori.txt $dir/lexicon.oov.filt.txt > $dir/lexicon.txt echo "Number of OOVs we handled is `cat $dir/lexicon.oov.filt.txt | wc -l`" echo "Created the larger lexicon $dir/lexicon.txt" |