Blame view
egs/aspire/s5/local/fisher_train_lms.sh
4.06 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
#!/bin/bash # To be run from one directory above this script. text=data/train_all/text lexicon=data/local/dict/lexicon.txt for f in "$text" "$lexicon"; do [ ! -f $x ] && echo "$0: No such file $f" && exit 1; done # This script takes no arguments. It assumes you have already run # fisher_data_prep.sh and fisher_prepare_dict.sh # It takes as input the files #data/train_all/text #data/local/dict/lexicon.txt dir=data/local/lm mkdir -p $dir export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. export PATH=$PATH:`pwd`/../../../tools/kaldi_lm ( # First make sure the kaldi_lm toolkit is installed. cd ../../../tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; cd kaldi_lm make || exit 1; echo Done making the kaldi_lm tools fi ) || exit 1; mkdir -p $dir cleantext=$dir/text.no_oov cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf(" ");}' \ > $cleantext || exit 1; cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; # Get counts from acoustic training transcripts, and add one-count # for each word in the lexicon (but not silence, we don't want it # in the LM-- we'll add it optionally later). cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; # note: we probably won't really make use of <unk> as there aren't any OOVs cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \ || exit 1; # note: ignore 1st field of train.txt, it's the utterance-id. cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;} { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \ || exit 1; train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 # note: output is # data/local/lm/3gram-mincount/lm_unpruned.gz exit 0 # From here is some commands to do a baseline with SRILM (assuming # you have it installed). heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. mkdir -p $sdir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \ head -$heldout_sent > $sdir/heldout cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \ tail -n +$heldout_sent > $sdir/train cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs # 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258 # Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above. # Difference in WSJ must have been due to different treatment of <unk>. ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs # 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614 |