.03_LM.sh 2.81 KB
#!/bin/sh

. ./00_init_paths.sh


mkdir lang_train
cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | ./utils/find_arpa_oovs.pl lang/words.txt  > lang/oovsML.txt
cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm |    \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
    grep -v '</s> </s>' | \
    arpa2fst - | fstprint | utils/remove_oovs.pl lang/oovsML.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
      --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     fstrmepsilon > lang_train/G.fst



#train LM
/home/Toolkits/Srilm/bin/i686-m64/ngram-count -order 4 -text /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -unk -kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1
#Filter non vocab words in BIG LM
cat lang/words.txt | cut -d " " -f 1 | grep -v \< | grep -v \( | grep -v \{ > ./LM2/vocab.txt
/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/lecouteu/DDA_TA/Scripts/MANY/many-read-only/test/lm/WMT11.arpa -vocab ./LM2/vocab.txt -limit-vocab -write-lm ./LM/WMT11_filtered4.arpa

#%PPL=255 
/home/Toolkits/Srilm/bin/i686-m64/ngram  -order 4 -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm  -lambda 0.5  -mix-lm ./LM/WMT11_filtered4.arpa -write-lm ./LM/TEDLM4_0.5.arpa

###now filter the final LM based on 80k most frequent words of TED TRAIN
 cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low | tr -s " " "\n" | grep -v [0-9] |  sort | uniq -c | sort -nr > train_ted.vocab
cat train_ted.vocab | grep -v [\,\.\?\;\:] | awk '{print $2}' > train_ted.vocab.final
/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5.arpa -vocab train_ted.vocab.final -limit-vocab -write-lm ./LM/TEDLM4_0.5_filtered50k.arpa
#/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5_filtered50k.arpa -ppl /home/besacier/ASR-KALDI/TED-LIUM/KALDI/data/LM/dev2010.txt
#%PPL=220


#convert to FST format for Kaldi
mkdir LM2
cat ./LM/TEDLM4_0.5_filtered50k.arpa | ./utils/find_arpa_oovs.pl lang/words.txt  > LM2/oovs.txt
cat ./LM/TEDLM4_0.5_filtered50k.arpa |    \
    grep -v '<s> <s>' | \
    grep -v '</s> <s>' | \
    grep -v '</s> </s>' | \
    /home/Toolkits/Kaldi/kaldi-trunk/src/bin/arpa2fst - | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstprint | \
    utils/remove_oovs.pl LM2/oovs.txt | \
    utils/eps2disambig.pl | utils/s2eps.pl | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstcompile --isymbols=lang/words.txt \
      --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
     /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstrmepsilon > ./LM2/G.fst