Blame view
Scripts/.03_LM.sh
2.81 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
#!/bin/sh . ./00_init_paths.sh mkdir lang_train cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | ./utils/find_arpa_oovs.pl lang/words.txt > lang/oovsML.txt cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | \ grep -v '<s> <s>' | \ grep -v '</s> <s>' | \ grep -v '</s> </s>' | \ arpa2fst - | fstprint | utils/remove_oovs.pl lang/oovsML.txt | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \ --osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon > lang_train/G.fst #train LM /home/Toolkits/Srilm/bin/i686-m64/ngram-count -order 4 -text /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -unk -kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1 #Filter non vocab words in BIG LM cat lang/words.txt | cut -d " " -f 1 | grep -v \< | grep -v \( | grep -v \{ > ./LM2/vocab.txt /home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/lecouteu/DDA_TA/Scripts/MANY/many-read-only/test/lm/WMT11.arpa -vocab ./LM2/vocab.txt -limit-vocab -write-lm ./LM/WMT11_filtered4.arpa #%PPL=255 /home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -lambda 0.5 -mix-lm ./LM/WMT11_filtered4.arpa -write-lm ./LM/TEDLM4_0.5.arpa ###now filter the final LM based on 80k most frequent words of TED TRAIN cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low | tr -s " " " " | grep -v [0-9] | sort | uniq -c | sort -nr > train_ted.vocab cat train_ted.vocab | grep -v [\,\.\?\;\:] | awk '{print $2}' > train_ted.vocab.final /home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5.arpa -vocab train_ted.vocab.final -limit-vocab -write-lm ./LM/TEDLM4_0.5_filtered50k.arpa #/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5_filtered50k.arpa -ppl /home/besacier/ASR-KALDI/TED-LIUM/KALDI/data/LM/dev2010.txt #%PPL=220 #convert to FST format for Kaldi mkdir LM2 cat ./LM/TEDLM4_0.5_filtered50k.arpa | ./utils/find_arpa_oovs.pl lang/words.txt > LM2/oovs.txt cat ./LM/TEDLM4_0.5_filtered50k.arpa | \ grep -v '<s> <s>' | \ grep -v '</s> <s>' | \ grep -v '</s> </s>' | \ /home/Toolkits/Kaldi/kaldi-trunk/src/bin/arpa2fst - | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstprint | \ utils/remove_oovs.pl LM2/oovs.txt | \ utils/eps2disambig.pl | utils/s2eps.pl | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstcompile --isymbols=lang/words.txt \ --osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstrmepsilon > ./LM2/G.fst |