.03_LM.sh
2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/sh
. ./00_init_paths.sh
mkdir lang_train
cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | ./utils/find_arpa_oovs.pl lang/words.txt > lang/oovsML.txt
cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | utils/remove_oovs.pl lang/oovsML.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
--osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > lang_train/G.fst
#train LM
/home/Toolkits/Srilm/bin/i686-m64/ngram-count -order 4 -text /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -unk -kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1
#Filter non vocab words in BIG LM
cat lang/words.txt | cut -d " " -f 1 | grep -v \< | grep -v \( | grep -v \{ > ./LM2/vocab.txt
/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/lecouteu/DDA_TA/Scripts/MANY/many-read-only/test/lm/WMT11.arpa -vocab ./LM2/vocab.txt -limit-vocab -write-lm ./LM/WMT11_filtered4.arpa
#%PPL=255
/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -lambda 0.5 -mix-lm ./LM/WMT11_filtered4.arpa -write-lm ./LM/TEDLM4_0.5.arpa
###now filter the final LM based on 80k most frequent words of TED TRAIN
cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low | tr -s " " "\n" | grep -v [0-9] | sort | uniq -c | sort -nr > train_ted.vocab
cat train_ted.vocab | grep -v [\,\.\?\;\:] | awk '{print $2}' > train_ted.vocab.final
/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5.arpa -vocab train_ted.vocab.final -limit-vocab -write-lm ./LM/TEDLM4_0.5_filtered50k.arpa
#/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5_filtered50k.arpa -ppl /home/besacier/ASR-KALDI/TED-LIUM/KALDI/data/LM/dev2010.txt
#%PPL=220
#convert to FST format for Kaldi
mkdir LM2
cat ./LM/TEDLM4_0.5_filtered50k.arpa | ./utils/find_arpa_oovs.pl lang/words.txt > LM2/oovs.txt
cat ./LM/TEDLM4_0.5_filtered50k.arpa | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
/home/Toolkits/Kaldi/kaldi-trunk/src/bin/arpa2fst - | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstprint | \
utils/remove_oovs.pl LM2/oovs.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstcompile --isymbols=lang/words.txt \
--osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
/home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstrmepsilon > ./LM2/G.fst