Blame view

Scripts/.03_LM.sh 2.81 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  #!/bin/sh
  
  . ./00_init_paths.sh
  
  
  mkdir lang_train
  cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm | ./utils/find_arpa_oovs.pl lang/words.txt  > lang/oovsML.txt
  cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.lm |    \
      grep -v '<s> <s>' | \
      grep -v '</s> <s>' | \
      grep -v '</s> </s>' | \
      arpa2fst - | fstprint | utils/remove_oovs.pl lang/oovsML.txt | \
      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
        --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
       fstrmepsilon > lang_train/G.fst
  
  
  
  #train LM
  /home/Toolkits/Srilm/bin/i686-m64/ngram-count -order 4 -text /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm -unk -kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1
  #Filter non vocab words in BIG LM
  cat lang/words.txt | cut -d " " -f 1 | grep -v \< | grep -v \( | grep -v \{ > ./LM2/vocab.txt
  /home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm /home/lecouteu/DDA_TA/Scripts/MANY/many-read-only/test/lm/WMT11.arpa -vocab ./LM2/vocab.txt -limit-vocab -write-lm ./LM/WMT11_filtered4.arpa
  
  #%PPL=255 
  /home/Toolkits/Srilm/bin/i686-m64/ngram  -order 4 -lm /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low.4lm  -lambda 0.5  -mix-lm ./LM/WMT11_filtered4.arpa -write-lm ./LM/TEDLM4_0.5.arpa
  
  ###now filter the final LM based on 80k most frequent words of TED TRAIN
   cat /home/besacier/ASR-KALDI/TED-LIUM/IWSLT13/IWSLT13.ASR.train.en.tok.low | tr -s " " "
  " | grep -v [0-9] |  sort | uniq -c | sort -nr > train_ted.vocab
  cat train_ted.vocab | grep -v [\,\.\?\;\:] | awk '{print $2}' > train_ted.vocab.final
  /home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5.arpa -vocab train_ted.vocab.final -limit-vocab -write-lm ./LM/TEDLM4_0.5_filtered50k.arpa
  #/home/Toolkits/Srilm/bin/i686-m64/ngram -order 4 -lm ./LM/TEDLM4_0.5_filtered50k.arpa -ppl /home/besacier/ASR-KALDI/TED-LIUM/KALDI/data/LM/dev2010.txt
  #%PPL=220
  
  
  #convert to FST format for Kaldi
  mkdir LM2
  cat ./LM/TEDLM4_0.5_filtered50k.arpa | ./utils/find_arpa_oovs.pl lang/words.txt  > LM2/oovs.txt
  cat ./LM/TEDLM4_0.5_filtered50k.arpa |    \
      grep -v '<s> <s>' | \
      grep -v '</s> <s>' | \
      grep -v '</s> </s>' | \
      /home/Toolkits/Kaldi/kaldi-trunk/src/bin/arpa2fst - | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstprint | \
      utils/remove_oovs.pl LM2/oovs.txt | \
      utils/eps2disambig.pl | utils/s2eps.pl | /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstcompile --isymbols=lang/words.txt \
        --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
       /home/Toolkits/Kaldi/kaldi-trunk/tools/openfst-1.3.2/bin/fstrmepsilon > ./LM2/G.fst