Blame view

egs/gale_arabic/s5c/local/prepare_lm_subword.sh 1.41 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  #!/bin/bash
  
  # Copyright 2012  Vassil Panayotov
  #           2017  Ewald Enzinger
  #           2019  Dongji Gao
  # Apache 2.0
  
  . ./path.sh || exit 1
  
  echo "=== Building a language model ..."
  
  dir=data/local/lm/
  text=data/train/text
  lexicon=data/local/dict/lexicon.txt
  # Language model order
  order=6
  
  . utils/parse_options.sh
  
  # Prepare a LM training corpus from the transcripts
  mkdir -p $dir
  
  for f in "$text" "$lexicon"; do
    [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
  done
  
  loc=`which ngram-count`;
  if [ -z $loc ]; then
    if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
      sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
    else
      sdir=$KALDI_ROOT/tools/srilm/bin/i686
    fi
    if [ -f $sdir/ngram-count ]; then
      echo Using SRILM tools from $sdir
      export PATH=$PATH:$sdir
    else
      echo You appear to not have SRILM tools installed, either on your path,
      echo or installed in $sdir.  See tools/install_srilm.sh for installation
      echo instructions.
      exit 1
    fi
  fi
  
  cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
  cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt
  cut -d' ' -f1 $lexicon > $dir/wordlist
  
  ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \
    -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz
  
  ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt
  echo "*** Finished building the LM model!"