Blame view

egs/spanish_dimex100/s5/local/lm_prep.sh 1.42 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  #!/bin/bash
  
  ## Install SRILM in the `tools` directory (install_srilm.sh)
  
  ## Only run this file from the example root directory
  ##      $ ./local/data_prep.sh
  
  mkdir -p "data/local/tmp" "data/lang/tmp"
  
  source ./path.sh
  
  if [ -d "../../../tools/srilm/bin/i686-m64" ]; then
      ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count"
  elif [ -d "../../../tools/srilm/bin/i686" ]; then
      ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count"
  else
      echo
      echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)"
      echo
      exit 1
  fi
  
  
  ########################
  # data/local/tmp/lm_text
  ########################
  
  # Text sentences input for language model generation
  # taken from data/[train|test]/text but with utterance IDs removed
  
  cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text
  
  
  #################################
  # data/local/tmp/3gram_arpa_lm.gz
  ##################################
  
  $ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \
      -order 3 \
      -write-vocab data/local/tmp/vocab-full.txt \
      -sort \
      -wbdiscount \
      -unk \
      -map-unk "<UNK>" \
      -text data/local/tmp/lm_text
      # -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \
      # -kndiscount3 -gt3min 3 -order 3 \
  
  
  #################
  # data/lang/G.fst
  #################
  
  utils/format_lm.sh data/lang \
      data/local/tmp/3gram_lm.arpa.kn.gz \
      data/local/dict/lexicon.txt \
      data/lang