Blame view

egs/gp/s5/local/gp_format_lm.sh 3.18 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
  #!/bin/bash -u
  
  # Copyright 2012  Arnab Ghoshal
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  set -o errexit
  set -o pipefail
  
  . ./path.sh    # Sets the PATH to contain necessary executables
  
  # Begin configuration section.
  filter_vocab_sri=false    # if true, use SRILM to change the LM vocab
  srilm_opts="-subset -prune-lowprobs -unk -tolower"
  # end configuration sections
  
  help_message="Usage: "`basename $0`" [options] LM-dir LC [LC ... ]
  where LC is a 2-letter code for GlobalPhone languages, and LM-dir is assumed to 
  contain LMs for all the languages (e.g. RU.3gram.lm.gz for Russian).
  options: 
    --help                           # print this message and exit
    --filter-vocab-sri (true|false)  # use SRILM to change the LM vocab (default: $filter_vocab_sri)
    --srilm-opts STRING              # options to pass to SRILM tools (default: '$srilm_opts')
  ";
  
  . utils/parse_options.sh
  
  if [ $# -lt 2 ]; then
    printf "$help_message
  "; exit 1;
  fi
  
  LMDIR=$1; shift;
  LANGUAGES=
  while [ $# -gt 0 ]; do
    case "$1" in
    ??) LANGUAGES=$LANGUAGES" $1"; shift ;;
    *)  echo "Unknown argument: $1, exiting"; error_exit $usage ;;
    esac
  done
  
  if [ -z $IRSTLM ] ; then
    export IRSTLM=$KALDI_ROOT/tools/irstlm/
  fi
  export PATH=${PATH}:$IRSTLM/bin
  if ! command -v prune-lm >/dev/null 2>&1 ; then
    echo "$0: Error: the IRSTLM is not available or compiled" >&2
    echo "$0: Error: We used to install it by default, but." >&2
    echo "$0: Error: this is no longer the case." >&2
    echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
    echo "$0: Error: and run extras/install_irstlm.sh" >&2
    exit 1
  fi
  
  for L in $LANGUAGES; do
    lm=$LMDIR/${L}.3gram.lm.gz
    [ -f $lm ] || { echo "LM '$lm' not found"; exit 1; }
    test=data/$L/lang_test_tg
    if $filter_vocab_sri; then  # use SRILM to change LM vocab
      utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
        data/$L/lang $lm data/$L/local/dict/lexicon.txt "${test}_sri"
    else  # just remove out-of-lexicon words without renormalizing the LM
      utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
    fi
  
    # Create a pruned version of the LM for building the decoding graphs, using 
    # 'prune-lm' from IRSTLM:
    mkdir -p data/$L/local/lm
    prune-lm --threshold=1e-7 $lm /dev/stdout | gzip -c \
      > data/$L/local/lm/${L}.tgpr.arpa.gz
    lm=data/$L/local/lm/${L}.tgpr.arpa.gz
    test=data/$L/lang_test_tgpr
    if $filter_vocab_sri; then  # use SRILM to change LM vocab
      utils/format_lm_sri.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt \
        "${test}_sri"
    else  # just remove out-of-lexicon words without renormalizing the LM
      utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
    fi
  done