Blame view
egs/gp/s5/local/gp_format_lm.sh
3.18 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail . ./path.sh # Sets the PATH to contain necessary executables # Begin configuration section. filter_vocab_sri=false # if true, use SRILM to change the LM vocab srilm_opts="-subset -prune-lowprobs -unk -tolower" # end configuration sections help_message="Usage: "`basename $0`" [options] LM-dir LC [LC ... ] where LC is a 2-letter code for GlobalPhone languages, and LM-dir is assumed to contain LMs for all the languages (e.g. RU.3gram.lm.gz for Russian). options: --help # print this message and exit --filter-vocab-sri (true|false) # use SRILM to change the LM vocab (default: $filter_vocab_sri) --srilm-opts STRING # options to pass to SRILM tools (default: '$srilm_opts') "; . utils/parse_options.sh if [ $# -lt 2 ]; then printf "$help_message "; exit 1; fi LMDIR=$1; shift; LANGUAGES= while [ $# -gt 0 ]; do case "$1" in ??) LANGUAGES=$LANGUAGES" $1"; shift ;; *) echo "Unknown argument: $1, exiting"; error_exit $usage ;; esac done if [ -z $IRSTLM ] ; then export IRSTLM=$KALDI_ROOT/tools/irstlm/ fi export PATH=${PATH}:$IRSTLM/bin if ! command -v prune-lm >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 echo "$0: Error: this is no longer the case." >&2 echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 echo "$0: Error: and run extras/install_irstlm.sh" >&2 exit 1 fi for L in $LANGUAGES; do lm=$LMDIR/${L}.3gram.lm.gz [ -f $lm ] || { echo "LM '$lm' not found"; exit 1; } test=data/$L/lang_test_tg if $filter_vocab_sri; then # use SRILM to change LM vocab utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \ data/$L/lang $lm data/$L/local/dict/lexicon.txt "${test}_sri" else # just remove out-of-lexicon words without renormalizing the LM utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test" fi # Create a pruned version of the LM for building the decoding graphs, using # 'prune-lm' from IRSTLM: mkdir -p data/$L/local/lm prune-lm --threshold=1e-7 $lm /dev/stdout | gzip -c \ > data/$L/local/lm/${L}.tgpr.arpa.gz lm=data/$L/local/lm/${L}.tgpr.arpa.gz test=data/$L/lang_test_tgpr if $filter_vocab_sri; then # use SRILM to change LM vocab utils/format_lm_sri.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt \ "${test}_sri" else # just remove out-of-lexicon words without renormalizing the LM utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test" fi done |