gp_format_lm.sh 3.18 KB
#!/bin/bash -u

# Copyright 2012  Arnab Ghoshal

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

set -o errexit
set -o pipefail

. ./path.sh    # Sets the PATH to contain necessary executables

# Begin configuration section.
filter_vocab_sri=false    # if true, use SRILM to change the LM vocab
srilm_opts="-subset -prune-lowprobs -unk -tolower"
# end configuration sections

help_message="Usage: "`basename $0`" [options] LM-dir LC [LC ... ]
where LC is a 2-letter code for GlobalPhone languages, and LM-dir is assumed to 
contain LMs for all the languages (e.g. RU.3gram.lm.gz for Russian).
options: 
  --help                           # print this message and exit
  --filter-vocab-sri (true|false)  # use SRILM to change the LM vocab (default: $filter_vocab_sri)
  --srilm-opts STRING              # options to pass to SRILM tools (default: '$srilm_opts')
";

. utils/parse_options.sh

if [ $# -lt 2 ]; then
  printf "$help_message\n"; exit 1;
fi

LMDIR=$1; shift;
LANGUAGES=
while [ $# -gt 0 ]; do
  case "$1" in
  ??) LANGUAGES=$LANGUAGES" $1"; shift ;;
  *)  echo "Unknown argument: $1, exiting"; error_exit $usage ;;
  esac
done

if [ -z $IRSTLM ] ; then
  export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
  echo "$0: Error: the IRSTLM is not available or compiled" >&2
  echo "$0: Error: We used to install it by default, but." >&2
  echo "$0: Error: this is no longer the case." >&2
  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
  echo "$0: Error: and run extras/install_irstlm.sh" >&2
  exit 1
fi

for L in $LANGUAGES; do
  lm=$LMDIR/${L}.3gram.lm.gz
  [ -f $lm ] || { echo "LM '$lm' not found"; exit 1; }
  test=data/$L/lang_test_tg
  if $filter_vocab_sri; then  # use SRILM to change LM vocab
    utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
      data/$L/lang $lm data/$L/local/dict/lexicon.txt "${test}_sri"
  else  # just remove out-of-lexicon words without renormalizing the LM
    utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
  fi

  # Create a pruned version of the LM for building the decoding graphs, using 
  # 'prune-lm' from IRSTLM:
  mkdir -p data/$L/local/lm
  prune-lm --threshold=1e-7 $lm /dev/stdout | gzip -c \
    > data/$L/local/lm/${L}.tgpr.arpa.gz
  lm=data/$L/local/lm/${L}.tgpr.arpa.gz
  test=data/$L/lang_test_tgpr
  if $filter_vocab_sri; then  # use SRILM to change LM vocab
    utils/format_lm_sri.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt \
      "${test}_sri"
  else  # just remove out-of-lexicon words without renormalizing the LM
    utils/format_lm.sh data/$L/lang $lm data/$L/local/dict/lexicon.txt "$test"
  fi
done