train_backoff_lm.sh 1.37 KB
#!/bin/bash

# this script is to train a small, pruned n-gram backoff LM to be used for sampling
# purposes during RNNLM training.  We ue pocolm for this because it's good at pruning,
# maintained by us so we can ensure it works, and has no licensing problems.

dir=exp/rnnlm_data_prep

# To be run from the directory egs/ptb/s5.
# to be run after prepare_rnnlm_data.sh.  this will all later be refactored.

. ./path.sh
set -e
[ -z "$KALDI_ROOT" ] && echo "$0: KALDI_ROOT is not set in path.sh" && exit 1
pocolm=$KALDI_ROOT/tools/pocolm

if [ ! -f $pocolm/scripts/train_lm.py ]; then
  echo "$0: you should install pocolm.  cd to $KALDI_ROOT/tools and run extras/install_pocolm.sh."
  exit 1
fi

# version of data/text with just the things needed for pocolm.
mkdir -p data/pocolm/text
cp data/text/*.txt data/pocolm/text  # wasteful, yes...

$pocolm/scripts/validate_text_dir.py data/pocolm/text
tail -n +5 data/vocab/words.txt | awk '{print $1}' > data/pocolm/wordlist

# later could consider using min-counts.

$pocolm/scripts/train_lm.py  --wordlist=data/pocolm/wordlist --num-splits=10 --warm-start-ratio=20  \
                             data/pocolm/text 3 data/pocolm/work data/pocolm/lm

$pocolm/scripts/prune_lm_dir.py  --target-num-ngrams=100000 data/pocolm/lm data/pocolm/lm_pruned100k

$pocolm/scripts/format_arpa_lm.py data/pocolm/lm_pruned100k | gzip -c >data/pocolm/trigram_100k.arpa.gz