Blame view

egs/ptb/s5/local/rnnlm/train_backoff_lm.sh 1.37 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  #!/bin/bash
  
  # this script is to train a small, pruned n-gram backoff LM to be used for sampling
  # purposes during RNNLM training.  We ue pocolm for this because it's good at pruning,
  # maintained by us so we can ensure it works, and has no licensing problems.
  
  dir=exp/rnnlm_data_prep
  
  # To be run from the directory egs/ptb/s5.
  # to be run after prepare_rnnlm_data.sh.  this will all later be refactored.
  
  . ./path.sh
  set -e
  [ -z "$KALDI_ROOT" ] && echo "$0: KALDI_ROOT is not set in path.sh" && exit 1
  pocolm=$KALDI_ROOT/tools/pocolm
  
  if [ ! -f $pocolm/scripts/train_lm.py ]; then
    echo "$0: you should install pocolm.  cd to $KALDI_ROOT/tools and run extras/install_pocolm.sh."
    exit 1
  fi
  
  # version of data/text with just the things needed for pocolm.
  mkdir -p data/pocolm/text
  cp data/text/*.txt data/pocolm/text  # wasteful, yes...
  
  $pocolm/scripts/validate_text_dir.py data/pocolm/text
  tail -n +5 data/vocab/words.txt | awk '{print $1}' > data/pocolm/wordlist
  
  # later could consider using min-counts.
  
  $pocolm/scripts/train_lm.py  --wordlist=data/pocolm/wordlist --num-splits=10 --warm-start-ratio=20  \
                               data/pocolm/text 3 data/pocolm/work data/pocolm/lm
  
  $pocolm/scripts/prune_lm_dir.py  --target-num-ngrams=100000 data/pocolm/lm data/pocolm/lm_pruned100k
  
  $pocolm/scripts/format_arpa_lm.py data/pocolm/lm_pruned100k | gzip -c >data/pocolm/trigram_100k.arpa.gz