Yannick Estève / ONTRAC-Kaldi

Blame view

egs/librispeech/s5/local/lm/train_lm.sh 4.97 KB
  #!/bin/bash
  
  # Copyright 2014 Vassil Panayotov
  # Apache 2.0
  
  # This is the top-level LM training script
  
  . ./path.sh || exit 1
  . ./cmd.sh || exit 1
  
  # use to skip some of the initial steps
  stage=1
  
  # how many words we want in the LM's vocabulary
  vocab_size=200000
  
  # LM pruning threshold for the 'small' trigram model
  prune_thresh_small=0.0000003
  
  # LM pruning threshold for the 'medium' trigram model
  prune_thresh_medium=0.0000001
  
  # how many text normalization jobs to run in parallel
  normjobs=2
  
  . utils/parse_options.sh || exit 1
  
  if [[ $# -ne 4 ]]; then
    echo "Usage: $1 <lm-texts-root> <tmp-dir> <txt-norm-root> <out-lm-dir>"
    echo "where,"
    echo "  <lm-text-root>: the root directory containing the raw(unnormalized) LM training texts"
    echo "  <tmp-dir>: store the temp files into this dir"
    echo "  <txt-norm-root>: store the normalized texts in subdirectories under this root dir"
    echo "  <out-lm-dir>: the directory to store the trained ARPA model"
    exit 1
  fi
  
  corpus_dir=$1/corpus
  tmp_dir=$2
  norm_dir=$3
  lm_dir=$4
  
  [[ -d "$corpus_dir" ]] || { echo "No such directory '$corpus_dir'"; exit 1; }
  
  split_prefix=$tmp_dir/split
  
  if [ "$stage" -le 1 ]; then
    mkdir -p $tmp_dir
    echo "Splitting into $normjobs parts, to allow for parallel processing ..."
    split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}")
    find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
      tee $tmp_dir/all_texts.txt |\
      utils/split_scp.pl /dev/stdin $split_files
    echo "Checking the splits ..."
    total_count=$(wc -l <$tmp_dir/all_texts.txt)
    split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}')
    [[ "$total_count" -eq "$split_count" ]] || { echo "Inconsistent counts"; exit 1; }
  fi
  
  if [ "$stage" -le 2 ]; then
    echo "Performing text normalization ($normjobs jobs) - check $tmp_dir/txt_norm.JOB.log ..."
    mkdir -p $norm_dir
    $mkgraph_cmd JOB=1:$normjobs $tmp_dir/txt_norm.JOB.log \
      local/lm/normalize_text.sh $split_prefix-JOB $norm_dir || exit 1
    echo "Finished OK"
  fi
  
  word_counts=$lm_dir/word_counts.txt
  vocab=$lm_dir/librispeech-vocab.txt
  full_corpus=$lm_dir/librispeech-lm-norm.txt.gz
  
  if [ "$stage" -le 3 ]; then
    echo "Selecting the vocabulary ($vocab_size words) ..."
    mkdir -p $lm_dir
    echo "Making the corpus and the vocabulary ..."
    # The following sequence of commands does the following:
    # 1) Eliminates duplicate sentences and saves the resulting corpus
    # 2) Splits the corpus into words
    # 3) Sorts the words in respect to their frequency
    # 4) Saves the list of the first $vocab_size words sorted by their frequencies
    # 5) Saves an alphabetically sorted vocabulary, that include the most frequent $vocab_size words
    for f in $(find $norm_dir -iname '*.txt'); do cat $f; done |\
      sort -u | tee >(gzip >$full_corpus) | tr -s '[[:space:]]' '
  ' |\
      sort | uniq -c | sort -k1 -n -r |\
      head -n $vocab_size | tee $word_counts | awk '{print $2}' | sort >$vocab || exit 1
    echo "Word counts saved to '$word_counts'"
    echo "Vocabulary saved as '$vocab'"
    echo "All unique sentences (in sorted order) stored in '$full_corpus'"
    echo "Counting the total number word tokens in the corpus ..."
    echo "There are $(wc -w < <(zcat $full_corpus)) tokens in the corpus"
  fi
  
  trigram_lm=$lm_dir/lm_tglarge.arpa.gz
  
  if [ "$stage" -le 4 ]; then
    echo "Training a 3-gram LM ..."
    command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    echo "This implementation assumes that you have a lot of free RAM(> 12GB) on your machine"
    echo "If that's not the case, consider something like: http://joshua-decoder.org/4.0/large-lms.html"
    ngram-count -order 3  -kndiscount -interpolate \
      -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $trigram_lm || exit 1
    du -h $trigram_lm
  fi
  
  trigram_pruned_small=$lm_dir/lm_tgsmall.arpa.gz
  
  if [ "$stage" -le 5 ]; then
    echo "Creating a 'small' pruned 3-gram LM (threshold: $prune_thresh_small) ..."
    command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram -prune $prune_thresh_small -lm $trigram_lm -write-lm $trigram_pruned_small || exit 1
    du -h $trigram_pruned_small
  fi
  
  trigram_pruned_medium=$lm_dir/lm_tgmed.arpa.gz
  
  if [ "$stage" -le 5 ]; then
    echo "Creating a 'medium' pruned 3-gram LM (threshold: $prune_thresh_medium) ..."
    command -v ngram 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram -prune $prune_thresh_medium -lm $trigram_lm -write-lm $trigram_pruned_medium || exit 1
    du -h $trigram_pruned_medium
  fi
  
  fourgram_lm=$lm_dir/lm_fglarge.arpa.gz
  
  if [ "$stage" -le 4 ]; then
    # This requires even more RAM than the 3-gram
    echo "Training a 4-gram LM ..."
    command -v ngram-count 1>/dev/null 2>&1 || { echo "Please install SRILM and set path.sh accordingly"; exit 1; }
    ngram-count -order 4  -kndiscount -interpolate \
      -unk -map-unk "<UNK>" -limit-vocab -vocab $vocab -text $full_corpus -lm $fourgram_lm || exit 1
    du -h $fourgram_lm
  fi
  
  exit 0