Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/lang/make_unk_lm.sh 14.3 KB
  #!/bin/bash
  
  # Copyright      2016 Johns Hopkins University (Author: Daniel Povey);
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # Begin configuration section.
  cmd=run.pl
  ngram_order=4
  num_extra_ngrams=10000
  position_dependent_phones=true
  use_pocolm=true
  min_word_length=2
  stage=0
  phone_disambig_symbol="#1"
  
  # end configuration sections
  
  [ -f path.sh ] && . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 2 ]; then
    echo "Usage: $0 [options] <input-dict-dir> <work-dir>"
    echo "e.g.: $0 data/local/dict exp/make_unk"
    echo ""
    echo "This script creates, as an FST, a phone language model suitable for modeling"
    echo "the unknown word.  It first trains a language model on the phone sequences of the"
    echo "provided dictionary entries (which should be without any word-position-dependency"
    echo "tags); it then creates an FST from it, while, for compactness after context-dependency"
    echo "limiting the transitions to seen bigram pairs of phones.  Then, by composing with"
    echo "a separate FST it converts it into word-position-dependent phones if applicable,"
    echo "while imposing a minimum-number-of-phones constraint."
    echo ""
    echo "  <input-dict-dir>:  A dictionary directory (as validated by validate_dict_dir.pl);"
    echo "             the dictionary from this location (lexicon.txt, lexiconp.txt, or"
    echo "             lexiconp_silprob.txt) will be used to train the language model on"
    echo "             phones.  The files silence_phones.txt and nonsilence_phones.txt will"
    echo "             be used to construct a symbol table used internally, and to"
    echo "             exclude lexicon entries containing silences."
    echo " <work-dir>:    A place to put logs and the output of this script.  The output of"
    echo "                this script will be written to <work-dir>/unk_fst.txt (we write in"
    echo "                text form so that it's independent of the phones.txt)."
    echo "Options:"
    echo "    --ngram-order <n>                 # (default: 4)  N-gram order of the phone-level language"
    echo "                                      # model.  Must be in range [2, 7]"
    echo "    --num-extra-ngrams <n>            # (default: 10000).  The maximum the number of n-grams"
    echo "                                      # that may be present in the language model in addition"
    echo "                                      # to the unigrams.  The LM will be pruned to achieve this."
    echo "    --use-pocolm <true|false>         # (default: true).  If true, use pocolm to estimate the"
    echo "                                      # language model; you will be prompted to install it if"
    echo "                                      # needed.  (If false, we use the script make_phone_lm.py,"
    echo "                                      # which is simpler but the perplexity is not as good)."
    echo "    --position-dependent-phones <true|false>  # (default: true).  If true, assume position-dependent"
    echo "                                      # phones (although in any case the lexicon should use position-"
    echo "                                      # independent phones).  If position-dependent phones are used,"
    echo "                                      # after creating the LM we compose with an FST that converts"
    echo "                                      # into position-dependent phones while enforcing the natural"
    echo "                                      # constraints that they form a single word."
    echo "    --min-word-length <1|2>           # (default: 2).  May only be 1 or 2.  The minimum word length"
    echo "                                      # (in number of phones) that is allowed"
    echo "    --phone-disambig-symbol <symbol>  # default: '#1'.  This is the symbol that will be put on the"
    echo "                                      # input side of backoff arcs.  You won't normally have to change"
    echo "                                      # this because prepare_lang.sh expects '#1' there."
    exit 1;
  fi
  
  
  dict_dir=$1
  dir=$2
  
  set -e
  
  mkdir -p $dir/log
  
  if [ $stage -le 0 ]; then
    if ! utils/validate_dict_dir.pl $dict_dir >&$dir/log/validate_dict_dir.log; then
      cat $dir/log/validate_dict_dir.log
      echo "$0: failed to validate input dict-dir $dict_dir"
      exit 1
    fi
  fi
  
  if ! [ $ngram_order -ge 2 ] || ! [ $ngram_order -le 7 ]; then
    echo "$0: invalid --ngram-order $ngram_order (must be in [2,7])"
    exit 1
  fi
  
  if ! [ $min_word_length -ge 1 ] || ! [ $min_word_length -le 2 ]; then
    echo "$0: invalid --min-word-length $min_word_length (must be in [1,2])"
    exit 1
  fi
  
  # The next command creates a symbol table that will cover all the symbols we might
  # possibly need in this script.  The word-position-dependent suffixes (_B and so on
  # won't be needed if --position-dependent-phones is false, but it won't hurt.
  cat $dict_dir/silence_phones.txt $dict_dir/nonsilence_phones.txt | \
    awk '{for(n=1;n<=NF;n++) print $n; }' | \
    awk '{print $1; print $1 "_B"; print $1 "_I"; print $1 "_S"; print $1 "_E";}' | \
        cat - <(echo "$phone_disambig_symbol") | \
    awk 'BEGIN{print "<eps> 0";} {print $1, NR;}' > $dir/phones.txt
  
  phone_disambig_int=$(tail -n 1 <$dir/phones.txt | awk '{print $2}')
  if ! [ $phone_disambig_int == $phone_disambig_int ]; then
    echo "$0: problem working out integer form of phone-disambig symbol."
    exit 1;
  fi
  
  if [ -e $dict_dir/lexicon.txt ]; then
    src_dict=$dict_dir/lexicon.txt
    first_phone_field=2
  elif [ -e $dict_dir/lexiconp.txt ]; then
    src_dict=$dict_dir/lexiconp.txt
    first_phone_field=3
  else
    [ ! -e $dict_dir/lexiconp_silprob.txt ] && \
      echo "$0: expected file $dict_dir/lexiconp_silprob.txt to exist" && exit 1
    src_dict=$dict_dir/lexiconp_silprob.tt
    first_phone_field=6
  fi
  
  cat $dict_dir/silence_phones.txt | awk '{for(n=1;n<=NF;n++) print $n; }' > $dir/silence_phones.txt
  
  # prepare the cleaned up version of the dictionary (to train our phone LM), with
  # the first field (the word) removed, with prons that have silence phones in
  # them removed, and with empty prons (which should not be allowed anyway, but
  # just in case..) removed.
  awk -v dir=$dir -v ff=$first_phone_field \
     'BEGIN{ while ((getline <(dir"/silence_phones.txt")) > 0) sil[$1]=1;  }
           { ok=1; for (n=ff; n<=NF; n++) { if ($n in sil) ok=0; }
             if (ok && NF>=ff) { for (n=ff;n<=NF;n++) printf("%s ",$n); print ""; } else {
              print("make_unk_lm.sh: info: not including dict line: ", $0) >"/dev/stderr" }}' <$src_dict >$dir/training.txt
  cat $dir/training.txt | awk '{for(n=1;n<=NF;n++) seen[$n]=1; } END{for (k in seen) print k;}' > $dir/all_nonsil_phones
  
  num_dict_lines=$(wc -l <$src_dict)
  num_train_lines=$(wc -l < $dir/training.txt)
  if ! [ $num_train_lines -gt 0 ]; then
    echo "$0: something went wrong getting text to train phone-level LM."
    exit 1
  fi
  echo "$0: training on $num_train_lines words out of $num_dict_lines in the "
  echo "     ... original dictionary (excluding words with silence phones)."
  
  
  if [ $num_train_lines -lt 2000 ] && $use_pocolm; then
    echo "$0: the number of lines of training data is very small [$num_train_lines]."
    echo "    Setting --use-pocolm to false since it probably won't work well"
    echo "    on so little data (e.g. hard to estimate the discounting parameters)"
    echo "    Using make_phone_lm.py instead."
    use_pocolm=false
  fi
  
  if $use_pocolm; then
    if [ ! -e $KALDI_ROOT/tools/pocolm ]; then
      echo "$0: $KALDI_ROOT/tools/pocolm does not exist:"
      echo " ... please do:  cd $KALDI_ROOT/tools; extras/install_pocolm.sh"
      echo " ... and then rerun this script."
      exit 1
    fi
  
    PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
  
    if [ $stage -le 1 ]; then
      echo "$0: training $ngram_order-gram LM with pocolm"
  
      mkdir -p $dir/pocolm/text
      heldout_ratio=5  # hold out one fifth of the data as validation to estimate
      # metaparameters; we'll fold it back in before estimating the
      # final LM.
      cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h == 0) print; }' > $dir/pocolm/text/dev.txt
      cat $dir/training.txt | awk -v h=$heldout_ratio '{if(NR%h != 0) print; }' > $dir/pocolm/text/train.txt
  
  
      # the following options are because we expect the amount of data to be small,
      # all the data subsampling isn't really needed and will increase the chance of
      # something going wrong.
  
      small_data_opts="--num-splits 4 --warm-start-ratio 1"
      $cmd $dir/log/train_lm.log \
           train_lm.py --wordlist $dir/all_nonsil_phones $small_data_opts \
           --fold-dev-into=train $dir/pocolm/text $ngram_order $dir/pocolm
    fi
  
    if [ $stage -le 2 ]; then
      echo "$0: pruning LM with pocolm"
      num_words=$(wc -l <$dir/all_nonsil_phones)
      num_ngrams=$[$num_extra_ngrams+$num_words]
  
  
      $cmd $dir/log/prune_lm_dir.log \
           prune_lm_dir.py --target-num-ngrams=$num_ngrams \
           $dir/pocolm/all_nonsil_phones_${ngram_order}.pocolm $dir/poclm/lm_pruned
  
      # format as arpa.
      format_arpa_lm.py $dir/poclm/lm_pruned > $dir/pocolm.arpa
    fi
  
    if [ $stage -le 3 ]; then
      echo "$0: applying bigram constraints and converting from ARPA to FST"
      # now get bigram constraints: we want to get an FST that only allows phone
      # bigrams that we've seen (this may enforce certain linguistic constraints,
      # and also stops the graph from blowing up too much once we introduce
      # phonetic context.
      # The NF > 0 is just a double-check that there are no empty prons, which
      # would be bad as it would allow an empty pronunciation of the unknown word.
      cat $dir/training.txt | awk '{ if (NF > 0) printf("<s> %s </s>
  ", $0); }' | \
        awk '{for(n=1;n<NF;n++) { m=n+1; seen[ $n " " $m ] = 1; }} END{for(k in seen) print k;}' \
            > $dir/allowed_bigrams
  
      $cmd $dir/log/arpa2fst.log \
           utils/lang/internal/arpa2fst_constrained.py --verbose=3 \
             --disambig-symbol="$phone_disambig_symbol" \
           $dir/pocolm.arpa $dir/allowed_bigrams '>' $dir/unk_fst_orig.txt
    fi
  else
  
    if [ $stage -le 1 ]; then
      echo "$0: using make_phone_lm.py to create $ngram_order-gram language-model FST"
      $cmd $dir/log/make_phone_lm.log \
           utils/sym2int.pl $dir/phones.txt $dir/training.txt '|' \
           utils/lang/make_phone_lm.py --verbose=2 \
           --phone-disambig-symbol=$phone_disambig_int \
           --num-extra-ngrams=$num_extra_ngrams \
           --ngram-order=$ngram_order '|' \
           utils/int2sym.pl -f 3-4 $dir/phones.txt '>'$dir/unk_fst_orig.txt
    fi
  fi
  
  
  sym_opts="--isymbols=$dir/phones.txt --osymbols=$dir/phones.txt"
  
  if ! $position_dependent_phones; then
    if  [ $min_word_length == 1 ]; then
      echo "$0: no word-length constraint or word-position-dependency, so exiting."
      # There is no need to compose unk_fst_orig.txt with a separate FST: because of
      # the bigram constraints and because we ensure that there were no empty prons
      # in the dictionary (no empty lines in training.txt), the FST wouldn't allow
      # length-zero words anyway.
      cp $dir/unk_fst_orig.txt $dir/unk_fst.txt
      fstcompile $sym_opts <$dir/unk_fst.txt >$dir/unk.fst
      exit 0;
    else
      echo "$0: creating constraint_fst.txt for min-word-length=2 constraint."
      # min-word-length is 2; we need to apply that constraint.  A note on the FST
      # states: 0 is start state, 1 is "seen one phone", 2 is "seen two or more
      # phones".
      # We don't need to take into account the disambig symbol because we compose on
      # the right with this FST, and it doesn't appear on the output side.
      cat $dir/all_nonsil_phones | \
        awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; }
                   print 2,0.0; }' > $dir/constraint_fst.txt
    fi
  else
    echo "$0: creating constraint_fst.txt for min-word-length=$min_word_length constraint, plus word-position-dependency conversion."
  
    # Add constraints and convert phones without tags into phones with the _B, _E, _I and _S
    # tags (begin, end, internal, singleton).
  
    # States:
    # 0 is start state,
    # 1 is "seen initial phone (and maybe internal phones) of multi-phone word",
    # 2 is "seen final phone of multi-phone word".
    # 3 is "seen phone of single-phone word"; note, if --min-word-length is 2,
    #      then state 3 will not exist.
  
    cat $dir/all_nonsil_phones | \
      awk -v mwl=$min_word_length -v "disambig=$phone_disambig_symbol" \
   '{ph[$1]=1} END{ for (n=0; n<3; n++) print n,n,disambig,disambig;
                    for (p in ph) { printf("0 1 %s %s_B
  ", p, p); printf("1 1 %s %s_I
  ", p, p);
                                    printf("1 2 %s %s_E
  ", p, p); if (mwl==1) printf("0 3 %s %s_S
  ", p, p);  }
                   print 2,0.0; if (mwl==1) print 3,0.0; }' >$dir/constraint_fst.txt
  fi
  
  
  echo "$0: creating final FST via composition, etc."
  
  fstcompile $sym_opts <$dir/constraint_fst.txt | fstarcsort > $dir/constraint.fst
  fstcompile $sym_opts <$dir/unk_fst_orig.txt >$dir/unk_orig.fst
  
  # The first 'fstproject' below projects on the input; it makes sure the
  # disambiguation symbol appears on the output side also.
  # The fstcompose actually applies the constraints and does the conversion, but
  # after this the "correct" phones appear only on the output side.
  # The second 'fstproject' copies the word-position-dependent phones to
  # the input side.
  # The 'fstpushspecial' pushes the weights, as the composition with the
  #  constraint FST makes the FST quite non-stochastic [weights per state do not
  #  sum up to one].
  # The 'fstrmsymbols' command makes sure the disambiguation symbol appears only
  # on the input side.
  # 'fstminimizeencoded' combines states that are the same as far as their output
  # arcs are concerned; in the case where --min-word-length is 1, this combines
  # a lot of final-states that have no transitions out of them.
  fstproject $dir/unk_orig.fst | \
    fstcompose - $dir/constraint.fst | \
    fstproject --project_output=true | \
    fstpushspecial | \
    fstminimizeencoded | \
    fstrmsymbols --remove-from-output=true <(echo $phone_disambig_int) >$dir/unk.fst
  
  fstprint $sym_opts <$dir/unk.fst >$dir/unk_fst.txt
  
  
  exit 0;