Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/subword/prepare_lang_subword.sh 19.4 KB
  #!/bin/bash
  # Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
  #                      Arnab Ghoshal
  #                2014  Guoguo Chen
  #                2015  Hainan Xu
  #                2016  FAU Erlangen (Author: Axel Horndasch)
  #                2019  Dongji Gao
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # This script prepares a directory (for subword) such as data/lang_subword/, in the standard format,
  # given a source directory containing a subword dictionary lexicon.txt in a form like:
  # subword phone1 phone2 ... phoneN
  # per line (alternate prons would be separate lines), or a dictionary with probabilities
  # called lexiconp.txt in a form:
  # subword pron-prob phone1 phone2 ... phoneN
  # (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
  # lexicon.txt exists.
  # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
  # and extra_questions.txt
  # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
  # non-silence phones respectively (where silence includes various kinds of
  # noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
  # "real" phones.)
  # In each line of those files is a list of phones, and the phones on each line
  # are assumed to correspond to the same "base phone", i.e. they will be
  # different stress or tone variations of the same basic phone.
  # The file "optional_silence.txt" contains just a single phone (typically SIL)
  # which is used for optional silence in the lexicon.
  # extra_questions.txt might be empty; typically will consist of lists of phones,
  # all members of each list with the same stress or tone; and also possibly a
  # list for the silence phones.  This will augment the automatically generated
  # questions (note: the automatically generated ones will treat all the
  # stress/tone versions of a phone the same, so will not "get to ask" about
  # stress or tone).
  #
  
  # This script adds word-position-dependent phones and constructs a host of other
  # derived files, that go in data/lang_subword/.
  
  # Currently it only support the most basic functions.
  # Begin configuration section.
  num_sil_states=5
  num_nonsil_states=3
  position_dependent_phones=true
  # position_dependent_phones is false also when position dependent phones and word_boundary.txt
  # have been generated by another source
  share_silence_phones=false  # if true, then share pdfs of different silence
                              # phones together.
  sil_prob=0.5
  num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                  # Increasing this number does not harm, but is only useful if you later
                                  # want to introduce this labels to L_disambig.fst
  separator="@@"   # Separator is a suffix or prefix of subword indicating the position of this subword in word.
                   # By default, subword which is not at the end of word would have separator as suffix.
                   # For example: international -> inter@@ nation@@ al
  
  # end configuration sections
  
  echo "$0 $@"  # Print the command line for logging
  
  . utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
    echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
    echo "<dict-src-dir> should contain the following files:"
    echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
    echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
    echo "options: "
    echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
    echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
    echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
    echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
    echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
    echo "                                                     # markers on phones to indicate word-internal positions. "
    echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
    echo "                                                     # all silence phones. "
    echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
    echo "     --separator <separator>                         # default: @@"
    exit 1;
  fi
  
  srcdir=$1
  oov_word=$2
  tmpdir=$3
  dir=$4
  mkdir -p $dir $tmpdir $dir/phones
  
  silprob=false
  [ -f $srcdir/lexiconp_silprob.txt ] && echo "$0: Currently we do not support word-dependent silence probability." && exit 1;
  
  if [ -f $srcdir/nonterminals.txt ]; then
    echo "$0: Currently we do not support nonterminals" && exit 1;
  else
    grammar_opts=
  fi
  
  [ -f path.sh ] && . ./path.sh
  
  # Validate dict directory
  ! utils/validate_dict_dir.pl $srcdir && \
    echo "*Error validating directory $srcdir*" && exit 1;
  
  if [[ ! -f $srcdir/lexicon.txt ]]; then
    echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
    perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
  fi
  if [[ ! -f $srcdir/lexiconp.txt ]]; then
    echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
    perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
  fi
  
  # Currently The lexicon in dict directory have to be a subword lexicon.
  # If the lexicon is for word and is not phonemic, we can not get a subword lexicon without knowing the alignment.
  ! grep -q $separator $srcdir/lexiconp.txt && \
  echo "$0: Warning, this lexicon contains no separator \"$separator\" and may not be a subword lexicon." && exit 1;
  
  # Write the separator into file for future use.
  echo $separator > $dir/subword_separator.txt
  
  if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
    utils/validate_dict_dir.pl $srcdir  # show the output.
    echo "Validation failed (second time)"
    exit 1;
  fi
  
  if $position_dependent_phones; then
    # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
    # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
    # adding the markers _B, _E, _S, _I depending on word position.
    # In this recipe, these markers apply to silence also.
    # Do this starting from lexiconp.txt only.
    if "$silprob"; then
      echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
    else
      utils/lang/make_position_dependent_subword_lexicon.py $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
    fi
  
    # create $tmpdir/phone_map.txt
    # this has the format (on each line)
    # <original phone> <version 1 of original phone> <version 2> ...
    # where the versions depend on the position of the phone within a word.
    # For instance, we'd have:
    # AA AA_B AA_E AA_I AA_S
    # for (B)egin, (E)nd, (I)nternal and (S)ingleton
    # and in the case of silence
    # SIL SIL SIL_B SIL_E SIL_I SIL_S
    # [because SIL on its own is one of the variants; this is for when it doesn't
    #  occur inside a word but as an option in the lexicon.]
  
    # This phone map expands the phone lists into all the word-position-dependent
    # versions of the phone lists.
    cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
      <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
      > $tmpdir/phone_map.txt
  else
    if "$silprob"; then
      echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
    else
      cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
    fi
  
    cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
      awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
    paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
  fi
  
  mkdir -p $dir/phones  # various sets of phones...
  
  # Sets of phones for use in clustering, and making monophone systems.
  
  if $share_silence_phones; then
    # build a roots file that will force all the silence phones to share the
    # same pdf's. [three distinct states, only the transitions will differ.]
    # 'shared'/'not-shared' means, do we share the 3 states of the HMM
    # in the same tree-root?
    # Sharing across models(phones) is achieved by writing several phones
    # into one line of roots.txt (shared/not-shared doesn't affect this).
    # 'not-shared not-split' means we have separate tree roots for the 3 states,
    # but we never split the tree so they remain stumps,
    # so all phones in the line correspond to the same model.
  
    cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("
  ");}' | cat - $srcdir/nonsilence_phones.txt | \
      utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
    cat $dir/phones/sets.txt | \
      awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
  else
    # different silence phones will have different GMMs.  [note: here, all "shared split" means
    # is that we may have one GMM for all the states, or we can split on states.  because they're
    # context-independent phones, they don't see the context.]
    cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
    cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
  fi
  
  cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
  cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
    awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
  cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
  cp $dir/phones/silence.txt $dir/phones/context_indep.txt
  
  # if extra_questions.txt is empty, it's OK.
  cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
    >$dir/phones/extra_questions.txt
  
  # Want extra questions about the word-start/word-end stuff. Make it separate for
  # silence and non-silence. Probably doesn't matter, as silence will rarely
  # be inside a word.
  if $position_dependent_phones; then
    for suffix in _B _E _I _S; do
      (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
    done
    for suffix in "" _B _E _I _S; do
      (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
    done
  fi
  
  # add_lex_disambig.pl is responsible for adding disambiguation symbols to
  # the lexicon, for telling us how many disambiguation symbols it used,
  # and and also for modifying the unknown-word's pronunciation (if the
  # --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
  # disambig symbols for that purpose.
  # The #2 will later be replaced with the actual unk model.  The reason
  # for the #1 and the #3 is for disambiguation and also to keep the
  # FST compact.  If we didn't have the #1, we might have a different copy of
  # the unk-model FST, or at least some of its arcs, for each start-state from
  # which an <unk> transition comes (instead of per end-state, which is more compact);
  # and adding the #3 prevents us from potentially having 2 copies of the unk-model
  # FST due to the optional-silence [the last phone of any word gets 2 arcs].
  
  if "$silprob"; then
    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
  else
    ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
  fi
  ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
  echo $ndisambig > $tmpdir/lex_ndisambig
  
  # Format of lexiconp_disambig.txt:
  # !SIL	1.0   SIL_S
  # <SPOKEN_NOISE>	1.0   SPN_S #1
  # <UNK>	1.0  SPN_S #2
  # <NOISE>	1.0  NSN_S
  # !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E
  
  ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt
  
  # Create phone symbol table.
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
    awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
  
  # Create a file that describes the word-boundary information for
  # each phone.  5 categories.
  if $position_dependent_phones; then
    cat $dir/phones/{silence,nonsilence}.txt | \
      awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
           /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
           {print $1, "nonword";} ' > $dir/phones/word_boundary_moved.txt
  else
    # word_boundary.txt might have been generated by another source
    [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary_moved.txt
  fi
  
  # Create word symbol table.
  # <s> and </s> are only needed due to the need to rescore lattices with
  # ConstArpaLm format language model. They do not normally appear in G.fst or
  # L.fst.
  
  if "$silprob"; then
    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
  fi
  
  cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
    BEGIN {
      print "<eps> 0";
    }
    {
      if ($1 == "<s>") {
        print "<s> is in the vocabulary!" | "cat 1>&2"
        exit 1;
      }
      if ($1 == "</s>") {
        print "</s> is in the vocabulary!" | "cat 1>&2"
        exit 1;
      }
      printf("%s %d
  ", $1, NR);
    }
    END {
      printf("#0 %d
  ", NR+1);
      printf("<s> %d
  ", NR+2);
      printf("</s> %d
  ", NR+3);
    }' > $dir/words.txt || exit 1;
  
  # In case there are extra word-level disambiguation symbols they also
  # need to be added to words.txt
  
  # format of $dir/words.txt:
  # <eps> 0
  # a 1
  # aa 2
  # aarvark 3
  # ...
  
  silphone=`cat $srcdir/optional_silence.txt` || exit 1;
  [ -z "$silphone" ] && \
    ( echo "You have no optional-silence phone; it is required in the current scripts"
      echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
     exit 1;
  
  # create $dir/phones/align_lexicon.{txt,int}.
  # This is the method we use for lattice word alignment if we are not
  # using word-position-dependent phones.
  
  # First remove pron-probs from the lexicon.
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
  
  # Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
  # and is not part of a word.
  [ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
  
  cat $tmpdir/align_lexicon.txt | \
    perl -ane '@A = split; print $A[0], " ", join(" ", @A), "
  ";' | sort | uniq > $dir/phones/align_lexicon.txt
  
  # create phones/align_lexicon.int from phones/align_lexicon.txt
  cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
    utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
  
  # Create the basic L.fst without disambiguation symbols, for use
  # in training.
  
  if $silprob; then
  #  # Add silence probabilities (models the prob. of silence before and after each
  #  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
  #  # and where it's called in the example scripts (run.sh).
    echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
  else
    utils/lang/make_subword_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone --position-dependent\
              --separator=$separator $tmpdir/lexiconp.txt | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
        --keep_isymbols=false --keep_osymbols=false | \
      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
  fi
  
  # The file oov.txt contains a word that we will map any OOVs to during
  # training.
  echo "$oov_word" > $dir/oov.txt || exit 1;
  cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
  # integer version of oov symbol, used in some scripts.
  
  # the file wdisambig.txt contains a (line-by-line) list of the text-form of the
  # disambiguation symbols that are used in the grammar and passed through by the
  # lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
  # for more generality (which probably would be added by another script).
  # wdisambig_words.int contains the corresponding list interpreted by the
  # symbol table words.txt, and wdisambig_phones.int contains the corresponding
  # list interpreted by the symbol table phones.txt.
  echo '#0' >$dir/phones/wdisambig.txt
  
  utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
  utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
  
  # Create these lists of phones in colon-separated integer list form too,
  # for purposes of being given to programs as command-line options.
  for f in silence nonsilence optional_silence disambig context_indep; do
    utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
    utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
     awk '{printf(":%d", $1);} END{printf "
  "}' | sed s/:// > $dir/phones/$f.csl || exit 1;
  done
  
  for x in sets extra_questions; do
    utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
  done
  
  utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
     > $dir/phones/roots.int || exit 1;
  
  if [ -f $dir/phones/word_boundary_moved.txt ]; then
    utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary_moved.txt \
      > $dir/phones/word_boundary_moved.int || exit 1;
  fi
  
  silphonelist=`cat $dir/phones/silence.csl`
  nonsilphonelist=`cat $dir/phones/nonsilence.csl`
  
  # Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
  # with another one of your choice if the 'topo' file you want can't be generated by
  # utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
  # should cover all the phones.  Try running utils/validate_lang.pl to check that
  # everything is OK after modifying the topo file.
  utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo
  
  # Create the lexicon FST with disambiguation symbols, and put it in lang_test.
  # There is an extra step where we create a loop to "pass through" the
  # disambiguation symbols from G.fst.
  
  if $silprob; then
    echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
  else
    utils/lang/make_subword_lexicon_fst.py $grammar_opts \
         --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig --position-dependent \
         --separator=$separator $tmpdir/lexiconp_disambig.txt | \
       fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
         --keep_isymbols=false --keep_osymbols=false |   \
       fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
       fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
  fi
  
  echo "$(basename $0): validating output directory"
  ! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;
  
  exit 0;