grammar-context-fst.h 12.2 KB
// fstext/grammar-context-fst.h

// Copyright   2018  Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
//


#ifndef KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
#define KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_

/* This header defines a special form of the context FST "C" (the "C" in "HCLG")
   that integrates with our framework for building dynamic graphs for grammars
   that are too big to statically create, or graphs with on-the-fly pieces that
   you want to create at recognition time without building the whole graph.

   This framework is limited to only work with models with left-biphone context.
   (Fortunately this doesn't impact results, as our best models are all 'chain'
   models with left biphone context).

   The main code exported from here is the class InverseLeftBiphoneContextFst,
   which is similar to the InverseContextFst defined in context-fst.h, but
   is limited to left-biphone context and also supports certain special
   extensions we need to compile grammars.

   See \ref grammar (../doc/grammar.dox) for high-level
   documentation on how this framework works.
*/


#include <algorithm>
#include <string>
#include <vector>
#include <fst/fstlib.h>
#include <fst/fst-decl.h>

#include "util/const-integer-set.h"
#include "fstext/deterministic-fst.h"
#include "fstext/context-fst.h"

namespace fst {


/**
   An anonymous enum to define some values for symbols used in our grammar-fst
   framework.  Please understand this with reference to the documentation in
   \ref grammar (../doc/grammar.dox).  This enum defines
   the values of nonterminal-related symbols in phones.txt.  They are not
   the actual values-- they will be shifted by adding the value
   nonterm_phones_offset which is passed in by the command-line flag
   --nonterm-phones-offset.

 */

enum NonterminalValues {
  kNontermBos = 0,  // #nonterm_bos
  kNontermBegin = 1,  // #nonterm_begin
  kNontermEnd = 2,  // #nonterm_end
  kNontermReenter = 3,  // #nonterm_reenter
  kNontermUserDefined = 4,   // the lowest-numbered user-defined nonterminal, e.g. #nonterm:foo
  // kNontermMediumNumber and kNontermBigNumber come into the encoding of
  // nonterminal-related symbols in HCLG.fst.  The only hard constraint on them
  // is that kNontermBigNumber must be bigger than the biggest transition-id in
  // your system, and kNontermMediumNumber must be >0.  These values were chosen
  // for ease of human inspection of numbers encoded with them.
  kNontermMediumNumber = 1000,
  kNontermBigNumber = 10000000
};



// Returns the smallest multiple of 1000 that is strictly greater than
// nonterm_phones_offset.  Used in the encoding of special symbol in HCLG;
// they are encoded as
//  special_symbol =
//     kNontermBigNumber + (nonterminal * encoding_multiple) + phone_index
inline int32 GetEncodingMultiple(int32 nonterm_phones_offset) {
  int32 medium_number = static_cast<int32>(kNontermMediumNumber);
  return medium_number *
      ((nonterm_phones_offset + medium_number) / medium_number);
}

/**
   This is a variant of the function ComposeContext() which is to be used
   with our "grammar FST" framework (see \ref graph_context, i.e.
   ../doc/grammar.dox, for more details).  This does not take
   the 'context_width' and 'central_position' arguments because they are
   assumed to be 2 and 1 respectively (meaning, left-biphone phonetic context).

   This function creates a context FST and composes it on the left with "ifst"
   to make "ofst".

    @param [in] nonterm_phones_offset  The integer id of the symbol
                  #nonterm_bos in the phones.txt file.  You can just set this
                  to a large value (like 1 million) if you are not actually using
                  nonterminals (e.g. for testing purposes).
    @param [in] disambig_syms  List of disambiguation symbols, e.g. the integer
                 ids of #0, #1, #2 ... in the phones.txt.
    @param [in,out] ifst   The FST we are composing with C (e.g. LG.fst).
    @param [out] ofst   Composed output FST (would be CLG.fst).
    @param [out] ilabels  Vector, indexed by ilabel of CLG.fst, providing information
                  about the meaning of that ilabel; see \ref tree_ilabel
                  (http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel)
                  and also \ref grammar_special_clg
                  (http://kaldi-asr.org/doc/grammar#grammar_special_clg).
  */
void ComposeContextLeftBiphone(
    int32 nonterm_phones_offset,
    const vector<int32> &disambig_syms,
    const VectorFst<StdArc> &ifst,
    VectorFst<StdArc> *ofst,
    vector<vector<int32> > *ilabels);



/*
   InverseLeftBiphoneContextFst represents the inverse of the context FST "C" (the "C" in
   "HCLG") which transduces from symbols representing phone context windows
   (e.g. "a, b, c") to individual phones, e.g. "a".  So InverseContextFst
   transduces from phones to symbols representing phone context windows.  The
   point is that the inverse is deterministic, so the DeterministicOnDemandFst
   interface is applicable, which turns out to be a convenient way to implement
   this.

   This doesn't implement the full Fst interface, it implements the
   DeterministicOnDemandFst interface which is much simpler and which is
   sufficient for what we need to do with this.

   Search for "hbka.pdf" ("Speech Recognition with Weighted Finite State
   Transducers") by M. Mohri, for more context.
*/

class InverseLeftBiphoneContextFst: public DeterministicOnDemandFst<StdArc> {
public:
  typedef StdArc Arc;
  typedef typename StdArc::StateId StateId;
  typedef typename StdArc::Weight Weight;
  typedef typename StdArc::Label Label;

  /**
     Constructor.  This does not take the arguments 'context_width' or
     'central_position' because they are assumed to be (2, 1) meaning a
     system with left-biphone context; and there is no subsequential
     symbol because it is not needed in systems without right context.

        @param [in] nonterm_phones_offset  The integer id of the symbol
                  #nonterm_bos in the phones.txt file. You can just set this to
                  a large value (like 1 million) if you are not actually using
                  nonterminals (e.g. for testing purposes).
        @param [in] phones      List of integer ids of phones, as you would see in phones.txt
        @param [in] disambig_syms   List of integer ids of disambiguation symbols,
                                   e.g. the ids of #0, #1, #2 in phones.txt

     See \ref graph_context for more details.
  */
  InverseLeftBiphoneContextFst(Label nonterm_phones_offset,
                               const vector<int32>& phones,
                               const vector<int32>& disambig_syms);

  /**
     Here is a note on the state space of InverseLeftBiphoneContextFst;
     see \ref grammar_special_c which has some documentation on this.

     The state space uses the same numbering as phones.txt.

       State 0 means the beginning-of-sequence state, where there is no left
       context.

       For each phone p in the list 'phones' passed to the constructor (i.e. in
       the set passed to the constructor), the state 'p' corresponds to a
       left-context of that phone.

       If p is equal to nonterm_phones_offset_ + kNontermBegin (i.e. the
       integer form of `\#nonterm_begin`), then this is the state we transition
       to when we see that symbol starting from left-context==0 (no context).  The
       transition to this special state will have epsilon on the output.  (talking
       here about inv(C), not C, so input/output are reversed).
       The state is nonfinal and when we see a regular phone p1 or #nonterm_bos, instead of
       outputting that phone in context, we output the pair (#nonterm_begin,p1) or
       (#nonterm_begin,#nonterm_bos).  This state is not final.

       If p is equal to nonterm_phones_offset_ + kNontermUserDefined, then this
       is the state we transition to when we see any user-defined nonterminal.
       Transitions to this special state have olabels of the form (#nonterm:foo,p1)
       where p1 is the preceding context (with #nonterm_begin if that context was
       0); transitions out of it have olabels of the form (#nonterm_reenter,p2), where
       p2 is the phone on the ilabel of that transition.  Again: talking about inv(C).
       This state is not final.

       If p is equal to nonterm_phones_offset_ + kNontermEnd, then this is
       the state we transition to when we see the ilabel #nonterm_end.  The olabels
       on the transitions to it (talking here about inv(C), so ilabels and olabels
       are reversed) are of the form (#nonterm_end, p1) where p1 corresponds to the
       context we were in.  This state is final.
   */


  virtual StateId Start() { return 0; }

  virtual Weight Final(StateId s);

  /// Note: ilabel must not be epsilon.
  virtual bool GetArc(StateId s, Label ilabel, Arc *arc);

  ~InverseLeftBiphoneContextFst() { }

  // Returns a reference to a vector<vector<int32> > with information about all
  // the input symbols of C (i.e. all the output symbols of this
  // InverseContextFst).  See
  // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
  const vector<vector<int32> > &IlabelInfo() const {
    return ilabel_info_;
  }

  // A way to destructively obtain the ilabel-info.  Only do this if you
  // are just about to destroy this object.
  void SwapIlabelInfo(vector<vector<int32> > *vec) { ilabel_info_.swap(*vec); }

private:

  inline int32 GetPhoneSymbolFor(enum NonterminalValues n) {
    return nonterm_phones_offset_ + static_cast<int32>(n);
  }

  /// Finds the label index corresponding to this context-window of phones
  /// (likely of width context_width_).  Inserts it into the
  /// ilabel_info_/ilabel_map_ tables if necessary.
  Label FindLabel(const vector<int32> &label_info);


  // Map type to map from vectors of int32 (representing ilabel-info,
  // see http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel) to
  // Label (the output label in this FST).
  typedef unordered_map<vector<int32>, Label,
                        kaldi::VectorHasher<int32> > VectorToLabelMap;


  // The following three variables were also passed in by the caller:
  int32 nonterm_phones_offset_;

  // 'phone_syms_' are a set of phone-ids, typically 1, 2, .. num_phones.
  kaldi::ConstIntegerSet<Label> phone_syms_;

  // disambig_syms_ is the set of integer ids of the disambiguation symbols,
  // usually represented in text form as #0, #1, #2, etc.  These are inserted
  // into the grammar (for #0) and the lexicon (for #1, #2, ...) in order to
  // make the composed FSTs determinizable.  They are treated "specially" by the
  // context FST in that they are not part of the context, they are just "passed
  // through" via self-loops.  See the Mohri chapter mrentioned above for more
  // information.
  kaldi::ConstIntegerSet<Label> disambig_syms_;


  // maps from vector<int32>, representing phonetic contexts of length
  // context_width_ - 1, to Label.  These are actually the output labels of this
  // InverseContextFst (because of the "Inverse" part), but for historical
  // reasons and because we've used the term ilabels" in the documentation, we
  // still call these "ilabels").
  VectorToLabelMap ilabel_map_;

  // ilabel_info_ is the reverse map of ilabel_map_.
  // Indexed by olabel (although we call this ilabel_info_ for historical
  // reasons and because is for the ilabels of C), ilabel_info_[i] gives
  // information about the meaning of each symbol on the input of C
  // aka the output of inv(C).
  // See "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
  vector<vector<int32> > ilabel_info_;

};

}  // namespace fst


#endif  // KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_