Blame view

src/fstext/determinize-lattice.h 7.35 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
  // fstext/determinize-lattice.h
  
  // Copyright 2009-2011  Microsoft Corporation
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_FSTEXT_DETERMINIZE_LATTICE_H_
  #define KALDI_FSTEXT_DETERMINIZE_LATTICE_H_
  #include <fst/fstlib.h>
  #include <fst/fst-decl.h>
  #include <algorithm>
  #include <map>
  #include <set>
  #include <vector>
  #include "fstext/lattice-weight.h"
  
  namespace fst {
  
  /// \addtogroup fst_extensions
  ///  @{
  
  
  // For example of usage, see test-determinize-lattice.cc
  
  /*
     DeterminizeLattice implements a special form of determinization
     with epsilon removal, optimized for a phase of lattice generation.
     Its input is an FST with weight-type BaseWeightType (usually a pair of floats,
     with a lexicographical type of order, such as LatticeWeightTpl<float>).
     Typically this would be a state-level lattice, with input symbols equal to
     words, and output-symbols equal to p.d.f's (so like the inverse of HCLG).  Imagine representing this as an
     acceptor of type CompactLatticeWeightTpl<float>, in which the input/output
     symbols are words, and the weights contain the original weights together with
     strings (with zero or one symbol in them) containing the original output labels
     (the p.d.f.'s).  We determinize this using acceptor determinization with
     epsilon removal.  Remember (from lattice-weight.h) that
     CompactLatticeWeightTpl has a special kind of semiring where we always take
     the string corresponding to the best cost (of type BaseWeightType), and
     discard the other.  This corresponds to taking the best output-label sequence
     (of p.d.f.'s) for each input-label sequence (of words).  We couldn't use the
     Gallic weight for this, or it would die as soon as it detected that the input
     FST was non-functional.  In our case, any acyclic FST (and many cyclic ones)
     can be determinized.
     We assume that there is a function
        Compare(const BaseWeightType &a, const BaseWeightType &b)
     that returns (-1, 0, 1) according to whether (a < b, a == b, a > b) in the
     total order on the BaseWeightType... this information should be the
     same as NaturalLess would give, but it's more efficient to do it this way.
     You can define this for things like TropicalWeight if you need to instantiate
     this class for that weight type.
  
     We implement this determinization in a special way to make it efficient for
     the types of FSTs that we will apply it to.  One issue is that if we
     explicitly represent the strings (in CompactLatticeWeightTpl) as vectors of
     type vector<IntType>, the algorithm takes time quadratic in the length of
     words (in states), because propagating each arc involves copying a whole
     vector (of integers representing p.d.f.'s).  Instead we use a hash structure
     where each string is a pointer (Entry*), and uses a hash from (Entry*,
     IntType), to the successor string (and a way to get the latest IntType and the
     ancestor Entry*).  [this is the class LatticeStringRepository].
  
     Another issue is that rather than representing a determinized-state as a
     collection of (state, weight), we represent it in a couple of reduced forms.
     Suppose a determinized-state is a collection of (state, weight) pairs; call
     this the "canonical representation".  Note: these collections are always
     normalized to remove any common weight and string part.  Define end-states as
     the subset of states that have an arc out of them with a label on, or are
     final.  If we represent a determinized-state a the set of just its (end-state,
     weight) pairs, this will be a valid and more compact representation, and will
     lead to a smaller set of determinized states (like early minimization).  Call
     this collection of (end-state, weight) pairs the "minimal representation".  As
     a mechanism to reduce compute, we can also consider another representation.
     In the determinization algorithm, we start off with a set of (begin-state,
     weight) pairs (where the "begin-states" are initial or have a label on the
     transition into them), and the "canonical representation" consists of the
     epsilon-closure of this set (i.e. follow epsilons).  Call this set of
     (begin-state, weight) pairs, appropriately normalized, the "initial
     representation".  If two initial representations are the same, the "canonical
     representation" and hence the "minimal representation" will be the same.  We
     can use this to reduce compute.  Note that if two initial representations are
     different, this does not preclude the other representations from being the same.
     
  */   
  
  struct DeterminizeLatticeOptions {
    float delta; // A small offset used to measure equality of weights.
    int max_mem; // If >0, determinization will fail and return false
    // when the algorithm's (approximate) memory consumption crosses this threshold.
    int max_loop; // If >0, can be used to detect non-determinizable input
    // (a case that wouldn't be caught by max_mem).
    DeterminizeLatticeOptions(): delta(kDelta),
                                 max_mem(-1),
                                 max_loop(-1) { }
  };
  
  /**
      This function implements the normal version of DeterminizeLattice, in which
      the output strings are represented using sequences of arcs, where all but
      the first one has an epsilon on the input side.  The debug_ptr argument is
      an optional pointer to a bool that, if it becomes true while the algorithm
      is executing, the algorithm will print a traceback and terminate (used in
      fstdeterminizestar.cc debug non-terminating determinization).  More
      efficient if ifst is arc-sorted on input label.  If the number of arcs gets
      more than max_states, it will throw std::runtime_error (otherwise this code
      does not use exceptions).  This is mainly useful for debug.  */
  template<class Weight, class IntType>
  bool DeterminizeLattice(
      const Fst<ArcTpl<Weight> > &ifst,
      MutableFst<ArcTpl<Weight> > *ofst,
      DeterminizeLatticeOptions opts = DeterminizeLatticeOptions(),
      bool *debug_ptr = NULL);
  
  
  /*  This is a version of DeterminizeLattice with a slightly more "natural" output format,
      where the output sequences are encoded using the CompactLatticeArcTpl template
      (i.e. the sequences of output symbols are represented directly as strings)
      More efficient if ifst is arc-sorted on input label.
      If the #arcs gets more than max_arcs, it will throw std::runtime_error (otherwise
      this code does not use exceptions).  This is mainly useful for debug.
  */
  template<class Weight, class IntType>
  bool DeterminizeLattice(
      const Fst<ArcTpl<Weight> >&ifst,
      MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
      DeterminizeLatticeOptions opts = DeterminizeLatticeOptions(),
      bool *debug_ptr = NULL);
  
  
  
  
  /// @} end "addtogroup fst_extensions"
  
  } // end namespace fst
  
  #include "fstext/determinize-lattice-inl.h"
  
  #endif