lattice-functions.h 21.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
// lat/lattice-functions.h

// Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
//           2012-2013   Johns Hopkins University (Author: Daniel Povey);
//                       Bagher BabaAli
//                2014   Guoguo Chen

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#ifndef KALDI_LAT_LATTICE_FUNCTIONS_H_
#define KALDI_LAT_LATTICE_FUNCTIONS_H_

#include <vector>
#include <map>

#include "base/kaldi-common.h"
#include "hmm/posterior.h"
#include "fstext/fstext-lib.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h"
#include "itf/decodable-itf.h"

namespace kaldi {

/**
   This function extracts the per-frame log likelihoods from a linear
   lattice (which we refer to as an 'nbest' lattice elsewhere in Kaldi code).
   The dimension of *per_frame_loglikes will be set to the
   number of input symbols in 'nbest'.  The elements of
   '*per_frame_loglikes' will be set to the .Value2() elements of the lattice
   weights, which represent the acoustic costs; you may want to scale this
   vector afterward by -1/acoustic_scale to get the original loglikes.
   If there are acoustic costs on input-epsilon arcs or the final-prob in 'nbest'
   (and this should not normally be the case in situations where it makes
   sense to call this function), they will be included to the cost of the
   preceding input symbol, or the following input symbol for input-epsilons
   encountered prior to any input symbol.  If 'nbest' has no input symbols,
   'per_frame_loglikes' will be set to the empty vector.
**/
void GetPerFrameAcousticCosts(const Lattice &nbest,
                              Vector<BaseFloat> *per_frame_loglikes);

/// This function iterates over the states of a topologically sorted lattice and
/// counts the time instance corresponding to each state. The times are returned
/// in a vector of integers 'times' which is resized to have a size equal to the
/// number of states in the lattice. The function also returns the maximum time
/// in the lattice (this will equal the number of frames in the file).
int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);

/// As LatticeStateTimes, but in the CompactLattice format.  Note: must
/// be topologically sorted.  Returns length of the utterance in frames, which
/// might not be the same as the maximum time in the lattice, due to frames
/// in the final-prob.
int32 CompactLatticeStateTimes(const CompactLattice &clat,
                               std::vector<int32> *times);

/// This function does the forward-backward over lattices and computes the
/// posterior probabilities of the arcs. It returns the total log-probability
/// of the lattice.  The Posterior quantities contain pairs of (transition-id, weight)
/// on each frame.
/// If the pointer "acoustic_like_sum" is provided, this value is set to
/// the sum over the arcs, of the posterior of the arc times the
/// acoustic likelihood [i.e. negated acoustic score] on that link.
/// This is used in combination with other quantities to work out
/// the objective function in MMI discriminative training.
BaseFloat LatticeForwardBackward(const Lattice &lat,
                                 Posterior *arc_post,
                                 double *acoustic_like_sum = NULL);

// This function is something similar to LatticeForwardBackward(), but it is on
// the CompactLattice lattice format. Also we only need the alpha in the forward
// path, not the posteriors.
bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
                                 std::vector<double> *alpha);

// A sibling of the function CompactLatticeAlphas()... We compute the beta from
// the backward path here.
bool ComputeCompactLatticeBetas(const CompactLattice &lat,
                                std::vector<double> *beta);


// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
// best-path negated cost) Note: in either case, the alphas and betas are
// negated costs.  Requires that lat be topologically sorted.  This code
// will work for either CompactLattice or Latice.
template<typename LatticeType>
double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
                                    bool viterbi,
                                    std::vector<double> *alpha,
                                    std::vector<double> *beta);


/// Topologically sort the compact lattice if not already topologically sorted.
/// Will crash if the lattice cannot be topologically sorted.
void TopSortCompactLatticeIfNeeded(CompactLattice *clat);


/// Topologically sort the lattice if not already topologically sorted.
/// Will crash if lattice cannot be topologically sorted.
void TopSortLatticeIfNeeded(Lattice *clat);

/// Returns the depth of the lattice, defined as the average number of arcs (or
/// final-prob strings) crossing any given frame.  Returns 1 for empty lattices.
/// Requires that clat is topologically sorted!
BaseFloat CompactLatticeDepth(const CompactLattice &clat,
                              int32 *num_frames = NULL);

/// This function returns, for each frame, the number of arcs crossing that
/// frame.
void CompactLatticeDepthPerFrame(const CompactLattice &clat,
                                 std::vector<int32> *depth_per_frame);


/// This function limits the depth of the lattice, per frame: that means, it
/// does not allow more than a specified number of arcs active on any given
/// frame.  This can be used to reduce the size of the "very deep" portions of
/// the lattice.
void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
                              CompactLattice *clat);


/// Given a lattice, and a transition model to map pdf-ids to phones,
/// outputs for each frame the set of phones active on that frame.  If
/// sil_phones (which must be sorted and uniq) is nonempty, it excludes
/// phones in this list.
void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
                         const std::vector<int32> &sil_phones,
                         std::vector<std::set<int32> > *active_phones);

/// Given a lattice, and a transition model to map pdf-ids to phones,
/// replace the output symbols (presumably words), with phones; we
/// use the TransitionModel to work out the phone sequence.  Note
/// that the phone labels are not exactly aligned with the phone
/// boundaries.  We put a phone label to coincide with any transition
/// to the final, nonemitting state of a phone (this state always exists,
/// we ensure this in HmmTopology::Check()).  This would be the last
/// transition-id in the phone if reordering is not done (but typically
/// we do reorder).
/// Also see PhoneAlignLattice, in phone-align-lattice.h.
void ConvertLatticeToPhones(const TransitionModel &trans_model,
                            Lattice *lat);

/// Prunes a lattice or compact lattice.  Returns true on success, false if
/// there was some kind of failure.
template<class LatticeType>
bool PruneLattice(BaseFloat beam, LatticeType *lat);


/// Given a lattice, and a transition model to map pdf-ids to phones,
/// replace the sequences of transition-ids with sequences of phones.
/// Note that this is different from ConvertLatticeToPhones, in that
/// we replace the transition-ids not the words.
void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
                                   CompactLattice *clat);

/// Boosts LM probabilities by b * [number of frame errors]; equivalently, adds
/// -b*[number of frame errors] to the graph-component of the cost of each arc/path.
/// There is a frame error if a particular transition-id on a particular frame
/// corresponds to a phone not matching transcription's alignment for that frame.
/// This is used in "margin-inspired" discriminative training, esp. Boosted MMI.
/// The TransitionModel is used to map transition-ids in the lattice
/// input-side to phones; the phones appearing in
/// "silence_phones" are treated specially in that we replace the frame error f
/// (either zero or 1) for a frame, with the minimum of f or max_silence_error.
/// For the normal recipe, max_silence_error would be zero.
/// Returns true on success, false if there was some kind of mismatch.
/// At input, silence_phones must be sorted and unique.
bool LatticeBoost(const TransitionModel &trans,
                  const std::vector<int32> &alignment,
                  const std::vector<int32> &silence_phones,
                  BaseFloat b,
                  BaseFloat max_silence_error,
                  Lattice *lat);


/**
   This function implements either the MPFE (minimum phone frame error) or SMBR
   (state-level minimum bayes risk) forward-backward, depending on whether
   "criterion" is "mpfe" or "smbr".  It returns the MPFE
   criterion of SMBR criterion for this utterance, and outputs the posteriors (which
   may be positive or negative) into "post".

   @param [in] trans    The transition model. Used to map the
                        transition-ids to phones or pdfs.
   @param [in] silence_phones   A list of integer ids of silence phones. The
                        silence frames i.e. the frames where num_ali
                        corresponds to a silence phones are treated specially.
                        The behavior is determined by 'one_silence_class'
                        being false (traditional behavior) or true.
                        Usually in our setup, several phones including
                        the silence, vocalized noise, non-spoken noise
                        and unk are treated as "silence phones"
   @param [in] lat      The denominator lattice
   @param [in] num_ali  The numerator alignment
   @param [in] criterion    The objective function. Must be "mpfe" or "smbr"
                        for MPFE (minimum phone frame error) or sMBR
                        (state minimum bayes risk) training.
   @param [in] one_silence_class   Determines how the silence frames are treated.
                        Setting this to false gives the old traditional behavior,
                        where the silence frames (according to num_ali) are
                        treated as incorrect. However, this means that the
                        insertions are not penalized by the objective.
                        Setting this to true gives the new behaviour, where we
                        treat silence as any other phone, except that all pdfs
                        of silence phones are collapsed into a single class for
                        the frame-error computation. This can possible reduce
                        the insertions in the trained model. This is closer to
                        the WER metric that we actually care about, since WER is
                        generally computed after filtering out noises, but
                        does penalize insertions.
    @param [out] post   The "MBR posteriors" i.e. derivatives w.r.t to the
                        pseudo log-likelihoods of states at each frame.
*/
BaseFloat LatticeForwardBackwardMpeVariants(
    const TransitionModel &trans,
    const std::vector<int32> &silence_phones,
    const Lattice &lat,
    const std::vector<int32> &num_ali,
    std::string criterion,
    bool one_silence_class,
    Posterior *post);

/**
   This function can be used to compute posteriors for MMI, with a positive contribution
   for the numerator and a negative one for the denominator.  This function is not actually
   used in our normal MMI training recipes, where it's instead done using various command
   line programs that each do a part of the job.  This function was written for use in
   neural-net MMI training.

   @param [in] trans    The transition model. Used to map the
                        transition-ids to phones or pdfs.
   @param [in] lat      The denominator lattice
   @param [in] num_ali  The numerator alignment
   @param [in] drop_frames   If "drop_frames" is true, it will not compute any
                        posteriors on frames where the num and den have disjoint
                        pdf-ids.
   @param [in] convert_to_pdf_ids   If "convert_to_pdfs_ids" is true, it will
                        convert the output to be at the level of pdf-ids, not
                        transition-ids.
   @param [in] cancel   If "cancel" is true, it will cancel out any positive and
                        negative parts from the same transition-id (or pdf-id,
                        if convert_to_pdf_ids == true).
   @param [out] arc_post   The output MMI posteriors of transition-ids (or
                        pdf-ids if convert_to_pdf_ids == true) at each frame
                        i.e. the difference between the numerator
                        and denominator posteriors.

   It returns the forward-backward likelihood of the lattice. */
BaseFloat LatticeForwardBackwardMmi(
    const TransitionModel &trans,
    const Lattice &lat,
    const std::vector<int32> &num_ali,
    bool drop_frames,
    bool convert_to_pdf_ids,
    bool cancel,
    Posterior *arc_post);


/// This function takes a CompactLattice that should only contain a single
/// linear sequence (e.g. derived from lattice-1best), and that should have been
/// processed so that the arcs in the CompactLattice align correctly with the
/// word boundaries (e.g. by lattice-align-words).  It outputs 3 vectors of the
/// same size, which give, for each word in the lattice (in sequence), the word
/// label and the begin time and length in frames.  This is done even for zero
/// (epsilon) words, generally corresponding to optional silence-- if you don't
/// want them, just ignore them in the output.
/// This function will print a warning and return false, if the lattice
/// did not have the correct format (e.g. if it is empty or it is not
/// linear).
bool CompactLatticeToWordAlignment(const CompactLattice &clat,
                                   std::vector<int32> *words,
                                   std::vector<int32> *begin_times,
                                   std::vector<int32> *lengths);

/// This function takes a CompactLattice that should only contain a single
/// linear sequence (e.g. derived from lattice-1best), and that should have been
/// processed so that the arcs in the CompactLattice align correctly with the
/// word boundaries (e.g. by lattice-align-words).  It outputs 4 vectors of the
/// same size, which give, for each word in the lattice (in sequence), the word
/// label, the begin time and length in frames, and the pronunciation (sequence
/// of phones).  This is done even for zero words, corresponding to optional
/// silences -- if you don't want them, just ignore them in the output.
/// This function will print a warning and return false, if the lattice
/// did not have the correct format (e.g. if it is empty or it is not
/// linear).
bool CompactLatticeToWordProns(
    const TransitionModel &tmodel,
    const CompactLattice &clat,
    std::vector<int32> *words,
    std::vector<int32> *begin_times,
    std::vector<int32> *lengths,
    std::vector<std::vector<int32> > *prons,
    std::vector<std::vector<int32> > *phone_lengths);


/// A form of the shortest-path/best-path algorithm that's specially coded for
/// CompactLattice.  Requires that clat be acyclic.
void CompactLatticeShortestPath(const CompactLattice &clat,
                                CompactLattice *shortest_path);

/// This function add the word insertion penalty to graph score of each word
/// in the compact lattice
void AddWordInsPenToCompactLattice(BaseFloat word_ins_penalty,
                                   CompactLattice *clat);

/// This function *adds* the negated scores obtained from the Decodable object,
/// to the acoustic scores on the arcs.  If you want to replace them, you should
/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
/// true on success, false on error (typically some kind of mismatched inputs).
bool RescoreCompactLattice(DecodableInterface *decodable,
                           CompactLattice *clat);


/// This function returns the number of words in the longest sentence in a
/// CompactLattice (i.e. the the maximum of any path, of the count of
/// olabels on that path).
int32 LongestSentenceLength(const Lattice &lat);

/// This function returns the number of words in the longest sentence in a
/// CompactLattice, i.e. the the maximum of any path, of the count of
/// labels on that path... note, in CompactLattice, the ilabels and olabels
/// are identical because it is an acceptor.
int32 LongestSentenceLength(const CompactLattice &lat);


/// This function is like RescoreCompactLattice, but it is modified to avoid
/// computing probabilities on most frames where all the pdf-ids are the same.
/// (it needs the transition-model to work out whether two transition-ids map to
/// the same pdf-id, and it assumes that the lattice has transition-ids on it).
/// The naive thing would be to just set all probabilities to zero on frames
/// where all the pdf-ids are the same (because this value won't affect the
/// lattice posterior).  But this would become confusing when we compute
/// corpus-level diagnostics such as the MMI objective function.  Instead,
/// imagine speedup_factor = 100 (it must be >= 1.0)... with probability (1.0 /
/// speedup_factor) we compute those likelihoods and multiply them by
/// speedup_factor; otherwise we set them to zero.  This gives the right
/// expected probability so our corpus-level diagnostics will be about right.
bool RescoreCompactLatticeSpeedup(
    const TransitionModel &tmodel,
    BaseFloat speedup_factor,
    DecodableInterface *decodable,
    CompactLattice *clat);


/// This function *adds* the negated scores obtained from the Decodable object,
/// to the acoustic scores on the arcs.  If you want to replace them, you should
/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
/// true on success, false on error (e.g. some kind of mismatched inputs).
/// The input labels, if nonzero, are interpreted as transition-ids or whatever
/// other index the Decodable object expects.
bool RescoreLattice(DecodableInterface *decodable,
                    Lattice *lat);

/// This function Composes a CompactLattice format lattice with a
/// DeterministicOnDemandFst<fst::StdFst> format fst, and outputs another
/// CompactLattice format lattice. The first element (the one that corresponds
/// to LM weight) in CompactLatticeWeight is used for composition.
///
/// Note that the DeterministicOnDemandFst interface is not "const", therefore
/// we cannot use "const" for <det_fst>.
void ComposeCompactLatticeDeterministic(
    const CompactLattice& clat,
    fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
    CompactLattice* composed_clat);

/// This function computes the mapping from the pair 
/// (frame-index, transition-id) to the pair 
/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
/// transition-id in that frame.
/// frame-index in the lattice. 
/// This function is useful for retaining the acoustic scores in a 
/// non-compact lattice after a process like determinization where the 
/// frame-level acoustic scores are typically lost.
/// The function ReplaceAcousticScoresFromMap is used to restore the 
/// acoustic scores computed by this function.
///
///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
///                     function will crash. 
///   @param [out] acoustic_scores  
///                     Pointer to a map from the pair (frame-index,
///                     transition-id) to a pair (sum-of-acoustic-scores,
///                     num-of-occurences).
///                     Usually the acoustic scores for a pdf-id (and hence
///                     transition-id) on a frame will be the same for all the
///                     occurences of the pdf-id in that frame. 
///                     But if not, we will take the average of the acoustic
///                     scores. Hence, we store both the sum-of-acoustic-scores
///                     and the num-of-occurences of the transition-id in that
///                     frame.
void ComputeAcousticScoresMap(
    const Lattice &lat,
    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                        PairHasher<int32> > *acoustic_scores);

/// This function restores acoustic scores computed using the function
/// ComputeAcousticScoresMap into the lattice.
///
///   @param [in] acoustic_scores  
///                      A map from the pair (frame-index, transition-id) to a
///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
///                      the occurences of the transition-id in that frame.
///                      See the comments for ComputeAcousticScoresMap for 
///                      details.
///   @param [out] lat   Pointer to the output lattice.
void ReplaceAcousticScoresFromMap(
    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                        PairHasher<int32> > &acoustic_scores,
    Lattice *lat);

}  // namespace kaldi

#endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_