nnet-example-functions.h 12.2 KB
// nnet2/nnet-example-functions.h

// Copyright 2013  Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_
#define KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_

/** @file
    Note on how to parse this filename: it contains functions relatied to
    neural-net training examples, mostly discriminative neural-net training examples,
   i.e. type DiscriminativeNnetExample    
*/

#include "nnet2/nnet-nnet.h"
#include "util/table-types.h"
#include "lat/kaldi-lattice.h"
#include "nnet2/nnet-example.h"
#include "hmm/transition-model.h"
#include "hmm/posterior.h"

namespace kaldi {
namespace nnet2 {

// Glossary: mmi = Maximum Mutual Information,
//          mpfe = Minimum Phone Frame Error
//          smbr = State-level Minimum Bayes Risk


// This file relates to the creation of examples for discriminative training
// (see struct DiscriminativeNnetExample, in ./nnet-example.h).


/** Config structure for SplitExample, for splitting discriminative
    training examples.
*/
struct SplitDiscriminativeExampleConfig {
  // This is the maximum length in frames that any example is allowed to have.
  // We will split training examples to ensure that they are no longer than
  // this.  Note: if you make this too short it may have bad effects because
  // the posteriors start to become inaccurate at the edges of the training
  // example (since they will be based on the acoustic model that was used to
  // generate the lattices, not the current one).
  int32 max_length;

  // criterion can be "smbr" or "mpfe" or "mmi".  This info is only needed to
  // determine which parts of the lattices will not contribute to training and
  // can be discarded (for mpe/smbr, any part where the den-lat has only one
  // path or all den-lat paths map to the same pdf can be discareded; for mmi,
  // any part where the den-lat's pdfs all have the same value as the num-lat
  // pdf for that frame, can be discarded.
  std::string criterion;

  bool collapse_transition_ids;

  bool determinize;

  bool minimize; // we'll push and minimize if this is true.
  
  bool test;

  bool drop_frames; // For MMI, true if we will eventually drop frames in which
                    // the numerator does not appear in the denominator lattice.
                    // (i.e. we won't backpropagate any derivatives on those
                    // frames).  We may still need to include those frames in
                    // the computation in order to get correct posteriors for
                    // other parts of the lattice.

  bool split; // if false, we won't split at all.

  bool excise; // if false, we will skip the "excise" step.
  
  SplitDiscriminativeExampleConfig():
      max_length(1024), criterion("smbr"), collapse_transition_ids(true),
      determinize(true), minimize(true), test(false), drop_frames(false),
      split(true), excise(true) { }

  void Register(OptionsItf *opts) {

    opts->Register("max-length", &max_length, "Maximum length allowed for any "
                   "segment (i.e. max #frames for any example");
    //opts->Register("target-length", &target_length, "Target length for a "
    // "segment");
    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr'. "
                   "Determines which frames may be dropped from lattices.");
    opts->Register("collapse-transition-ids", &collapse_transition_ids,
                   "This option included for debugging purposes");
    opts->Register("determinize", &determinize, "If true, we determinize "
                   "lattices (as Lattice) before splitting and possibly minimize");
    opts->Register("minimize", &minimize, "If true, we push and "
                   "minimize lattices (as Lattice) before splitting");
    opts->Register("test", &test, "If true, activate self-testing code.");
    // See "Sequence-discriminative training of deep neural networks", Vesely et al,
    // ICASSP 2013 for explanation of frame dropping.
    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
                   "with no overlap of num and den pdf-ids");
    opts->Register("split", &split, "Set to false to disable lattice-splitting.");
    opts->Register("excise", &excise, "Set to false to disable excising un-needed "
                   "frames (option included for debug purposes)");
  }
};

/// This struct exists only for diagnostic purposes.  Note: the stats assume
/// that you call SplitDiscriminative and ExciseDiscriminativeExample in the
/// same program, and the info printed out will be wrong if this is not the
/// case... this isn't ideal but it was more convenient.
struct SplitExampleStats {
  int32 num_lattices;
  int32 longest_lattice;
  int32 num_segments;
  int32 num_kept_segments;
  int64 num_frames_orig;
  int64 num_frames_must_keep;
  int64 num_frames_kept_after_split;
  int32 longest_segment_after_split;
  int64 num_frames_kept_after_excise;
  int32 longest_segment_after_excise;
  
  SplitExampleStats() { memset(this, 0, sizeof(*this)); }
  void Print();
};

/** Converts lattice to discriminative training example.  returns true on
    success, false on failure such as mismatched input (will also warn in this
    case). */
bool LatticeToDiscriminativeExample(
    const std::vector<int32> &alignment,
    const Matrix<BaseFloat> &feats,
    const CompactLattice &clat,
    BaseFloat weight,
    int32 left_context,
    int32 right_context,
    DiscriminativeNnetExample *eg);


/** Split a "discriminative example" into multiple pieces,
    splitting where the lattice has "pinch points".
 */
void SplitDiscriminativeExample(
    const SplitDiscriminativeExampleConfig &config,
    const TransitionModel &tmodel,
    const DiscriminativeNnetExample &eg,
    std::vector<DiscriminativeNnetExample> *egs_out,
    SplitExampleStats *stats_out);

/** Remove unnecessary frames from discriminative training
    example.  The output egs_out will be of size zero or one
    (usually one) after being called. */
void ExciseDiscriminativeExample(
    const SplitDiscriminativeExampleConfig &config,
    const TransitionModel &tmodel,
    const DiscriminativeNnetExample &eg,
    std::vector<DiscriminativeNnetExample> *egs_out,
    SplitExampleStats *stats_out);


/** Appends the given vector of examples (which must be non-empty) into 
    a single output example (called by CombineExamples, which might be
    a more convenient interface).

   When combining examples it directly appends the features, and then adds a
   "fake" segment to the lattice and alignment in between, padding with
   transition-ids that are all ones.  This is necessary in case the network
   needs acoustic context, and only because of a kind of limitation in the nnet
   training code that doesn't support varying 'chunk' sizes within a minibatch.

   Will fail if all the input examples don't have the same weight (this will
   normally be 1.0 anyway), or if the feature dimension (i.e. basic feature
   dimension plus spk_info dimension) differs between the examples.
*/
void AppendDiscriminativeExamples(
    const std::vector<const DiscriminativeNnetExample*> &input,
    DiscriminativeNnetExample *output);

/**
   This function is used to combine multiple discriminative-training
   examples (each corresponding to a segment of a lattice), into one.
   
   It combines examples into groups such that each group will have a
   total length (number of rows of the feature matrix) less than or
   equal to max_length.  However, if individual examples are longer
   than max_length they will still be processed; they will be given
   their own group.
   
   See also the documentation for AppendDiscriminativeExamples() which
   gives more details on how we append the examples.

   Will fail if all the input examples don't have the same weight (this will
   normally be 1.0 anyway).

   If the spk_info variables are non-empty, it will move them into the features
   of the output, so the spk_info of the output will be empty but the
   appropriate speaker vectors will be appended to each row of the features.  */

void CombineDiscriminativeExamples(
    int32 max_length,
    const std::vector<DiscriminativeNnetExample> &input,
    std::vector<DiscriminativeNnetExample> *output);
                     
/**
   This function solves the "packing problem" using the "first fit" algorithm.
   It groups together the indices 0 through sizes.size() - 1, such that the sum
   of cost within each group does not exceed max_lcost.  [However, if there
   are single examples that exceed max_cost, it puts them in their own bin].
   The algorithm is not particularly efficient-- it's more n^2 than n log(n)
   which it should be.  */
void SolvePackingProblem(BaseFloat max_cost,
                         const std::vector<BaseFloat> &costs,
                         std::vector<std::vector<size_t> > *groups);



/**
   Given a discriminative training example, this function works out posteriors
   at the pdf level (note: these are "discriminative-training posteriors" that
   may be positive or negative.  The denominator lattice "den_lat" in the
   example "eg" should already have had acoustic-rescoring done so that its
   acoustic probs are up to date, and any acoustic scaling should already have
   been applied.

   "criterion" may be "mmi" or "mpfe" or "smbr".  If criterion
   is "mmi", "drop_frames" means we don't include derivatives for frames
   where the numerator pdf is not in the denominator lattice.

   if "one_silence_class" is true you can get a newer behavior for MPE/SMBR
   which will tend to reduce insertions.

   "silence_phones" is a list of silence phones (this is only relevant for mpfe
   or smbr, if we want to treat silence specially).
 */
void ExampleToPdfPost(
    const TransitionModel &tmodel,
    const std::vector<int32> &silence_phones,
    std::string criterion,
    bool drop_frames,
    bool one_silence_class,
    const DiscriminativeNnetExample &eg,
    Posterior *post);

/**
   This function is used in code that tests the functionality that we provide
   here, about splitting and excising nnet examples.  It adds to a "hash
   function" that is a function of a set of examples; the hash function is of
   dimension (number of pdf-ids x features dimension).  The hash function
   consists of the (denominator - numerator) posteriors over pdf-ids, times the
   average over the context-window (left-context on the left, right-context on
   the right), of the features.  This is useful because the various
   manipulations we do are supposed to preserve this, and if there is a bug
   it will most likely cause the hash function to change.

   This function will resize the matrix if it is empty.

   Any acoustic scaling of the lattice should be done before you call this
   function.

   'criterion' should be 'mmi', 'mpfe', or 'smbr'.
   
   You should set drop_frames to true if you are doing MMI with drop-frames
   == true.  Then it will not compute the hash for frames where the numerator
   pdf-id is not in the denominator lattice.

   You can set one_silence_class to true for a newer optional behavior that will
   reduce insertions in the trained model (or false for the traditional
   behavior).

   The function will also accumulate the total numerator and denominator weights
   used as num_weight and den_weight, for an additional diagnostic, and the total
   number of frames, as tot_t.
*/
void UpdateHash(
    const TransitionModel &tmodel,
    const DiscriminativeNnetExample &eg,
    std::string criterion,
    bool drop_frames,
    bool one_silence_class,
    Matrix<double> *hash,
    double *num_weight,
    double *den_weight,
    double *tot_t);



} // namespace nnet2
} // namespace kaldi

#endif // KALDI_NNET2_NNET_EXAMPLE_FUNCTIONS_H_