Yannick Estève / ONTRAC-Kaldi

Blame view

src/chainbin/nnet3-chain-compute-post.cc 10.6 KB
  // nnet3bin/nnet3-chain-compute-post.cc
  
  // Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
  //                2015   Vimal Manohar
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #include "base/kaldi-common.h"
  #include "util/common-utils.h"
  #include "nnet3/nnet-am-decodable-simple.h"
  #include "base/timer.h"
  #include "nnet3/nnet-utils.h"
  #include "chain/chain-denominator.h"
  
  
  int main(int argc, char *argv[]) {
    try {
      using namespace kaldi;
      using namespace kaldi::nnet3;
      typedef kaldi::int32 int32;
      typedef kaldi::int64 int64;
  
      const char *usage =
          "Compute posteriors from 'denominator FST' of chain model and optionally "
          "map them to phones.
  "
          "
  "
          "Usage: nnet3-chain-compute-post [options] <nnet-in> <den-fst> <features-rspecifier> <matrix-wspecifier>
  "
          " e.g.: nnet3-chain-compute-post --transform-mat=transform.mat final.raw den.fst scp:feats.scp ark:nnet_prediction.ark
  "
          "See also: nnet3-compute
  "
          "See steps/nnet3/chain/get_phone_post.sh for example of usage.
  "
          "Note: this program makes *extremely inefficient* use of the GPU.
  "
          "You are advised to run this on CPU until it's improved.
  ";
  
      ParseOptions po(usage);
      Timer timer;
  
      BaseFloat leaky_hmm_coefficient = 0.1;
      NnetSimpleComputationOptions opts;
      opts.acoustic_scale = 1.0; // by default do no acoustic scaling.
  
      std::string use_gpu = "yes";
  
      std::string transform_mat_rxfilename;
      std::string ivector_rspecifier,
                  online_ivector_rspecifier,
                  utt2spk_rspecifier;
      int32 online_ivector_period = 0;
      opts.Register(&po);
  
      po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
                  "iVectors as vectors (i.e. not estimated online); per utterance "
                  "by default, or per speaker if you provide the --utt2spk option.");
      po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
                  "utt2spk option used to get ivectors per speaker");
      po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
                  "iVectors estimated online, as matrices.  If you supply this,"
                  " you must set the --online-ivector-period option.");
      po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
                  "between iVectors in matrices supplied to the --online-ivectors "
                  "option");
      po.Register("use-gpu", &use_gpu,
                  "yes|no|optional|wait, only has effect if compiled with CUDA");
      po.Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "'Leaky HMM' "
                  "coefficient: smaller values will tend to lead to more "
                  "confident posteriors.  0.1 is what we normally use in "
                  "training.");
      po.Register("transform-mat", &transform_mat_rxfilename, "Location to read "
                  "the matrix to transform posteriors to phones.  Matrix is "
                  "of dimension num-phones by num-pdfs.");
  
      po.Read(argc, argv);
  
      if (po.NumArgs() != 4) {
        po.PrintUsage();
        exit(1);
      }
  
  #if HAVE_CUDA==1
      CuDevice::Instantiate().SelectGpuId(use_gpu);
  #endif
  
      std::string nnet_rxfilename = po.GetArg(1),
          den_fst_rxfilename = po.GetArg(2),
          feature_rspecifier = po.GetArg(3),
          matrix_wspecifier = po.GetArg(4);
  
      Nnet nnet;
      ReadKaldiObject(nnet_rxfilename, &nnet);
  
      SetBatchnormTestMode(true, &nnet);
      SetDropoutTestMode(true, &nnet);
      CollapseModel(CollapseModelConfig(), &nnet);
  
      RandomAccessBaseFloatMatrixReader online_ivector_reader(
          online_ivector_rspecifier);
      RandomAccessBaseFloatVectorReaderMapped ivector_reader(
          ivector_rspecifier, utt2spk_rspecifier);
  
      CachingOptimizingCompiler compiler(nnet, opts.optimize_config);
  
      chain::ChainTrainingOptions chain_opts;
      // the only option that actually gets used here is
      // opts_.leaky_hmm_coefficient, and that's the only one we expose on the
      // command line.
      chain_opts.leaky_hmm_coefficient = leaky_hmm_coefficient;
  
      fst::StdVectorFst den_fst;
      ReadFstKaldi(den_fst_rxfilename, &den_fst);
      int32 num_pdfs = nnet.OutputDim("output");
      if (num_pdfs < 0) {
        KALDI_ERR << "Neural net '" << nnet_rxfilename
                  << "' has no output named 'output'";
      }
      chain::DenominatorGraph den_graph(den_fst, num_pdfs);
  
  
      CuSparseMatrix<BaseFloat> transform_sparse_mat;
      if (!transform_mat_rxfilename.empty()) {
        Matrix<BaseFloat> transform_mat;
        ReadKaldiObject(transform_mat_rxfilename, &transform_mat);
        if (transform_mat.NumCols() != num_pdfs)
          KALDI_ERR << "transform-mat from " << transform_mat_rxfilename
                    << " has " << transform_mat.NumCols() << " cols, expected "
                    << num_pdfs;
        SparseMatrix<BaseFloat> temp_sparse_mat(transform_mat);
        // the following is just a shallow swap if we're on CPU.  This program
        // actually won't actually work very fast on GPU, but doing it this way
        // will make it easier to modify it later if we really want efficient
        // operation on GPU.
        transform_sparse_mat.Swap(&temp_sparse_mat);
      }
  
      BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
  
      int32 num_success = 0, num_fail = 0;
      int64 tot_input_frames = 0, tot_output_frames = 0;
      double tot_forward_prob = 0.0;
  
      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
  
      for (; !feature_reader.Done(); feature_reader.Next()) {
        std::string utt = feature_reader.Key();
        const Matrix<BaseFloat> &features (feature_reader.Value());
        if (features.NumRows() == 0) {
          KALDI_WARN << "Zero-length utterance: " << utt;
          num_fail++;
          continue;
        }
        const Matrix<BaseFloat> *online_ivectors = NULL;
        const Vector<BaseFloat> *ivector = NULL;
        if (!ivector_rspecifier.empty()) {
          if (!ivector_reader.HasKey(utt)) {
            KALDI_WARN << "No iVector available for utterance " << utt;
            num_fail++;
            continue;
          } else {
            ivector = &ivector_reader.Value(utt);
          }
        }
        if (!online_ivector_rspecifier.empty()) {
          if (!online_ivector_reader.HasKey(utt)) {
            KALDI_WARN << "No online iVector available for utterance " << utt;
            num_fail++;
            continue;
          } else {
            online_ivectors = &online_ivector_reader.Value(utt);
          }
        }
  
        Vector<BaseFloat> priors;  // empty vector, we don't need priors here.
        DecodableNnetSimple nnet_computer(
            opts, nnet, priors,
            features, &compiler,
            ivector, online_ivectors,
            online_ivector_period);
  
        Matrix<BaseFloat> matrix(nnet_computer.NumFrames(),
                                 nnet_computer.OutputDim());
        for (int32 t = 0; t < nnet_computer.NumFrames(); t++) {
          SubVector<BaseFloat> row(matrix, t);
          nnet_computer.GetOutputForFrame(t, &row);
        }
  
        // Of course it makes no sense to copy to GPU and then back again.
        // But anyway this program woudn't work very well if we actually ran
        // with --use-gpu=yes.  In the CPU case the following is just a shallow
        // swap.
        CuMatrix<BaseFloat> gpu_nnet_output;
        gpu_nnet_output.Swap(&matrix);
  
  
        chain::DenominatorComputation den_computation(
            chain_opts, den_graph,
            1, // num_sequences,
            gpu_nnet_output);
  
  
        int32 num_frames = gpu_nnet_output.NumRows();
        BaseFloat forward_prob = den_computation.Forward();
  
        CuMatrix<BaseFloat> posteriors(num_frames, num_pdfs);
        BaseFloat scale = 1.0;
        bool ok = den_computation.Backward(scale, &posteriors);
  
        KALDI_VLOG(1) << "For utterance " << utt << ", log-prob per frame was "
                      << (forward_prob / num_frames) << " over "
                      << num_frames << " frames.";
  
        if (!ok || !(forward_prob - forward_prob == 0)) {  // if or NaN
          KALDI_WARN << "Something went wrong for utterance " << utt
                     << "; forward-prob = " << forward_prob
                     << ", num-frames = " << num_frames;
          num_fail++;
          continue;
        }
  
        num_success++;
        tot_input_frames += features.NumRows();
        tot_output_frames += num_frames;
        tot_forward_prob += forward_prob;
  
        // Write out the posteriors.
        if (transform_mat_rxfilename.empty()) {
          // write out posteriors over pdfs.
          Matrix<BaseFloat> posteriors_cpu;
          posteriors.Swap(&posteriors_cpu);
          matrix_writer.Write(utt, posteriors_cpu);
        } else {
          // write out posteriors over (most likely) phones.
          int32 num_phones = transform_sparse_mat.NumRows();
          CuMatrix<BaseFloat> phone_post(num_frames, num_phones);
          phone_post.AddMatSmat(1.0, posteriors,
                                transform_sparse_mat, kTrans, 0.0);
          Matrix<BaseFloat> phone_post_cpu;
          phone_post.Swap(&phone_post_cpu);
          // write out posteriors over phones.
          matrix_writer.Write(utt, phone_post_cpu);
  
          if (GetVerboseLevel() >= 1 || RandInt(0,99)==0) {
            BaseFloat sum = posteriors.Sum();
            if (((sum / num_frames) - 1.0) > 0.01) {
              KALDI_WARN << "Expected sum of posteriors " << sum
                         << " to be close to num-frames " << num_frames;
            }
          }
        }
      }
  
  #if HAVE_CUDA==1
      CuDevice::Instantiate().PrintProfile();
  #endif
      double elapsed = timer.Elapsed();
      KALDI_LOG << "Time taken "<< elapsed
                << "s: real-time factor assuming 100 input frames/sec is "
                << (elapsed*100.0/tot_input_frames);
      KALDI_LOG << "Done " << num_success << " utterances, failed for "
                << num_fail;
  
      KALDI_LOG << "Overall log-prob per (output) frame was "
                << (tot_forward_prob / tot_output_frames)
                << " over " << tot_output_frames << " frames.";
  
      if (num_success != 0) return 0;
      else return 1;
    } catch(const std::exception &e) {
      std::cerr << e.what();
      return -1;
    }
  }