Yannick Estève / ONTRAC-Kaldi

Blame view

src/ivectorbin/ivector-extract.cc 13.2 KB
  // ivectorbin/ivector-extract.cc
  
  // Copyright 2013  Daniel Povey
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #include "base/kaldi-common.h"
  #include "util/common-utils.h"
  #include "gmm/am-diag-gmm.h"
  #include "ivector/ivector-extractor.h"
  #include "util/kaldi-thread.h"
  
  namespace kaldi {
  
  // This class will be used to parallelize over multiple threads the job
  // that this program does.  The work happens in the operator (), the
  // output happens in the destructor.
  class IvectorExtractTask {
   public:
    IvectorExtractTask(const IvectorExtractor &extractor,
                       std::string utt,
                       const Matrix<BaseFloat> &feats,
                       const Posterior &posterior,
                       BaseFloatVectorWriter *writer,
                       double *tot_auxf_change):
        extractor_(extractor), utt_(utt), feats_(feats), posterior_(posterior),
        writer_(writer), tot_auxf_change_(tot_auxf_change) { }
  
    void operator () () {
      bool need_2nd_order_stats = false;
  
      IvectorExtractorUtteranceStats utt_stats(extractor_.NumGauss(),
                                               extractor_.FeatDim(),
                                               need_2nd_order_stats);
  
      utt_stats.AccStats(feats_, posterior_);
  
      ivector_.Resize(extractor_.IvectorDim());
      ivector_(0) = extractor_.PriorOffset();
  
      if (tot_auxf_change_ != NULL) {
        double old_auxf = extractor_.GetAuxf(utt_stats, ivector_);
        extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
        double new_auxf = extractor_.GetAuxf(utt_stats, ivector_);
        auxf_change_ = new_auxf - old_auxf;
      } else {
        extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL);
      }
    }
    ~IvectorExtractTask() {
      if (tot_auxf_change_ != NULL) {
        double T = TotalPosterior(posterior_);
        *tot_auxf_change_ += auxf_change_;
        KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was "
                      << (auxf_change_ / T) << " per frame over " << T
                      << " frames (weighted)";
      }
      // We actually write out the offset of the iVectors from the mean of the
      // prior distribution; this is the form we'll need it in for scoring.  (most
      // formulations of iVectors have zero-mean priors so this is not normally an
      // issue).
      ivector_(0) -= extractor_.PriorOffset();
      KALDI_VLOG(2) << "Ivector norm for utterance " << utt_
                    << " was " << ivector_.Norm(2.0);
      writer_->Write(utt_, Vector<BaseFloat>(ivector_));
    }
   private:
    const IvectorExtractor &extractor_;
    std::string utt_;
    Matrix<BaseFloat> feats_;
    Posterior posterior_;
    BaseFloatVectorWriter *writer_;
    double *tot_auxf_change_; // if non-NULL we need the auxf change.
    Vector<double> ivector_;
    double auxf_change_;
  };
  
  int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename,
                     const IvectorEstimationOptions &opts,
                     bool compute_objf_change,
                     const std::string &spk2utt_rspecifier,
                     const std::string &feature_rspecifier,
                     const std::string &posterior_rspecifier,
                     const std::string &ivector_wspecifier) {
    IvectorExtractor extractor;
    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
    RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
    RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
    BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
  
    double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0;
    int32 num_utt_done = 0, num_utt_err = 0,
        num_spk_done = 0, num_spk_err = 0;
  
    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
      std::string spk = spk2utt_reader.Key();
      const std::vector<std::string> &utts = spk2utt_reader.Value();
  
      bool need_2nd_order_stats = false;
  
      IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(),
                                               extractor.FeatDim(),
                                               need_2nd_order_stats);
  
      for (size_t i = 0; i < utts.size(); i++) {
        const std::string &utt = utts[i];
        if (!feature_reader.HasKey(utt)) {
          KALDI_WARN << "No features present for utterance " << utt;
          num_utt_err++;
          continue;
        }
        const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
        if (!posterior_reader.HasKey(utt)) {
          KALDI_WARN << "No posteriors present for utterance " << utt;
          num_utt_err++;
          continue;
        }
        Posterior posterior = posterior_reader.Value(utt);
        if (feats.NumRows() != posterior.size()) {
          KALDI_WARN << "Posterior has wrong size " << posterior.size()
                     << " vs. feats " << feats.NumRows() << " for "
                     << utt;
          num_utt_err++;
          continue;
        }
        ScalePosterior(opts.acoustic_weight, &posterior);
        num_utt_done++;
        utt_stats.AccStats(feats, posterior);
      }
  
      if (utt_stats.NumFrames() == 0.0) {
        KALDI_WARN << "No stats accumulated for speaker " << spk;
        num_spk_err++;
        continue;
      } else {
        if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) {
          double scale = opts.max_count / utt_stats.NumFrames();
          utt_stats.Scale(scale);
          KALDI_LOG << "Scaling stats for speaker " << spk << " by scale "
                    << scale << " due to --max-count=" << opts.max_count;
        }
  
        Vector<double> ivector(extractor.IvectorDim());
        ivector(0) = extractor.PriorOffset();
  
        if (compute_objf_change) {
          double old_auxf = extractor.GetAuxf(utt_stats, ivector);
          extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
          double new_auxf = extractor.GetAuxf(utt_stats, ivector);
          double auxf_change = new_auxf - old_auxf;
  
          KALDI_LOG << "Auxf change for speaker " << spk << " was "
                    << (auxf_change / utt_stats.NumFrames()) << " per frame, over "
                    << utt_stats.NumFrames() << " frames (weighted).";
          tot_auxf_change += auxf_change;
        } else {
          extractor.GetIvectorDistribution(utt_stats, &ivector, NULL);
        }
        // We actually write out the offset of the iVectors from the mean of the
        // prior distribution; this is the form we'll need it in for scoring and
        // as a feature for neural nets.  (most formulations of iVectors have
        // zero-mean priors so this is not normally an issue).
        ivector(0) -= extractor.PriorOffset();
        KALDI_LOG << "Ivector norm for speaker " << spk
                  << " was " << ivector.Norm(2.0);
  
        tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames();
        tot_post += utt_stats.NumFrames();
        num_spk_done++;
        Vector<BaseFloat> ivector_flt(ivector);
        ivector_writer.Write(spk, ivector_flt);
      }
    }
  
    KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err
              << " with errors.  " << num_utt_done << " utterances "
              << "were processed, " << num_utt_err << " with errors.";
    if (tot_post != 0.0) {
      if (compute_objf_change) {
        KALDI_LOG << "Overall weighted-average objective function improvement was "
                  << (tot_auxf_change / tot_post) << " over " << tot_post
                  << " frames (weighted)";
      }
      KALDI_LOG << "Average iVector norm (weighted by frames) was "
                << (tot_norm / tot_post) << " over " << tot_post
                << " frames (weighted)";
    }
    return (num_spk_done != 0 ? 0 : 1);
  }
  
  }
  
  
  
  int main(int argc, char *argv[]) {
    using namespace kaldi;
    typedef kaldi::int32 int32;
    typedef kaldi::int64 int64;
    try {
      const char *usage =
          "Extract iVectors for utterances, using a trained iVector extractor,
  "
          "and features and Gaussian-level posteriors
  "
          "Usage:  ivector-extract [options] <model-in> <feature-rspecifier> "
          "<posteriors-rspecifier> <ivector-wspecifier>
  "
          "e.g.: 
  "
          " fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\
  "
          "  ivector-extract final.ie '$feats' ark,s,cs:- ark,t:ivectors.1.ark
  ";
  
      ParseOptions po(usage);
      bool compute_objf_change = true;
      IvectorEstimationOptions opts;
      std::string spk2utt_rspecifier;
      TaskSequencerConfig sequencer_config;
      po.Register("compute-objf-change", &compute_objf_change,
                  "If true, compute the change in objective function from using "
                  "nonzero iVector (a potentially useful diagnostic).  Combine "
                  "with --verbose=2 for per-utterance information");
      po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you "
                  "want iVectors to be output at the per-speaker level, estimated "
                  "using stats accumulated from multiple utterances.  Note: this "
                  "is not the normal way iVectors are obtained for speaker-id. "
                  "This option will cause the program to ignore the --num-threads "
                  "option.");
  
      opts.Register(&po);
      sequencer_config.Register(&po);
  
      po.Read(argc, argv);
  
      if (po.NumArgs() != 4) {
        po.PrintUsage();
        exit(1);
      }
  
      std::string ivector_extractor_rxfilename = po.GetArg(1),
          feature_rspecifier = po.GetArg(2),
          posterior_rspecifier = po.GetArg(3),
          ivectors_wspecifier = po.GetArg(4);
  
  
      if (spk2utt_rspecifier.empty()) {
        // g_num_threads affects how ComputeDerivedVars is called when we read the
        // extractor.
        g_num_threads = sequencer_config.num_threads;
        IvectorExtractor extractor;
        ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
  
        double tot_auxf_change = 0.0, tot_t = 0.0;
        int32 num_done = 0, num_err = 0;
  
        SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
        RandomAccessPosteriorReader posterior_reader(posterior_rspecifier);
        BaseFloatVectorWriter ivector_writer(ivectors_wspecifier);
  
        {
          TaskSequencer<IvectorExtractTask> sequencer(sequencer_config);
          for (; !feature_reader.Done(); feature_reader.Next()) {
            std::string utt = feature_reader.Key();
            if (!posterior_reader.HasKey(utt)) {
              KALDI_WARN << "No posteriors for utterance " << utt;
              num_err++;
              continue;
            }
            const Matrix<BaseFloat> &mat = feature_reader.Value();
            Posterior posterior = posterior_reader.Value(utt);
  
            if (static_cast<int32>(posterior.size()) != mat.NumRows()) {
              KALDI_WARN << "Size mismatch between posterior " << posterior.size()
                         << " and features " << mat.NumRows() << " for utterance "
                         << utt;
              num_err++;
              continue;
            }
  
            double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL );
  
            double this_t = opts.acoustic_weight * TotalPosterior(posterior),
                max_count_scale = 1.0;
            if (opts.max_count > 0 && this_t > opts.max_count) {
              max_count_scale = opts.max_count / this_t;
              KALDI_LOG << "Scaling stats for utterance " << utt << " by scale "
                        << max_count_scale << " due to --max-count="
                        << opts.max_count;
              this_t = opts.max_count;
            }
            ScalePosterior(opts.acoustic_weight * max_count_scale,
                           &posterior);
            // note: now, this_t == sum of posteriors.
  
            sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior,
                                                 &ivector_writer, auxf_ptr));
  
            tot_t += this_t;
            num_done++;
          }
          // Destructor of "sequencer" will wait for any remaining tasks.
        }
  
        KALDI_LOG << "Done " << num_done << " files, " << num_err
                  << " with errors.  Total (weighted) frames " << tot_t;
        if (compute_objf_change)
          KALDI_LOG << "Overall average objective-function change from estimating "
                    << "ivector was " << (tot_auxf_change / tot_t) << " per frame "
                    << " over " << tot_t << " (weighted) frames.";
  
        return (num_done != 0 ? 0 : 1);
      } else {
        KALDI_ASSERT(sequencer_config.num_threads == 1 &&
                     "--spk2utt option is incompatible with --num-threads option");
        return RunPerSpeaker(ivector_extractor_rxfilename,
                             opts,
                             compute_objf_change,
                             spk2utt_rspecifier,
                             feature_rspecifier,
                             posterior_rspecifier,
                             ivectors_wspecifier);
      }
    } catch(const std::exception &e) {
      std::cerr << e.what();
      return -1;
    }
  }