Yannick Estève / ONTRAC-Kaldi

Blame view

src/ivectorbin/ivector-plda-scoring-dense.cc 7.93 KB
  // ivectorbin/ivector-plda-scoring-dense.cc
  
  // Copyright 2016-2018  David Snyder
  //           2017-2018  Matthew Maciejewski
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #include "base/kaldi-common.h"
  #include "util/common-utils.h"
  #include "util/stl-utils.h"
  #include "ivector/plda.h"
  
  namespace kaldi {
  
  bool EstPca(const Matrix<BaseFloat> &ivector_mat, BaseFloat target_energy,
    const std::string &reco, Matrix<BaseFloat> *mat) {
  
    // If the target_energy is 1.0, it's equivalent to not applying the
    // conversation-dependent PCA at all, so it's better to exit this
    // function before doing any computation.
    if (ApproxEqual(target_energy, 1.0, 0.001))
      return false;
  
    int32 num_rows = ivector_mat.NumRows(),
      num_cols = ivector_mat.NumCols();
    Vector<BaseFloat> sum;
    SpMatrix<BaseFloat> sumsq;
    sum.Resize(num_cols);
    sumsq.Resize(num_cols);
    sum.AddRowSumMat(1.0, ivector_mat);
    sumsq.AddMat2(1.0, ivector_mat, kTrans, 1.0);
    sum.Scale(1.0 / num_rows);
    sumsq.Scale(1.0 / num_rows);
    sumsq.AddVec2(-1.0, sum); // now sumsq is centered covariance.
    int32 full_dim = sum.Dim();
  
    Matrix<BaseFloat> P(full_dim, full_dim);
    Vector<BaseFloat> s(full_dim);
  
    try {
      if (num_rows > num_cols)
        sumsq.Eig(&s, &P);
      else
        Matrix<BaseFloat>(sumsq).Svd(&s, &P, NULL);
    } catch (...) {
      KALDI_WARN << "Unable to compute conversation dependent PCA for"
        << " recording " << reco << ".";
      return false;
    }
  
    SortSvd(&s, &P);
  
    Matrix<BaseFloat> transform(P, kTrans); // Transpose of P.  This is what
                                         // appears in the transform.
  
    // We want the PCA transform to retain target_energy amount of the total
    // energy.
    BaseFloat total_energy = s.Sum();
    BaseFloat energy = 0.0;
    int32 dim = 1;
    while (energy / total_energy <= target_energy) {
      energy += s(dim-1);
      dim++;
    }
    Matrix<BaseFloat> transform_float(transform);
    mat->Resize(transform.NumCols(), transform.NumRows());
    mat->CopyFromMat(transform);
    mat->Resize(dim, transform_float.NumCols(), kCopyData);
    return true;
  }
  
  // Transforms i-vectors using the PLDA model.
  void TransformIvectors(const Matrix<BaseFloat> &ivectors_in,
    const PldaConfig &plda_config, const Plda &plda,
    Matrix<BaseFloat> *ivectors_out) {
    int32 dim = plda.Dim();
    ivectors_out->Resize(ivectors_in.NumRows(), dim);
    for (int32 i = 0; i < ivectors_in.NumRows(); i++) {
      Vector<BaseFloat> transformed_ivector(dim);
      plda.TransformIvector(plda_config, ivectors_in.Row(i), 1.0,
        &transformed_ivector);
      ivectors_out->Row(i).CopyFromVec(transformed_ivector);
    }
  }
  
  // Transform the i-vectors using the recording-dependent PCA matrix.
  void ApplyPca(const Matrix<BaseFloat> &ivectors_in,
    const Matrix<BaseFloat> &pca_mat, Matrix<BaseFloat> *ivectors_out) {
    int32 transform_cols = pca_mat.NumCols(),
          transform_rows = pca_mat.NumRows(),
          feat_dim = ivectors_in.NumCols();
    ivectors_out->Resize(ivectors_in.NumRows(), transform_rows);
    KALDI_ASSERT(transform_cols == feat_dim);
    ivectors_out->AddMatMat(1.0, ivectors_in, kNoTrans,
      pca_mat, kTrans, 0.0);
  }
  
  } // namespace kaldi
  
  int main(int argc, char *argv[]) {
    using namespace kaldi;
    typedef kaldi::int32 int32;
    try {
      const char *usage =
        "Perform PLDA scoring for speaker diarization.  The input reco2utt
  "
        "should be of the form <recording-id> <seg1> <seg2> ... <segN> and
  "
        "there should be one iVector for each segment.  PLDA scoring is
  "
        "performed between all pairs of iVectors in a recording and outputs
  "
        "an archive of score matrices, one for each recording-id.  The rows
  "
        "and columns of the the matrix correspond the sorted order of the
  "
        "segments.
  "
        "Usage: ivector-plda-scoring-dense [options] <plda> <reco2utt>"
        " <ivectors-rspecifier> <scores-wspecifier>
  "
        "e.g.: 
  "
        "  ivector-plda-scoring-dense plda reco2utt scp:ivectors.scp"
        " ark:scores.ark ark,t:ivectors.1.ark
  ";
  
      ParseOptions po(usage);
      BaseFloat target_energy = 0.5;
      PldaConfig plda_config;
      plda_config.Register(&po);
  
      po.Register("target-energy", &target_energy,
        "Reduce dimensionality of i-vectors using a recording-dependent"
        " PCA such that this fraction of the total energy remains.");
      KALDI_ASSERT(target_energy <= 1.0);
  
      po.Read(argc, argv);
  
      if (po.NumArgs() != 4) {
        po.PrintUsage();
        exit(1);
      }
  
      std::string plda_rxfilename = po.GetArg(1),
        reco2utt_rspecifier = po.GetArg(2),
        ivector_rspecifier = po.GetArg(3),
        scores_wspecifier = po.GetArg(4);
  
      Plda plda;
      ReadKaldiObject(plda_rxfilename, &plda);
  
      SequentialTokenVectorReader reco2utt_reader(reco2utt_rspecifier);
      RandomAccessBaseFloatVectorReader ivector_reader(ivector_rspecifier);
      BaseFloatMatrixWriter scores_writer(scores_wspecifier);
      int32 num_reco_err = 0,
            num_reco_done = 0;
      for (; !reco2utt_reader.Done(); reco2utt_reader.Next()) {
        Plda this_plda(plda);
        std::string reco = reco2utt_reader.Key();
  
        std::vector<std::string> uttlist = reco2utt_reader.Value();
        std::vector<Vector<BaseFloat> > ivectors;
  
        for (size_t i = 0; i < uttlist.size(); i++) {
          std::string utt = uttlist[i];
  
          if (!ivector_reader.HasKey(utt)) {
            KALDI_ERR << "No iVector present in input for utterance " << utt;
          }
  
          Vector<BaseFloat> ivector = ivector_reader.Value(utt);
          ivectors.push_back(ivector);
        }
        if (ivectors.size() == 0) {
          KALDI_WARN << "Not producing output for recording " << reco
                     << " since no segments had iVectors";
          num_reco_err++;
        } else {
          Matrix<BaseFloat> ivector_mat(ivectors.size(), ivectors[0].Dim()),
                            ivector_mat_pca,
                            ivector_mat_plda,
                            pca_transform,
                            scores(ivectors.size(), ivectors.size());
  
          for (size_t i = 0; i < ivectors.size(); i++) {
            ivector_mat.Row(i).CopyFromVec(ivectors[i]);
          }
          if (EstPca(ivector_mat, target_energy, reco, &pca_transform)) {
            // Apply the PCA transform to the raw i-vectors.
            ApplyPca(ivector_mat, pca_transform, &ivector_mat_pca);
  
            // Apply the PCA transform to the parameters of the PLDA model.
            this_plda.ApplyTransform(Matrix<double>(pca_transform));
  
            // Now transform the i-vectors using the reduced PLDA model.
            TransformIvectors(ivector_mat_pca, plda_config, this_plda,
              &ivector_mat_plda);
          } else {
            // If EstPca returns false, we won't apply any PCA.
            TransformIvectors(ivector_mat, plda_config, this_plda,
            &ivector_mat_plda);
          }
          for (int32 i = 0; i < ivector_mat_plda.NumRows(); i++) {
            for (int32 j = 0; j < ivector_mat_plda.NumRows(); j++) {
              scores(i, j) = this_plda.LogLikelihoodRatio(Vector<double>(
                ivector_mat_plda.Row(i)), 1.0,
                Vector<double>(ivector_mat_plda.Row(j)));
            }
          }
          scores_writer.Write(reco, scores);
          num_reco_done++;
        }
      }
      KALDI_LOG << "Processed " << num_reco_done << " recordings, "
                << num_reco_err << " had errors.";
      return (num_reco_done != 0 ? 0 : 1 );
    } catch(const std::exception &e) {
      std::cerr << e.what();
      return -1;
    }
  }