// ivectorbin/ivector-extract.cc // Copyright 2013 Daniel Povey // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" #include "gmm/am-diag-gmm.h" #include "ivector/ivector-extractor.h" #include "util/kaldi-thread.h" namespace kaldi { // This class will be used to parallelize over multiple threads the job // that this program does. The work happens in the operator (), the // output happens in the destructor. class IvectorExtractTask { public: IvectorExtractTask(const IvectorExtractor &extractor, std::string utt, const Matrix &feats, const Posterior &posterior, BaseFloatVectorWriter *writer, double *tot_auxf_change): extractor_(extractor), utt_(utt), feats_(feats), posterior_(posterior), writer_(writer), tot_auxf_change_(tot_auxf_change) { } void operator () () { bool need_2nd_order_stats = false; IvectorExtractorUtteranceStats utt_stats(extractor_.NumGauss(), extractor_.FeatDim(), need_2nd_order_stats); utt_stats.AccStats(feats_, posterior_); ivector_.Resize(extractor_.IvectorDim()); ivector_(0) = extractor_.PriorOffset(); if (tot_auxf_change_ != NULL) { double old_auxf = extractor_.GetAuxf(utt_stats, ivector_); extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL); double new_auxf = extractor_.GetAuxf(utt_stats, ivector_); auxf_change_ = new_auxf - old_auxf; } else { extractor_.GetIvectorDistribution(utt_stats, &ivector_, NULL); } } ~IvectorExtractTask() { if (tot_auxf_change_ != NULL) { double T = TotalPosterior(posterior_); *tot_auxf_change_ += auxf_change_; KALDI_VLOG(2) << "Auxf change for utterance " << utt_ << " was " << (auxf_change_ / T) << " per frame over " << T << " frames (weighted)"; } // We actually write out the offset of the iVectors from the mean of the // prior distribution; this is the form we'll need it in for scoring. (most // formulations of iVectors have zero-mean priors so this is not normally an // issue). ivector_(0) -= extractor_.PriorOffset(); KALDI_VLOG(2) << "Ivector norm for utterance " << utt_ << " was " << ivector_.Norm(2.0); writer_->Write(utt_, Vector(ivector_)); } private: const IvectorExtractor &extractor_; std::string utt_; Matrix feats_; Posterior posterior_; BaseFloatVectorWriter *writer_; double *tot_auxf_change_; // if non-NULL we need the auxf change. Vector ivector_; double auxf_change_; }; int32 RunPerSpeaker(const std::string &ivector_extractor_rxfilename, const IvectorEstimationOptions &opts, bool compute_objf_change, const std::string &spk2utt_rspecifier, const std::string &feature_rspecifier, const std::string &posterior_rspecifier, const std::string &ivector_wspecifier) { IvectorExtractor extractor; ReadKaldiObject(ivector_extractor_rxfilename, &extractor); SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); RandomAccessPosteriorReader posterior_reader(posterior_rspecifier); BaseFloatVectorWriter ivector_writer(ivector_wspecifier); double tot_auxf_change = 0.0, tot_post = 0.0, tot_norm = 0.0; int32 num_utt_done = 0, num_utt_err = 0, num_spk_done = 0, num_spk_err = 0; for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { std::string spk = spk2utt_reader.Key(); const std::vector &utts = spk2utt_reader.Value(); bool need_2nd_order_stats = false; IvectorExtractorUtteranceStats utt_stats(extractor.NumGauss(), extractor.FeatDim(), need_2nd_order_stats); for (size_t i = 0; i < utts.size(); i++) { const std::string &utt = utts[i]; if (!feature_reader.HasKey(utt)) { KALDI_WARN << "No features present for utterance " << utt; num_utt_err++; continue; } const Matrix &feats = feature_reader.Value(utt); if (!posterior_reader.HasKey(utt)) { KALDI_WARN << "No posteriors present for utterance " << utt; num_utt_err++; continue; } Posterior posterior = posterior_reader.Value(utt); if (feats.NumRows() != posterior.size()) { KALDI_WARN << "Posterior has wrong size " << posterior.size() << " vs. feats " << feats.NumRows() << " for " << utt; num_utt_err++; continue; } ScalePosterior(opts.acoustic_weight, &posterior); num_utt_done++; utt_stats.AccStats(feats, posterior); } if (utt_stats.NumFrames() == 0.0) { KALDI_WARN << "No stats accumulated for speaker " << spk; num_spk_err++; continue; } else { if (opts.max_count > 0 && utt_stats.NumFrames() > opts.max_count) { double scale = opts.max_count / utt_stats.NumFrames(); utt_stats.Scale(scale); KALDI_LOG << "Scaling stats for speaker " << spk << " by scale " << scale << " due to --max-count=" << opts.max_count; } Vector ivector(extractor.IvectorDim()); ivector(0) = extractor.PriorOffset(); if (compute_objf_change) { double old_auxf = extractor.GetAuxf(utt_stats, ivector); extractor.GetIvectorDistribution(utt_stats, &ivector, NULL); double new_auxf = extractor.GetAuxf(utt_stats, ivector); double auxf_change = new_auxf - old_auxf; KALDI_LOG << "Auxf change for speaker " << spk << " was " << (auxf_change / utt_stats.NumFrames()) << " per frame, over " << utt_stats.NumFrames() << " frames (weighted)."; tot_auxf_change += auxf_change; } else { extractor.GetIvectorDistribution(utt_stats, &ivector, NULL); } // We actually write out the offset of the iVectors from the mean of the // prior distribution; this is the form we'll need it in for scoring and // as a feature for neural nets. (most formulations of iVectors have // zero-mean priors so this is not normally an issue). ivector(0) -= extractor.PriorOffset(); KALDI_LOG << "Ivector norm for speaker " << spk << " was " << ivector.Norm(2.0); tot_norm += ivector.Norm(2.0) * utt_stats.NumFrames(); tot_post += utt_stats.NumFrames(); num_spk_done++; Vector ivector_flt(ivector); ivector_writer.Write(spk, ivector_flt); } } KALDI_LOG << "Done " << num_spk_done << " speakers; " << num_spk_err << " with errors. " << num_utt_done << " utterances " << "were processed, " << num_utt_err << " with errors."; if (tot_post != 0.0) { if (compute_objf_change) { KALDI_LOG << "Overall weighted-average objective function improvement was " << (tot_auxf_change / tot_post) << " over " << tot_post << " frames (weighted)"; } KALDI_LOG << "Average iVector norm (weighted by frames) was " << (tot_norm / tot_post) << " over " << tot_post << " frames (weighted)"; } return (num_spk_done != 0 ? 0 : 1); } } int main(int argc, char *argv[]) { using namespace kaldi; typedef kaldi::int32 int32; typedef kaldi::int64 int64; try { const char *usage = "Extract iVectors for utterances, using a trained iVector extractor,\n" "and features and Gaussian-level posteriors\n" "Usage: ivector-extract [options]

" "

\n" "e.g.: \n" " fgmm-global-gselect-to-post 1.ubm '$feats' 'ark:gunzip -c gselect.1.gz|' ark:- | \\\n" " ivector-extract final.ie '$feats' ark,s,cs:- ark,t:ivectors.1.ark\n"; ParseOptions po(usage); bool compute_objf_change = true; IvectorEstimationOptions opts; std::string spk2utt_rspecifier; TaskSequencerConfig sequencer_config; po.Register("compute-objf-change", &compute_objf_change, "If true, compute the change in objective function from using " "nonzero iVector (a potentially useful diagnostic). Combine " "with --verbose=2 for per-utterance information"); po.Register("spk2utt", &spk2utt_rspecifier, "Supply this option if you " "want iVectors to be output at the per-speaker level, estimated " "using stats accumulated from multiple utterances. Note: this " "is not the normal way iVectors are obtained for speaker-id. " "This option will cause the program to ignore the --num-threads " "option."); opts.Register(&po); sequencer_config.Register(&po); po.Read(argc, argv); if (po.NumArgs() != 4) { po.PrintUsage(); exit(1); } std::string ivector_extractor_rxfilename = po.GetArg(1), feature_rspecifier = po.GetArg(2), posterior_rspecifier = po.GetArg(3), ivectors_wspecifier = po.GetArg(4); if (spk2utt_rspecifier.empty()) { // g_num_threads affects how ComputeDerivedVars is called when we read the // extractor. g_num_threads = sequencer_config.num_threads; IvectorExtractor extractor; ReadKaldiObject(ivector_extractor_rxfilename, &extractor); double tot_auxf_change = 0.0, tot_t = 0.0; int32 num_done = 0, num_err = 0; SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); RandomAccessPosteriorReader posterior_reader(posterior_rspecifier); BaseFloatVectorWriter ivector_writer(ivectors_wspecifier); { TaskSequencer sequencer(sequencer_config); for (; !feature_reader.Done(); feature_reader.Next()) { std::string utt = feature_reader.Key(); if (!posterior_reader.HasKey(utt)) { KALDI_WARN << "No posteriors for utterance " << utt; num_err++; continue; } const Matrix &mat = feature_reader.Value(); Posterior posterior = posterior_reader.Value(utt); if (static_cast(posterior.size()) != mat.NumRows()) { KALDI_WARN << "Size mismatch between posterior " << posterior.size() << " and features " << mat.NumRows() << " for utterance " << utt; num_err++; continue; } double *auxf_ptr = (compute_objf_change ? &tot_auxf_change : NULL ); double this_t = opts.acoustic_weight * TotalPosterior(posterior), max_count_scale = 1.0; if (opts.max_count > 0 && this_t > opts.max_count) { max_count_scale = opts.max_count / this_t; KALDI_LOG << "Scaling stats for utterance " << utt << " by scale " << max_count_scale << " due to --max-count=" << opts.max_count; this_t = opts.max_count; } ScalePosterior(opts.acoustic_weight * max_count_scale, &posterior); // note: now, this_t == sum of posteriors. sequencer.Run(new IvectorExtractTask(extractor, utt, mat, posterior, &ivector_writer, auxf_ptr)); tot_t += this_t; num_done++; } // Destructor of "sequencer" will wait for any remaining tasks. } KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors. Total (weighted) frames " << tot_t; if (compute_objf_change) KALDI_LOG << "Overall average objective-function change from estimating " << "ivector was " << (tot_auxf_change / tot_t) << " per frame " << " over " << tot_t << " (weighted) frames."; return (num_done != 0 ? 0 : 1); } else { KALDI_ASSERT(sequencer_config.num_threads == 1 && "--spk2utt option is incompatible with --num-threads option"); return RunPerSpeaker(ivector_extractor_rxfilename, opts, compute_objf_change, spk2utt_rspecifier, feature_rspecifier, posterior_rspecifier, ivectors_wspecifier); } } catch(const std::exception &e) { std::cerr << e.what(); return -1; } }