// gmm/am-diag-gmm.h // Copyright 2009-2012 Saarland University (Author: Arnab Ghoshal) // Johns Hopkins University (Author: Daniel Povey) // Karel Vesely // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_GMM_AM_DIAG_GMM_H_ #define KALDI_GMM_AM_DIAG_GMM_H_ 1 #include #include "base/kaldi-common.h" #include "gmm/diag-gmm.h" #include "itf/options-itf.h" namespace kaldi { /// @defgroup DiagGmm DiagGmm /// @{ /// kaldi Diagonal Gaussian Mixture Models class AmDiagGmm { public: AmDiagGmm() {} ~AmDiagGmm(); /// Initializes with a single "prototype" GMM. void Init(const DiagGmm &proto, int32 num_pdfs); /// Adds a GMM to the model, and increments the total number of PDFs. void AddPdf(const DiagGmm &gmm); /// Copies the parameters from another model. Allocates necessary memory. void CopyFromAmDiagGmm(const AmDiagGmm &other); void SplitPdf(int32 idx, int32 target_components, float perturb_factor); // In SplitByCount we use the "target_components" and "power" // to work out targets for each state (according to power-of-occupancy rule), // and any state less than its target gets mixed up. If some states // were over their target, this may take the #Gauss over the target. // we enforce a min-count on Gaussians while splitting (don't split // if it would take it below min-count). void SplitByCount(const Vector &state_occs, int32 target_components, float perturb_factor, BaseFloat power, BaseFloat min_count); // In SplitByCount we use the "target_components" and "power" // to work out targets for each state (according to power-of-occupancy rule), // and any state over its target gets mixed down. If some states // were under their target, this may take the #Gauss below the target. void MergeByCount(const Vector &state_occs, int32 target_components, BaseFloat power, BaseFloat min_count); /// Sets the gconsts for all the PDFs. Returns the total number of Gaussians /// over all PDFs that are "invalid" e.g. due to zero weights or variances. int32 ComputeGconsts(); BaseFloat LogLikelihood(const int32 pdf_index, const VectorBase &data) const; void Read(std::istream &in_stream, bool binary); void Write(std::ostream &out_stream, bool binary) const; int32 Dim() const { return (densities_.size() > 0)? densities_[0]->Dim() : 0; } int32 NumPdfs() const { return densities_.size(); } int32 NumGauss() const; int32 NumGaussInPdf(int32 pdf_index) const; /// Accessors DiagGmm& GetPdf(int32 pdf_index); const DiagGmm& GetPdf(int32 pdf_index) const; void GetGaussianMean(int32 pdf_index, int32 gauss, VectorBase *out) const; void GetGaussianVariance(int32 pdf_index, int32 gauss, VectorBase *out) const; /// Mutators void SetGaussianMean(int32 pdf_index, int32 gauss_index, const VectorBase &in); private: std::vector densities_; // int32 dim_; void RemovePdf(int32 pdf_index); KALDI_DISALLOW_COPY_AND_ASSIGN(AmDiagGmm); }; inline BaseFloat AmDiagGmm::LogLikelihood( const int32 pdf_index, const VectorBase &data) const { return densities_[pdf_index]->LogLikelihood(data); } inline int32 AmDiagGmm::NumGaussInPdf(int32 pdf_index) const { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); return densities_[pdf_index]->NumGauss(); } inline DiagGmm& AmDiagGmm::GetPdf(int32 pdf_index) { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); return *(densities_[pdf_index]); } inline const DiagGmm& AmDiagGmm::GetPdf(int32 pdf_index) const { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); return *(densities_[pdf_index]); } inline void AmDiagGmm::GetGaussianMean(int32 pdf_index, int32 gauss, VectorBase *out) const { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); densities_[pdf_index]->GetComponentMean(gauss, out); } inline void AmDiagGmm::GetGaussianVariance(int32 pdf_index, int32 gauss, VectorBase *out) const { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); densities_[pdf_index]->GetComponentVariance(gauss, out); } inline void AmDiagGmm::SetGaussianMean(int32 pdf_index, int32 gauss_index, const VectorBase &in) { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); densities_[pdf_index]->SetComponentMean(gauss_index, in); } inline void AmDiagGmm::SplitPdf(int32 pdf_index, int32 target_components, float perturb_factor) { KALDI_ASSERT((static_cast(pdf_index) < densities_.size()) && (densities_[pdf_index] != NULL)); densities_[pdf_index]->Split(target_components, perturb_factor); } struct UbmClusteringOptions { int32 ubm_num_gauss; BaseFloat reduce_state_factor; int32 intermediate_num_gauss; BaseFloat cluster_varfloor; int32 max_am_gauss; UbmClusteringOptions() : ubm_num_gauss(400), reduce_state_factor(0.2), intermediate_num_gauss(4000), cluster_varfloor(0.01), max_am_gauss(20000) {} UbmClusteringOptions(int32 ncomp, BaseFloat red, int32 interm_gauss, BaseFloat vfloor, int32 max_am_gauss) : ubm_num_gauss(ncomp), reduce_state_factor(red), intermediate_num_gauss(interm_gauss), cluster_varfloor(vfloor), max_am_gauss(max_am_gauss) {} void Register(OptionsItf *opts) { std::string module = "UbmClusteringOptions: "; opts->Register("max-am-gauss", &max_am_gauss, module+ "We first reduce acoustic model to this max #Gauss before clustering."); opts->Register("ubm-num-gauss", &ubm_num_gauss, module+ "Number of Gaussians components in the final UBM."); opts->Register("ubm-numcomps", &ubm_num_gauss, module+ "Backward compatibility option (see ubm-num-gauss)"); opts->Register("reduce-state-factor", &reduce_state_factor, module+ "Intermediate number of clustered states (as fraction of total states)."); opts->Register("intermediate-num-gauss", &intermediate_num_gauss, module+ "Intermediate number of merged Gaussian components."); opts->Register("intermediate-numcomps", &intermediate_num_gauss, module+ "Backward compatibility option (see intermediate-num-gauss)"); opts->Register("cluster-varfloor", &cluster_varfloor, module+ "Variance floor used in bottom-up state clustering."); } void Check(); }; /** Clusters the Gaussians in an acoustic model to a single GMM with specified * number of components. First the each state is mixed-down to a single * Gaussian, then the states are clustered by clustering these Gaussians in a * bottom-up fashion. Number of clusters is determined by reduce_state_factor. * The Gaussians for each cluster of states are then merged based on the least * likelihood reduction till there are intermediate_numcomp Gaussians, which * are then merged into ubm_num_gauss Gaussians. * This is the UBM initialization algorithm described in section 2.1 of Povey, * et al., "The subspace Gaussian mixture model - A structured model for speech * recognition", In Computer Speech and Language, April 2011. */ void ClusterGaussiansToUbm(const AmDiagGmm &am, const Vector &state_occs, UbmClusteringOptions opts, DiagGmm *ubm_out); } // namespace kaldi /// @} DiagGmm #endif // KALDI_GMM_AM_DIAG_GMM_H_