// chain/chain-denominator.cc // Copyright 2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "chain/chain-denominator.h" #include "chain/chain-kernels-ansi.h" namespace kaldi { namespace chain { DenominatorComputation::DenominatorComputation( const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, int32 num_sequences, const CuMatrixBase &nnet_output): opts_(opts), den_graph_(den_graph), num_sequences_(num_sequences), frames_per_sequence_(nnet_output.NumRows() / num_sequences_), nnet_output_deriv_transposed_( nnet_output.NumCols(), std::min(nnet_output.NumRows(), static_cast(kMaxDerivTimeSteps) * num_sequences_)), alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, kUndefined), tot_prob_(num_sequences_, kUndefined), tot_log_prob_(num_sequences_, kUndefined), log_correction_term_(num_sequences_, kUndefined), ok_(true) { // We don't let leaky_hmm_coefficient be exactly zero (although that would // make sense mathematically, corresponding to "turning off" the leaky HMM), // because that would lead to underflow and eventually NaN's or inf's // appearing in the computation, since we do this computation not in // log-space. KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && opts_.leaky_hmm_coefficient < 1.0); if (RandInt(0, 99) == 0) { // A check, that all values in nnet_output are in the range [-30, 30].. // otherwise derivatives will be wrong (search below for 30). BaseFloat max_val = nnet_output.Max(), min_val = nnet_output.Min(); if (max_val > 30.0 || min_val < -30.0) { KALDI_WARN << "Nnet outputs " << min_val << ", " << max_val << " outside the range [-30,30], derivs may be inaccurate."; } } // make sure the alpha sums and beta sums are zeroed. alpha_.ColRange(den_graph_.NumStates() * num_sequences_, num_sequences_).SetZero(); beta_.ColRange(den_graph_.NumStates() * num_sequences_, num_sequences_).SetZero(); KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); // the kStrideEqualNumCols argument is so that we can share the same // memory block with xent_output_deriv (see chain-training.cc, search for // kStrideEqualNumCols). This depends on how the allocator works, and // actually might not happen, but anyway, the impact on speed would // likely be un-measurably small. exp_nnet_output_transposed_.Resize(nnet_output.NumCols(), nnet_output.NumRows(), kUndefined, kStrideEqualNumCols); exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans); // We limit the nnet output to the range [-30,30] before doing the exp; // this avoids NaNs appearing in the forward-backward computation, which // is not done in log space. exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0); } void DenominatorComputation::AlphaFirstFrame() { // dim == num_hmm_states_ * num_sequences_. BaseFloat *first_frame_alpha = alpha_.RowData(0); // create a 'fake matrix' - view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. CuSubMatrix alpha_mat(first_frame_alpha, den_graph_.NumStates(), num_sequences_, num_sequences_); // TODO (possible): It would be more efficient here if we implemented a // CopyColsFromVec function in class CuMatrix. alpha_mat.SetZero(); alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0); } // the alpha computation for some 0 < t <= num_time_steps_. void DenominatorComputation::AlphaGeneralFrame(int32 t) { KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); BaseFloat *this_alpha = alpha_.RowData(t); const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), num_hmm_states = den_graph_.NumStates(), num_sequences = num_sequences_; // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, (t-1) * num_sequences_, num_sequences_); const BaseFloat *prob_data = probs.Data(); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); while (1) { if (dimGrid.y > 65535) // the hardware doesn't allow more than this. dimGrid.y = 65535; cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, num_sequences, den_graph_.NumStates(), prob_data, probs.Stride(), prev_alpha_dash, this_alpha); CU_SAFE_CALL(cudaGetLastError()); if (dimGrid.y == num_hmm_states) { break; // this is the normal case. } else { // We reach this code only in the unusual case where num_hmm_states > // 65535. We can compute the alphas for the remaining HMM states by // moving some of the array pointers and making the call again. backward_transitions += dimGrid.y; this_alpha += dimGrid.y * num_sequences; num_hmm_states -= dimGrid.y; dimGrid.y = num_hmm_states; } } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { int32 prob_stride = probs.Stride(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { double this_tot_alpha = 0.0; const DenominatorGraphTransition *trans_iter = transitions + backward_transitions[h].first, *trans_end = transitions + backward_transitions[h].second; for (; trans_iter != trans_end; ++trans_iter) { BaseFloat transition_prob = trans_iter->transition_prob; int32 pdf_id = trans_iter->pdf_id, prev_hmm_state = trans_iter->hmm_state; BaseFloat prob = prob_data[pdf_id * prob_stride + s], this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; } // Let arbitrary_scale be the inverse of the alpha-sum value that we // store in the same place we'd store the alpha for the state numbered // 'num_hmm_states'. We multiply this into all the // transition-probabilities from the previous frame to this frame, in // both the forward and backward passes, in order to keep the alphas in // a good numeric range. This won't affect the posteriors, but when // computing the total likelihood we'll need to compensate for it later // on. BaseFloat arbitrary_scale = 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } } } } void DenominatorComputation::AlphaDash(int32 t) { BaseFloat *this_alpha = alpha_.RowData(t); // create a 'fake matrix' for the regular alphas- view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. CuSubMatrix alpha_mat(this_alpha, den_graph_.NumStates(), num_sequences_, num_sequences_); // the alpha-dash is the sum of alpha over all states. CuSubVector alpha_sum_vec(this_alpha + den_graph_.NumStates() * num_sequences_, num_sequences_); alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, den_graph_.InitialProbs(), alpha_sum_vec); // it's now alpha-dash. } // compute beta from beta-dash. void DenominatorComputation::Beta(int32 t) { BaseFloat *this_beta_dash = beta_.RowData(t % 2); // create a 'fake matrix' for the regular beta-dash (which is // the counterpart of alpha-dash)- view this row as a matrix. // initializer takes [pointer, num-rows, num-cols, stride]. CuSubMatrix beta_dash_mat(this_beta_dash, den_graph_.NumStates(), num_sequences_, num_sequences_); // making the t index implicit, the beta-dash-sum for each sequence is the sum // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. CuSubVector beta_dash_sum_vec( this_beta_dash + den_graph_.NumStates() * num_sequences_, num_sequences_); beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, kTrans, den_graph_.InitialProbs(), 0.0); // we are computing beta in place. After the following, beta-dash-mat // will contain the actual beta (i.e. the counterpart of alpha), // not the beta-dash. beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); } BaseFloat DenominatorComputation::Forward() { AlphaFirstFrame(); AlphaDash(0); for (int32 t = 1; t <= frames_per_sequence_; t++) { AlphaGeneralFrame(t); AlphaDash(t); } return ComputeTotLogLike(); } BaseFloat DenominatorComputation::ComputeTotLogLike() { tot_prob_.Resize(num_sequences_); // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. CuSubMatrix last_alpha_dash( alpha_.RowData(frames_per_sequence_), den_graph_.NumStates(), num_sequences_, num_sequences_); tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); // we should probably add an ApplyLog() function that takes a vector argument. tot_log_prob_ = tot_prob_; tot_log_prob_.ApplyLog(); BaseFloat tot_log_prob = tot_log_prob_.Sum(); // We now have to add something for the arbitrary scaling factor. [note: the // purpose of the arbitrary scaling factors was to keep things in a good // floating-point range] // The inverses of all the tot-alpha quantities, for t = 0 // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in // the transition-probs, so we need to multiply them all together (not // inversed) and add them as a correction term to the total log-likes. // These tot-alpha quantities were stored in the same place that we would // have stored the HMM-state numbered 'num_hmm_states'. int32 num_hmm_states = den_graph_.NumStates(); CuSubMatrix inv_arbitrary_scales( alpha_, 0, frames_per_sequence_, num_sequences_ * num_hmm_states, num_sequences_); CuMatrix log_inv_arbitrary_scales( inv_arbitrary_scales); log_inv_arbitrary_scales.ApplyLog(); BaseFloat log_inv_arbitrary_scales_product = log_inv_arbitrary_scales.Sum(); return tot_log_prob + log_inv_arbitrary_scales_product; } bool DenominatorComputation::Backward( BaseFloat deriv_weight, CuMatrixBase *nnet_output_deriv) { BetaDashLastFrame(); Beta(frames_per_sequence_); for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { BetaDashGeneralFrame(t); if (GetVerboseLevel() >= 1 || t == 0) BetaGeneralFrameDebug(t); Beta(t); if (t % kMaxDerivTimeSteps == 0) { // commit the derivative stored in nnet_output_deriv_transposed_ by adding // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), frames_per_sequence_ - t), num_pdfs = exp_nnet_output_transposed_.NumRows(); CuSubMatrix transposed_deriv_part( nnet_output_deriv_transposed_, 0, num_pdfs, 0, chunk_frames * num_sequences_); CuSubMatrix output_deriv_part( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); if (t != 0) transposed_deriv_part.SetZero(); } } return ok_; } void DenominatorComputation::BetaDashLastFrame() { // sets up the beta-dash quantity on the last frame (frame == // frames_per_sequence_). Note that the betas we use here contain a // 1/(tot-prob) factor in order to simplify the backprop. int32 t = frames_per_sequence_; BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); // create a 'fake matrix' - view this row as a matrix. CuSubMatrix beta_dash_mat(last_frame_beta_dash, den_graph_.NumStates(), num_sequences_, num_sequences_); CuVector inv_tot_prob(tot_prob_); inv_tot_prob.InvertElements(); // the beta values at the end of the file only vary with the sequence-index, // not with the HMM-index. We treat all states as having a final-prob of one. beta_dash_mat.CopyRowsFromVec(inv_tot_prob); } void DenominatorComputation::BetaDashGeneralFrame(int32 t) { KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); // t_wrapped gives us the time-index we use when indexing // nnet_output_deriv_transposed_; to save memory we limit the size of the // matrix, storing only chunks of frames at a time, and we add it to the // non-transposed output whenever we finish a chunk. int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); const BaseFloat *this_alpha_dash = alpha_.RowData(t), *next_beta = beta_.RowData((t + 1) % 2); BaseFloat *this_beta_dash = beta_.RowData(t % 2); const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); const DenominatorGraphTransition *transitions = den_graph_.Transitions(); // 'probs' is the matrix of pseudo-likelihoods for frame t. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, t * num_sequences_, num_sequences_), log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), num_sequences = num_sequences_; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); while (1) { if (dimGrid.y > 65535) // the hardware doesn't allow more than this. dimGrid.y = 65535; cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, num_sequences, num_hmm_states, probs.Data(), probs.Stride(), this_alpha_dash, next_beta, this_beta_dash, log_prob_deriv.Data(), log_prob_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); if (dimGrid.y == num_hmm_states) { break; // this is the normal case. } else { // We reach this code only in the unusual case where num_hmm_states > // 65535. We can compute the betas (and log-prob derivatives) for the // remaining HMM states by moving some of the array pointers and making // the call again. forward_transitions += dimGrid.y; this_alpha_dash += dimGrid.y * num_sequences; this_beta_dash += dimGrid.y * num_sequences; num_hmm_states -= dimGrid.y; dimGrid.y = num_hmm_states; } } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { int32 prob_stride = probs.Stride(), deriv_stride = log_prob_deriv.Stride(); const BaseFloat *prob_data = probs.Data(); BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); for (int32 h = 0; h < num_hmm_states; h++) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], inv_arbitrary_scale = this_alpha_dash[num_hmm_states * num_sequences + s]; double tot_variable_factor = 0.0; BaseFloat occupation_factor = this_alpha_dash_prob / inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, *trans_end = transitions + forward_transitions[h].second; for (; trans_iter != trans_end; ++trans_iter) { BaseFloat transition_prob = trans_iter->transition_prob; int32 pdf_id = trans_iter->pdf_id, next_hmm_state = trans_iter->hmm_state; BaseFloat variable_factor = transition_prob * next_beta[next_hmm_state * num_sequences + s] * prob_data[pdf_id * prob_stride + s]; tot_variable_factor += variable_factor; BaseFloat occupation_prob = variable_factor * occupation_factor; log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; } this_beta_dash[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale; } } } } void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { BaseFloat num_hmm_states = den_graph_.NumStates(), alpha_beta_size = num_hmm_states * num_sequences_; CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), num_pdfs = exp_nnet_output_transposed_.NumRows(); CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ << " alpha-dash-sum = " << this_alpha_dash.Sum() << ", beta-dash-sum = " << this_beta_dash.Sum(); if (fabs(alpha_beta_product - num_sequences_) > 2.0) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } } // use higher tolerance, since we are using randomized pruning for the // log-prob derivatives. if (!ApproxEqual(this_log_prob_deriv_sum, num_sequences_, 0.01)) { KALDI_WARN << "On time " << t << ", log-prob-deriv sum " << this_log_prob_deriv_sum << " != " << num_sequences_; if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { KALDI_WARN << "Excessive error detected, will abandon this minibatch"; ok_ = false; } } } } // namespace chain } // namespace kaldi