// cudamatrix/cu-math.cc // Copyright 2009-2012 Karel Vesely // Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/timer.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" namespace kaldi { namespace cu { /* * templated functions wrapping the ANSI-C CUDA kernel functions */ template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, Real lr) { KALDI_ASSERT(SameDim(*weight, *grad)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK)); cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr, weight->Dim(), grad->Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { MatrixBase &weight2 = weight->Mat(); MatrixBase &grad2 = grad->Mat(); for(MatrixIndexT r=0; r 0.0) ^ (before > 0.0)) { weight2(r, c) = 0.0; grad2(r, c) = 0.0; } else { weight2(r, c) -= l1_signed; } } } } } template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt) { KALDI_ASSERT(src.NumCols() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; /* Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); */ /* * Let's use blocksize 4 x 128 (512 threads/block) * and extend the randomizable matrices to: col 4*65535, row 128*65535 * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) */ dim3 dimBlock(4, 128); dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128)); /* */ MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim(); MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim(); cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_idx.Data(), dimtgt, dimsrc); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { // randomize in CPU const MatrixBase &srcmat = src.Mat(); const int32 *copy_from_idxvec = copy_from_idx.Data(); MatrixBase &tgtmat = tgt->Mat(); for(int32 i=0; i void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, CuMatrixBase *tgt) { KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(), frame_offsets.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { // expand in CPU const MatrixBase &srcmat = src.Mat(); const int32 *frame_offsetvec = frame_offsets.Data(); int32 dim = frame_offsets.Dim(); MatrixBase &tgtmat = tgt->Mat(); // for(int32 r=0; r < tgtmat.NumRows(); r++) { for(int32 off=0; off < dim; off++) { int32 r_off = r + frame_offsetvec[off]; if(r_off < 0) r_off = 0; if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1; memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols()); } } } } template void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, CuMatrixBase *tgt) { KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_indices.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { // expand in CPU const MatrixBase &srcmat = src.Mat(); const int32 *copy_from_indicesvec = copy_from_indices.Data(); int32 dim = copy_from_indices.Dim(); MatrixBase &tgtmat = tgt->Mat(); // for(int32 r = 0; r < tgtmat.NumRows(); r++) { for(int32 c = 0; c < dim; c++) { tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]); } } } } template void EnsureNonzero(const CuMatrixBase &src, Real epsilon, CuMatrixBase *dest) { KALDI_ASSERT(SameDim(*dest, src) && epsilon > 0.0); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; dim3 dimGrid, dimBlock; GetBlockSizesForSimpleMatrixOperation(src.NumRows(), src.NumCols(), &dimGrid, &dimBlock); cuda_ensure_nonzero(dimGrid, dimBlock, src.Data(), src.Dim(), epsilon, dest->Stride(), dest->Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { int32 num_rows = src.NumRows(), num_cols = src.NumCols(); for (int32 r = 0; r < num_rows; r++) { const Real *src_data = src.RowData(r); Real *dest_data = dest->RowData(r); for (int32 c = 0; c < num_cols; c++) { Real x = src_data[c], y; if (x <= -epsilon || x >= epsilon) y = x; else if (x >= 0.0) y = epsilon; else y = -epsilon; dest_data[c] = y; } } } } // instantiate the templates. template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, float l1, float lr); template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, double l1, double lr); template void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, CuMatrixBase *tgt); template void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, CuMatrixBase *tgt); template void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, CuMatrixBase *tgt); template void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, CuMatrixBase *tgt); template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt); template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, CuMatrixBase *tgt); // The output y_i = scale * x_i, // and we want to RMS value of the y_i to equal target_rms, // so y^t y = D * target_rms^2 (if y is one row of the input). // we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)). // there is also flooring involved, to avoid division-by-zero // problems. It's important for the backprop, that the floor's // square root is exactly representable as float. // If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D))) // is an extra dimension of the output. template void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, const bool add_log_stddev, CuMatrixBase* out) { const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66 if (add_log_stddev) { KALDI_ASSERT(in.NumRows() == out->NumRows()); KALDI_ASSERT(in.NumCols() + 1 == out->NumCols()); } else { KALDI_ASSERT(SameDim(in, *out)); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; size_t dimBlock = CU1DBLOCK; size_t dimGrid = out->NumRows(); cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(), in.Data(), in.Dim(), target_rms, add_log_stddev); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { CuSubMatrix out_no_log(*out, 0, out->NumRows(), 0, in.NumCols()); if (in.Data() != out_no_log.Data()) out_no_log.CopyFromMat(in); CuVector in_norm(in.NumRows()); Real d_scaled = in.NumCols() * target_rms * target_rms; in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0); in_norm.ApplyFloor(kSquaredNormFloor); in_norm.ApplyPow(-0.5); out_no_log.MulRowsVec(in_norm); if (add_log_stddev) { in_norm.ApplyLog(); in_norm.Scale(-1.0); in_norm.Add(log(target_rms)); out->CopyColFromVec(in_norm, in.NumCols()); } } } template void NormalizePerRow(const CuMatrixBase& in, const float target_rms, const bool add_log_stddev, CuMatrixBase* out); template void NormalizePerRow(const CuMatrixBase& in, const double target_rms, const bool add_log_stddev, CuMatrixBase* out); // A note on the derivative of NormalizeComponent... // let both row_in and row_out be vectors of dimension D. // Let p = row_in^T row_in / (D * target_rms^2), and let // f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as: // row_out = f row_in. // Suppose we have a quantity deriv_out which is the derivative // of the objective function w.r.t. row_out. We want to compute // deriv_in which is the derivative of the objective function w.r.t. // row_in. Let the objective function be F. One term is obvious: we have // deriv_in = f deriv_out + .... // next we have to take into account the derivative that gets back-propagated // through f. Obviously, dF/df = deriv_out^T row_in. // And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3), // and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued]. // So this term in dF/d(row_in) equals: // dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in // So // deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in // if add_log_stddev_ true, the deriv_in has another term as // dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x) template void DiffNormalizePerRow(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase* in_deriv) { const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66 #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; size_t dimBlock = CU1DBLOCK; size_t dimGrid = in_deriv->NumRows(); cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(), in_deriv->Stride(), in_value.Data(), in_value.Dim(), out_deriv.Data(), out_deriv.Stride(), target_rms, add_log_stddev); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { const CuSubMatrix out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(), 0, in_value.NumCols()); CuVector dot_products(out_deriv.NumRows()); dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value, kTrans, 0.0); CuVector in_norm(in_value.NumRows()); Real d_scaled = (in_value.NumCols() * target_rms * target_rms); in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0); if (add_log_stddev) { CuVector log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1 out_deriv_for_stddev(out_deriv.NumRows(), kUndefined); // f = log(sqrt(max(epsi, x^T x / D))) // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0. // we don't compute this exactly below for the case when x^2 x is very // small, but we do make sure that the deriv isn't infinity when the input // is zero. log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor); log_stddev_deriv.ApplyPow(-1.0); out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1)); log_stddev_deriv.MulElements(out_deriv_for_stddev); if (in_deriv) in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0); } in_norm.Scale(1.0 / d_scaled); in_norm.ApplyFloor(kSquaredNormFloor); in_norm.ApplyPow(-0.5); if (in_deriv) { if (in_deriv->Data() != out_deriv_no_log.Data()) in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0); else in_deriv->MulRowsVec(in_norm); in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0); in_norm.ApplyPow(3.0); dot_products.MulElements(in_norm); in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans, 1.0); } } } template void DiffNormalizePerRow(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv, const float target_rms, const bool add_log_stddev, CuMatrixBase* in_deriv); template void DiffNormalizePerRow(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv, const double target_rms, const bool add_log_stddev, CuMatrixBase* in_deriv); // not calling this Sigmoid to reduce the chance of future collisions. template static inline Real ScalarSigmoid(Real a) { if (a > Real(0)) { return Real(1) / (Real(1) + Exp(-a)); } else { Real x = Exp(a); return x / (x + Real(1)); } } template static inline Real ScalarTanh(Real a) { if (a > Real(0)) { Real inv_expa = Exp(-a); return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa); } else { Real expa = Exp(a); return Real(1) - Real(2) / (Real(1) + expa * expa); } } template void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const MatrixBase ¶ms_mat, MatrixBase *output) { int32 num_rows = input_mat.NumRows(), input_cols = input_mat.NumCols(), cell_dim = input_cols / 5; KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); KALDI_ASSERT(params_mat.NumRows() == 3); KALDI_ASSERT(params_mat.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); MatrixBase &output_mat = *output; const Real *params_data = params_mat.Data(); int32 params_stride = params_mat.Stride(); for (int32 r = 0; r < num_rows; r++) { const Real *input_row = input_mat.RowData(r); // i_scale and f_scale relate to dropout, they will normally be 1.0. Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]), f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]), o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]); Real *output_row = output_mat.RowData(r); for (int32 c = 0; c < cell_dim; c++) { Real i_part = input_row[c]; Real f_part = input_row[c + cell_dim]; Real c_part = input_row[c + 2 * cell_dim]; Real o_part = input_row[c + 3 * cell_dim]; Real c_prev = input_row[c + 4 * cell_dim]; Real w_ic = params_data[c]; Real w_fc = params_data[c + params_stride]; Real w_oc = params_data[c + params_stride * 2]; Real i_t = ScalarSigmoid(i_part + w_ic * c_prev); Real f_t = ScalarSigmoid(f_part + w_fc * c_prev); Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part); Real o_t = ScalarSigmoid(o_part + w_oc * c_t); Real m_t = o_t * o_scale * ScalarTanh(c_t); output_row[c] = c_t; output_row[c + cell_dim] = m_t; } } } template void ComputeLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, CuMatrixBase *output) { int32 num_rows = input.NumRows(), input_cols = input.NumCols(), cell_dim = input_cols / 5; KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(output->NumRows() == num_rows); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output->NumCols() == 2 * cell_dim); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(num_rows); cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(), params.Data(), params.Stride(), output->Stride(), cell_dim, have_dropout_mask, num_rows, output->Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat()); } } template void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const MatrixBase ¶ms_mat, MatrixBase *output); template void CpuComputeLstmNonlinearity(const MatrixBase &input_mat, const MatrixBase ¶ms_mat, MatrixBase *output); template void ComputeLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, CuMatrixBase *output); template void ComputeLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, CuMatrixBase *output); template void CpuBackpropLstmNonlinearity(const MatrixBase &input, const MatrixBase ¶ms, const MatrixBase &output_deriv, const MatrixBase &deriv_sum_in, const VectorBase &self_repair_config, double count_in, MatrixBase *input_deriv, MatrixBase *params_deriv, MatrixBase *value_sum_out, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out) { int32 num_rows = input.NumRows(), input_cols = input .NumCols(), cell_dim = input.NumCols() / 5; // Check dimensions. KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim); KALDI_ASSERT(deriv_sum_in.NumRows() == 5); KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim); KALDI_ASSERT(self_repair_config.Dim() == 10); if (input_deriv != NULL) { KALDI_ASSERT(SameDim(input, *input_deriv)); } if (params_deriv == NULL) { KALDI_ASSERT(value_sum_out == NULL); KALDI_ASSERT(deriv_sum_out == NULL); KALDI_ASSERT(self_repair_sum_out == NULL); } else { KALDI_ASSERT(value_sum_out != NULL); KALDI_ASSERT(deriv_sum_out != NULL); KALDI_ASSERT(self_repair_sum_out != NULL); KALDI_ASSERT(SameDim(params, *params_deriv)); KALDI_ASSERT(value_sum_out->NumRows() == 5); KALDI_ASSERT(value_sum_out->NumCols() == cell_dim); KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out)); KALDI_ASSERT(self_repair_sum_out->NumRows() == 5); KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim); } const MatrixBase &input_mat = input; const MatrixBase ¶ms_mat = params; const MatrixBase &output_deriv_mat = output_deriv; const MatrixBase &deriv_sum_in_mat = deriv_sum_in; const VectorBase &sr_config = self_repair_config; MatrixBase *input_deriv_mat = ( input_deriv == NULL ? NULL : input_deriv); MatrixBase *params_deriv_mat = NULL; MatrixBase *self_repair_sum_out_mat = NULL; MatrixBase *value_sum_out_mat = NULL; MatrixBase *deriv_sum_out_mat = NULL; if (params_deriv != NULL) { params_deriv_mat = params_deriv; value_sum_out_mat = value_sum_out; deriv_sum_out_mat = deriv_sum_out; self_repair_sum_out_mat = self_repair_sum_out; } // We add 1.0 (i.e. a small value) to the count to avoid division by zero. Real count = 1.0 + count_in; for (int32 c = 0; c < cell_dim; c++) { // parameters Real w_ic = params_mat(0, c); Real w_fc = params_mat(1, c); Real w_oc = params_mat(2, c); // derivative sums w.r.t. parameters. Real w_ic_deriv_sum = 0.0; Real w_fc_deriv_sum = 0.0; Real w_oc_deriv_sum = 0.0; // average derivatives, for self-repair. // The 5 nonlinearities that are subject to self-repair are written as: // Sigmoid(i_t_input), Sigmoid(f_t_input), // Tanh(c_part), Sigmoid(o_t_input), Tanh(c_t) Real i_t_self_repair = ( deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0); Real f_t_self_repair = ( deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0); Real c_part_self_repair = ( deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0); Real o_t_self_repair = ( deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0); Real c_t_self_repair = ( deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0); // Note on how we add self-repair for sigmoids/tanh's. If self-repair // is activated for this unit, then... // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0) // ... to the input-deriv; // For tanh's we'd add -self_repair_scale * tanh(x) // If self-repair is not activated, the 'self_repair' scales are set to zero. // The following variables are for the accumulation of stats on the // sigmoid and tanh units. Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0; Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0; Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0; Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0; Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0; for (int32 r = 0; r < num_rows; r++) { Real i_part = input_mat(r, c), f_part = input_mat(r, c + cell_dim), c_part = input_mat(r, c + 2 * cell_dim), o_part = input_mat(r, c + 3 * cell_dim), c_prev = input_mat(r, c + 4 * cell_dim); Real i_scale = (input_cols == cell_dim * 5 ? 1.0 : input_mat(r, cell_dim * 5)), f_scale = (input_cols == cell_dim * 5 ? 1.0 : input_mat(r, cell_dim * 5 + 1)), o_scale = (input_cols == cell_dim * 5 ? 1.0 : input_mat(r, cell_dim * 5 + 2)); // For greater clarity, we give some of the quantities in the // forward equations their own names. Real i_t_input = i_part + w_ic * c_prev, i_t = ScalarSigmoid(i_t_input), f_t_input = f_part + w_fc * c_prev, f_t = ScalarSigmoid(f_t_input), tanh_c_part = ScalarTanh(c_part), c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part, o_t_input = o_part + w_oc * c_t, o_t = ScalarSigmoid(o_t_input), tanh_c_t = ScalarTanh(c_t); // we'd also compute, in the forward pass, // m_t = o_t * tanh_c_t; // but this variable is not needed. // Accumulate nonlinearity value and derivative stats. // Note: // tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x) // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)). i_t_value_sum += i_t; i_t_deriv_sum += i_t * (1.0F - i_t); f_t_value_sum += f_t; f_t_deriv_sum += f_t * (1.0F - f_t); c_part_value_sum += tanh_c_part; c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part; o_t_value_sum += o_t; o_t_deriv_sum += o_t * (1.0F - o_t); c_t_value_sum += tanh_c_t; c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t; // the derivative of the objective function w.r.t. a particular quantity // will be written by prepending "d" to the name. // We compute these derivatives in the reverse of the order in which // we computed the original quantities. // dc_t_out is the part of the derivative w.r.t. c_t that // comes directly from the output of this function. Real dc_t_out = output_deriv_mat(r, c); Real dm_t = output_deriv_mat(r, c + cell_dim); Real dtanh_c_t = o_t * o_scale * dm_t; Real do_t = o_scale * tanh_c_t * dm_t; Real do_t_input = (o_t * (1.0F - o_t) * do_t - (2.0F * o_t - 1.0F) * o_t_self_repair); Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out + do_t_input * w_oc) - tanh_c_t * c_t_self_repair; Real dtanh_c_part = i_t * i_scale * dc_t; Real df_t = dc_t * f_scale * c_prev; Real df_t_input = ((df_t * f_t * (1.0F - f_t) - (2.0F * f_t - 1.0F) * f_t_self_repair)); Real di_t = dc_t * i_scale * tanh_c_part; Real di_t_input = ((di_t * i_t * (1.0F - i_t) - (2.0F * i_t - 1.0F) * i_t_self_repair)); w_ic_deriv_sum += c_prev * di_t_input; w_fc_deriv_sum += c_prev * df_t_input; w_oc_deriv_sum += c_t * do_t_input; Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t; Real do_part = do_t_input; Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part - tanh_c_part * c_part_self_repair); Real df_part = df_t_input; Real di_part = di_t_input; if (input_deriv_mat != NULL) { (*input_deriv_mat)(r, c) = di_part; (*input_deriv_mat)(r, c + cell_dim) = df_part; (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part; (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part; (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev; } } if (params_deriv != NULL) { // note: for optimizing things you can assume that params_deriv and // input_deriv_mat are non-NULL (i.e. all the output matrices are // non-NULL). The situations when some of the output matrices are NULL // does not happen often (mainly only in testing code). (*params_deriv_mat)(0, c) = w_ic_deriv_sum; (*params_deriv_mat)(1, c) = w_fc_deriv_sum; (*params_deriv_mat)(2, c) = w_oc_deriv_sum; (*value_sum_out_mat)(0, c) += i_t_value_sum; (*value_sum_out_mat)(1, c) += f_t_value_sum; (*value_sum_out_mat)(2, c) += c_part_value_sum; (*value_sum_out_mat)(3, c) += o_t_value_sum; (*value_sum_out_mat)(4, c) += c_t_value_sum; // need to update self_repair_sum_out before deriv_sum_out, because // deriv_sum_out and deriv_sum_in might point to the same memory. for (int32 i = 0; i < 5; i++) (*self_repair_sum_out_mat)(i, c) = (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0); (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum; (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum; (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum; (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum; (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum; } } } template void BackpropLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, const CuMatrixBase &output_deriv, const CuMatrixBase &deriv_sum_in, const CuVectorBase &self_repair_config, double count_in, CuMatrixBase *input_deriv, CuMatrixBase *params_deriv, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out) { int32 num_rows = input.NumRows(), cell_dim = input.NumCols() / 5, input_cols = input.NumCols(); // Check dimensions. KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3); KALDI_ASSERT(params.NumRows() == 3); KALDI_ASSERT(params.NumCols() == cell_dim); KALDI_ASSERT(output_deriv.NumRows() == num_rows); KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim); KALDI_ASSERT(deriv_sum_in.NumRows() == 5); KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim); KALDI_ASSERT(self_repair_config.Dim() == 10); if (input_deriv != NULL) { KALDI_ASSERT(SameDim(input, *input_deriv)); } if (params_deriv == NULL) { KALDI_ASSERT(value_sum_out == NULL); KALDI_ASSERT(deriv_sum_out == NULL); KALDI_ASSERT(self_repair_sum_out == NULL); } else { KALDI_ASSERT(value_sum_out != NULL); KALDI_ASSERT(deriv_sum_out != NULL); KALDI_ASSERT(self_repair_sum_out != NULL); KALDI_ASSERT(SameDim(params, *params_deriv)); KALDI_ASSERT(value_sum_out->NumRows() == 5); KALDI_ASSERT(value_sum_out->NumCols() == cell_dim); KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out)); KALDI_ASSERT(self_repair_sum_out->NumRows() == 5); KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Each thread block is working on 1 row of the data. // It's best that cell dim is a multiple fo CU1DBLOCK int have_dropout_mask = (input_cols == (cell_dim * 5) + 3); // Use 2D block (8x32 threads) as we need to compute column sum. // Use 1D grid to cover the data matrix width `cell_dim`. const int kWarpSize = 32; dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x), // n_blocks(num_rows, dimBlock.y)); // if (dimGrid.x * dimGrid.y > 1024) { // dimGrid.y = std::max(1024 / dimGrid.x, 1); // } dim3 dimGrid(n_blocks(cell_dim, dimBlock.x)); if (input_deriv == NULL) { if (params_deriv == NULL) { cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), deriv_sum_in.Stride(), self_repair_config.Data(), count_in + 1, NULL, 0, NULL, 0, NULL, 0, NULL, 0, NULL, 0); } else { cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), deriv_sum_in.Stride(), self_repair_config.Data(), count_in + 1, NULL, 0, params_deriv->Data(), params_deriv->Stride(), value_sum_out->Data(), value_sum_out->Stride(), deriv_sum_out->Data(), deriv_sum_out->Stride(), self_repair_sum_out->Data(), self_repair_sum_out->Stride()); } } else { if (params_deriv == NULL) { cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), deriv_sum_in.Stride(), self_repair_config.Data(), count_in + 1, input_deriv->Data(), input_deriv->Stride(), NULL, 0, NULL, 0, NULL, 0, NULL, 0); } else { cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, have_dropout_mask, num_rows, input.Data(), input.Stride(), params.Data(), params.Stride(), output_deriv.Data(), output_deriv.Stride(), deriv_sum_in.Data(), deriv_sum_in.Stride(), self_repair_config.Data(), count_in + 1, input_deriv->Data(), input_deriv->Stride(), params_deriv->Data(), params_deriv->Stride(), value_sum_out->Data(), value_sum_out->Stride(), deriv_sum_out->Data(), deriv_sum_out->Stride(), self_repair_sum_out->Data(), self_repair_sum_out->Stride()); } } CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(), deriv_sum_in.Mat(), self_repair_config.Vec(), count_in, &(input_deriv->Mat()), &(params_deriv->Mat()), &(value_sum_out->Mat()), &(deriv_sum_out->Mat()), &(self_repair_sum_out->Mat())); } } template void EnsureNonzero(const CuVectorBase &src, Real epsilon, CuVectorBase *dest) { KALDI_ASSERT(src.Dim() == dest->Dim()); int32 dim = src.Dim(); // fake it with a 1-row matrix. CuSubMatrix src_mat(src.Data(), 1, dim, dim), dest_mat(dest->Data(), 1, dim, dim); EnsureNonzero(src_mat, epsilon, &dest_mat); } // Instantiate the templates we defined above. template void EnsureNonzero(const CuMatrixBase &src, float epsilon, CuMatrixBase *dest); template void EnsureNonzero(const CuMatrixBase &src, double epsilon, CuMatrixBase *dest); template void EnsureNonzero(const CuVectorBase &src, float epsilon, CuVectorBase *dest); template void EnsureNonzero(const CuVectorBase &src, double epsilon, CuVectorBase *dest); template void CpuBackpropLstmNonlinearity(const MatrixBase &input, const MatrixBase ¶ms, const MatrixBase &output_deriv, const MatrixBase &deriv_sum_in, const VectorBase &self_repair_config, double count_in, MatrixBase *input_deriv, MatrixBase *params_deriv, MatrixBase *value_sum_out, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out); template void CpuBackpropLstmNonlinearity(const MatrixBase &input, const MatrixBase ¶ms, const MatrixBase &output_deriv, const MatrixBase &deriv_sum_in, const VectorBase &self_repair_config, double count_in, MatrixBase *input_deriv, MatrixBase *params_deriv, MatrixBase *value_sum_out, MatrixBase *deriv_sum_out, MatrixBase *self_repair_sum_out); template void BackpropLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, const CuMatrixBase &output_deriv, const CuMatrixBase &deriv_sum_in, const CuVectorBase &self_repair_config, double count_in, CuMatrixBase *input_deriv, CuMatrixBase *params_deriv, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out); template void BackpropLstmNonlinearity(const CuMatrixBase &input, const CuMatrixBase ¶ms, const CuMatrixBase &output_deriv, const CuMatrixBase &deriv_sum_in, const CuVectorBase &self_repair_config, double count_in, CuMatrixBase *input_deriv, CuMatrixBase *params_deriv, CuMatrixBase *value_sum_out, CuMatrixBase *deriv_sum_out, CuMatrixBase *self_repair_sum_out); } //namespace cu } //namespace kaldi