// nnet3/nnet-simple-component.cc // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey) // 2015 Xiaohui Zhang // 2015 Guoguo Chen // 2015 Daniel Galvez // 2016 Yiming Wang // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include "nnet3/nnet-simple-component.h" #include "nnet3/nnet-parse.h" #include "cudamatrix/cu-math.h" namespace kaldi { namespace nnet3 { void PnormComponent::Init(int32 input_dim, int32 output_dim) { input_dim_ = input_dim; output_dim_ = output_dim; KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 && input_dim_ % output_dim_ == 0); } void PnormComponent::InitFromConfig(ConfigLine *cfl) { int32 input_dim = 0; int32 output_dim = 0; bool ok = cfl->GetValue("output-dim", &output_dim) && cfl->GetValue("input-dim", &input_dim); if (!ok || cfl->HasUnusedValues() || output_dim <= 0) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(input_dim, output_dim); } void* PnormComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { BaseFloat p = 2.0; out->GroupPnorm(in, p); return NULL; } void PnormComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { if (!in_deriv) return; BaseFloat p = 2.0; in_deriv->DiffGroupPnorm(in_value, out_value, out_deriv, p); } void PnormComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &input_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); } void PnormComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, input_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, output_dim_); WriteToken(os, binary, ""); } DropoutComponent::DropoutComponent(const DropoutComponent &other): RandomComponent(other), dim_(other.dim_), dropout_proportion_(other.dropout_proportion_), dropout_per_frame_(other.dropout_per_frame_) { } Component* DropoutComponent::Copy() const { DropoutComponent *ans = new DropoutComponent(*this); return ans; } void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion, bool dropout_per_frame) { dropout_proportion_ = dropout_proportion; dropout_per_frame_ = dropout_per_frame; dim_ = dim; } void DropoutComponent::InitFromConfig(ConfigLine *cfl) { int32 dim = 0; BaseFloat dropout_proportion = 0.0; bool dropout_per_frame = false; test_mode_ = false; bool ok = cfl->GetValue("dim", &dim) && cfl->GetValue("dropout-proportion", &dropout_proportion); cfl->GetValue("dropout-per-frame", &dropout_per_frame); // It only makes sense to set test-mode in the config for testing purposes. cfl->GetValue("test-mode", &test_mode_); // for this stage, dropout is hard coded in // normal mode if not declared in config if (!ok || cfl->HasUnusedValues() || dim <= 0 || dropout_proportion < 0.0 || dropout_proportion > 1.0) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(dim, dropout_proportion, dropout_per_frame); } std::string DropoutComponent::Info() const { std::ostringstream stream; stream << Type() << ", dim=" << dim_ << ", dropout-proportion=" << dropout_proportion_ << ", dropout-per-frame=" << (dropout_per_frame_ ? "true" : "false"); return stream.str(); } void* DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { KALDI_ASSERT(out->NumRows() == in.NumRows() && out->NumCols() == in.NumCols() && in.NumCols() == dim_); BaseFloat dropout = dropout_proportion_; KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0); if (test_mode_) { out->CopyFromMat(in); out->Scale(1.0 - dropout); return NULL; } if (!dropout_per_frame_) { // This const_cast is only safe assuming you don't attempt // to use multi-threaded code with the GPU. const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout); // now, a proportion "dropout" will be <0.0 // apply the function (x>0?1:0). Now, a proportion // "dropout" will be zero and (1 - dropout) will be 1.0. out->ApplyHeaviside(); out->MulElements(in); } else { // randomize the dropout matrix by row, // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]] CuMatrix tmp(1, out->NumRows(), kUndefined); // This const_cast is only safe assuming you don't attempt // to use multi-threaded code with the GPU. const_cast&>(random_generator_).RandUniform(&tmp); tmp.Add(-dropout); tmp.ApplyHeaviside(); out->CopyColsFromVec(tmp.Row(0)); out->MulElements(in); } return NULL; } void DropoutComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { KALDI_ASSERT(in_value.NumRows() == out_value.NumRows() && in_value.NumCols() == out_value.NumCols()); KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() && in_value.NumCols() == out_deriv.NumCols()); in_deriv->SetMatMatDivMat(out_deriv, out_value, in_value); } void DropoutComponent::Read(std::istream &is, bool binary) { std::string token; ReadToken(is, binary, &token); if (token == "") { ReadToken(is, binary, &token); } KALDI_ASSERT(token == ""); ReadBasicType(is, binary, &dim_); // read dimension. ReadToken(is, binary, &token); KALDI_ASSERT(token == ""); ReadBasicType(is, binary, &dropout_proportion_); // read dropout rate ReadToken(is, binary, &token); if (token == "") { ReadBasicType(is, binary, &dropout_per_frame_); // read dropout mode ReadToken(is, binary, &token); } else { dropout_per_frame_ = false; } if (token == "") { ReadBasicType(is, binary, &test_mode_); // read test mode ExpectToken(is, binary, ""); } else { test_mode_ = false; KALDI_ASSERT(token == ""); } } void DropoutComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_proportion_); WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_per_frame_); WriteToken(os, binary, ""); WriteBasicType(os, binary, test_mode_); WriteToken(os, binary, ""); } void ElementwiseProductComponent::Init(int32 input_dim, int32 output_dim) { input_dim_ = input_dim; output_dim_ = output_dim; KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0); KALDI_ASSERT(input_dim_ > output_dim_); KALDI_ASSERT(input_dim_ % output_dim_ == 0); } void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) { int32 input_dim = 0; int32 output_dim = 0; bool ok = cfl->GetValue("output-dim", &output_dim) && cfl->GetValue("input-dim", &input_dim); if (!ok || cfl->HasUnusedValues() || output_dim <= 0) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(input_dim, output_dim); } void* ElementwiseProductComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { KALDI_ASSERT(in.NumCols() == input_dim_); int32 num_inputs = input_dim_ / output_dim_; for (int32 i = 0; i < num_inputs; i++) { CuSubMatrix current_in(in, 0, in.NumRows(), i * output_dim_, output_dim_); if (i == 0) { out->CopyFromMat(current_in); } else { out->MulElements(current_in); } } return NULL; } void ElementwiseProductComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { if (!in_deriv) return; int32 num_inputs = input_dim_ / output_dim_; for (int32 i = 0; i < num_inputs; i++) { CuSubMatrix current_in_deriv(*in_deriv, 0, in_deriv->NumRows(), i * output_dim_, output_dim_); current_in_deriv.CopyFromMat(out_deriv); for (int32 j = 0; j < num_inputs; j++) { if (i == j) continue; CuSubMatrix in_value_partition(in_value, 0, in_value.NumRows(), j * output_dim_, output_dim_); current_in_deriv.MulElements(in_value_partition); } } } void ElementwiseProductComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &input_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); } void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, input_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, output_dim_); WriteToken(os, binary, ""); } void* SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->Sigmoid(in); return NULL; } void SigmoidComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { if (in_deriv != NULL) { in_deriv->DiffSigmoid(out_value, out_deriv); SigmoidComponent *to_update = dynamic_cast(to_update_in); if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); to_update->StoreBackpropStats(out_deriv); } } } void SigmoidComponent::RepairGradients( const CuMatrixBase &out_value, CuMatrixBase *in_deriv, SigmoidComponent *to_update) const { KALDI_ASSERT(to_update != NULL); // maximum possible derivative of SigmoidComponent is 0.25. // the default lower-threshold on the derivative, below which we // add a term to the derivative to encourage the inputs to the sigmoid // to be closer to zero, is 0.05, which means the derivative is on average // 5 times smaller than its maximum possible value. BaseFloat default_lower_threshold = 0.05; // we use this 'repair_probability' (hardcoded for now) to limit // this code to running on about half of the minibatches. BaseFloat repair_probability = 0.5; to_update->num_dims_processed_ += dim_; if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ || RandUniform() > repair_probability) return; // check that the self-repair scale is in a reasonable range. KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); BaseFloat unset = kUnsetThreshold; // -1000.0 BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? default_lower_threshold : self_repair_lower_threshold_) * count_; if (self_repair_upper_threshold_ != unset) { KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid " << "components, it does nothing."; } // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside // function isn't defined for vectors). CuMatrix thresholds(1, dim_); CuSubVector thresholds_vec(thresholds, 0); thresholds_vec.AddVec(-1.0, deriv_sum_); thresholds_vec.Add(lower_threshold); thresholds.ApplyHeaviside(); to_update->num_dims_self_repaired_ += thresholds_vec.Sum(); // At this point, 'thresholds_vec' contains a 1 for each dimension of // the output that is 'problematic', i.e. for which the avg-deriv // is less than the self-repair lower threshold, and a 0 for // each dimension that is not problematic. // what we want to do is to add // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0) // to the input derivative for each problematic dimension. // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to // 1.0, like a tanh. the negative sign is so that for inputs <0, we push them // up towards 0, and for inputs >0, we push them down towards 0. // Our use of this sigmoid-type function here is just a convenience since // we have it available. We could use just about any function that is positive // for inputs < 0 and negative for inputs > 0. // We can rearrange the above as: for only the problematic columns, // input-deriv -= 2 * self-repair-scale / repair-probabilty * output // input-deriv += self-repair-scale / repair-probabilty // which we can write as: // input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec // input-deriv += self-repair-scale / repair-probabilty * thresholds-vec in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability, out_value, kNoTrans, thresholds_vec); in_deriv->AddVecToRows(self_repair_scale_ / repair_probability, thresholds_vec); } void SigmoidComponent::StoreStats(const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { // Only store stats about every other minibatch (but on the first minibatch, // always store it, which is necessary for the ConsolidateMemory() operation // to work correctly. if (RandInt(0, 1) == 0 && count_ != 0) return; // derivative of the nonlinearity is out_value * (1.0 - out_value); CuMatrix temp_deriv(out_value.NumRows(), out_value.NumCols(), kUndefined); temp_deriv.Set(1.0); temp_deriv.AddMat(-1.0, out_value); temp_deriv.MulElements(out_value); StoreStatsInternal(out_value, &temp_deriv); } void* NoOpComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyFromMat(in); return NULL; } void NoOpComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, const CuMatrixBase &, const CuMatrixBase &out_deriv, void *memo, Component *to_update, // may be NULL; may be identical // to "this" or different. CuMatrixBase *in_deriv) const { in_deriv->CopyFromMat(out_deriv); if (backprop_scale_ != 1.0) in_deriv->Scale(backprop_scale_); } void NoOpComponent::InitFromConfig(ConfigLine *cfl) { backprop_scale_ = 1.0; cfl->GetValue("backprop-scale", &backprop_scale_); if (!cfl->GetValue("dim", &dim_) || dim_ <= 0 || cfl->HasUnusedValues()) { KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; } } std::string NoOpComponent::Info() const { std::ostringstream stream; stream << Type() << ", dim=" << dim_; if (backprop_scale_ != 1.0) stream << ", backprop-scale=" << backprop_scale_; return stream.str(); } void NoOpComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, backprop_scale_); WriteToken(os, binary, ""); } void NoOpComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &dim_); if (PeekToken(is, binary) == 'V') { // This is the old format, from when NoOpComponent inherited from // NonlinearComponent. backprop_scale_ = 1.0; ExpectToken(is, binary, ""); CuVector temp_vec; temp_vec.Read(is, binary); ExpectToken(is, binary, ""); temp_vec.Read(is, binary); ExpectToken(is, binary, ""); BaseFloat temp_float; ReadBasicType(is, binary, &temp_float); if (PeekToken(is, binary) == 'O') { ExpectToken(is, binary, ""); temp_vec.Read(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &temp_float); } std::string token; ReadToken(is, binary, &token); if (token[0] != '<') { // this should happen only rarely, in case we couldn't push back the // '<' to the stream in PeekToken(). token = '<' + token; } if (token == "") { ReadBasicType(is, binary, &temp_float); ReadToken(is, binary, &token); } if (token == "") { ReadBasicType(is, binary, &temp_float); ReadToken(is, binary, &token); } KALDI_ASSERT(token == ""); return; } else { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &backprop_scale_); ExpectToken(is, binary, ""); } } void ClipGradientComponent::Read(std::istream &is, bool binary) { // might not see the "" part because // of how ReadNew() works. ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &clipping_threshold_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &norm_based_clipping_); std::string token; ReadToken(is, binary, &token); if (token == "") { ReadBasicType(is, binary, &self_repair_clipped_proportion_threshold_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &self_repair_target_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &self_repair_scale_); ExpectToken(is, binary, ""); } else { self_repair_clipped_proportion_threshold_ = 1.0; self_repair_target_ = 0.0; self_repair_scale_ = 0.0; KALDI_ASSERT(token == ""); } ReadBasicType(is, binary, &num_clipped_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &count_); ReadToken(is, binary, &token); if (token == "") { ReadBasicType(is, binary, &num_self_repaired_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_backpropped_); ExpectToken(is, binary, ""); } else { num_self_repaired_ = 0; num_backpropped_ = 0; KALDI_ASSERT(token == ""); } } void ClipGradientComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, clipping_threshold_); WriteToken(os, binary, ""); WriteBasicType(os, binary, norm_based_clipping_); WriteToken(os, binary, ""); WriteBasicType(os, binary, self_repair_clipped_proportion_threshold_); WriteToken(os, binary, ""); WriteBasicType(os, binary, self_repair_target_); WriteToken(os, binary, ""); WriteBasicType(os, binary, self_repair_scale_); WriteToken(os, binary, ""); WriteBasicType(os, binary, num_clipped_); WriteToken(os, binary, ""); WriteBasicType(os, binary, count_); WriteToken(os, binary, ""); WriteBasicType(os, binary, num_self_repaired_); WriteToken(os, binary, ""); WriteBasicType(os, binary, num_backpropped_); WriteToken(os, binary, ""); } std::string ClipGradientComponent::Info() const { std::ostringstream stream; stream << Type() << ", dim=" << dim_ << ", norm-based-clipping=" << (norm_based_clipping_ ? "true" : "false") << ", clipping-threshold=" << clipping_threshold_ << ", clipped-proportion=" << (count_ > 0 ? static_cast(num_clipped_)/count_ : 0); if (self_repair_scale_ != 0.0) stream << ", self-repair-clipped-proportion-threshold=" << self_repair_clipped_proportion_threshold_ << ", self-repair-target=" << self_repair_target_ << ", self-repair-scale=" << self_repair_scale_; return stream.str(); } void ClipGradientComponent::Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping, BaseFloat self_repair_clipped_proportion_threshold, BaseFloat self_repair_target, BaseFloat self_repair_scale, int32 num_clipped, int32 count, int32 num_self_repaired, int32 num_backpropped) { KALDI_ASSERT(clipping_threshold >= 0 && dim > 0 && self_repair_clipped_proportion_threshold >= 0.0 && self_repair_target >= 0.0 && self_repair_scale >= 0.0); dim_ = dim; norm_based_clipping_ = norm_based_clipping; clipping_threshold_ = clipping_threshold; self_repair_clipped_proportion_threshold_ = self_repair_clipped_proportion_threshold; self_repair_target_ = self_repair_target; self_repair_scale_ = self_repair_scale; num_clipped_ = num_clipped; count_ = count; num_self_repaired_ = num_self_repaired; num_backpropped_ = num_backpropped; } void ClipGradientComponent::InitFromConfig(ConfigLine *cfl) { int32 dim = 0; bool ok = cfl->GetValue("dim", &dim); bool norm_based_clipping = false; BaseFloat clipping_threshold = 15.0; BaseFloat self_repair_clipped_proportion_threshold = 0.01; BaseFloat self_repair_target = 0.0; BaseFloat self_repair_scale = 1.0; cfl->GetValue("clipping-threshold", &clipping_threshold); cfl->GetValue("norm-based-clipping", &norm_based_clipping); cfl->GetValue("self-repair-clipped-proportion-threshold", &self_repair_clipped_proportion_threshold); cfl->GetValue("self-repair-target", &self_repair_target); cfl->GetValue("self-repair-scale", &self_repair_scale); if (!ok || cfl->HasUnusedValues() || clipping_threshold < 0 || dim <= 0 || self_repair_clipped_proportion_threshold < 0.0 || self_repair_target < 0.0 || self_repair_scale < 0.0) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(dim, clipping_threshold, norm_based_clipping, self_repair_clipped_proportion_threshold, self_repair_target, self_repair_scale, 0, 0, 0, 0); } void* ClipGradientComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyFromMat(in); return NULL; } void ClipGradientComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, // may be NULL; may be identical // to "this" or different. CuMatrixBase *in_deriv) const { // the following statement will do nothing if in_deriv and out_deriv have same // memory. in_deriv->CopyFromMat(out_deriv); ClipGradientComponent *to_update = dynamic_cast(to_update_in); if (clipping_threshold_ > 0) { if (norm_based_clipping_) { // each row in the derivative matrix, which corresponds to one sample in // the mini-batch, is scaled to have a max-norm of clipping_threshold_ CuVector clipping_scales(in_deriv->NumRows()); clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv, kNoTrans, 0.0); // now clipping_scales contains the squared (norm of each row divided by // clipping_threshold) int32 num_not_scaled; clipping_scales.ApplyFloor(1.0, &num_not_scaled); // now clipping_scales contains min(1, // squared-(norm/clipping_threshold)) if (num_not_scaled != clipping_scales.Dim()) { clipping_scales.ApplyPow(-0.5); // now clipping_scales contains max(1, // clipping_threshold/vector_norm) in_deriv->MulRowsVec(clipping_scales); if (to_update != NULL) to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled); } if (to_update != NULL) to_update->count_ += clipping_scales.Dim(); } else { // each element of the derivative matrix, is clipped to be below the // clipping_threshold_ in_deriv->ApplyCeiling(clipping_threshold_); in_deriv->ApplyFloor(-1 * clipping_threshold_); } if (to_update != NULL) { to_update->num_backpropped_ += 1; RepairGradients(debug_info, in_value, in_deriv, to_update); } } else if (clipping_threshold_ == 0.0) { in_deriv->SetZero(); } } // This function will add a self-repair term to in-deriv, attempting to shrink // the magnitude of the input towards self_repair_target_. // This term is proportional to [-(input vector - self_repair_target_)]. // The avarage magnitude of this term is equal to // [self_repair_scale_ * clipped_proportion * average norm of input derivative]. // We use norm of input derivative when computing the magnitude so that it is // comparable to the magnitude of input derivative, especially when the gradient // explosion is actually happening. void ClipGradientComponent::RepairGradients( const std::string &debug_info, const CuMatrixBase &in_value, CuMatrixBase *in_deriv, ClipGradientComponent *to_update) const { KALDI_ASSERT(to_update != NULL); // we use this 'repair_probability' (hardcoded for now) to limit // this code to running on about half of the minibatches. BaseFloat repair_probability = 0.5; if (self_repair_clipped_proportion_threshold_ >= 1.0 || self_repair_scale_ == 0.0 || count_ == 0 || RandUniform() > repair_probability) return; KALDI_ASSERT(self_repair_target_ >= 0.0 && self_repair_scale_ > 0.0); BaseFloat clipped_proportion = (count_ > 0 ? static_cast(num_clipped_) / count_ : 0); // in-deriv would be modified only when clipped_proportion exceeds the // threshold if (clipped_proportion <= self_repair_clipped_proportion_threshold_) return; to_update->num_self_repaired_ += 1; if (to_update->debug_info_ == "") // get the component-node name to_update->debug_info_ = debug_info; if (to_update->num_self_repaired_ == 1) KALDI_LOG << "ClipGradientComponent(node_name=" << debug_info << ")'s self-repair was activated as the first time at the " << to_update->num_backpropped_ << "-th call of Backprop() in this training job."; // sign_mat = sign(in_value), i.e., // An element in sign_mat is 1 if its corresponding element in in_value > 0, // or -1 otherwise CuMatrix sign_mat(in_value); sign_mat.ApplyHeaviside(); sign_mat.Scale(2.0); sign_mat.Add(-1.0); // repair_mat = // floor(abs(in_value) - self_repair_target_, 0) .* sign(in_value) CuMatrix repair_mat(in_value); repair_mat.ApplyPowAbs(1.0); repair_mat.Add(-self_repair_target_); repair_mat.ApplyFloor(0.0); repair_mat.MulElements(sign_mat); // magnitude = // self_repair_scale_ * clipped_proportion * average norm of in-deriv CuVector in_deriv_norm_vec(in_deriv->NumRows()); in_deriv_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0); in_deriv_norm_vec.ApplyPow(0.5); double in_deriv_norm_sum = in_deriv_norm_vec.Sum(); BaseFloat magnitude = self_repair_scale_ * clipped_proportion * (in_deriv_norm_sum / in_deriv_norm_vec.Dim()); CuVector repair_mat_norm_vec(repair_mat.NumRows()); repair_mat_norm_vec.AddDiagMat2(1.0, repair_mat, kNoTrans, 0.0); repair_mat_norm_vec.ApplyPow(0.5); double repair_mat_norm_sum = repair_mat_norm_vec.Sum(); double scale = 0.0; if (repair_mat_norm_sum != 0.0) scale = magnitude / (repair_mat_norm_sum / repair_mat_norm_vec.Dim()); // repair_mat is scaled so that on average the rows have the norm // (magnitude / repair_probability). This will give higher magnitude of // self-repair to input vectors that have larger absolute value, which tend to // be those that are diverging. in_deriv->AddMat(-scale / repair_probability, repair_mat); CuVector in_deriv_repaired_norm_vec(in_deriv->NumRows()); in_deriv_repaired_norm_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 0.0); in_deriv_repaired_norm_vec.ApplyPow(0.5); // scale in_deriv to have the same norm as that before adding the self-repair // term, in order to avoid increase of the norm caused by self-repair, // which may incur more clip of gradient and thus more self-repair double in_deriv_repaired_norm_sum = in_deriv_repaired_norm_vec.Sum(); if (in_deriv_repaired_norm_sum != 0.0) in_deriv->Scale(in_deriv_norm_sum / in_deriv_repaired_norm_sum); } void ClipGradientComponent::ZeroStats() { count_ = 0.0; num_clipped_ = 0.0; num_self_repaired_ = 0; num_backpropped_ = 0; } void ClipGradientComponent::Scale(BaseFloat scale) { count_ *= scale; num_clipped_ *= scale; } void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) { const ClipGradientComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); count_ += alpha * other->count_; num_clipped_ += alpha * other->num_clipped_; } void* TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // Apply tanh function to each element of the output... // the tanh function may be written as -1 + ( 2 / (1 + e^{-2 x})), // which is a scaled and shifted sigmoid. out->Tanh(in); return NULL; } void TanhComponent::RepairGradients( const CuMatrixBase &out_value, CuMatrixBase *in_deriv, TanhComponent *to_update) const { KALDI_ASSERT(to_update != NULL); // maximum possible derivative of SigmoidComponent is 1.0 // the default lower-threshold on the derivative, below which we // add a term to the derivative to encourage the inputs to the sigmoid // to be closer to zero, is 0.2, which means the derivative is on average // 5 times smaller than its maximum possible value. BaseFloat default_lower_threshold = 0.2; // we use this 'repair_probability' (hardcoded for now) to limit // this code to running on about half of the minibatches. BaseFloat repair_probability = 0.5; to_update->num_dims_processed_ += dim_; if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ || RandUniform() > repair_probability) return; // check that the self-repair scale is in a reasonable range. KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); BaseFloat unset = kUnsetThreshold; // -1000.0 BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? default_lower_threshold : self_repair_lower_threshold_) * count_; if (self_repair_upper_threshold_ != unset) { KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid " << "components, it does nothing."; } // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside // function isn't defined for vectors). CuMatrix thresholds(1, dim_); CuSubVector thresholds_vec(thresholds, 0); thresholds_vec.AddVec(-1.0, deriv_sum_); thresholds_vec.Add(lower_threshold); thresholds.ApplyHeaviside(); to_update->num_dims_self_repaired_ += thresholds_vec.Sum(); // At this point, 'thresholds_vec' contains a 1 for each dimension of // the output that is 'problematic', i.e. for which the avg-deriv // is less than the self-repair lower threshold, and a 0 for // each dimension that is not problematic. // what we want to do is to add -self_repair_scale_ / repair_probability times // output-valiue) to the input derivative for each problematic dimension. // note that for the tanh, the output-value goes from -1.0 when the input is // -inf to +1.0 when the input is +inf. The negative sign is so that for // inputs <0, we push them up towards 0, and for inputs >0, we push them down // towards 0. Our use of the tanh here is just a convenience since we have it // available. We could use just about any function that is positive for // inputs < 0 and negative for inputs > 0. // We can rearrange the above as: for only the problematic columns, // input-deriv -= self-repair-scale / repair-probabilty * output // which we can write as: // input-deriv -= self-repair-scale / repair-probabilty * output * thresholds-vec in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability, out_value, kNoTrans, thresholds_vec); } void TanhComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, // may be NULL; may be identical // to "this" or different. CuMatrixBase *in_deriv) const { if (in_deriv != NULL) { in_deriv->DiffTanh(out_value, out_deriv); TanhComponent *to_update = dynamic_cast(to_update_in); if (to_update != NULL) { RepairGradients(out_value, in_deriv, to_update); to_update->StoreBackpropStats(out_deriv); } } } /* Note on the derivative of the tanh function: tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x) The element by element equation of what we're doing would be: in_deriv = out_deriv * (1.0 - out_value^2). We can accomplish this via calls to the matrix library. */ void TanhComponent::StoreStats(const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { // Only store stats about every other minibatch (but on the first minibatch, // always store it, which is necessary for the ConsolidateMemory() operation // to work correctly. if (RandInt(0, 1) == 0 && count_ != 0) return; // derivative of the onlinearity is out_value * (1.0 - out_value); CuMatrix temp_deriv(out_value); temp_deriv.ApplyPow(2.0); temp_deriv.Scale(-1.0); temp_deriv.Add(1.0); StoreStatsInternal(out_value, &temp_deriv); } void* RectifiedLinearComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // Apply rectified linear function (x >= 0 ? 1.0 : 0.0) out->CopyFromMat(in); out->ApplyFloor(0.0); return NULL; } void RectifiedLinearComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, //in_value const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { if (in_deriv != NULL) { in_deriv->Heaviside(out_value); in_deriv->MulElements(out_deriv); RectifiedLinearComponent *to_update = dynamic_cast(to_update_in); if (to_update != NULL) { RepairGradients(in_deriv, to_update); to_update->StoreBackpropStats(out_deriv); } } } void RectifiedLinearComponent::RepairGradients( CuMatrixBase *in_deriv, RectifiedLinearComponent *to_update) const { KALDI_ASSERT(to_update != NULL); int32 dim = dim_, block_dim = block_dim_; BaseFloat default_lower_threshold = 0.05, default_upper_threshold = 0.95; // we use this 'repair_probability' (hardcoded for now) to limit // this code to running on about half of the minibatches. BaseFloat repair_probability = 0.5; KALDI_ASSERT(in_deriv->NumCols() == dim || in_deriv->NumCols() == block_dim); if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim) return; if (in_deriv->NumCols() != block_dim) { KALDI_ASSERT(in_deriv->NumCols() == in_deriv->Stride()); int32 dim_multiple = dim / block_dim; CuSubMatrix in_deriv_reshaped(in_deriv->Data(), in_deriv->NumRows() * dim_multiple, block_dim, block_dim); RepairGradients(&in_deriv_reshaped, to_update); return; } // By now we know that in_deriv->NumCols() == block_dim. if (RandUniform() > repair_probability) return; to_update->num_dims_processed_ += block_dim; // check that the self-repair scale is in a reasonable range. KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); BaseFloat unset = kUnsetThreshold; // -1000.0 BaseFloat count = count_, lower_threshold = (self_repair_lower_threshold_ == unset ? default_lower_threshold : self_repair_lower_threshold_) * count, upper_threshold = (self_repair_upper_threshold_ == unset ? default_upper_threshold : self_repair_upper_threshold_) * count; CuMatrix storage(2, block_dim + 2, kUndefined); CuSubVector thresholds_vec(storage.RowData(0) + block_dim, 2); CuSubMatrix stats_mat(storage, 0, 2, 0, block_dim); thresholds_vec(0) = -lower_threshold; thresholds_vec(1) = -upper_threshold; CuSubVector row0(stats_mat, 0); CuSubVector row1(stats_mat, 1); if (block_dim == dim) { row0.CopyFromVec(deriv_sum_); } else { CuSubMatrix deriv_sum_mat(deriv_sum_.Data(), dim / block_dim, block_dim, block_dim); CuVector deriv_sum_dbl(block_dim); // get the average of the deriv-sums over the blocks. deriv_sum_dbl.AddRowSumMat(block_dim * 1.0 / dim, deriv_sum_mat); row0.CopyFromVec(deriv_sum_dbl); } row1.CopyFromVec(row0); stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0); // now row0 equals stats - lower_threshold, and // row1 equals stats - upper_threshold. stats_mat.ApplyHeaviside(); // now row0 equals (stats > lower_threshold ? 1 : 0), and // row1 equals (stats > upper_threshold ? 1 : 0). // what we want is: // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) + // (stats > upper_threshold ? -1 : 0)). // // we can get these in stats_mat.Row(0) by computing: // -self_repair_scale * (stats_mat.Row(1) + stats_mat.Row(0) - 1). row0.AddVec(1.0, row1, 1.0); row0.Add(-1.0); CuVector temp(row0); temp.ApplyPow(2.0); to_update->num_dims_self_repaired_ += temp.Sum(); // [actually we need to divide by repair_probability also, to // correct for the fact that we only do this on some frames.] row0.Scale(-self_repair_scale_ / repair_probability); in_deriv->AddVecToRows(1.0, row0, 1.0); } void RectifiedLinearComponent::StoreStats( const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { // Only store stats about every other minibatch (but on the first minibatch, // always store it, which is necessary for the ConsolidateMemory() operation // to work correctly. if (RandInt(0, 1) == 0 && count_ != 0) return; CuMatrix temp_deriv(out_value.NumRows(), out_value.NumCols(), kUndefined); temp_deriv.Heaviside(out_value); StoreStatsInternal(out_value, &temp_deriv); } void AffineComponent::Scale(BaseFloat scale) { if (scale == 0.0) { // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's. linear_params_.SetZero(); bias_params_.SetZero(); } else { linear_params_.Scale(scale); bias_params_.Scale(scale); } } void AffineComponent::Resize(int32 input_dim, int32 output_dim) { KALDI_ASSERT(input_dim > 0 && output_dim > 0); bias_params_.Resize(output_dim); linear_params_.Resize(output_dim, input_dim); } void AffineComponent::Add(BaseFloat alpha, const Component &other_in) { const AffineComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); linear_params_.AddMat(alpha, other->linear_params_); bias_params_.AddVec(alpha, other->bias_params_); } AffineComponent::AffineComponent(const AffineComponent &component): UpdatableComponent(component), linear_params_(component.linear_params_), bias_params_(component.bias_params_), orthonormal_constraint_(component.orthonormal_constraint_) { } AffineComponent::AffineComponent(const CuMatrixBase &linear_params, const CuVectorBase &bias_params, BaseFloat learning_rate): linear_params_(linear_params), bias_params_(bias_params), orthonormal_constraint_(0.0) { SetUnderlyingLearningRate(learning_rate); KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&& bias_params.Dim() != 0); } void AffineComponent::SetParams(const CuVectorBase &bias, const CuMatrixBase &linear) { bias_params_ = bias; linear_params_ = linear; KALDI_ASSERT(bias_params_.Dim() == linear_params_.NumRows()); } void AffineComponent::PerturbParams(BaseFloat stddev) { CuMatrix temp_linear_params(linear_params_); temp_linear_params.SetRandn(); linear_params_.AddMat(stddev, temp_linear_params); CuVector temp_bias_params(bias_params_); temp_bias_params.SetRandn(); bias_params_.AddVec(stddev, temp_bias_params); } std::string AffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); if (orthonormal_constraint_ != 0.0) stream << ", orthonormal-constraint=" << orthonormal_constraint_; PrintParameterStats(stream, "linear-params", linear_params_, false, // include_mean true, // include_row_norms true, // include_column_norms GetVerboseLevel() >= 2); // include_singular_values PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } Component* AffineComponent::Copy() const { AffineComponent *ans = new AffineComponent(*this); return ans; } BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const { const AffineComponent *other = dynamic_cast(&other_in); return TraceMatMat(linear_params_, other->linear_params_, kTrans) + VecVec(bias_params_, other->bias_params_); } void AffineComponent::Init(int32 input_dim, int32 output_dim, BaseFloat param_stddev, BaseFloat bias_stddev) { linear_params_.Resize(output_dim, input_dim); bias_params_.Resize(output_dim); KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0); linear_params_.SetRandn(); // sets to random normally distributed noise. linear_params_.Scale(param_stddev); bias_params_.SetRandn(); bias_params_.Scale(bias_stddev); } void AffineComponent::Init(std::string matrix_filename) { CuMatrix mat; ReadKaldiObject(matrix_filename, &mat); // will abort on failure. KALDI_ASSERT(mat.NumCols() >= 2); int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows(); linear_params_.Resize(output_dim, input_dim); bias_params_.Resize(output_dim); linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim)); bias_params_.CopyColFromMat(mat, input_dim); } void AffineComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string matrix_filename; int32 input_dim = -1, output_dim = -1; InitLearningRatesFromConfig(cfl); if (cfl->GetValue("matrix", &matrix_filename)) { Init(matrix_filename); if (cfl->GetValue("input-dim", &input_dim)) KALDI_ASSERT(input_dim == InputDim() && "input-dim mismatch vs. matrix."); if (cfl->GetValue("output-dim", &output_dim)) KALDI_ASSERT(output_dim == OutputDim() && "output-dim mismatch vs. matrix."); } else { ok = ok && cfl->GetValue("input-dim", &input_dim); ok = ok && cfl->GetValue("output-dim", &output_dim); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0; cfl->GetValue("param-stddev", ¶m_stddev); cfl->GetValue("bias-stddev", &bias_stddev); Init(input_dim, output_dim, param_stddev, bias_stddev); } cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); } void* AffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // No need for asserts as they'll happen within the matrix operations. out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row // of *out. out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0); return NULL; } void AffineComponent::UpdateSimple(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { bias_params_.AddRowSumMat(learning_rate_, out_deriv, 1.0); linear_params_.AddMatMat(learning_rate_, out_deriv, kTrans, in_value, kNoTrans, 1.0); } void AffineComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { AffineComponent *to_update = dynamic_cast(to_update_in); // Propagate the derivative back to the input. // add with coefficient 1.0 since property kBackpropAdds is true. // If we wanted to add with coefficient 0.0 we'd need to zero the // in_deriv, in case of infinities. if (in_deriv) in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans, 1.0); if (to_update != NULL) { // Next update the model (must do this 2nd so the derivatives we propagate // are accurate, in case this == to_update_in.) if (to_update->is_gradient_) to_update->UpdateSimple(in_value, out_deriv); else // the call below is to a virtual function that may be re-implemented to_update->Update(debug_info, in_value, out_deriv); // by child classes. } } void AffineComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // read opening tag and learning rate. ExpectToken(is, binary, ""); linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } if (PeekToken(is, binary) == 'O') { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &orthonormal_constraint_); } else { orthonormal_constraint_ = 0.0; } ExpectToken(is, binary, ""); } void AffineComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate WriteToken(os, binary, ""); linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); if (orthonormal_constraint_ != 0.0) { WriteToken(os, binary, ""); WriteBasicType(os, binary, orthonormal_constraint_); } WriteToken(os, binary, ""); } int32 AffineComponent::NumParameters() const { return (InputDim() + 1) * OutputDim(); } void AffineComponent::Vectorize(VectorBase *params) const { KALDI_ASSERT(params->Dim() == this->NumParameters()); params->Range(0, InputDim() * OutputDim()).CopyRowsFromMat(linear_params_); params->Range(InputDim() * OutputDim(), OutputDim()).CopyFromVec(bias_params_); } void AffineComponent::UnVectorize(const VectorBase ¶ms) { KALDI_ASSERT(params.Dim() == this->NumParameters()); linear_params_.CopyRowsFromVec(params.Range(0, InputDim() * OutputDim())); bias_params_.CopyFromVec(params.Range(InputDim() * OutputDim(), OutputDim())); } RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent & component) : UpdatableComponent(component), linear_params_(component.linear_params_), bias_params_(component.bias_params_), num_repeats_(component.num_repeats_) {} void RepeatedAffineComponent::Scale(BaseFloat scale) { if (scale == 0.0) { linear_params_.SetZero(); bias_params_.SetZero(); } else { linear_params_.Scale(scale); bias_params_.Scale(scale); } } void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) { const RepeatedAffineComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); linear_params_.AddMat(alpha, other->linear_params_); bias_params_.AddVec(alpha, other->bias_params_); } void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){ CuMatrix temp_linear_params(linear_params_); temp_linear_params.SetRandn(); linear_params_.AddMat(stddev, temp_linear_params); CuVector temp_bias_params(bias_params_); temp_bias_params.SetRandn(); bias_params_.AddVec(stddev, temp_bias_params); } std::string RepeatedAffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", num-repeats=" << num_repeats_; PrintParameterStats(stream, "linear-params", linear_params_); PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } Component* RepeatedAffineComponent::Copy() const { RepeatedAffineComponent *ans = new RepeatedAffineComponent(*this); return ans; } BaseFloat RepeatedAffineComponent::DotProduct(const UpdatableComponent &other_in) const { const RepeatedAffineComponent *other = dynamic_cast(&other_in); return TraceMatMat(linear_params_, other->linear_params_, kTrans) + VecVec(bias_params_, other->bias_params_); } void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats, BaseFloat param_stddev, BaseFloat bias_mean, BaseFloat bias_stddev) { KALDI_ASSERT(input_dim % num_repeats == 0 && output_dim % num_repeats == 0); linear_params_.Resize(output_dim / num_repeats, input_dim / num_repeats); bias_params_.Resize(output_dim / num_repeats); num_repeats_ = num_repeats; KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0); linear_params_.SetRandn(); // sets to random normally distributed noise. linear_params_.Scale(param_stddev); bias_params_.SetRandn(); bias_params_.Scale(bias_stddev); bias_params_.Add(bias_mean); SetNaturalGradientConfigs(); } void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; int32 num_repeats = num_repeats_; int32 input_dim = -1, output_dim = -1; InitLearningRatesFromConfig(cfl); ok = cfl->GetValue("num-repeats", &num_repeats) && ok; ok = cfl->GetValue("input-dim", &input_dim) && ok; ok = cfl->GetValue("output-dim", &output_dim) && ok; KALDI_ASSERT(input_dim % num_repeats == 0 && "num-repeats must divide input-dim"); KALDI_ASSERT(output_dim % num_repeats == 0 && "num-repeats must divide output-dim"); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats), bias_mean = 0.0, bias_stddev = 0.0; cfl->GetValue("param-stddev", ¶m_stddev); cfl->GetValue("bias-mean", &bias_mean); cfl->GetValue("bias-stddev", &bias_stddev); Init(input_dim, output_dim, num_repeats, param_stddev, bias_mean, bias_stddev); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); } void* RepeatedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // we gave the kInputContiguous and kOutputContiguous flags-- check that they // are honored. KALDI_ASSERT(in.NumCols() == in.Stride() && out->NumCols() == out->Stride() && out->NumRows() == in.NumRows()); int32 num_repeats = num_repeats_, num_rows = in.NumRows(), block_dim_out = linear_params_.NumRows(), block_dim_in = linear_params_.NumCols(); CuSubMatrix in_reshaped(in.Data(), num_rows * num_repeats, block_dim_in, block_dim_in), out_reshaped(out->Data(), num_rows * num_repeats, block_dim_out, block_dim_out); out_reshaped.CopyRowsFromVec(bias_params_); out_reshaped.AddMatMat(1.0, in_reshaped, kNoTrans, linear_params_, kTrans, 1.0); return NULL; } void RepeatedAffineComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() && (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) && (!in_deriv || in_deriv->NumCols() == in_deriv->Stride())); RepeatedAffineComponent *to_update = dynamic_cast( to_update_in); // Propagate the derivative back to the input. // add with coefficient 1.0 since property kBackpropAdds is true. // If we wanted to add with coefficient 0.0 we'd need to zero the // in_deriv, in case of infinities. if (in_deriv) { int32 num_repeats = num_repeats_, num_rows = out_deriv.NumRows(), block_dim_out = linear_params_.NumRows(), block_dim_in = linear_params_.NumCols(); CuSubMatrix in_deriv_reshaped(in_deriv->Data(), num_rows * num_repeats, block_dim_in, block_dim_in), out_deriv_reshaped(out_deriv.Data(), num_rows * num_repeats, block_dim_out, block_dim_out); in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans, linear_params_, kNoTrans, 1.0); } // Next update the model (must do this 2nd so the derivatives we propagate are // accurate, in case this == to_update_in.) if (to_update != NULL) to_update->Update(in_value, out_deriv); } void RepeatedAffineComponent::Update(const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() && in_value.NumCols() == in_value.Stride() && in_value.NumRows() == out_deriv.NumRows()); int32 num_repeats = num_repeats_, num_rows = in_value.NumRows(), block_dim_out = linear_params_.NumRows(), block_dim_in = linear_params_.NumCols(); CuSubMatrix in_value_reshaped(in_value.Data(), num_rows * num_repeats, block_dim_in, block_dim_in), out_deriv_reshaped(out_deriv.Data(), num_rows * num_repeats, block_dim_out, block_dim_out); linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans, in_value_reshaped, kNoTrans, 1.0); bias_params_.AddRowSumMat(learning_rate_, out_deriv_reshaped); } void RepeatedAffineComponent::Read(std::istream &is, bool binary) { // This Read function also works for NaturalGradientRepeatedAffineComponent. ReadUpdatableCommon(is, binary); // read opening tag and learning rate. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_repeats_); ExpectToken(is, binary, ""); linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } ExpectToken(is, binary, std::string("")); SetNaturalGradientConfigs(); } void RepeatedAffineComponent::Write(std::ostream &os, bool binary) const { // This Write function also works for NaturalGradientRepeatedAffineComponent. WriteUpdatableCommon(os, binary); // Write opening tag and learning rate WriteToken(os, binary, ""); WriteBasicType(os, binary, num_repeats_); WriteToken(os, binary, ""); linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); // write closing token. WriteToken(os, binary, std::string("")); } int32 RepeatedAffineComponent::NumParameters() const { // Note: unlike AffineComponent, InputDim() & OutputDim() are not used here and below, // for they are multipled by num_repeats_. return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim(); } void RepeatedAffineComponent::Vectorize(VectorBase *params) const { KALDI_ASSERT(params->Dim() == this->NumParameters()); params->Range(0, linear_params_.NumCols() * linear_params_.NumRows()).CopyRowsFromMat(linear_params_); params->Range(linear_params_.NumCols() * linear_params_.NumRows(), bias_params_.Dim()).CopyFromVec(bias_params_); } void RepeatedAffineComponent::UnVectorize(const VectorBase ¶ms) { KALDI_ASSERT(params.Dim() == this->NumParameters()); linear_params_.CopyRowsFromVec(params.Range(0, linear_params_.NumCols() * linear_params_.NumRows())); bias_params_.CopyFromVec(params.Range(linear_params_.NumCols() * linear_params_.NumRows(), bias_params_.Dim())); } void NaturalGradientRepeatedAffineComponent::SetNaturalGradientConfigs() { int32 rank_in = 40; int32 input_dim = linear_params_.NumCols(); if (rank_in > input_dim / 2) rank_in = input_dim / 2; if (rank_in < 1) rank_in = 1; preconditioner_in_.SetRank(rank_in); preconditioner_in_.SetUpdatePeriod(4); } NaturalGradientRepeatedAffineComponent::NaturalGradientRepeatedAffineComponent( const NaturalGradientRepeatedAffineComponent &other): RepeatedAffineComponent(other), preconditioner_in_(other.preconditioner_in_) { } // virtual Component* NaturalGradientRepeatedAffineComponent::Copy() const { return new NaturalGradientRepeatedAffineComponent(*this); } void NaturalGradientRepeatedAffineComponent::Update( const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() && in_value.NumCols() == in_value.Stride() && in_value.NumRows() == out_deriv.NumRows()); int32 num_repeats = num_repeats_, num_rows = in_value.NumRows(), block_dim_out = linear_params_.NumRows(), block_dim_in = linear_params_.NumCols(); CuSubMatrix in_value_reshaped(in_value.Data(), num_rows * num_repeats, block_dim_in, block_dim_in), out_deriv_reshaped(out_deriv.Data(), num_rows * num_repeats, block_dim_out, block_dim_out); CuVector bias_deriv(block_dim_out); bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped); CuMatrix deriv(block_dim_out, block_dim_in + 1); deriv.ColRange(0, block_dim_in).AddMatMat( 1.0, out_deriv_reshaped, kTrans, in_value_reshaped, kNoTrans, 1.0); deriv.CopyColFromVec(bias_deriv, block_dim_in); BaseFloat scale = 1.0; if (!is_gradient_) { try { // Only apply the preconditioning/natural-gradient if we're not computing // the exact gradient. preconditioner_in_.PreconditionDirections(&deriv, &scale); } catch (...) { int32 num_bad_rows = 0; for (int32 i = 0; i < out_deriv.NumRows(); i++) { BaseFloat f = out_deriv.Row(i).Sum(); if (!(f - f == 0)) num_bad_rows++; } KALDI_ERR << "Preonditioning failed, in_value sum is " << in_value.Sum() << ", out_deriv sum is " << out_deriv.Sum() << ", out_deriv has " << num_bad_rows << " bad rows."; } } linear_params_.AddMat(learning_rate_ * scale, deriv.ColRange(0, block_dim_in)); bias_deriv.CopyColFromMat(deriv, block_dim_in); bias_params_.AddVec(learning_rate_ * scale, bias_deriv); } void NaturalGradientRepeatedAffineComponent::ConsolidateMemory() { OnlineNaturalGradient temp(preconditioner_in_); preconditioner_in_.Swap(&temp); } BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) : UpdatableComponent(other), linear_params_(other.linear_params_), bias_params_(other.bias_params_), num_blocks_(other.num_blocks_) {} BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) : UpdatableComponent(rac), linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), rac.linear_params_.NumCols(), kUndefined), bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined), num_blocks_(rac.num_repeats_) { // copy rac's linear_params_ and bias_params_ to this. int32 num_rows_in_block = rac.linear_params_.NumRows(); for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) { int32 row_offset = block_counter * num_rows_in_block; CuSubMatrix block = this->linear_params_.RowRange(row_offset, num_rows_in_block); block.CopyFromMat(rac.linear_params_); CuSubVector block_bias = this->bias_params_.Range(row_offset, num_rows_in_block); block_bias.CopyFromVec(rac.bias_params_); } } Component* BlockAffineComponent::Copy() const { BlockAffineComponent *ans = new BlockAffineComponent(*this); return ans; } std::string BlockAffineComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", num-blocks=" << num_blocks_; PrintParameterStats(stream, "linear-params", linear_params_); PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } void BlockAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_blocks, BaseFloat param_stddev, BaseFloat bias_mean, BaseFloat bias_stddev) { KALDI_ASSERT(input_dim > 0 && output_dim > 0 && num_blocks >= 1); KALDI_ASSERT(output_dim % num_blocks == 0 && input_dim % num_blocks == 0); const int32 num_columns_per_block = input_dim / num_blocks; linear_params_.Resize(output_dim, num_columns_per_block); bias_params_.Resize(output_dim); KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0); linear_params_.SetRandn(); linear_params_.Scale(param_stddev); bias_params_.SetRandn(); bias_params_.Scale(bias_stddev); bias_params_.Add(bias_mean); num_blocks_ = num_blocks; } void BlockAffineComponent::InitFromConfig(ConfigLine *cfl) { int32 input_dim = -1, output_dim = -1, num_blocks = -1; if(!cfl->GetValue("input-dim", &input_dim) || !cfl->GetValue("output-dim", &output_dim) || !cfl->GetValue("num-blocks", &num_blocks)) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; InitLearningRatesFromConfig(cfl); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks), bias_mean = 0.0, bias_stddev = 1.0; cfl->GetValue("param-stddev", ¶m_stddev); cfl->GetValue("bias-stddev", &bias_stddev); cfl->GetValue("bias-mean", &bias_mean); if (cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(input_dim, output_dim, num_blocks, param_stddev, bias_mean, bias_stddev); } void* BlockAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyRowsFromVec(bias_params_); // block_dimension is both the number of columns, and the number of rows, // of a block. int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_; int32 num_cols_in_block = linear_params_.NumCols(); std::vector *> in_batch, out_batch, linear_params_batch; for(int block_counter = 0; block_counter < num_blocks_; block_counter++) { CuSubMatrix *in_block = new CuSubMatrix(in.ColRange(block_counter * num_cols_in_block, num_cols_in_block)); in_batch.push_back(in_block); CuSubMatrix *out_block = new CuSubMatrix(out->ColRange(block_counter * num_rows_in_block, num_rows_in_block)); out_batch.push_back(out_block); CuSubMatrix *linear_params_block = new CuSubMatrix(linear_params_.RowRange(block_counter * num_rows_in_block, num_rows_in_block)); linear_params_batch.push_back(linear_params_block); } AddMatMatBatched(1.0, out_batch, in_batch, kNoTrans, linear_params_batch, kTrans, 1.0); DeletePointers(&in_batch); DeletePointers(&out_batch); DeletePointers(&linear_params_batch); return NULL; } void BlockAffineComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { BlockAffineComponent *to_update = dynamic_cast(to_update_in); const int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_; const int32 num_cols_in_block = linear_params_.NumCols(); // Propagate the derivative back to the input. // add with coefficient 1.0 since property kBackpropAdds is true. // If we wanted to add with coefficient 0.0 we'd need to zero the // in_deriv, in case of infinities. if (in_deriv) { std::vector *> in_deriv_batch, out_deriv_batch, linear_params_batch; for(int block_counter = 0; block_counter < num_blocks_; block_counter++) { CuSubMatrix *in_deriv_block = new CuSubMatrix(in_deriv->ColRange(block_counter * num_cols_in_block, num_cols_in_block)); in_deriv_batch.push_back(in_deriv_block); CuSubMatrix *out_deriv_block = new CuSubMatrix(out_deriv.ColRange(block_counter * num_rows_in_block, num_rows_in_block)); out_deriv_batch.push_back(out_deriv_block); CuSubMatrix *linear_params_block = new CuSubMatrix(linear_params_.RowRange(block_counter * num_rows_in_block, num_rows_in_block)); linear_params_batch.push_back(linear_params_block); } AddMatMatBatched(1.0, in_deriv_batch, out_deriv_batch, kNoTrans, linear_params_batch, kNoTrans, 1.0); DeletePointers(&in_deriv_batch); DeletePointers(&out_deriv_batch); DeletePointers(&linear_params_batch); } if (to_update != NULL) { { // linear params update std::vector *> in_value_batch, out_deriv_batch, linear_params_batch; for (int block_counter = 0; block_counter < num_blocks_; block_counter++) { CuSubMatrix *in_value_block = new CuSubMatrix(in_value.ColRange(block_counter * num_cols_in_block, num_cols_in_block)); in_value_batch.push_back(in_value_block); CuSubMatrix *out_deriv_block = new CuSubMatrix(out_deriv.ColRange(block_counter * num_rows_in_block, num_rows_in_block)); out_deriv_batch.push_back(out_deriv_block); CuSubMatrix *linear_params_block = new CuSubMatrix(to_update->linear_params_.RowRange(block_counter * num_rows_in_block, num_rows_in_block)); linear_params_batch.push_back(linear_params_block); } AddMatMatBatched(to_update->learning_rate_, linear_params_batch, out_deriv_batch, kTrans, in_value_batch, kNoTrans, 1.0); DeletePointers(&in_value_batch); DeletePointers(&out_deriv_batch); DeletePointers(&linear_params_batch); } // end linear params update { // bias update to_update->bias_params_.AddRowSumMat(to_update->learning_rate_, out_deriv, 1.0); } // end bias update } } void BlockAffineComponent::Scale(BaseFloat scale) { if (scale == 0.0) { linear_params_.SetZero(); bias_params_.SetZero(); } else { linear_params_.Scale(scale); bias_params_.Scale(scale); } } void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) { const BlockAffineComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); linear_params_.AddMat(alpha, other->linear_params_); bias_params_.AddVec(alpha, other->bias_params_); } void BlockAffineComponent::PerturbParams(BaseFloat stddev) { CuMatrix temp_linear_params(linear_params_); temp_linear_params.SetRandn(); linear_params_.AddMat(stddev, temp_linear_params); CuVector temp_bias_params(bias_params_); temp_bias_params.SetRandn(); bias_params_.AddVec(stddev, temp_bias_params); } BaseFloat BlockAffineComponent::DotProduct(const UpdatableComponent &other_in) const { const BlockAffineComponent *other = dynamic_cast(&other_in); return TraceMatMat(linear_params_, other->linear_params_, kTrans) + VecVec(bias_params_, other->bias_params_); } void BlockAffineComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // read opening tag and learning rate. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_blocks_); ExpectToken(is, binary, ""); linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } ExpectToken(is, binary, ""); } void BlockAffineComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate WriteToken(os, binary, ""); WriteBasicType(os, binary, num_blocks_); WriteToken(os, binary, ""); linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); WriteToken(os, binary, ""); } int32 BlockAffineComponent::NumParameters() const { return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim(); } void BlockAffineComponent::Vectorize(VectorBase *params) const { KALDI_ASSERT(params->Dim() == this->NumParameters()); int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows(); int32 num_bias_params = bias_params_.Dim(); params->Range(0, num_linear_params).CopyRowsFromMat(linear_params_); params->Range(num_linear_params, num_bias_params).CopyFromVec(bias_params_); } void BlockAffineComponent::UnVectorize(const VectorBase ¶ms) { KALDI_ASSERT(params.Dim() == this->NumParameters()); int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows(); int32 num_bias_params = bias_params_.Dim(); linear_params_.CopyRowsFromVec(params.Range(0, num_linear_params)); bias_params_.CopyFromVec(params.Range(num_linear_params, num_bias_params)); } void PerElementScaleComponent::Scale(BaseFloat scale) { if (scale == 0.0) { scales_.SetZero(); } else { scales_.Scale(scale); } } void PerElementScaleComponent::Add(BaseFloat alpha, const Component &other_in) { const PerElementScaleComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); scales_.AddVec(alpha, other->scales_); } PerElementScaleComponent::PerElementScaleComponent( const PerElementScaleComponent &component): UpdatableComponent(component), scales_(component.scales_) { } void PerElementScaleComponent::PerturbParams(BaseFloat stddev) { CuVector temp_scales(scales_.Dim(), kUndefined); temp_scales.SetRandn(); scales_.AddVec(stddev, temp_scales); } std::string PerElementScaleComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", scales-min=" << scales_.Min() << ", scales-max=" << scales_.Max(); PrintParameterStats(stream, "scales", scales_, true); return stream.str(); } Component* PerElementScaleComponent::Copy() const { return new PerElementScaleComponent(*this); } BaseFloat PerElementScaleComponent::DotProduct( const UpdatableComponent &other_in) const { const PerElementScaleComponent *other = dynamic_cast(&other_in); return VecVec(scales_, other->scales_); } void PerElementScaleComponent::Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev) { KALDI_ASSERT(dim > 0 && param_stddev >= 0.0); scales_.Resize(dim); scales_.SetRandn(); scales_.Scale(param_stddev); scales_.Add(param_mean); } void PerElementScaleComponent::Init(std::string vector_filename) { CuVector vec; ReadKaldiObject(vector_filename, &vec); // will abort on failure. scales_.Resize(vec.Dim()); scales_.CopyFromVec(vec); } void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) { std::string vector_filename; int32 dim = -1; InitLearningRatesFromConfig(cfl); if (cfl->GetValue("vector", &vector_filename)) { Init(vector_filename); if (cfl->GetValue("dim", &dim)) KALDI_ASSERT(dim == InputDim() && "input-dim mismatch vs. vector."); } else { if(!cfl->GetValue("dim", &dim)) KALDI_ERR << "'dim' not provided in the config line."; BaseFloat param_mean = 1.0, param_stddev = 0.0; cfl->GetValue("param-mean", ¶m_mean); cfl->GetValue("param-stddev", ¶m_stddev); Init(dim, param_mean, param_stddev); } if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); } void* PerElementScaleComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyFromMat(in); out->MulColsVec(scales_); return NULL; } void PerElementScaleComponent::UpdateSimple( const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans, in_value, kNoTrans, 1.0); } void PerElementScaleComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { PerElementScaleComponent *to_update = dynamic_cast(to_update_in); if (to_update != NULL) { // Next update the model (must do this 2nd so the derivatives we propagate // are accurate, in case this == to_update_in.) if (to_update->is_gradient_) to_update->UpdateSimple(in_value, out_deriv); else // the call below is to a virtual function that may be re-implemented to_update->Update(debug_info, in_value, out_deriv); // by child classes. } if (in_deriv) { // Propagate the derivative back to the input. if (in_deriv->Data() != out_deriv.Data()) in_deriv->CopyFromMat(out_deriv); in_deriv->MulColsVec(scales_); } } void PerElementScaleComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read opening tag and learning rate. ExpectToken(is, binary, ""); scales_.Read(is, binary); if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } ExpectToken(is, binary, ""); } void PerElementScaleComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate. WriteToken(os, binary, ""); scales_.Write(os, binary); WriteToken(os, binary, ""); } int32 PerElementScaleComponent::NumParameters() const { return InputDim(); } void PerElementScaleComponent::Vectorize(VectorBase *params) const { params->CopyFromVec(scales_); } void PerElementScaleComponent::UnVectorize( const VectorBase ¶ms) { scales_.CopyFromVec(params); } void PerElementOffsetComponent::Scale(BaseFloat scale) { if (scale == 0.0) { offsets_.SetZero(); } else { offsets_.Scale(scale); } } void PerElementOffsetComponent::Add(BaseFloat alpha, const Component &other_in) { const PerElementOffsetComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); offsets_.AddVec(alpha, other->offsets_); } PerElementOffsetComponent::PerElementOffsetComponent( const PerElementOffsetComponent &component): UpdatableComponent(component), offsets_(component.offsets_), dim_(component.dim_), use_natural_gradient_(component.use_natural_gradient_), preconditioner_(component.preconditioner_) { } void PerElementOffsetComponent::PerturbParams(BaseFloat stddev) { CuVector temp_offsets(offsets_.Dim(), kUndefined); temp_offsets.SetRandn(); offsets_.AddVec(stddev, temp_offsets); } std::string PerElementOffsetComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", offsets-min=" << offsets_.Min() << ", offsets-max=" << offsets_.Max() << ", block-dim=" << offsets_.Dim() << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false"); PrintParameterStats(stream, "offsets", offsets_, true); return stream.str(); } Component* PerElementOffsetComponent::Copy() const { return new PerElementOffsetComponent(*this); } BaseFloat PerElementOffsetComponent::DotProduct( const UpdatableComponent &other_in) const { const PerElementOffsetComponent *other = dynamic_cast(&other_in); return VecVec(offsets_, other->offsets_); } void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) { std::string vector_filename; InitLearningRatesFromConfig(cfl); if (cfl->GetValue("vector", &vector_filename)) { ReadKaldiObject(vector_filename, &offsets_); dim_ = offsets_.Dim(); // if dim is not supplied, it defaults to this. cfl->GetValue("dim", &dim_); if (dim_ <= 0 || offsets_.Dim() % dim_ != 0) KALDI_ERR << "Invalid dimension dim=" << dim_; } else { if(!cfl->GetValue("dim", &dim_)) KALDI_ERR << "'dim' not provided in the config line."; if (dim_ <= 0) KALDI_ERR << "Invalid dimension dim=" << dim_; BaseFloat param_mean = 0.0, param_stddev = 0.0; cfl->GetValue("param-mean", ¶m_mean); cfl->GetValue("param-stddev", ¶m_stddev); int32 block_dim = dim_; cfl->GetValue("block-dim", &block_dim); if (block_dim <= 0 || dim_ % block_dim != 0) KALDI_ERR << "Invalid value block-dim=" << block_dim; offsets_.Resize(block_dim); offsets_.SetRandn(); offsets_.Scale(param_stddev); offsets_.Add(param_mean); } use_natural_gradient_ = true; cfl->GetValue("use-natural-gradient", &use_natural_gradient_); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); // For now you can't modify these defaults of the natural gradient. // This code must be kept in sync with the code in Read(). preconditioner_.SetRank(20); preconditioner_.SetUpdatePeriod(4); } void* PerElementOffsetComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { if (in.Data() != out->Data()) out->CopyFromMat(in); if (dim_ == offsets_.Dim()) { out->AddVecToRows(1.0, offsets_); } else { KALDI_ASSERT(out->Stride() == out->NumCols()); int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim, num_rows = out->NumRows() * multiple; CuSubMatrix out_rearranged(out->Data(), num_rows, block_dim, block_dim); out_rearranged.AddVecToRows(1.0, offsets_); } return NULL; } void PerElementOffsetComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { PerElementOffsetComponent *to_update = dynamic_cast(to_update_in); if (in_deriv && in_deriv->Data() != out_deriv.Data()) { // Propagate the derivative back to the input. in_deriv->CopyFromMat(out_deriv); } if (to_update != NULL) { // we may have to reshape out_deriv, if "block-dim" was set // in the config file when initializing the object, leading // to dim_ being a multiple >1 of offset_.Dim(). // To avoid having separate code paths we create a sub-matrix // in any case, but this may just be a copy of out_deriv. int32 block_dim = offsets_.Dim(), multiple = dim_ / block_dim, block_stride = (multiple == 1 ? out_deriv.Stride() : block_dim), num_rows = out_deriv.NumRows() * multiple; KALDI_ASSERT(multiple == 1 || out_deriv.Stride() == out_deriv.NumCols()); CuSubMatrix out_deriv_reshaped(out_deriv.Data(), num_rows, block_dim, block_stride); if (!to_update->use_natural_gradient_ || to_update->is_gradient_) { KALDI_LOG << "Using non-NG update, lr = " << to_update->learning_rate_; to_update->offsets_.AddRowSumMat(to_update->learning_rate_, out_deriv_reshaped); } else { KALDI_LOG << "Using NG update, lr = " << to_update->learning_rate_; // make a copy as we don't want to modify the data of 'out_deriv', which // was const (even though CuSubMatrix does not respect const-ness in // this scenario) CuMatrix out_deriv_copy(out_deriv_reshaped); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } } } void PerElementOffsetComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read opening tag and learning rate ExpectToken(is, binary, ""); offsets_.Read(is, binary); if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } if (PeekToken(is, binary) != '/') { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); } else { dim_ = offsets_.Dim(); use_natural_gradient_ = true; } // For now you can't modify these defaults of the natural gradient. // This code must be kept in sync with the code in InitFromConfig(). preconditioner_.SetRank(20); preconditioner_.SetUpdatePeriod(4); ExpectToken(is, binary, ""); } void PerElementOffsetComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate WriteToken(os, binary, ""); offsets_.Write(os, binary); WriteToken(os, binary, ""); WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); WriteToken(os, binary, ""); } int32 PerElementOffsetComponent::NumParameters() const { return offsets_.Dim(); } void PerElementOffsetComponent::Vectorize(VectorBase *params) const { params->CopyFromVec(offsets_); } void PerElementOffsetComponent::UnVectorize( const VectorBase ¶ms) { offsets_.CopyFromVec(params); } std::string ScaleAndOffsetComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", rank=" << scale_preconditioner_.GetRank(); if (dim_ != scales_.Dim()) stream << ", block-size=" << scales_.Dim(); PrintParameterStats(stream, "scales", scales_, true); PrintParameterStats(stream, "offsets", offsets_, true); return stream.str(); } void ScaleAndOffsetComponent::InitFromConfig(ConfigLine *cfl) { InitLearningRatesFromConfig(cfl); if (!cfl->GetValue("dim", &dim_) || dim_ <= 0) { KALDI_ERR << "Dimension 'dim' must be specified and >0: " << cfl->WholeLine(); } use_natural_gradient_ = true; cfl->GetValue("use-natural-gradient", &use_natural_gradient_); int32 block_dim = dim_, rank = 20; cfl->GetValue("block-dim", &block_dim); if (block_dim <= 0 || dim_ % block_dim != 0) { KALDI_ERR << "Invalid block-dim: " << cfl->WholeLine(); } cfl->GetValue("rank", &rank); scales_.Resize(block_dim); scales_.Set(1.0); offsets_.Resize(block_dim); // offsets are all zero when initialized. if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); offset_preconditioner_.SetRank(rank); scale_preconditioner_.SetRank(rank); // the update period can't be configured for now; we'll add an option if we // want to. offset_preconditioner_.SetUpdatePeriod(4); scale_preconditioner_.SetUpdatePeriod(4); } void ScaleAndOffsetComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read opening tag and learning rate ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dim_); ExpectToken(is, binary, ""); scales_.Read(is, binary); ExpectToken(is, binary, ""); offsets_.Read(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); int32 rank; ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank); scale_preconditioner_.SetRank(rank); offset_preconditioner_.SetRank(rank); ExpectToken(is, binary, ""); } void ScaleAndOffsetComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate WriteToken(os, binary, ""); WriteBasicType(os, binary, dim_); WriteToken(os, binary, ""); scales_.Write(os, binary); WriteToken(os, binary, ""); offsets_.Write(os, binary); WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); WriteToken(os, binary, ""); WriteBasicType(os, binary, scale_preconditioner_.GetRank()); WriteToken(os, binary, ""); } void ScaleAndOffsetComponent::Scale(BaseFloat scale) { if (scale == 0.0) { scales_.SetZero(); offsets_.SetZero(); } else { scales_.Scale(scale); offsets_.Scale(scale); } } void ScaleAndOffsetComponent::Add(BaseFloat alpha, const Component &other_in) { const ScaleAndOffsetComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); scales_.AddVec(alpha, other->scales_); offsets_.AddVec(alpha, other->offsets_); } ScaleAndOffsetComponent::ScaleAndOffsetComponent( const ScaleAndOffsetComponent &component): UpdatableComponent(component), dim_(component.dim_), scales_(component.scales_), offsets_(component.offsets_), use_natural_gradient_(component.use_natural_gradient_), scale_preconditioner_(component.scale_preconditioner_), offset_preconditioner_(component.offset_preconditioner_) { } void ScaleAndOffsetComponent::PerturbParams(BaseFloat stddev) { CuVector temp(scales_.Dim(), kUndefined); temp.SetRandn(); scales_.AddVec(stddev, temp); temp.SetRandn(); offsets_.AddVec(stddev, temp); } BaseFloat ScaleAndOffsetComponent::DotProduct( const UpdatableComponent &other_in) const { const ScaleAndOffsetComponent *other = dynamic_cast(&other_in); return VecVec(other->scales_, scales_) + VecVec(other->offsets_, offsets_); } void ScaleAndOffsetComponent::Vectorize(VectorBase *params) const { int32 dim = scales_.Dim(); params->Range(0, dim).CopyFromVec(scales_); params->Range(dim, dim).CopyFromVec(offsets_); } void ScaleAndOffsetComponent::UnVectorize( const VectorBase ¶ms) { int32 dim = scales_.Dim(); scales_.CopyFromVec(params.Range(0, dim)); offsets_.CopyFromVec(params.Range(dim, dim)); } void* ScaleAndOffsetComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { if (dim_ == scales_.Dim()) { PropagateInternal(in, out); } else { int32 multiple = dim_ / scales_.Dim(), num_rows = in.NumRows(), block_dim = scales_.Dim(); KALDI_ASSERT(in.NumCols() == in.Stride() && SameDimAndStride(in, *out)); // Reinterpret the data as matrices with more rows but fewer columns. CuSubMatrix in_rearranged(in.Data(), num_rows * multiple, block_dim, block_dim), out_rearranged(out->Data(), num_rows * multiple, block_dim, block_dim); PropagateInternal(in_rearranged, &out_rearranged); } return NULL; } void ScaleAndOffsetComponent::PropagateInternal( const CuMatrixBase &in, CuMatrixBase *out) const { if (out->Data() != in.Data()) out->CopyFromMat(in); BaseFloat epsilon = Epsilon(); int32 dim = scales_.Dim(); CuVector scales_nonzero(dim, kUndefined); cu::EnsureNonzero(scales_, epsilon, &scales_nonzero); out->MulColsVec(scales_nonzero); out->AddVecToRows(1.0, offsets_); } void ScaleAndOffsetComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { ScaleAndOffsetComponent *to_update = dynamic_cast(to_update_in); KALDI_ASSERT(SameDim(out_value, out_deriv)); if (dim_ == scales_.Dim()) { BackpropInternal(debug_info, out_value, out_deriv, to_update, in_deriv); } else { KALDI_ASSERT(out_value.NumCols() == out_value.Stride() && SameDimAndStride(out_value, out_deriv) && (!in_deriv || SameDimAndStride(out_value, *in_deriv))); int32 multiple = dim_ / scales_.Dim(), num_rows = out_value.NumRows(), block_dim = scales_.Dim(); CuSubMatrix out_value_rearranged(out_value.Data(), num_rows * multiple, block_dim, block_dim), out_deriv_rearranged(out_deriv.Data(), num_rows * multiple, block_dim, block_dim); if (in_deriv) { CuSubMatrix in_deriv_rearranged(in_deriv->Data(), num_rows * multiple, block_dim, block_dim); BackpropInternal(debug_info, out_value_rearranged, out_deriv_rearranged, to_update, &in_deriv_rearranged); } else { BackpropInternal(debug_info, out_value_rearranged, out_deriv_rearranged, to_update, NULL); } } } // Internal version of backprop, where the num-cols of the // argument matrices are equal to scales_.Dim(). void ScaleAndOffsetComponent::BackpropInternal( const std::string &debug_info, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, ScaleAndOffsetComponent *to_update, CuMatrixBase *in_deriv) const { if (to_update) { if (!to_update->use_natural_gradient_ || to_update->is_gradient_) { to_update->offsets_.AddRowSumMat(to_update->learning_rate_, out_deriv); } else { BaseFloat scale = 1.0; CuMatrix out_deriv_copy(out_deriv); to_update->offset_preconditioner_.PreconditionDirections( &out_deriv_copy, &scale); to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } // The backprop actually needs the input to the component, not the output; // but we make the output available because in the common topologies that // will already be required for backprop-- it's for memory efficiency. CuMatrix in_value_reconstructed(out_value); int32 dim = scales_.Dim(); CuVector scales_nonzero(dim, kUndefined); BaseFloat epsilon = Epsilon(); cu::EnsureNonzero(scales_, epsilon, &scales_nonzero); scales_nonzero.InvertElements(); in_value_reconstructed.AddVecToRows(-1.0, offsets_); // Actually scales_nonzero are now the inverses of the scales. in_value_reconstructed.MulColsVec(scales_nonzero); // OK, at this point in_value_reconstructed is the input to the component. // Multiply its elements by 'out_deriv' to get the derivatives // (for each frame) w.r.t. the scales. in_value_reconstructed.MulElements(out_deriv); BaseFloat scale = 1.0; if (to_update->use_natural_gradient_ && !to_update->is_gradient_) { to_update->scale_preconditioner_.PreconditionDirections( &in_value_reconstructed, &scale); } to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_, in_value_reconstructed); } if (in_deriv) { if (in_deriv->Data() != out_deriv.Data()) in_deriv->CopyFromMat(out_deriv); in_deriv->MulColsVec(scales_); } } void ScaleAndOffsetComponent::ConsolidateMemory() { OnlineNaturalGradient temp_scale(scale_preconditioner_); scale_preconditioner_.Swap(&temp_scale); OnlineNaturalGradient temp_offset(offset_preconditioner_); offset_preconditioner_.Swap(&temp_offset); } std::string ConstantFunctionComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info() << ", " << Type() << ", input-dim=" << InputDim() << ", output-dim=" << OutputDim() << ", is-updatable=" << std::boolalpha << is_updatable_ << ", use-natural-gradient=" << std::boolalpha << use_natural_gradient_; PrintParameterStats(stream, "output", output_, true); return stream.str(); } ConstantFunctionComponent::ConstantFunctionComponent(): UpdatableComponent(), input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { } ConstantFunctionComponent::ConstantFunctionComponent( const ConstantFunctionComponent &other): UpdatableComponent(other), input_dim_(other.input_dim_), output_(other.output_), is_updatable_(other.is_updatable_), use_natural_gradient_(other.use_natural_gradient_), preconditioner_(other.preconditioner_) { } void* ConstantFunctionComponent::Propagate( const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyRowsFromVec(output_); return NULL; } void ConstantFunctionComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { // we don't update in_deriv, since we set the flag // kBackpropAdds, and the output doesn't depend on the // input, so the input-derivative is zero. if (to_update_in) { ConstantFunctionComponent *to_update = dynamic_cast(to_update_in); if (to_update->is_updatable_) { // only do the update if the is_updatable_ flag is set. KALDI_ASSERT(to_update && to_update->is_updatable_); if (to_update->use_natural_gradient_ && !to_update->is_gradient_) { CuMatrix out_deriv_copy(out_deriv); BaseFloat scale = 1.0; to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, &scale); to_update->output_.AddRowSumMat(scale * to_update->learning_rate_, out_deriv_copy); } else { to_update->output_.AddRowSumMat(to_update->learning_rate_, out_deriv); } } } } void ConstantFunctionComponent::Read(std::istream &is, bool binary) { std::string token; ReadToken(is, binary, &token); if (token == "") { ReadToken(is, binary, &token); } if (token == "") { ReadBasicType(is, binary, &learning_rate_factor_); ReadToken(is, binary, &token); } else { learning_rate_factor_ = 1.0; } if (token == "") { ReadBasicType(is, binary, &is_gradient_); ReadToken(is, binary, &token); } else { is_gradient_ = false; } if (token == "") { ReadBasicType(is, binary, &learning_rate_); ReadToken(is, binary, &token); } else { learning_rate_ = 0.001; } if (token == "") { ReadBasicType(is, binary, &input_dim_); } else { KALDI_ERR << "Expected token , got " << token; } ExpectToken(is, binary, ""); output_.Read(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_updatable_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); ExpectToken(is, binary, ""); } void ConstantFunctionComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); WriteBasicType(os, binary, input_dim_); WriteToken(os, binary, ""); output_.Write(os, binary); WriteToken(os, binary, ""); WriteBasicType(os, binary, is_updatable_); WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); WriteToken(os, binary, ""); } Component* ConstantFunctionComponent::Copy() const { return new ConstantFunctionComponent(*this); } void ConstantFunctionComponent::Scale(BaseFloat scale) { if (is_updatable_) { if (scale == 0.0) { output_.SetZero(); } else { output_.Scale(scale); } } } void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) { if (is_updatable_) { const ConstantFunctionComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); output_.AddVec(alpha, other->output_); } } void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) { CuVector temp_output(output_.Dim(), kUndefined); temp_output.SetRandn(); output_.AddVec(stddev, temp_output); } BaseFloat ConstantFunctionComponent::DotProduct( const UpdatableComponent &other_in) const { KALDI_ASSERT(is_updatable_); const ConstantFunctionComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); return VecVec(output_, other->output_); } void ConstantFunctionComponent::InitFromConfig(ConfigLine *cfl) { int32 output_dim = 0; InitLearningRatesFromConfig(cfl); bool ok = cfl->GetValue("output-dim", &output_dim) && cfl->GetValue("input-dim", &input_dim_); cfl->GetValue("is-updatable", &is_updatable_); cfl->GetValue("use-natural-gradient", &use_natural_gradient_); BaseFloat output_mean = 0.0, output_stddev = 0.0; cfl->GetValue("output-mean", &output_mean); cfl->GetValue("output-stddev", &output_stddev); if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || output_dim <= 0) { KALDI_ERR << "Bad initializer " << cfl->WholeLine(); } Vector output(output_dim); output.SetRandn(); output.Scale(output_stddev); output.Add(output_mean); output_ = output; } int32 ConstantFunctionComponent::NumParameters() const { KALDI_ASSERT(is_updatable_); return output_.Dim(); } void ConstantFunctionComponent::Vectorize(VectorBase *params) const { params->CopyFromVec(output_); } void ConstantFunctionComponent::UnVectorize(const VectorBase ¶ms) { output_.CopyFromVec(params); } void ConstantFunctionComponent::ConsolidateMemory() { OnlineNaturalGradient temp(preconditioner_); preconditioner_.Swap(&temp); } void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate ExpectToken(is, binary, ""); linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); BaseFloat num_samples_history, alpha; int32 rank_in, rank_out, update_period; ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_in); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_out); if (PeekToken(is, binary) == 'O') { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &orthonormal_constraint_); } else { orthonormal_constraint_ = 0.0; } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &update_period); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_samples_history); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &alpha); preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); preconditioner_in_.SetRank(rank_in); preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); if (PeekToken(is, binary) == 'M') { // MaxChangePerSample, long ago removed; back compatibility. ExpectToken(is, binary, ""); BaseFloat temp; ReadBasicType(is, binary, &temp); } if (PeekToken(is, binary) == 'I') { // for back compatibility; we don't write this here any // more as it's written and read in Write/ReadUpdatableCommon ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); } if (PeekToken(is, binary) == 'U') { ExpectToken(is, binary, ""); // back-compatibility branch (these configs were added and then removed). double temp; ReadBasicType(is, binary, &temp); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &temp); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &temp); } std::string token; ReadToken(is, binary, &token); // the following has to handle a couple variants of if (token.find("NaturalGradientAffineComponent>") == std::string::npos) KALDI_ERR << "Expected or " << ", got " << token; } NaturalGradientAffineComponent::NaturalGradientAffineComponent( const CuMatrixBase &linear_params, const CuVectorBase &bias_params): AffineComponent(linear_params, bias_params, 0.001) { KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() && bias_params.Dim() != 0); // set some default natural gradient configs. preconditioner_in_.SetRank(20); preconditioner_out_.SetRank(80); preconditioner_in_.SetUpdatePeriod(4); preconditioner_out_.SetUpdatePeriod(4); } void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string matrix_filename; is_gradient_ = false; // not configurable; there's no reason you'd want this InitLearningRatesFromConfig(cfl); if (cfl->GetValue("matrix", &matrix_filename)) { CuMatrix mat; ReadKaldiObject(matrix_filename, &mat); // will abort on failure. KALDI_ASSERT(mat.NumCols() >= 2); int32 input_dim = mat.NumCols() - 1, output_dim = mat.NumRows(); linear_params_.Resize(output_dim, input_dim); bias_params_.Resize(output_dim); linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim)); bias_params_.CopyColFromMat(mat, input_dim); if (cfl->GetValue("input-dim", &input_dim)) KALDI_ASSERT(input_dim == InputDim() && "input-dim mismatch vs. matrix."); if (cfl->GetValue("output-dim", &output_dim)) KALDI_ASSERT(output_dim == OutputDim() && "output-dim mismatch vs. matrix."); } else { int32 input_dim = -1, output_dim = -1; ok = ok && cfl->GetValue("input-dim", &input_dim); ok = ok && cfl->GetValue("output-dim", &output_dim); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0, bias_mean = 0.0; cfl->GetValue("param-stddev", ¶m_stddev); cfl->GetValue("bias-stddev", &bias_stddev); cfl->GetValue("bias-mean", &bias_mean); linear_params_.Resize(output_dim, input_dim); bias_params_.Resize(output_dim); KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 && bias_stddev >= 0.0); linear_params_.SetRandn(); // sets to random normally distributed noise. linear_params_.Scale(param_stddev); bias_params_.SetRandn(); bias_params_.Scale(bias_stddev); bias_params_.Add(bias_mean); } orthonormal_constraint_ = 0.0; cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); // Set natural-gradient configs. BaseFloat num_samples_history = 2000.0, alpha = 4.0; int32 rank_in = -1, rank_out = -1, update_period = 4; cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); cfl->GetValue("rank-in", &rank_in); cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); if (rank_in < 0) rank_in = std::min(20, (InputDim() + 1) / 2); if (rank_out < 0) rank_out = std::min(80, (OutputDim() + 1) / 2); preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); preconditioner_in_.SetRank(rank_in); preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); } void NaturalGradientAffineComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetRank()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_out_.GetRank()); if (orthonormal_constraint_ != 0.0) { WriteToken(os, binary, ""); WriteBasicType(os, binary, orthonormal_constraint_); } WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetNumSamplesHistory()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_in_.GetAlpha()); WriteToken(os, binary, ""); } std::string NaturalGradientAffineComponent::Info() const { std::ostringstream stream; stream << AffineComponent::Info(); stream << ", rank-in=" << preconditioner_in_.GetRank() << ", rank-out=" << preconditioner_out_.GetRank() << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() << ", update-period=" << preconditioner_in_.GetUpdatePeriod() << ", alpha=" << preconditioner_in_.GetAlpha(); return stream.str(); } Component* NaturalGradientAffineComponent::Copy() const { return new NaturalGradientAffineComponent(*this); } NaturalGradientAffineComponent::NaturalGradientAffineComponent( const NaturalGradientAffineComponent &other): AffineComponent(other), preconditioner_in_(other.preconditioner_in_), preconditioner_out_(other.preconditioner_out_) { } void NaturalGradientAffineComponent::Update( const std::string &debug_info, const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { CuMatrix in_value_temp; in_value_temp.Resize(in_value.NumRows(), in_value.NumCols() + 1, kUndefined); in_value_temp.Range(0, in_value.NumRows(), 0, in_value.NumCols()).CopyFromMat(in_value); // Add the 1.0 at the end of each row "in_value_temp" in_value_temp.Range(0, in_value.NumRows(), in_value.NumCols(), 1).Set(1.0); CuMatrix out_deriv_temp(out_deriv); // These "scale" values get will get multiplied into the learning rate (faster // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); // "scale" is a scaling factor coming from the PreconditionDirections calls // (it's faster to have them output a scaling factor than to have them scale // their outputs). BaseFloat scale = in_scale * out_scale; CuSubMatrix in_value_precon_part(in_value_temp, 0, in_value_temp.NumRows(), 0, in_value_temp.NumCols() - 1); // this "precon_ones" is what happens to the vector of 1's representing // offsets, after multiplication by the preconditioner. CuVector precon_ones(in_value_temp.NumRows()); precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1); BaseFloat local_lrate = scale * learning_rate_; bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans, precon_ones, 1.0); linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans, in_value_precon_part, kNoTrans, 1.0); } void NaturalGradientAffineComponent::Scale(BaseFloat scale) { if (scale == 0.0) { linear_params_.SetZero(); bias_params_.SetZero(); } else { linear_params_.Scale(scale); bias_params_.Scale(scale); } } void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) { const NaturalGradientAffineComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); linear_params_.AddMat(alpha, other->linear_params_); bias_params_.AddVec(alpha, other->bias_params_); } void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) { preconditioner_in_.Freeze(freeze); preconditioner_out_.Freeze(freeze); } void NaturalGradientAffineComponent::ConsolidateMemory() { OnlineNaturalGradient temp_in(preconditioner_in_); preconditioner_in_.Swap(&temp_in); OnlineNaturalGradient temp_out(preconditioner_out_); preconditioner_out_.Swap(&temp_out); } void LinearComponent::Read(std::istream &is, bool binary) { std::string token = ReadUpdatableCommon(is, binary); KALDI_ASSERT(token == ""); ExpectToken(is, binary, ""); params_.Read(is, binary); if (PeekToken(is, binary) == 'O') { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &orthonormal_constraint_); } else { orthonormal_constraint_ = 0.0; } ExpectToken(is, binary, ""); ReadBasicType(is, binary, &use_natural_gradient_); // Read various natural-gradient-related configs. int32 rank_in, rank_out, update_period; BaseFloat alpha, num_samples_history; ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank_in); ReadBasicType(is, binary, &rank_out); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &alpha); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_samples_history); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &update_period); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); preconditioner_in_.SetRank(rank_in); preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); ExpectToken(is, binary, ""); } void LinearComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string matrix_filename; is_gradient_ = false; // not configurable; there's no reason you'd want this InitLearningRatesFromConfig(cfl); int32 input_dim = -1, output_dim = -1; if (cfl->GetValue("matrix", &matrix_filename)) { ReadKaldiObject(matrix_filename, ¶ms_); // will abort on failure. KALDI_ASSERT(params_.NumRows() != 0); if (cfl->GetValue("input-dim", &input_dim)) KALDI_ASSERT(input_dim == InputDim() && "input-dim mismatch vs. matrix."); if (cfl->GetValue("output-dim", &output_dim)) KALDI_ASSERT(output_dim == OutputDim() && "output-dim mismatch vs. matrix."); } else { ok = ok && cfl->GetValue("input-dim", &input_dim); ok = ok && cfl->GetValue("output-dim", &output_dim); if (!ok) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); BaseFloat param_stddev = 1.0 / std::sqrt(input_dim); cfl->GetValue("param-stddev", ¶m_stddev); params_.Resize(output_dim, input_dim); KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0); params_.SetRandn(); // sets to random normally distributed noise. params_.Scale(param_stddev); } // Read various natural-gradient-related configs. int32 rank_in = -1, rank_out = -1, update_period = 4; BaseFloat alpha = 4.0, num_samples_history = 2000.0; use_natural_gradient_ = true; cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); cfl->GetValue("rank-in", &rank_in); cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); cfl->GetValue("use-natural-gradient", &use_natural_gradient_); if (rank_in < 0) rank_in = std::min(20, (InputDim() + 1) / 2); if (rank_out < 0) rank_out = std::min(80, (OutputDim() + 1) / 2); preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); preconditioner_in_.SetRank(rank_in); preconditioner_out_.SetRank(rank_out); preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetUpdatePeriod(update_period); preconditioner_out_.SetUpdatePeriod(update_period); orthonormal_constraint_ = 0.0; cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); } void LinearComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); params_.Write(os, binary); if (orthonormal_constraint_ != 0.0) { WriteToken(os, binary, ""); WriteBasicType(os, binary, orthonormal_constraint_); } WriteToken(os, binary, ""); WriteBasicType(os, binary, use_natural_gradient_); int32 rank_in = preconditioner_in_.GetRank(), rank_out = preconditioner_out_.GetRank(), update_period = preconditioner_in_.GetUpdatePeriod(); BaseFloat alpha = preconditioner_in_.GetAlpha(), num_samples_history = preconditioner_in_.GetNumSamplesHistory(); WriteToken(os, binary, ""); WriteBasicType(os, binary, rank_in); WriteBasicType(os, binary, rank_out); WriteToken(os, binary, ""); WriteBasicType(os, binary, alpha); WriteToken(os, binary, ""); WriteBasicType(os, binary, num_samples_history); WriteToken(os, binary, ""); WriteBasicType(os, binary, update_period); WriteToken(os, binary, ""); } std::string LinearComponent::Info() const { std::ostringstream stream; stream << UpdatableComponent::Info(); PrintParameterStats(stream, "params", params_, false, // include_mean true, // include_row_norms true, // include_column_norms GetVerboseLevel() >= 2); // include_singular_values if (orthonormal_constraint_ != 0.0) stream << ", orthonormal-constraint=" << orthonormal_constraint_; stream << ", use-natural-gradient=" << (use_natural_gradient_ ? "true" : "false") << ", rank-in=" << preconditioner_in_.GetRank() << ", rank-out=" << preconditioner_out_.GetRank() << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() << ", update-period=" << preconditioner_in_.GetUpdatePeriod() << ", alpha=" << preconditioner_in_.GetAlpha(); return stream.str(); } void* LinearComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->AddMatMat(1.0, in, kNoTrans, params_, kTrans, 1.0); return NULL; } void LinearComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { LinearComponent *to_update = dynamic_cast(to_update_in); // Propagate the derivative back to the input. add with coefficient 1.0 since // property kBackpropAdds is true. If we wanted to add with coefficient 0.0 // we'd need to zero the in_deriv, in case of infinities. if (in_deriv) in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, params_, kNoTrans, 1.0); if (to_update != NULL) { if (!to_update->is_gradient_) { CuMatrix in_value_temp(in_value), out_deriv_temp(out_deriv); // These "scale" values get will get multiplied into the learning rate (faster // than having the matrices scaled inside the preconditioning code). BaseFloat in_scale, out_scale; to_update->preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_; to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans, in_value_temp, kNoTrans, 1.0); } else { to_update->params_.AddMatMat(to_update->learning_rate_, out_deriv, kTrans, in_value, kNoTrans, 1.0); } } } Component* LinearComponent::Copy() const { return new LinearComponent(*this); } LinearComponent::LinearComponent( const LinearComponent &other): UpdatableComponent(other), params_(other.params_), orthonormal_constraint_(other.orthonormal_constraint_), use_natural_gradient_(other.use_natural_gradient_), preconditioner_in_(other.preconditioner_in_), preconditioner_out_(other.preconditioner_out_) { } LinearComponent::LinearComponent(const CuMatrix ¶ms): params_(params), orthonormal_constraint_(0.0), use_natural_gradient_(true) { // Set defaults for natural gradient. preconditioner_in_.SetRank(40); preconditioner_out_.SetRank(80); preconditioner_in_.SetUpdatePeriod(4); preconditioner_out_.SetUpdatePeriod(4); // the component-level defaults of alpha and num_samples_history, at 4.0 and // 2000.0, are the same as in the NaturalGradientOnline code, so there is no // need to set those here. } void LinearComponent::Scale(BaseFloat scale) { if (scale == 0.0) params_.SetZero(); else params_.Scale(scale); } void LinearComponent::Add(BaseFloat alpha, const Component &other_in) { const LinearComponent *other = dynamic_cast(&other_in); KALDI_ASSERT(other != NULL); params_.AddMat(alpha, other->params_); } void LinearComponent::PerturbParams(BaseFloat stddev) { CuMatrix temp_params(params_); temp_params.SetRandn(); params_.AddMat(stddev, temp_params); } int32 LinearComponent::NumParameters() const { return params_.NumRows() * params_.NumCols(); } void LinearComponent::Vectorize(VectorBase *params) const { KALDI_ASSERT(params->Dim() == this->NumParameters()); params->CopyRowsFromMat(params_); } void LinearComponent::UnVectorize(const VectorBase ¶ms) { KALDI_ASSERT(params.Dim() == this->NumParameters()); params_.CopyRowsFromVec(params); } BaseFloat LinearComponent::DotProduct(const UpdatableComponent &other_in) const { const LinearComponent *other = dynamic_cast(&other_in); return TraceMatMat(params_, other->params_, kTrans); } void LinearComponent::FreezeNaturalGradient(bool freeze) { preconditioner_in_.Freeze(freeze); preconditioner_out_.Freeze(freeze); } void LinearComponent::ConsolidateMemory() { OnlineNaturalGradient temp_in(preconditioner_in_); preconditioner_in_.Swap(&temp_in); OnlineNaturalGradient temp_out(preconditioner_out_); preconditioner_out_.Swap(&temp_out); } std::string FixedAffineComponent::Info() const { std::ostringstream stream; stream << Component::Info(); PrintParameterStats(stream, "linear-params", linear_params_); PrintParameterStats(stream, "bias", bias_params_, true); return stream.str(); } void FixedAffineComponent::Init(const CuMatrixBase &mat) { KALDI_ASSERT(mat.NumCols() > 1); linear_params_ = mat.Range(0, mat.NumRows(), 0, mat.NumCols() - 1); bias_params_.Resize(mat.NumRows()); bias_params_.CopyColFromMat(mat, mat.NumCols() - 1); } void FixedAffineComponent::InitFromConfig(ConfigLine *cfl) { std::string filename; // Two forms allowed: "matrix=", or "input-dim=x output-dim=y" // (for testing purposes only). if (cfl->GetValue("matrix", &filename)) { if (cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; bool binary; Input ki(filename, &binary); CuMatrix mat; mat.Read(ki.Stream(), binary); KALDI_ASSERT(mat.NumRows() != 0); Init(mat); } else { int32 input_dim = -1, output_dim = -1; if (!cfl->GetValue("input-dim", &input_dim) || !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) { KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; } CuMatrix mat(output_dim, input_dim + 1); mat.SetRandn(); Init(mat); } } FixedAffineComponent::FixedAffineComponent(const AffineComponent &c): linear_params_(c.LinearParams()), bias_params_(c.BiasParams()) { } void* FixedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyRowsFromVec(bias_params_); // Adds the bias term first. out->AddMatMat(1.0, in, kNoTrans, linear_params_, kTrans, 1.0); return NULL; } void FixedAffineComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, //in_value const CuMatrixBase &, //out_value const CuMatrixBase &out_deriv, void *memo, Component *, //to_update CuMatrixBase *in_deriv) const { // kBackpropAdds is true. It's the user's responsibility to zero out // if they need it to be so. if (in_deriv) in_deriv->AddMatMat(1.0, out_deriv, kNoTrans, linear_params_, kNoTrans, 1.0); } Component* FixedAffineComponent::Copy() const { FixedAffineComponent *ans = new FixedAffineComponent(); ans->linear_params_ = linear_params_; ans->bias_params_ = bias_params_; return ans; } void FixedAffineComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); linear_params_.Write(os, binary); WriteToken(os, binary, ""); bias_params_.Write(os, binary); WriteToken(os, binary, ""); } void FixedAffineComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); linear_params_.Read(is, binary); ExpectToken(is, binary, ""); bias_params_.Read(is, binary); ExpectToken(is, binary, ""); } void SumGroupComponent::Init(const std::vector &sizes) { KALDI_ASSERT(!sizes.empty()); std::vector cpu_vec(sizes.size()); std::vector reverse_cpu_vec; int32 cur_index = 0; for (size_t i = 0; i < sizes.size(); i++) { KALDI_ASSERT(sizes[i] > 0); cpu_vec[i].first = cur_index; cpu_vec[i].second = cur_index + sizes[i]; cur_index += sizes[i]; for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++) reverse_cpu_vec.push_back(i); } this->indexes_ = cpu_vec; this->reverse_indexes_ = reverse_cpu_vec; this->input_dim_ = cur_index; this->output_dim_ = sizes.size(); } void SumGroupComponent::Init(int32 input_dim, int32 output_dim) { const int32 num_groups = output_dim; KALDI_ASSERT(input_dim % num_groups == 0); const int32 group_size = input_dim / num_groups; std::vector cpu_vec(num_groups); std::vector reverse_cpu_vec; int32 cur_index = 0; for (size_t i = 0; i < num_groups; i++) { cpu_vec[i].first = cur_index; cpu_vec[i].second = cur_index + group_size; cur_index += group_size; for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++) reverse_cpu_vec.push_back(i); } this->indexes_ = cpu_vec; this->reverse_indexes_ = reverse_cpu_vec; this->input_dim_ = input_dim; this->output_dim_ = num_groups; } void SumGroupComponent::InitFromConfig(ConfigLine *cfl) { std::vector sizes; bool has_sizes = cfl->GetValue("sizes", &sizes); if (has_sizes) { if (cfl->HasUnusedValues() || sizes.empty()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; this->Init(sizes); } else { // each group has the same size int32 input_dim = -1, output_dim = -1; if (!cfl->GetValue("input-dim", &input_dim) || !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) { KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; } Init(input_dim, output_dim); } } Component* SumGroupComponent::Copy() const { SumGroupComponent *ans = new SumGroupComponent(); ans->indexes_ = indexes_; ans->reverse_indexes_ = reverse_indexes_; ans->input_dim_ = input_dim_; ans->output_dim_ = output_dim_; return ans; } void SumGroupComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); std::vector sizes; ReadIntegerVector(is, binary, &sizes); std::string token; ReadToken(is, binary, &token); if (!(token == "" || token == "")) { KALDI_ERR << "Expected , got " << token; } this->Init(sizes); } void SumGroupComponent::GetSizes(std::vector *sizes) const { std::vector indexes; indexes_.CopyToVec(&indexes); sizes->resize(indexes.size()); for (size_t i = 0; i < indexes.size(); i++) { (*sizes)[i] = indexes[i].second - indexes[i].first; if (i == 0) { KALDI_ASSERT(indexes[i].first == 0); } else { KALDI_ASSERT(indexes[i].first == indexes[i-1].second); } KALDI_ASSERT(indexes[i].second > indexes[i].first); (*sizes)[i] = indexes[i].second - indexes[i].first; } } void SumGroupComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); std::vector sizes; this->GetSizes(&sizes); WriteIntegerVector(os, binary, sizes); WriteToken(os, binary, ""); } void* SumGroupComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->SumColumnRanges(in, indexes_); return NULL; } void SumGroupComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value, const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { in_deriv->CopyCols(out_deriv, reverse_indexes_); } void* SoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // Apply softmax function to each row of the output... // for that row, we do // x_i = exp(x_i) / sum_j exp(x_j). out->SoftMaxPerRow(in); // This floor on the output helps us deal with // almost-zeros in a way that doesn't lead to overflow. out->ApplyFloor(1.0e-20); return NULL; } void SoftmaxComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { if (to_update_in) { SoftmaxComponent *to_update = dynamic_cast(to_update_in); to_update->StoreBackpropStats(out_deriv); } if (in_deriv == NULL) return; /* Note on the derivative of the softmax function: let it be p_i = exp(x_i) / sum_i exp_i The [matrix-valued] Jacobian of this function is diag(p) - p p^T Let the derivative vector at the output be e, and at the input be d. We have d = diag(p) e - p (p^T e). d_i = p_i e_i - p_i (p^T e). */ in_deriv->DiffSoftmaxPerRow(out_value, out_deriv); } void SoftmaxComponent::StoreStats(const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { // We don't store derivative stats for this component type, just activation // stats. StoreStatsInternal(out_value, NULL); } void* LogSoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { // Applies log softmax function to each row of the output. For each row, we do // x_i = x_i - log(sum_j exp(x_j)) out->LogSoftMaxPerRow(in); return NULL; } void LogSoftmaxComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update_in, CuMatrixBase *in_deriv) const { if (to_update_in) { LogSoftmaxComponent *to_update = dynamic_cast(to_update_in); to_update->StoreBackpropStats(out_deriv); } if (in_deriv == NULL) return; in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv); } void FixedScaleComponent::Init(const CuVectorBase &scales) { KALDI_ASSERT(scales.Dim() != 0); scales_ = scales; } void FixedScaleComponent::InitFromConfig(ConfigLine *cfl) { std::string filename; // Accepts "scales" config (for filename) or "dim" -> random init, for testing. if (cfl->GetValue("scales", &filename)) { if (cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; CuVector vec; ReadKaldiObject(filename, &vec); Init(vec); } else { int32 dim; BaseFloat scale = 1.0; bool scale_is_set = cfl->GetValue("scale", &scale); if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; KALDI_ASSERT(dim > 0); CuVector vec(dim); if (scale_is_set) { vec.Set(scale); } else { vec.SetRandn(); } Init(vec); } } std::string FixedScaleComponent::Info() const { std::ostringstream stream; stream << Component::Info(); PrintParameterStats(stream, "scales", scales_, true); return stream.str(); } void* FixedScaleComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyFromMat(in); // does nothing if same matrix. out->MulColsVec(scales_); return NULL; } void FixedScaleComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *, // to_update CuMatrixBase *in_deriv) const { in_deriv->CopyFromMat(out_deriv); // does nothing if same memory. in_deriv->MulColsVec(scales_); } Component* FixedScaleComponent::Copy() const { FixedScaleComponent *ans = new FixedScaleComponent(); ans->scales_ = scales_; return ans; } void FixedScaleComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); scales_.Write(os, binary); WriteToken(os, binary, ""); } void FixedScaleComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); scales_.Read(is, binary); ExpectToken(is, binary, ""); } void FixedBiasComponent::Init(const CuVectorBase &bias) { KALDI_ASSERT(bias.Dim() != 0); bias_ = bias; } void FixedBiasComponent::InitFromConfig(ConfigLine *cfl) { std::string filename; // Accepts "bias" config (for filename) or "dim" -> random init, for testing. if (cfl->GetValue("bias", &filename)) { if (cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; CuVector vec; ReadKaldiObject(filename, &vec); Init(vec); } else { int32 dim; if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; KALDI_ASSERT(dim > 0); CuVector vec(dim); vec.SetRandn(); Init(vec); } } std::string FixedBiasComponent::Info() const { std::ostringstream stream; stream << Component::Info(); PrintParameterStats(stream, "bias", bias_, true); return stream.str(); } void* FixedBiasComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyFromMat(in); // will do nothing if in and out have same memory. out->AddVecToRows(1.0, bias_, 1.0); return NULL; } void FixedBiasComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, // in_value const CuMatrixBase &, // out_value const CuMatrixBase &out_deriv, void *memo, Component *, // to_update CuMatrixBase *in_deriv) const { // the following statement will do nothing if in_deriv and out_deriv have same // memory. in_deriv->CopyFromMat(out_deriv); } Component* FixedBiasComponent::Copy() const { FixedBiasComponent *ans = new FixedBiasComponent(); ans->bias_ = bias_; return ans; } void FixedBiasComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); bias_.Write(os, binary); WriteToken(os, binary, ""); } void FixedBiasComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); bias_.Read(is, binary); ExpectToken(is, binary, ""); } void NaturalGradientPerElementScaleComponent::Read( std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate ExpectToken(is, binary, ""); scales_.Read(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &is_gradient_); int32 rank, update_period; ExpectToken(is, binary, ""); ReadBasicType(is, binary, &rank); preconditioner_.SetRank(rank); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &update_period); preconditioner_.SetUpdatePeriod(update_period); BaseFloat num_samples_history, alpha; ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_samples_history); preconditioner_.SetNumSamplesHistory(num_samples_history); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &alpha); preconditioner_.SetAlpha(alpha); std::string token; ReadToken(is, binary, &token); if (token == "") { // back compatibility; this was removed, it's now handled by the // 'max-change' config variable. BaseFloat temp; ReadBasicType(is, binary, &temp); ReadToken(is, binary, &token); } KALDI_ASSERT(token == ""); } void NaturalGradientPerElementScaleComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write the opening tag and learning rate WriteToken(os, binary, ""); scales_.Write(os, binary); WriteToken(os, binary, ""); WriteBasicType(os, binary, is_gradient_); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_.GetRank()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_.GetUpdatePeriod()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_.GetNumSamplesHistory()); WriteToken(os, binary, ""); WriteBasicType(os, binary, preconditioner_.GetAlpha()); WriteToken(os, binary, ""); } std::string NaturalGradientPerElementScaleComponent::Info() const { std::ostringstream stream; stream << PerElementScaleComponent::Info() << ", rank=" << preconditioner_.GetRank() << ", update-period=" << preconditioner_.GetUpdatePeriod() << ", num-samples-history=" << preconditioner_.GetNumSamplesHistory() << ", alpha=" << preconditioner_.GetAlpha(); return stream.str(); } void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) { // First set various configuration values that have defaults. int32 rank = 8, // Use a small rank because in this case the amount of memory // for the preconditioner actually exceeds the memory for the // parameters (by "rank"). update_period = 10; BaseFloat num_samples_history = 2000.0, alpha = 4.0; cfl->GetValue("rank", &rank); cfl->GetValue("update-period", &update_period); cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); InitLearningRatesFromConfig(cfl); std::string filename; // Accepts "scales" config (for filename) or "dim" -> random init, for testing. if (cfl->GetValue("scales", &filename)) { if (cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(filename, rank, update_period, num_samples_history, alpha); } else { BaseFloat param_mean = 1.0, param_stddev = 0.0; cfl->GetValue("param-mean", ¶m_mean); cfl->GetValue("param-stddev", ¶m_stddev); int32 dim; if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues()) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; KALDI_ASSERT(dim > 0); Init(dim, param_mean, param_stddev, rank, update_period, num_samples_history, alpha); } } void NaturalGradientPerElementScaleComponent::Init( int32 dim, BaseFloat param_mean, BaseFloat param_stddev, int32 rank, int32 update_period, BaseFloat num_samples_history, BaseFloat alpha) { PerElementScaleComponent::Init(dim, param_mean, param_stddev); preconditioner_.SetRank(rank); preconditioner_.SetUpdatePeriod(update_period); preconditioner_.SetNumSamplesHistory(num_samples_history); preconditioner_.SetAlpha(alpha); } void NaturalGradientPerElementScaleComponent::Init( std::string vector_filename, int32 rank, int32 update_period, BaseFloat num_samples_history, BaseFloat alpha) { PerElementScaleComponent::Init(vector_filename); preconditioner_.SetRank(rank); preconditioner_.SetUpdatePeriod(update_period); preconditioner_.SetNumSamplesHistory(num_samples_history); preconditioner_.SetAlpha(alpha); } NaturalGradientPerElementScaleComponent::NaturalGradientPerElementScaleComponent( const NaturalGradientPerElementScaleComponent &other): PerElementScaleComponent(other), preconditioner_(other.preconditioner_) { } Component* NaturalGradientPerElementScaleComponent::Copy() const { return new NaturalGradientPerElementScaleComponent(*this); } void NaturalGradientPerElementScaleComponent::Update( const std::string &debug_info, const CuMatrixBase &in_value, const CuMatrixBase &out_deriv) { CuMatrix derivs_per_frame(in_value); derivs_per_frame.MulElements(out_deriv); // the non-natural-gradient update would just do // scales_.AddRowSumMat(learning_rate_, derivs_per_frame). BaseFloat scale; preconditioner_.PreconditionDirections(&derivs_per_frame, &scale); CuVector delta_scales(scales_.Dim()); delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame); scales_.AddVec(1.0, delta_scales); } void NaturalGradientPerElementScaleComponent::FreezeNaturalGradient(bool freeze) { preconditioner_.Freeze(freeze); } void NaturalGradientPerElementScaleComponent::ConsolidateMemory() { OnlineNaturalGradient temp(preconditioner_); preconditioner_.Swap(&temp); } void PermuteComponent::ComputeReverseColumnMap() { int32 dim = column_map_.Dim(); KALDI_ASSERT(dim > 0); std::vector reverse_column_map_cpu(dim, -1), column_map_cpu(dim); column_map_.CopyToVec(&column_map_cpu); for (int32 i = 0; i < dim; i++) { int32 &dest = reverse_column_map_cpu[column_map_cpu[i]]; if (dest != -1) KALDI_ERR << "Column map does not represent a permutation."; dest = i; } reverse_column_map_.Resize(dim); reverse_column_map_.CopyFromVec(reverse_column_map_cpu); } Component* PermuteComponent::Copy() const { PermuteComponent *ans = new PermuteComponent(); ans->column_map_ = column_map_; ans->reverse_column_map_ = reverse_column_map_; return ans; } void* PermuteComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { out->CopyCols(in, column_map_); return NULL; } void PermuteComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, //in_value const CuMatrixBase &, // out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { in_deriv->CopyCols(out_deriv, reverse_column_map_); } void PermuteComponent::InitFromConfig(ConfigLine *cfl) { bool ok = true; std::string column_map_str; ok = ok && cfl->GetValue("column-map", &column_map_str); std::vector column_map; if (!SplitStringToIntegers(column_map_str, ",", true, &column_map)) KALDI_ERR << "Bad initializer in PermuteComponent: column-map=" << column_map_str; if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); if (!ok) KALDI_ERR << "Invalid initializer for layer of type " << Type() << ": \"" << cfl->WholeLine() << "\""; Init(column_map); } void PermuteComponent::Init(const std::vector &column_map) { KALDI_ASSERT(column_map.size() > 0); column_map_.CopyFromVec(column_map); ComputeReverseColumnMap(); } void PermuteComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); std::vector column_map; if (binary && is.peek() == 'F') { // back-compatibility code [temporary] Vector float_map; float_map.Read(is, binary); column_map.resize(float_map.Dim()); for (int32 i = 0; i < float_map.Dim(); i++) { // note: casting truncates toward zero: add 0.5 to approximate rounding. column_map[i] = static_cast(float_map(i) + 0.5); } // the next line is a workaround for a bug in the old // writing code, which now causes an assert failure. it's only // valid for the permutations we're currently using. anyway all this // code is only temporary. column_map.back() = float_map.Dim() - 1; } else { ReadIntegerVector(is, binary, &column_map); } column_map_.CopyFromVec(column_map); ExpectToken(is, binary, ""); ComputeReverseColumnMap(); } void PermuteComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); std::ostringstream buffer; std::vector column_map; column_map_.CopyToVec(&column_map); WriteIntegerVector(os, binary, column_map); WriteToken(os, binary, ""); } std::string PermuteComponent::Info() const { std::ostringstream stream; stream << Type() << ", dim=" << column_map_.Dim(); stream << " , column-map=[ "; std::vector column_map(column_map_.Dim()); column_map_.CopyToVec(&column_map); int32 max_size = 5; for (size_t i = 0; i < column_map.size() && i < max_size; i++) stream << column_map[i] << ' '; if (static_cast(column_map.size()) > max_size) stream << "... "; stream << "]"; return stream.str(); } bool CompositeComponent::IsUpdatable() const { for (std::vector::const_iterator iter = components_.begin(), end = components_.end(); iter != end; ++iter) if (((*iter)->Properties() & kUpdatableComponent) != 0) return true; return false; } // virtual int32 CompositeComponent::InputDim() const { KALDI_ASSERT(!components_.empty()); return components_.front()->InputDim(); } // virtual int32 CompositeComponent::OutputDim() const { KALDI_ASSERT(!components_.empty()); return components_.back()->OutputDim(); } // virtual int32 CompositeComponent::Properties() const { KALDI_ASSERT(!components_.empty()); int32 last_component_properties = components_.back()->Properties(), first_component_properties = components_.front()->Properties(); // We always assume backprop needs the input, as this would be necessary to // get the activations at intermediate layers, if these were not needed in // backprop, there would be no reason to use a CompositeComponent. int32 ans = kSimpleComponent | kBackpropNeedsInput | (last_component_properties & (kPropagateAdds|kBackpropNeedsOutput|kOutputContiguous)) | (first_component_properties & (kBackpropAdds|kInputContiguous)) | (IsUpdatable() ? kUpdatableComponent : 0); // note, we don't return the kStoresStats property because that function is // not implemented; instead, for efficiency, we call StoreStats() on any // sub-components as part of the backprop phase. if (last_component_properties & kStoresStats) ans |= kBackpropNeedsOutput; return ans; } MatrixStrideType CompositeComponent::GetStrideType(int32 i) const { int32 num_components = components_.size(); if ((components_[i]->Properties() & kOutputContiguous) || (i + 1 < num_components && (components_[i + 1]->Properties() & kInputContiguous))) return kStrideEqualNumCols; else return kDefaultStride; } // virtual void* CompositeComponent::Propagate( const ComponentPrecomputedIndexes *, // indexes const CuMatrixBase &in, CuMatrixBase *out) const { KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() && out->NumCols() == OutputDim()); int32 num_rows = in.NumRows(), num_components = components_.size(); if (max_rows_process_ > 0 && num_rows > max_rows_process_) { // recurse and process smaller parts of the data, to save memory. for (int32 row_offset = 0; row_offset < num_rows; row_offset += max_rows_process_) { int32 this_num_rows = std::min(max_rows_process_, num_rows - row_offset); const CuSubMatrix in_part(in, row_offset, this_num_rows, 0, in.NumCols()); CuSubMatrix out_part(*out, row_offset, this_num_rows, 0, out->NumCols()); this->Propagate(NULL, in_part, &out_part); } return NULL; } std::vector > intermediate_outputs(num_components - 1); for (int32 i = 0; i < num_components; i++) { if (i + 1 < num_components) { MatrixResizeType resize_type = ((components_[i]->Properties() & kPropagateAdds) ? kSetZero : kUndefined); intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(), resize_type, GetStrideType(i)); } const CuMatrixBase &this_in = (i == 0 ? in : intermediate_outputs[i-1]); CuMatrixBase *this_out = (i + 1 == num_components ? out : &(intermediate_outputs[i])); void *memo = components_[i]->Propagate(NULL, this_in, this_out); // we'll re-do the forward propagation in the backprop, and we can // regenerate any memos there, so no need to keep them. if (memo != NULL) components_[i]->DeleteMemo(memo); if (i > 0) intermediate_outputs[i-1].Resize(0, 0); } return NULL; } void CompositeComponent::Init(const std::vector &components, int32 max_rows_process) { DeletePointers(&components_); // clean up. components_ = components; KALDI_ASSERT(!components.empty()); max_rows_process_ = max_rows_process; for (size_t i = 0; i < components_.size(); i++) { // make sure all constituent components are simple. KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent); if (i > 0) { // make sure all the internal dimensions match up. KALDI_ASSERT(components_[i]->InputDim() == components_[i-1]->OutputDim()); } } } // virtual void CompositeComponent::Read(std::istream &is, bool binary) { // Because we didn't previously write out the learning rate, // we need some temporary code. int32 max_rows_process; if (false) { ReadUpdatableCommon(is, binary); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &max_rows_process); } else { // temporary code. std::string token; ReadToken(is, binary, &token); if (token == "") { // if the first token is the opening tag, then // ignore it and get the next tag. ReadToken(is, binary, &token); } if (token == "") { ReadBasicType(is, binary, &learning_rate_factor_); ReadToken(is, binary, &token); } else { learning_rate_factor_ = 1.0; } if (token == "") { ReadBasicType(is, binary, &is_gradient_); ReadToken(is, binary, &token); } else { is_gradient_ = false; } if (token == "") { ReadBasicType(is, binary, &learning_rate_); ReadToken(is, binary, &token); } if (token != "") { KALDI_ERR << "Expected token , got " << token; } ReadBasicType(is, binary, &max_rows_process); } ExpectToken(is, binary, ""); int32 num_components; ReadBasicType(is, binary, &num_components); // Read dimension. if (num_components < 0 || num_components > 100000) KALDI_ERR << "Bad num-components"; std::vector components(num_components); for (int32 i = 0; i < num_components; i++) components[i] = ReadNew(is, binary); Init(components, max_rows_process); ExpectToken(is, binary, ""); } // virtual void CompositeComponent::ZeroStats() { // we call ZeroStats() on all components without checking their flags; this // will do nothing if the component doesn't store stats. (components like // ReLU and sigmoid and tanh store stats on activations). for (size_t i = 0; i < components_.size(); i++) components_[i]->ZeroStats(); } // virtual void CompositeComponent::Write(std::ostream &os, bool binary) const { WriteUpdatableCommon(os, binary); // Write opening tag and learning rate. WriteToken(os, binary, ""); WriteBasicType(os, binary, max_rows_process_); WriteToken(os, binary, ""); int32 num_components = components_.size(); WriteBasicType(os, binary, num_components); for (int32 i = 0; i < num_components; i++) components_[i]->Write(os, binary); WriteToken(os, binary, ""); } // virtual void CompositeComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() && in_value.NumCols() == InputDim() && out_deriv.NumCols() == OutputDim()); int32 num_rows = in_value.NumRows(), num_components = components_.size(); if (max_rows_process_ > 0 && num_rows > max_rows_process_) { KALDI_ASSERT(max_rows_process_ > 0); // recurse and process smaller parts of the data, to save memory. for (int32 row_offset = 0; row_offset < num_rows; row_offset += max_rows_process_) { bool have_output_value = (out_value.NumRows() != 0); int32 this_num_rows = std::min(max_rows_process_, num_rows - row_offset); // out_value_part will only be used if out_value is nonempty; otherwise we // make it a submatrix of 'out_deriv' to avoid errors in the constructor. const CuSubMatrix out_value_part(have_output_value ? out_value : out_deriv, row_offset, this_num_rows, 0, out_deriv.NumCols()); // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we // make it a submatrix of 'in_value' to avoid errors in the constructor. CuSubMatrix in_deriv_part(in_deriv != NULL ? *in_deriv : in_value, row_offset, this_num_rows, 0, in_value.NumCols()); CuSubMatrix in_value_part(in_value, row_offset, this_num_rows, 0, in_value.NumCols()); const CuSubMatrix out_deriv_part(out_deriv, row_offset, this_num_rows, 0, out_deriv.NumCols()); CuMatrix empty_mat; this->Backprop(debug_info, NULL, in_value_part, (have_output_value ? static_cast&>(out_value_part) : static_cast&>(empty_mat)), out_deriv_part, NULL, to_update, in_deriv != NULL ? &in_deriv_part : NULL); } return; } // For now, assume all intermediate values and derivatives need to be // computed. in_value and out_deriv will always be supplied. // intermediate_outputs[i] contains the output of component i. std::vector > intermediate_outputs(num_components); // intermediate_derivs[i] contains the deriative at the output of component i. std::vector > intermediate_derivs(num_components - 1); KALDI_ASSERT(memo == NULL); // note: only a very few components use memos, but we need to support them. std::vector memos(num_components, NULL); int32 num_components_to_propagate = num_components; if (!(components_[num_components - 1]->Properties() & kUsesMemo)) { // we only need to propagate the very last component if it uses a memo. num_components_to_propagate--; if (num_components > 1) { // skip the last-but-one component's propagate if the last component's // backprop doesn't need the input and the last-but-one component's // backprop doesn't need the output. This is the lowest hanging fruit for // optimization; other propagates might also be skippable. int32 properties = components_[num_components - 2]->Properties(), next_properties = components_[num_components - 1]->Properties(); if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) && !(next_properties & kBackpropNeedsInput)) { num_components_to_propagate--; } } } // Do the propagation again. for (int32 i = 0; i < num_components_to_propagate; i++) { MatrixResizeType resize_type = ((components_[i]->Properties() & kPropagateAdds) ? kSetZero : kUndefined); intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(), resize_type, GetStrideType(i)); memos[i] = components_[i]->Propagate(NULL, (i == 0 ? in_value : intermediate_outputs[i-1]), &(intermediate_outputs[i])); } for (int32 i = num_components - 1; i >= 0; i--) { const CuMatrixBase &this_in_value = (i == 0 ? in_value : intermediate_outputs[i-1]), &this_out_value = (i == num_components - 1 ? out_value : intermediate_outputs[i]); Component *component_to_update = (to_update == NULL ? NULL : dynamic_cast(to_update)->components_[i]); if (component_to_update != NULL && components_[i]->Properties() & kStoresStats) component_to_update->StoreStats(this_in_value, this_out_value, memos[i]); if (i > 0) { MatrixResizeType resize_type = ((components_[i]->Properties() & kBackpropAdds) ? kSetZero : kUndefined); intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(), resize_type, GetStrideType(i - 1)); } // skip the first component's backprop if it's not updatable and in_deriv is // not requested. Again, this is the lowest-hanging fruit to optimize. if (!(i == 0 && !(components_[0]->Properties() & kUpdatableComponent) && in_deriv == NULL)) { components_[i]->Backprop(debug_info, NULL, this_in_value, this_out_value, (i + 1 == num_components ? out_deriv : intermediate_derivs[i]), memos[i], component_to_update, (i == 0 ? in_deriv : &(intermediate_derivs[i-1]))); } if (memos[i] != NULL) components_[i]->DeleteMemo(memos[i]); } } // virtual std::string CompositeComponent::Info() const { std::ostringstream stream; stream << Type() << " "; for (size_t i = 0; i < components_.size(); i++) { if (i > 0) stream << ", "; stream << "sub-component" << (i+1) << " = { " << components_[i]->Info() << " }"; } return stream.str(); } // virtual void CompositeComponent::Scale(BaseFloat scale) { for (size_t i = 0; i < components_.size(); i++) components_[i]->Scale(scale); } // virtual void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) { const CompositeComponent *other = dynamic_cast( &other_in); KALDI_ASSERT(other != NULL && other->components_.size() == components_.size() && "Mismatching nnet topologies"); for (size_t i = 0; i < components_.size(); i++) components_[i]->Add(alpha, *(other->components_[i])); } // virtual void CompositeComponent::PerturbParams(BaseFloat stddev) { KALDI_ASSERT(this->IsUpdatable()); // or should not be called. for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); uc->PerturbParams(stddev); } } } void CompositeComponent::SetUnderlyingLearningRate(BaseFloat lrate) { KALDI_ASSERT(this->IsUpdatable()); // or should not be called. UpdatableComponent::SetUnderlyingLearningRate(lrate); // apply any learning-rate-factor that's set at this level (ill-advised, but // we'll do it.) BaseFloat effective_lrate = LearningRate(); for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); uc->SetUnderlyingLearningRate(effective_lrate); } } } void CompositeComponent::SetActualLearningRate(BaseFloat lrate) { KALDI_ASSERT(this->IsUpdatable()); // or should not be called. UpdatableComponent::SetActualLearningRate(lrate); for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); uc->SetActualLearningRate(lrate); } } } // virtual void CompositeComponent::SetAsGradient() { KALDI_ASSERT(this->IsUpdatable()); // or should not be called. UpdatableComponent::SetAsGradient(); for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); uc->SetAsGradient(); } } } // virtual int32 CompositeComponent::NumParameters() const { KALDI_ASSERT(this->IsUpdatable()); // or should not be called. int32 ans = 0; for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); ans += uc->NumParameters(); } } return ans; } // virtual void CompositeComponent::Vectorize(VectorBase *params) const { int32 cur_offset = 0; KALDI_ASSERT(this->IsUpdatable()); // or should not be called. for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); int32 this_size = uc->NumParameters(); SubVector params_range(*params, cur_offset, this_size); uc->Vectorize(¶ms_range); cur_offset += this_size; } } KALDI_ASSERT(cur_offset == params->Dim()); } // virtual void CompositeComponent::UnVectorize(const VectorBase ¶ms) { int32 cur_offset = 0; KALDI_ASSERT(this->IsUpdatable()); // or should not be called. for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); int32 this_size = uc->NumParameters(); SubVector params_range(params, cur_offset, this_size); uc->UnVectorize(params_range); cur_offset += this_size; } } KALDI_ASSERT(cur_offset == params.Dim()); } // virtual BaseFloat CompositeComponent::DotProduct( const UpdatableComponent &other_in) const { const CompositeComponent *other = dynamic_cast( &other_in); KALDI_ASSERT(other != NULL && other->components_.size() == components_.size() && "Mismatching nnet topologies"); BaseFloat ans = 0.0; for (size_t i = 0.0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); const UpdatableComponent *uc_other = dynamic_cast(other->components_[i]); KALDI_ASSERT(uc != NULL && uc_other != NULL); ans += uc->DotProduct(*uc_other); } } return ans; } /// virtual void CompositeComponent::FreezeNaturalGradient(bool freeze) { for (size_t i = 0; i < components_.size(); i++) { if (components_[i]->Properties() & kUpdatableComponent) { UpdatableComponent *uc = dynamic_cast(components_[i]); KALDI_ASSERT(uc != NULL); uc->FreezeNaturalGradient(freeze); } } } // virtual Component* CompositeComponent::Copy() const { std::vector components(components_.size()); for (size_t i = 0; i < components_.size(); i++) components[i] = components_[i]->Copy(); CompositeComponent *ans = new CompositeComponent(); ans->Init(components, max_rows_process_); return ans; } // virtual void CompositeComponent::InitFromConfig(ConfigLine *cfl) { int32 max_rows_process = 4096, num_components = -1; cfl->GetValue("max-rows-process", &max_rows_process); if (!cfl->GetValue("num-components", &num_components) || num_components < 1) KALDI_ERR << "Expected num-components to be defined in " << "CompositeComponent config line '" << cfl->WholeLine() << "'"; std::vector components; for (int32 i = 1; i <= num_components; i++) { std::ostringstream name_stream; name_stream << "component" << i; std::string component_config; if (!cfl->GetValue(name_stream.str(), &component_config)) { DeletePointers(&components); KALDI_ERR << "Expected '" << name_stream.str() << "' to be defined in " << "CompositeComponent config line '" << cfl->WholeLine() << "'"; } ConfigLine nested_line; // note: the nested line may not contain comments. std::string component_type; Component *this_component = NULL; if (!nested_line.ParseLine(component_config) || !nested_line.GetValue("type", &component_type) || !(this_component = NewComponentOfType(component_type)) || nested_line.FirstToken() != "") { DeletePointers(&components); KALDI_ERR << "Could not parse config line for '" << name_stream.str() << "(or undefined or bad component type [type=xxx]), in " << "CompositeComponent config line '" << cfl->WholeLine() << "'"; } if(this_component->Type() == "CompositeComponent") { DeletePointers(&components); delete this_component; // This is not allowed. If memory is too much with just one // CompositeComponent, try decreasing max-rows-process instead. KALDI_ERR << "Found CompositeComponent nested within CompositeComponent." << "Nested line: '" << nested_line.WholeLine() << "'\n" << "Toplevel CompositeComponent line '" << cfl->WholeLine() << "'"; } this_component->InitFromConfig(&nested_line); int32 props = this_component->Properties(); if ((props & kRandomComponent) != 0 || (props & kSimpleComponent) == 0) { KALDI_ERR << "CompositeComponent contains disallowed component type: " << nested_line.WholeLine(); } components.push_back(this_component); } if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); this->Init(components, max_rows_process); } const Component* CompositeComponent::GetComponent(int32 i) const { KALDI_ASSERT(static_cast(i) < components_.size()); return components_[i]; } void CompositeComponent::SetComponent(int32 i, Component *component) { KALDI_ASSERT(static_cast(i) < components_.size()); delete components_[i]; components_[i] = component; } SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): input_dim_(other.input_dim_), output_dim_(other.output_dim_), scale_(other.scale_) { } void SumBlockComponent::InitFromConfig(ConfigLine *cfl) { scale_ = 1.0; bool ok = cfl->GetValue("input-dim", &input_dim_) && cfl->GetValue("output-dim", &output_dim_); if (!ok) KALDI_ERR << "input-dim and output-dim must both be provided."; if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0) KALDI_ERR << "Invalid values input-dim=" << input_dim_ << " output-dim=" << output_dim_; cfl->GetValue("scale", &scale_); if (cfl->HasUnusedValues()) KALDI_ERR << "Could not process these elements in initializer: " << cfl->UnusedValues(); } void SumBlockComponent::Read(std::istream &is, bool binary) { ExpectOneOrTwoTokens(is, binary, "", ""); ReadBasicType(is, binary, &input_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &scale_); ExpectToken(is, binary, ""); } void SumBlockComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteToken(os, binary, ""); WriteBasicType(os, binary, input_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, output_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, scale_); WriteToken(os, binary, ""); } std::string SumBlockComponent::Info() const { std::ostringstream stream; stream << Type() << ", input-dim=" << input_dim_ << ", output-dim=" << output_dim_ << ", scale=" << scale_; return stream.str(); } void* SumBlockComponent::Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, CuMatrixBase *out) const { KALDI_ASSERT(out->NumRows() == in.NumRows() && out->NumCols() == output_dim_ && in.NumCols() == input_dim_); out->AddMatBlocks(scale_, in, kNoTrans); return NULL; } void SumBlockComponent::Backprop( const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, //in_value const CuMatrixBase &, // out_value, const CuMatrixBase &out_deriv, void *memo, Component *to_update, CuMatrixBase *in_deriv) const { if (in_deriv) { in_deriv->AddMatBlocks(scale_, out_deriv, kNoTrans); } } } // namespace nnet3 } // namespace kaldi