// nnet/nnet-frame-pooling-component.h // Copyright 2014 Brno University of Technology (author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_ #define KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_ #include #include #include #include #include "nnet/nnet-component.h" #include "nnet/nnet-utils.h" #include "cudamatrix/cu-math.h" namespace kaldi { namespace nnet1 { /** * FramePoolingComponent : * The input/output matrices are split to frames of width 'feature_dim_'. * Here we do weighted pooling of frames along the temporal axis, * given a frame-offset of leftmost frame, the pool-size is defined * by weight-vector size. */ class FramePoolingComponent : public UpdatableComponent { public: FramePoolingComponent(int32 dim_in, int32 dim_out): UpdatableComponent(dim_in, dim_out), feature_dim_(0), normalize_(false) { } ~FramePoolingComponent() { } Component* Copy() const { return new FramePoolingComponent(*this); } ComponentType GetType() const { return kFramePoolingComponent; } /** * Here the offsets are w.r.t. central frames, which has offset 0. * Note.: both the offsets and pool sizes can be negative. */ void InitData(std::istream &is) { // temporary, for initialization, std::vector pool_size; std::vector central_offset; Vector pool_weight; float learn_rate_coef = 0.01; // parse config std::string token; while (is >> std::ws, !is.eof()) { ReadToken(is, false, &token); /**/ if (token == "") ReadBasicType(is, false, &feature_dim_); else if (token == "") ReadIntegerVector(is, false, ¢ral_offset); else if (token == "") ReadIntegerVector(is, false, &pool_size); else if (token == "") pool_weight.Read(is, false); else if (token == "") ReadBasicType(is, false, &learn_rate_coef); else if (token == "") ReadBasicType(is, false, &normalize_); else KALDI_ERR << "Unknown token " << token << ", a typo in config?" << " (FeatureDim|CentralOffset |PoolSize |LearnRateCoef|Normalize)"; } // check inputs: KALDI_ASSERT(feature_dim_ > 0); KALDI_ASSERT(central_offset.size() > 0); KALDI_ASSERT(central_offset.size() == pool_size.size()); // initialize: int32 num_frames = InputDim() / feature_dim_; int32 central_frame = (num_frames -1) / 2; int32 num_pools = central_offset.size(); offset_.resize(num_pools); weight_.resize(num_pools); for (int32 p = 0; p < num_pools; p++) { offset_[p] = central_frame + central_offset[p] + std::min(0, pool_size[p]+1); weight_[p].Resize(std::abs(pool_size[p])); weight_[p].Set(1.0/std::abs(pool_size[p])); } learn_rate_coef_ = learn_rate_coef; if (pool_weight.Dim() != 0) { KALDI_LOG << "Initializing from pool-weight vector"; int32 num_weights = 0; for (int32 p = 0; p < num_pools; p++) { weight_[p].CopyFromVec(pool_weight.Range(num_weights, weight_[p].Dim())); num_weights += weight_[p].Dim(); } KALDI_ASSERT(num_weights == pool_weight.Dim()); } // check that offsets are within the splice we had, for (int32 p = 0; p < num_pools; p++) { KALDI_ASSERT(offset_[p] >= 0); KALDI_ASSERT(offset_[p] + weight_[p].Dim() <= num_frames); } } /** * Here the offsets are w.r.t. leftmost frame from splice, its offset is 0. * If we spliced +/- 15 frames, the central frames has index '15'. */ void ReadData(std::istream &is, bool binary) { // get the input dimension before splicing ExpectToken(is, binary, ""); ReadBasicType(is, binary, &feature_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &learn_rate_coef_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &normalize_); // read the offsets w.r.t. central frame ExpectToken(is, binary, ""); ReadIntegerVector(is, binary, &offset_); // read the frame-weights ExpectToken(is, binary, ""); int32 num_pools = offset_.size(); weight_.resize(num_pools); for (int32 p = 0; p < num_pools; p++) { weight_[p].Read(is, binary); } // // Sanity checks: // KALDI_ASSERT(input_dim_ % feature_dim_ == 0); KALDI_ASSERT(output_dim_ % feature_dim_ == 0); KALDI_ASSERT(output_dim_ / feature_dim_ == num_pools); KALDI_ASSERT(offset_.size() == weight_.size()); // check the shifts don't exceed the splicing int32 total_frame = InputDim() / feature_dim_; for (int32 p = 0; p < num_pools; p++) { KALDI_ASSERT(offset_[p] >= 0); KALDI_ASSERT(offset_[p] + (weight_[p].Dim()-1) < total_frame); } // } void WriteData(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, feature_dim_); WriteToken(os, binary, ""); WriteBasicType(os, binary, learn_rate_coef_); WriteToken(os, binary, ""); WriteBasicType(os, binary, normalize_); WriteToken(os, binary, ""); WriteIntegerVector(os, binary, offset_); // write pooling weights of individual frames WriteToken(os, binary, ""); int32 num_pools = offset_.size(); for (int32 p = 0; p < num_pools; p++) { weight_[p].Write(os, binary); } } int32 NumParams() const { int32 ans = 0; for (int32 p = 0; p < weight_.size(); p++) { ans += weight_[p].Dim(); } return ans; } void GetGradient(VectorBase *gradient) const { KALDI_ERR << "Unimplemented."; } void GetParams(VectorBase* params) const { KALDI_ASSERT(params->Dim() == NumParams()); int32 offset = 0; for (int32 p = 0; p < weight_.size(); p++) { params->Range(offset, weight_[p].Dim()).CopyFromVec(weight_[p]); offset += weight_[p].Dim(); } KALDI_ASSERT(offset == params->Dim()); } void SetParams(const VectorBase& params) { KALDI_ERR << "Unimplemented."; } std::string Info() const { std::ostringstream oss; oss << "\n (offset,weights) : "; for (int32 p = 0; p < weight_.size(); p++) { oss << "(" << offset_[p] << "," << weight_[p] << "), "; } return oss.str(); } std::string InfoGradient() const { std::ostringstream oss; oss << "\n lr-coef " << ToString(learn_rate_coef_); oss << "\n (offset,weights_grad) : "; for (int32 p = 0; p < weight_diff_.size(); p++) { oss << "(" << offset_[p] << ","; // pass the weight vector, remove '\n' as last char oss << weight_diff_[p]; oss.seekp(-1, std::ios_base::cur); oss << "), "; } return oss.str(); } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // check dims KALDI_ASSERT(in.NumCols() % feature_dim_ == 0); KALDI_ASSERT(out->NumCols() % feature_dim_ == 0); // useful dims int32 num_pools = offset_.size(); // compute the output pools for (int32 p = 0; p < num_pools; p++) { CuSubMatrix tgt(out->ColRange(p*feature_dim_, feature_dim_)); tgt.SetZero(); // reset for (int32 i = 0; i < weight_[p].Dim(); i++) { tgt.AddMat(weight_[p](i), in.ColRange((offset_[p]+i) * feature_dim_, feature_dim_)); } } } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { KALDI_ERR << "Unimplemented."; } void Update(const CuMatrixBase &input, const CuMatrixBase &diff) { // useful dims int32 num_pools = offset_.size(); // lazy init if (weight_diff_.size() != num_pools) weight_diff_.resize(num_pools); // get the derivatives for (int32 p = 0; p < num_pools; p++) { weight_diff_[p].Resize(weight_[p].Dim(), kSetZero); // reset for (int32 i = 0; i < weight_[p].Dim(); i++) { // multiply matrices element-wise, and sum to get the derivative CuSubMatrix in_frame( input.ColRange((offset_[p]+i) * feature_dim_, feature_dim_) ); CuSubMatrix diff_frame( diff.ColRange(p * feature_dim_, feature_dim_) ); CuMatrix mul_elems(in_frame); mul_elems.MulElements(diff_frame); weight_diff_[p](i) = mul_elems.Sum(); } } // update for (int32 p = 0; p < num_pools; p++) { weight_[p].AddVec(- learn_rate_coef_ * opts_.learn_rate, weight_diff_[p]); } // force to be positive, re-normalize the sum if (normalize_) { for (int32 p = 0; p < num_pools; p++) { weight_[p].ApplyFloor(0.0); weight_[p].Scale(1.0/weight_[p].Sum()); } } } private: int32 feature_dim_; // feature dimension before splicing std::vector offset_; // vector of pooling offsets /// Vector of pooling weight vectors, std::vector > weight_; /// detivatives of weight vectors, std::vector > weight_diff_; bool normalize_; // apply normalization after each update }; } // namespace nnet1 } // namespace kaldi #endif // KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_