// nnet/nnet-activation.h // Copyright 2011-2016 Brno University of Technology (author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET_NNET_ACTIVATION_H_ #define KALDI_NNET_NNET_ACTIVATION_H_ #include #include #include #include "nnet/nnet-component.h" #include "nnet/nnet-utils.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-rand.h" #include "util/text-utils.h" namespace kaldi { namespace nnet1 { class Softmax : public Component { public: Softmax(int32 dim_in, int32 dim_out): Component(dim_in, dim_out) { } ~Softmax() { } Component* Copy() const { return new Softmax(*this); } ComponentType GetType() const { return kSoftmax; } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // y = e^x_j/sum_j(e^x_j) out->SoftMaxPerRow(in); } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { // simply copy the error derivative // (ie. assume crossentropy error function, // while in_diff contains (net_output-target) : // this is already derivative of the error with // respect to activations of last layer neurons) in_diff->CopyFromMat(out_diff); } }; class HiddenSoftmax : public Component { public: HiddenSoftmax(int32 dim_in, int32 dim_out) : Component(dim_in, dim_out) { } ~HiddenSoftmax() { } Component* Copy() const { return new HiddenSoftmax(*this); } ComponentType GetType() const { return kHiddenSoftmax; } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // y = e^x_j/sum_j(e^x_j) out->SoftMaxPerRow(in); } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { // This Softmax should be used for a hidden layer, it calculates // the true Jacobian of Softmax: J = diag(out) - out*out^T // The backpropagation formual is: // in_diff = out_diff \odot out - out(out_diff^T * out) // (where \odot is Hadamard product) // 1st term, out_diff \odot out, in_diff->CopyFromMat(out_diff); in_diff->MulElements(out); // 2nd term, -out(out_diff^T * out), diag_out_diff_out_.Resize(out.NumRows()); diag_out_diff_out_.AddDiagMatMat(1.0, out_diff, kNoTrans, out, kTrans, 0.0); in_diff->AddDiagVecMat(-1.0, diag_out_diff_out_, out, kNoTrans, 1.0); } private: /// buffer for dot-products in BackpropagateFnc, CuVector diag_out_diff_out_; }; class BlockSoftmax : public Component { public: BlockSoftmax(int32 dim_in, int32 dim_out): Component(dim_in, dim_out) { } ~BlockSoftmax() { } Component* Copy() const { return new BlockSoftmax(*this); } ComponentType GetType() const { return kBlockSoftmax; } void InitData(std::istream &is) { // parse config std::string token, dims_str; while (is >> std::ws, !is.eof()) { ReadToken(is, false, &token); /**/ if (token == "") is >> dims_str; else KALDI_ERR << "Unknown token " << token << ", a typo in config?" << " (BlockDims)"; } // parse dims, if (!kaldi::SplitStringToIntegers(dims_str, ",:", false, &block_dims)) KALDI_ERR << "Invalid block-dims " << dims_str; // sanity check int32 sum = 0; for (int32 i = 0; i < block_dims.size(); i++) { sum += block_dims[i]; } KALDI_ASSERT(sum == OutputDim()); } void ReadData(std::istream &is, bool binary) { ReadIntegerVector(is, binary, &block_dims); block_offset.resize(block_dims.size()+1, 0); for (int32 i = 0; i < block_dims.size(); i++) { block_offset[i+1] = block_offset[i] + block_dims[i]; } // check KALDI_ASSERT(OutputDim() == block_offset[block_offset.size()-1]); } void WriteData(std::ostream &os, bool binary) const { WriteIntegerVector(os, binary, block_dims); } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // perform softmax per block: for (int32 bl = 0; bl < block_dims.size(); bl++) { // get the blocks, CuSubMatrix in_bl = in.ColRange(block_offset[bl], block_dims[bl]); CuSubMatrix out_bl = out->ColRange(block_offset[bl], block_dims[bl]); // y = e^x_j/sum_j(e^x_j), out_bl.SoftMaxPerRow(in_bl); } } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { // copy the error derivative: // (assuming we already got softmax-cross-entropy derivative in out_diff) in_diff->CopyFromMat(out_diff); // Set the derivatives to zero for the matrix-lines in which // the sum of 'derivatives' was 1.0 (i.e. there was no target): for (int32 bl = 0; bl < block_dims.size(); bl++) { // get the block, CuSubMatrix diff_bl = in_diff->ColRange(block_offset[bl], block_dims[bl]); // get the sum of each row, CuVector row_sum(diff_bl.NumRows()); row_sum.AddColSumMat(1.0, diff_bl, 0.0); // 0: keep as-is, 1: zero-out // we'll scale rows by 0/1 masks, CuVector row_diff_mask(row_sum); row_diff_mask.Scale(-1.0); // 0: keep as-is, -1: zero-out row_diff_mask.Add(1.0); // 1: keep as-is, 0: zero-out // here we should have only 0's and 1's, diff_bl.MulRowsVec(row_diff_mask); } } std::string Info() const { return "\n softmax-dims " + ToString(block_dims); } std::vector block_dims; std::vector block_offset; }; class Sigmoid : public Component { public: Sigmoid(int32 dim_in, int32 dim_out): Component(dim_in, dim_out) { } ~Sigmoid() { } Component* Copy() const { return new Sigmoid(*this); } ComponentType GetType() const { return kSigmoid; } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // y = 1/(1+e^-x) out->Sigmoid(in); } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { // ey = y(1-y)ex, in_diff->DiffSigmoid(out, out_diff); } }; class Tanh : public Component { public: Tanh(int32 dim_in, int32 dim_out): Component(dim_in, dim_out) { } ~Tanh() { } Component* Copy() const { return new Tanh(*this); } ComponentType GetType() const { return kTanh; } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { // y = (e^x - e^(-x)) / (e^x + e^(-x)), out->Tanh(in); } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { // ey = (1 - y^2)ex in_diff->DiffTanh(out, out_diff); } }; class Dropout : public Component { public: Dropout(int32 dim_in, int32 dim_out): Component(dim_in, dim_out), dropout_rate_(0.5) { } ~Dropout() { } Component* Copy() const { return new Dropout(*this); } ComponentType GetType() const { return kDropout; } void InitData(std::istream &is) { is >> std::ws; // eat-up whitespace // parse config std::string token; while (is >> std::ws, !is.eof()) { ReadToken(is, false, &token); /**/ if (token == "") ReadBasicType(is, false, &dropout_rate_); else KALDI_ERR << "Unknown token " << token << ", a typo in config?" << " (DropoutRate)"; } KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } void ReadData(std::istream &is, bool binary) { // Read all the '' in arbitrary order, bool finished = false; while ('<' == Peek(is, binary) && !finished) { std::string token; int first_char = PeekToken(is, binary); switch (first_char) { case 'D': ReadToken(is, false, &token); /**/ if (token == "") ReadBasicType(is, binary, &dropout_rate_); else if (token == "") { /* compatibility */ BaseFloat dropout_retention; ReadBasicType(is, binary, &dropout_retention); dropout_rate_ = 1.0 - dropout_retention; } else KALDI_ERR << "Unknown token: " << token; break; case '!': ExpectToken(is, binary, ""); finished = true; break; default: ReadToken(is, false, &token); KALDI_ERR << "Unknown token: " << token; } } KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } void WriteData(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_rate_); } std::string Info() const { return std::string(" ") + ToString(dropout_rate_); } void PropagateFnc(const CuMatrixBase &in, CuMatrixBase *out) { out->CopyFromMat(in); // set N inputs to zero, according to the 'dropout_rate_' ... dropout_mask_.Resize(out->NumRows(), out->NumCols()); rand_.RandUniform(&dropout_mask_); // [0..1] dropout_mask_.Add(-dropout_rate_); // [(-rate)..(1-rate)] dropout_mask_.Heaviside(dropout_mask_); // (x > 0.0 ? 1 : 0) out->MulElements(dropout_mask_); // rescale to keep the same dynamic range as w/o dropout, out->Scale(1.0 / (1.0 - dropout_rate_)); } void BackpropagateFnc(const CuMatrixBase &in, const CuMatrixBase &out, const CuMatrixBase &out_diff, CuMatrixBase *in_diff) { in_diff->CopyFromMat(out_diff); // use same mask on the error derivatives... in_diff->MulElements(dropout_mask_); // enlarge the output to fit same dynamic range as w/o dropout in_diff->Scale(1.0 / (1.0 - dropout_rate_)); } BaseFloat GetDropoutRate() { return dropout_rate_; } void SetDropoutRate(BaseFloat dr) { dropout_rate_ = dr; KALDI_ASSERT(dropout_rate_ >= 0.0 && dropout_rate_ < 1.0); } private: BaseFloat dropout_rate_; ///< probability that a neuron is dropped, CuRand rand_; ///< generator of random numbers, CuMatrix dropout_mask_; // random binary mask, // 1 = keep neuron, 0 = drop neuron, }; } // namespace nnet1 } // namespace kaldi #endif // KALDI_NNET_NNET_ACTIVATION_H_