Blame view
src/nnet3/nnet-normalize-component.h
13.3 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
// nnet3/nnet-normalize-component.h // Copyright 2011-2013 Karel Vesely // 2012-2015 Johns Hopkins University (author: Daniel Povey) // 2013 Xiaohui Zhang // 2014-2015 Vijayaditya Peddinti // 2014-2015 Guoguo Chen // 2015 Daniel Galvez // 2015 Tom Ko // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ #define KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_ #include "nnet3/nnet-common.h" #include "nnet3/nnet-component-itf.h" #include "nnet3/natural-gradient-online.h" #include <iostream> namespace kaldi { namespace nnet3 { /// @file nnet-normalize-component.h /// /// This file contains declarations of components that in one way or /// another normalize their input: NormalizeComponent and BatchNormComponent. /* NormalizeComponent implements the function: y = x * (sqrt(dim(x)) * target-rms) / |x| where |x| is the 2-norm of the vector x. I.e. its output is its input scaled such that the root-mean-square values of its elements equals target-rms. (As a special case, if the input is zero, it outputs zero). This is like Hinton's layer-norm, except not normalizing the mean, only the variance. Note: if you specify add-log-stddev=true, it adds an extra element to y which equals log(|x| / sqrt(dim(x))). Configuration values accepted: dim, or input-dim Input dimension of this component, e.g. 1024. Will be the same as the output dimension if add-log-stddev=false. block-dim Defaults to 'dim' you may specify a divisor of 'dim'. In this case the input dimension will be interpreted as blocks of dimension 'block-dim' to which the nonlinearity described above is applied separately. add-log-stddev You can set this to true to add an extra output dimension which will equal |x| / sqrt(dim(x)). If block-dim is specified, this is done per block. target-rms This defaults to 1.0, but if set it to another (nonzero) value, the output will be scaled by this factor. */ class NormalizeComponent: public Component { public: explicit NormalizeComponent(const NormalizeComponent &other); virtual int32 Properties() const { return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds| (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) | (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0); } NormalizeComponent() { } virtual std::string Type() const { return "NormalizeComponent"; } virtual void InitFromConfig(ConfigLine *cfl); virtual Component* Copy() const { return new NormalizeComponent(*this); } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) const; virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase<BaseFloat> &in_value, const CuMatrixBase<BaseFloat> &, // out_value const CuMatrixBase<BaseFloat> &out_deriv, void *memo, Component *to_update, CuMatrixBase<BaseFloat> *in_deriv) const; virtual void Read(std::istream &is, bool binary); virtual void Write(std::ostream &os, bool binary) const; virtual int32 InputDim() const { return input_dim_; } virtual int32 OutputDim() const { return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0)); } virtual std::string Info() const; private: NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow. enum { kExpSquaredNormFloor = -66 }; // kSquaredNormFloor is about 0.7e-20. We need a value that's exactly representable in // float and whose inverse square root is also exactly representable // in float (hence, an even power of two). static const BaseFloat kSquaredNormFloor; int32 input_dim_; int32 block_dim_; BaseFloat target_rms_; // The target rms for outputs, default 1.0. bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D))) // is an extra dimension of the output. }; /* BatchNormComponent This implements batch normalization; for each dimension of the input it normalizes the data to be zero-mean, unit-variance. You can set the block-dim configuration value to implement spatial batch normalization, see the comment for the variable. If you want to combine this with the trainable offset and scale that the original BatchNorm paper used, then follow this by the ScaleAndOffsetComponent. It's a simple component (uses the kSimpleComponent flag), but it is unusual in that it will give different results if you call it on half the matrix at a time. Most of the time this would be pretty harmless, so we still return the kSimpleComponent flag. We may have to modify the test code a little to account for this, or possibly remove the kSimpleComponent flag. In some sense each output Index depends on every input Index, but putting those dependencies explicitly into the dependency-tracking framework as a GeneralComponent would be very impractical and might lead to a lot of unnecessary things being computed. You have to be a bit careful where you put this component, and understand what you're doing e.g. putting it in the path of a recurrence is a bit problematic if the minibatch size is small. Accepted configuration values: dim Dimension of the input and output block-dim Defaults to 'dim', but may be set to a divisor of 'dim'. In this case, each block of dimension 'block-dim' is treated like a separate row of the input matrix, which means that the stats from n'th element of each block are pooled into one class, for each n. epsilon Small term added to the variance that is used to prevent division by zero target-rms This defaults to 1.0, but if set, for instance, to 2.0, it will normalize the standard deviation of the output to 2.0. 'target-stddev' might be a more suitable name, but this was chosen for consistency with NormalizeComponent. */ class BatchNormComponent: public Component { public: BatchNormComponent() { } // call this with 'true' to set 'test mode' where the batch normalization is // done with stored stats. There won't normally be any need to specially // accumulate these stats; they are stored as a matter of course on each // iteration of training, as for NonlinearComponents, and we'll use the stats // from the most recent [script-level] iteration. // (Note: it will refuse to actually set test-mode to true if there // are no stats stored.) void SetTestMode(bool test_mode); // constructor using another component BatchNormComponent(const BatchNormComponent &other); virtual int32 InputDim() const { return dim_; } virtual int32 OutputDim() const { return dim_; } virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); virtual std::string Type() const { return "BatchNormComponent"; } virtual int32 Properties() const { // If the block-dim is less than the dim, we need the input and output // matrices to be contiguous (stride==num-cols), as we'll be reshaping // internally. This is not much of a cost, because this will be used // in convnets where we have to do this anyway. return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace| kBackpropInPlace| (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)| (test_mode_ ? 0 : kUsesMemo|kStoresStats); } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) const; virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase<BaseFloat> &in_value, const CuMatrixBase<BaseFloat> &out_value, const CuMatrixBase<BaseFloat> &out_deriv, void *memo, Component *, // to_update, CuMatrixBase<BaseFloat> *in_deriv) const; virtual void Read(std::istream &is, bool binary); // This Read function // requires that the Component has the correct type. /// Write component to stream virtual void Write(std::ostream &os, bool binary) const; virtual Component* Copy() const { return new BatchNormComponent(*this); } virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); virtual void ZeroStats(); virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); } virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value, const CuMatrixBase<BaseFloat> &out_value, void *memo); // Members specific to this component type. // Note: the offset and scale will only be nonempty in 'test mode'. const CuVector<BaseFloat> &Offset() const { return offset_; } const CuVector<BaseFloat> &Scale() const { return scale_; } private: struct Memo { // number of frames (after any reshaping). int32 num_frames; // 'sum_sumsq_scale' is of dimension 5 by block_dim_: // Row 0 = mean = the mean of the rows of the input // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames). // Row 2 = scale = the scale of the renormalization. // Rows 3 and 4 are used as temporaries in Backprop. CuMatrix<BaseFloat> mean_uvar_scale; }; void Check() const; // this function is used in a couple of places; it turns the raw stats into // the offset/scale term of a normalizing transform. static void ComputeOffsetAndScale(double count, BaseFloat epsilon, const Vector<double> &stats_sum, const Vector<double> &stats_sumsq, Vector<BaseFloat> *offset, Vector<BaseFloat> *scale); // computes derived parameters offset_ and scale_. void ComputeDerived(); // Dimension of the input and output. int32 dim_; // This would normally be the same as dim_, but if it's less (and it must be > // 0 and must divide dim_), then each separate block of the input of dimension // 'block_dim_' is treated like a separate frame for the purposes of // normalization. This can be used to implement spatial batch normalization // for convolutional setups-- assuming the filter-dim has stride 1, which it // always will in the new code in nnet-convolutional-component.h. int32 block_dim_; // Used to avoid exact-zero variances, epsilon has the dimension of a // covariance. BaseFloat epsilon_; // This value will normally be 1.0, which is the default, but you can set it // to other values as a way to control how fast the following layer learns // (smaller -> slower). The same config exists in NormalizeComponent. BaseFloat target_rms_; // This is true if we want the batch normalization to operate in 'test mode' // meaning the data mean and stddev used for the normalization are fixed // quantities based on previously accumulated stats. Note: the stats we use // for this are based on the same 'StoreStats' mechanism as we use for // components like SigmoidComponent and ReluComponent; we'll be using // the stats from the most recent [script-level] iteration of training. bool test_mode_; // total count of stats stored by StoreStats(). double count_; // sum-of-data component of stats of input data. CuVector<double> stats_sum_; // sum-of-squared component of stats of input data. CuVector<double> stats_sumsq_; // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they // dictate the transform that is done in 'test mode'. They are set only when // reading the model from disk and when calling SetTestMode(true); they are // resized to empty when the stats are updated, to ensure that out-of-date // values are not kept around. CuVector<BaseFloat> offset_; CuVector<BaseFloat> scale_; }; } // namespace nnet3 } // namespace kaldi #endif |