Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/nnet/make_nnet_proto.py 12.1 KB
  #!/usr/bin/env python
  
  # Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
  
  from __future__ import division
  from __future__ import print_function
  import math, random, sys, re
  
  ###
  ### Parse options
  ###
  from optparse import OptionParser
  usage="%prog [options] <feat-dim> <num-leaves> <num-hid-layers> <num-hid-neurons> >nnet-proto-file"
  parser = OptionParser(usage)
  
  # Softmax related,
  parser.add_option('--no-softmax', dest='with_softmax',
                     help='Do not put <SoftMax> in the prototype [default: %default]',
                     default=True, action='store_false');
  parser.add_option('--block-softmax-dims', dest='block_softmax_dims',
                     help='Generate <BlockSoftmax> with dims D1:D2:D3 [default: %default]',
                     default="", type='string');
  # Activation related,
  parser.add_option('--activation-type', dest='activation_type',
                     help='Select type of activation function : (<Sigmoid>|<Tanh>|<ParametricRelu>) [default: %default]',
                     default='<Sigmoid>', type='string');
  parser.add_option('--activation-opts', dest='activation_opts',
                     help='Additional options for protoype of activation function [default: %default]',
                     default='', type='string');
  # Affine-transform related,
  parser.add_option('--hid-bias-mean', dest='hid_bias_mean',
                     help='Set bias for hidden activations [default: %default]',
                     default=-2.0, type='float');
  parser.add_option('--hid-bias-range', dest='hid_bias_range',
                     help='Set bias range for hidden activations (+/- 1/2 range around mean) [default: %default]',
                     default=4.0, type='float');
  parser.add_option('--param-stddev-factor', dest='param_stddev_factor',
                     help='Factor to rescale Normal distriburtion for initalizing weight matrices [default: %default]',
                     default=0.1, type='float');
  parser.add_option('--no-glorot-scaled-stddev', dest='with_glorot',
                     help='Generate normalized weights according to X.Glorot paper, but mapping U->N with same variance (factor sqrt(x/(dim_in+dim_out)))',
                     action='store_false', default=True);
  parser.add_option('--no-smaller-input-weights', dest='smaller_input_weights',
                     help='Disable 1/12 reduction of stddef in input layer [default: %default]',
                     action='store_false', default=True);
  parser.add_option('--no-bottleneck-trick', dest='bottleneck_trick',
                     help='Disable smaller initial weights and learning rate around bottleneck',
                     action='store_false', default=True);
  parser.add_option('--max-norm', dest='max_norm',
                     help='Max radius of neuron-weights in L2 space (if longer weights get shrinked, not applied to last layer, 0.0 = disable) [default: %default]',
                     default=0.0, type='float');
  parser.add_option('--affine-opts', dest='affine_opts',
                     help='Additional options for protoype of affine tranform [default: %default]',
                     default='', type='string');
  # Topology related,
  parser.add_option('--bottleneck-dim', dest='bottleneck_dim',
                     help='Make bottleneck network with desired bn-dim (0 = no bottleneck) [default: %default]',
                     default=0, type='int');
  parser.add_option('--with-dropout', dest='with_dropout',
                     help='Add <Dropout> after the non-linearity of hidden layer.',
                     action='store_true', default=False);
  parser.add_option('--dropout-opts', dest='dropout_opts',
                     help='Extra options for dropout [default: %default]',
                     default='', type='string');
  
  
  (o,args) = parser.parse_args()
  if len(args) != 4 :
    parser.print_help()
    sys.exit(1)
  
  # A HACK TO PASS MULTI-WORD OPTIONS, WORDS ARE CONNECTED BY UNDERSCORES '_',
  o.activation_opts = o.activation_opts.replace("_"," ")
  o.affine_opts = o.affine_opts.replace("_"," ")
  o.dropout_opts = o.dropout_opts.replace("_"," ")
  
  (feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = [int(i) for i in args];
  ### End parse options
  
  
  # Check
  assert(feat_dim > 0)
  assert(num_leaves > 0)
  assert(num_hid_layers >= 0)
  assert(num_hid_neurons > 0)
  if o.block_softmax_dims:
    assert(sum(map(int, re.split("[,:]", o.block_softmax_dims))) == num_leaves) # posible separators : ',' ':'
  
  # Optionaly scale
  def Glorot(dim1, dim2):
    if o.with_glorot:
      # 35.0 = magic number, gives ~1.0 in inner layers for hid-dim 1024dim,
      return 35.0 * math.sqrt(2.0/(dim1+dim2));
    else:
      return 1.0
  
  
  ###
  ### Print prototype of the network
  ###
  
  # NO HIDDEN LAYER, ADDING BOTTLENECK!
  # No hidden layer while adding bottleneck means:
  # - add bottleneck layer + hidden layer + output layer
  if num_hid_layers == 0 and o.bottleneck_dim != 0:
    assert(o.bottleneck_dim > 0)
    assert(num_hid_layers == 0)
    if o.bottleneck_trick:
      # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
      print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
       (feat_dim, o.bottleneck_dim, \
        (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1))
      # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
      print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
       (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
        (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm))
    else:
      print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
       (feat_dim, o.bottleneck_dim, \
        (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim))))
      print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
       (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
        (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm))
    print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) # Non-linearity
    # Last AffineTransform (10x smaller learning rate on bias)
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
     (num_hid_neurons, num_leaves, 0.0, 0.0, \
      (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))
    # Optionaly append softmax
    if o.with_softmax:
      if o.block_softmax_dims == "":
        print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
      else:
        print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
    print("</NnetProto>")
    # We are done!
    sys.exit(0)
  
  # NO HIDDEN LAYERS!
  # Add only last layer (logistic regression)
  if num_hid_layers == 0:
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
          (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves))))
    if o.with_softmax:
      if o.block_softmax_dims == "":
        print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
      else:
        print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
    print("</NnetProto>")
    # We are done!
    sys.exit(0)
  
  
  # THE USUAL DNN PROTOTYPE STARTS HERE!
  # Assuming we have >0 hidden layers,
  assert(num_hid_layers > 0)
  
  # Begin the prototype,
  # First AffineTranform,
  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
        (feat_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
         (o.param_stddev_factor * Glorot(feat_dim, num_hid_neurons) * \
          (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts))
        # Note.: compensating dynamic range mismatch between input features and Sigmoid-hidden layers,
        # i.e. mapping the std-dev of N(0,1) (input features) to std-dev of U[0,1] (sigmoid-outputs).
        # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12).
        # The stddev of weights is consequently reduced with scale 0.29,
  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
  if o.with_dropout:
    print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
  
  
  # Internal AffineTransforms,
  for i in range(num_hid_layers-1):
    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
          (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
           (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts))
    print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
    if o.with_dropout:
      print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
  
  # Optionaly add bottleneck,
  if o.bottleneck_dim != 0:
    assert(o.bottleneck_dim > 0)
    if o.bottleneck_trick:
      # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
      print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
       (num_hid_neurons, o.bottleneck_dim, \
        (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1))
      # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
      print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f %s" % \
       (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
        (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts))
    else:
      # Same learninig-rate and stddev-formula everywhere,
      print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
       (num_hid_neurons, o.bottleneck_dim, \
        (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim))))
      print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
       (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
        (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts))
    print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
    if o.with_dropout:
      print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
  
  # Last AffineTransform (10x smaller learning rate on bias)
  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
        (num_hid_neurons, num_leaves, 0.0, 0.0, \
         (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))
  
  # Optionaly append softmax
  if o.with_softmax:
    if o.block_softmax_dims == "":
      print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
    else:
      print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
  
  # We are done!
  sys.exit(0)