composite_layers.py
14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# Copyright 2018 Johns Hopkins University (Dan Povey)
# Apache 2.0.
""" This module contains some composite layers, which is basically a catch-all
term for things like TDNN-F that contain several affine or linear comopnents.
"""
from __future__ import print_function
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
# This class is intended to implement an extension of the factorized TDNN
# (TDNN-F) that supports resnet-type 'bypass' connections. It is for lines like
# the following:
#
# tdnnf-layer name=tdnnf2 dim=1024 bottleneck-dim=128 dropout-proportion=0.0 time-stride=3
#
# The line above would be roughly equivalent to the following four lines (except
# for different naming, and the use of TdnnComponent, for efficiency, in place
# of AffineComponent). Assume that the previous layer (the default input) was tdnnf1:
#
# linear-component name=tdnnf2.linear dim=128 orthonormal-constraint=-1.0 input=Append(Offset(-3, tdnnf1), tdnnf1)
# relu-batchnorm-dropout-layer name=tdnnf2.affine dim=1024 dropout-proportion=0.0 \
# dropout-per-dim-continuous=true input=Append(0,3)
# no-op-component name=tdnnf2 input=Sum(Scale(0.66,tdnnf1), tdnn2.affine)
# Documentation of some of the important options:
#
# - dropout-proportion
# This gets passed through to the dropout component. If you don't set
# 'dropout-proportion', no dropout component will be included; it would be like
# using a relu-batchnorm-layer in place of a relu-batchnorm-dropout-layer. You
# should only set 'dropout-proportion' if you intend to use dropout (it would
# usually be combined with the --dropout-schedule option to train.py). If you
# use the --dropout-schedule option, the value doesn't really matter since it
# will be changed during training, and 0 is recommended.
#
# - time-stride
# Controls the time offsets in the splicing, e.g. if you set time-stride to
# 1 instead of the 3 in the example, the time-offsets would be -1 and 1 instead
# of 1 and 3.
# If you set time-stride=0, as a special case no splicing over time will be
# performed (so no Append() expressions) and the second linear component (named
# tdnnf2l in the example) would be omitted, since it would add no modeling
# power.
# You can set time-stride to a negative number which will negate all the
# time indexes; it might potentially be useful to alternate negative and positive
# time-stride if you wanted to force the overall network to have symmetric
# context, since with positive time stride, this layer has more negative
# than positive time context (i.e. more left than right).
#
# - bypass-scale
# A scale on the previous layer's output, used in bypass (resnet-type)
# connections. Should not exceed 1.0. The default is 0.66. If you set it to
# zero, the layer will lack the bypass (but we don't recommend this). won't use
# a bypass connection at all, so it would be like conventional TDNN-F Note: the
# layer outputs are added together after the batchnorm so the model cannot
# control their relative magnitudes and this does actually affect what it can
# model. When we experimented with having this scale trainable it did not seem
# to give an advantage.
#
# - l2-regularize
# This is passed through to the linear and affine components. You'll normally
# want this to be set to a nonzero value, e.g. 0.004.
class XconfigTdnnfLayer(XconfigLayerBase):
def __init__(self, first_token, key_to_value, prev_names = None):
assert first_token == "tdnnf-layer"
XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
def set_default_configs(self):
self.config = {'input':'[-1]',
'dim':-1,
'bottleneck-dim':-1,
'bypass-scale':0.66,
'dropout-proportion':-1.0,
'time-stride':1,
'l2-regularize':0.0,
'max-change': 0.75,
'self-repair-scale': 1.0e-05}
def set_derived_configs(self):
pass
def check_configs(self):
if self.config['bottleneck-dim'] <= 0:
raise RuntimeError("bottleneck-dim must be set and >0.")
if self.config['dim'] <= self.config['bottleneck-dim']:
raise RuntimeError("dim must be greater than bottleneck-dim")
dropout = self.config['dropout-proportion']
if dropout != -1.0 and not (dropout >= 0.0 and dropout < 1.0):
raise RuntimeError("invalid value for dropout-proportion")
if abs(self.config['bypass-scale']) > 1.0:
raise RuntimeError("bypass-scale has invalid value")
input_dim = self.descriptors['input']['dim']
output_dim = self.config['dim']
if output_dim != input_dim and self.config['bypass-scale'] != 0.0:
raise RuntimeError('bypass-scale is nonzero but output-dim != input-dim: {0} != {1}'
''.format(output_dim, input_dim))
def output_name(self, auxiliary_output=None):
assert auxiliary_output is None
output_component = ''
if self.config['bypass-scale'] != 0.0:
# the no-op component is used to cache something that we don't want
# to have to recompute.
output_component = 'noop'
elif self.config['dropout-proportion'] != -1.0:
output_component = 'dropout'
else:
output_component = 'batchnorm'
return '{0}.{1}'.format(self.name, output_component)
def output_dim(self, auxiliary_output=None):
return self.config['dim']
def get_full_config(self):
ans = []
config_lines = self._generate_config()
for line in config_lines:
for config_name in ['ref', 'final']:
ans.append((config_name, line))
return ans
def _generate_config(self):
configs = []
name = self.name
input_dim = self.descriptors['input']['dim']
input_descriptor = self.descriptors['input']['final-string']
output_dim = self.config['dim']
bottleneck_dim = self.config['bottleneck-dim']
bypass_scale = self.config['bypass-scale']
dropout_proportion = self.config['dropout-proportion']
time_stride = self.config['time-stride']
if time_stride != 0:
time_offsets1 = '{0},0'.format(-time_stride)
time_offsets2 = '0,{0}'.format(time_stride)
else:
time_offsets1 = '0'
time_offsets2 = '0'
l2_regularize = self.config['l2-regularize']
max_change = self.config['max-change']
self_repair_scale = self.config['self-repair-scale']
# The first linear layer, from input-dim (spliced x2) to bottleneck-dim
configs.append('component name={0}.linear type=TdnnComponent input-dim={1} '
'output-dim={2} l2-regularize={3} max-change={4} use-bias=false '
'time-offsets={5} orthonormal-constraint=-1.0'.format(
name, input_dim, bottleneck_dim, l2_regularize,
max_change, time_offsets1))
configs.append('component-node name={0}.linear component={0}.linear '
'input={1}'.format(name, input_descriptor))
# The affine layer, from bottleneck-dim (spliced x2) to output-dim
configs.append('component name={0}.affine type=TdnnComponent '
'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
'time-offsets={5}'.format(
name, bottleneck_dim, output_dim, l2_regularize,
max_change, time_offsets2))
configs.append('component-node name={0}.affine component={0}.affine '
'input={0}.linear'.format(name))
# The ReLU layer
configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} '
'self-repair-scale={2}'.format(
name, output_dim, self_repair_scale))
configs.append('component-node name={0}.relu component={0}.relu '
'input={0}.affine'.format(name))
# The BatchNorm layer
configs.append('component name={0}.batchnorm type=BatchNormComponent '
'dim={1}'.format(name, output_dim))
configs.append('component-node name={0}.batchnorm component={0}.batchnorm '
'input={0}.relu'.format(name))
if dropout_proportion != -1:
# This is not normal dropout. It's dropout where the mask is shared
# across time, and (thanks to continuous=true), instead of a
# zero-or-one scale, it's a continuously varying scale whose
# expected value is 1, drawn from a uniform distribution over an
# interval of a size that varies with dropout-proportion.
configs.append('component name={0}.dropout type=GeneralDropoutComponent '
'dim={1} dropout-proportion={2} continuous=true'.format(
name, output_dim, dropout_proportion))
configs.append('component-node name={0}.dropout component={0}.dropout '
'input={0}.batchnorm'.format(name))
cur_component_type = 'dropout'
else:
cur_component_type = 'batchnorm'
if bypass_scale != 0.0:
# Add a NoOpComponent to cache the weighted sum of the input and the
# output. We could easily have the output of the component be a
# Descriptor like 'Append(Scale(0.66, tdnn1.batchnorm), tdnn2.batchnorm)',
# but if we did that and you used many of this component in sequence,
# the weighted sums would have more and more terms as you went deeper
# in the network.
configs.append('component name={0}.noop type=NoOpComponent '
'dim={1}'.format(name, output_dim))
configs.append('component-node name={0}.noop component={0}.noop '
'input=Sum(Scale({1}, {2}), {0}.{3})'.format(
name, bypass_scale, input_descriptor,
cur_component_type))
return configs
# This is for lines like the following:
# prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 big-dim=1024 small-dim=256
#
# which is equivalent to the following sequence of components (except for
# name differences):
# relu-batchnorm-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 dim=1024
# linear-comonent name=prefinal-chain-l dim=256 l2-regularize=0.02 orthonormal-constraint=-1.0
# batchnorm-component name=prefinal-chain-batchnorm
#
# This layer is really just for convenience in writing config files: it doesn't
# do anything that's particular hard or unusual, but it encapsulates a commonly
# repeated pattern.
class XconfigPrefinalLayer(XconfigLayerBase):
def __init__(self, first_token, key_to_value, prev_names = None):
assert first_token == "prefinal-layer"
XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
def set_default_configs(self):
self.config = {'input':'[-1]',
'big-dim':-1,
'small-dim':-1,
'l2-regularize':0.0,
'max-change': 0.75,
'self-repair-scale': 1.0e-05}
def set_derived_configs(self):
pass
def check_configs(self):
if self.config['small-dim'] <= 0:
raise RuntimeError("small-dim must be set and >0.")
if self.config['big-dim'] <= self.config['small-dim']:
raise RuntimeError("big-dim must be greater than small-dim")
def output_name(self, auxiliary_output=None):
assert auxiliary_output is None
return '{0}.batchnorm2'.format(self.name)
def output_dim(self, auxiliary_output=None):
return self.config['small-dim']
def get_full_config(self):
ans = []
config_lines = self._generate_config()
for line in config_lines:
for config_name in ['ref', 'final']:
ans.append((config_name, line))
return ans
def _generate_config(self):
configs = []
name = self.name
input_dim = self.descriptors['input']['dim']
input_descriptor = self.descriptors['input']['final-string']
small_dim = self.config['small-dim']
big_dim = self.config['big-dim']
l2_regularize = self.config['l2-regularize']
max_change = self.config['max-change']
self_repair_scale = self.config['self-repair-scale']
# The affine layer, from input-dim to big-dim.
configs.append('component name={0}.affine type=NaturalGradientAffineComponent '
'input-dim={1} output-dim={2} l2-regularize={3} max-change={4}'.format(
name, input_dim, big_dim, l2_regularize, max_change))
configs.append('component-node name={0}.affine component={0}.affine '
'input={1}'.format(name, input_descriptor))
# The ReLU layer
configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} '
'self-repair-scale={2}'.format(
name, big_dim, self_repair_scale))
configs.append('component-node name={0}.relu component={0}.relu '
'input={0}.affine'.format(name))
# The first BatchNorm layer
configs.append('component name={0}.batchnorm1 type=BatchNormComponent '
'dim={1}'.format(name, big_dim))
configs.append('component-node name={0}.batchnorm1 component={0}.batchnorm1 '
'input={0}.relu'.format(name))
# The linear layer, from big-dim to small-dim, with orthonormal-constraint=-1
# ("floating" orthonormal constraint).
configs.append('component name={0}.linear type=LinearComponent '
'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
'orthonormal-constraint=-1 '.format(
name, big_dim, small_dim,
l2_regularize, max_change))
configs.append('component-node name={0}.linear component={0}.linear '
'input={0}.batchnorm1'.format(name))
# The second BatchNorm layer
configs.append('component name={0}.batchnorm2 type=BatchNormComponent '
'dim={1}'.format(name, small_dim))
configs.append('component-node name={0}.batchnorm2 component={0}.batchnorm2 '
'input={0}.linear'.format(name))
return configs