convolution.py 59.7 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

1001

1002

1003

1004

1005

1006

1007

1008

1009

1010

1011

1012

1013

1014

1015

1016

1017

1018

1019

1020

1021

1022

1023

1024

1025

1026

1027

1028

1029

1030

1031

1032

1033

1034

1035

1036

1037

1038

1039

1040

1041

1042

1043

1044

1045

1046

1047

1048

1049

1050

1051

1052

1053

1054

1055

1056

1057

1058

1059

1060

1061

1062

1063

1064

1065

1066

1067

1068

1069

1070

1071

1072

1073

1074

1075

1076

1077

1078

1079

1080

1081

1082

1083

1084

1085

1086

1087

1088

1089

1090

1091

1092

1093

1094

1095

1096

1097

1098

1099

1100

1101

1102

1103

1104

1105

1106

1107

1108

1109

1110

1111

1112

1113

1114

1115

1116

1117

1118

1119

1120

1121

1122

1123

1124

1125

1126

1127

1128

1129

1130

1131

1132

1133

1134

1135

1136

1137

1138

1139

1140

1141

1142

1143

1144

1145

1146

1147

1148

1149

1150

1151

1152

1153

1154

1155

1156

1157

1158

1159

1160

1161

1162

1163

1164

1165

1166

1167

1168

1169

1170

1171

1172

1173

1174

1175

1176

1177

1178

1179

1180

1181

1182

1183

1184

1185

1186

1187

1188

1189

1190

1191

1192

1193

1194

1195

1196

1197

1198

1199

1200

1201

1202

1203


# Copyright 2018    Johns Hopkins University (Author: Dan Povey)
#           2016    Vijayaditya Peddinti
# Apache 2.0.


""" This module has the implementation of convolutional layers.
"""
from __future__ import print_function
from __future__ import division
import math
import re
import sys
from libs.nnet3.xconfig.basic_layers import XconfigLayerBase


# This class is for lines like the following:
#

#  conv-batchnorm-layer name=conv2 height-in=40 height-out=40 \
#      num-filters-out=64 height-offsets=-1,0,1 time-offsets=-1,0,1 \
#      required-time-offsets=0
#  or (with NormalizeLayer instead of batch-norm, and with subsampling on the height axis):
#  conv-renorm-layer name=conv3 height-in=40 height-out=20 \
#      height-subsample-out=2 num-filters-out=128 height-offsets=-1,0,1 \
#       time-offsets=-1,0,1 required-time-offsets=0
#
# You don't specify subsampling on the time axis explicitly, it's implicit
# in the 'time-offsets' which are the same as the splicing indexes in a TDNN,
# and which, unlike the height offsets, operate relative to a fixed clock,
# so that after subsampling by a factor of 2, we'd expect all time-offsets
# of subsequent layers to be a factor of 2.  You don't specify the input
# num-filters either; it's worked out from the input height and the input dim.
#
# The layer-name encodes the use (or not) of batch normalization, so that if you
# want to skip batch normalization you could just call it 'conv-layer'.
#
# If batch-normalization is used, it's *spatial* batch-normalization, meaning
# that the offset and scale is specific to the output filter, but shared across
# all time and height offsets.
#
# Most of the configuration values mirror same-named values in class
# TimeHeightConvolutionComponent, and for a deeper understanding of what's going
# on you should look at the comment by its declaration, in
# src/nnet3/nnet-convolutional-component.h.
#
# Parameters of the class, and their defaults if they have defaults:
#
#   input='[-1]'             Descriptor giving the input of the layer.
#   height-in                The height of the input image, e.g. 40 if the input
#                            is MFCCs.  The num-filters-in is worked out as
#                            (dimension of input) / height-in.  If the preceding
#                            layer is a convolutional layer, height-in should be
#                            the same as the height-out of the preceding layer.
#   height-subsample-out=1   The height subsampling factor, will be e.g. 2 if you
#                            want to subsample by a factor of 2 on the height
#                            axis.
#   height-out               The height of the output image.  This will normally
#                            be <= (height-in / height-subsample-out).
#                            Zero-padding on the height axis may be implied by a
#                            combination of this and height-offsets-in, e.g. if
#                            height-out==height-in and height-subsample-out=1
#                            and height-offsets=-2,-1,0,1 then we'd be padding
#                            by 2 pixels on the bottom and 1 on the top; see
#                            comments in nnet-convolutional-layers.h for more
#                            details.
#   height-offsets           The offsets on the height axis that define what
#                            inputs require for each output pixel; will
#                            often be something like -1,0,1 (if zero-padding
#                            on height axis) or 0,1,2 otherwise.  These are
#                            comparable to TDNN splicing offsets; e.g. if
#                            height-offsets=-1,0,1 then height 10 at the output
#                            would take input from heights 9,10,11 at the input.
#   num-filters-out          The number of output filters.  The output dimension
#                            of this layer is num-filters-out * height-out; the
#                            filter dim varies the fastest (filter-stride == 1).
#   time-offsets             The input offsets on the time axis; these are
#                            interpreted just like the splicing indexes in TDNNs.
#                            E.g. if time-offsets=-2,0,2 then time 100 at the
#                            output would require times 98,100,102 at the input.
#   required-time-offsets    The subset of 'time-offsets' that are required in
#                            order to produce an output; if the set has fewer
#                            elements than 'time-offsets' then it implies some
#                            kind of zero-padding on the time axis is allowed.
#                            Defaults to the same as 'time-offsets'.  For speech
#                            tasks we recommend not to set this, as the normal
#                            padding approach is to pad with copies of the
#                            first/last frame, which is handled automatically in
#                            the calling code.
#   target-rms=1.0           Only applicable if the layer type is
#                            conv-batchnorm-layer or
#                            conv-normalize-layer.  This will affect the
#                            scaling of the output features (larger -> larger),
#                            and sometimes we set target-rms=0.5 for the layer
#                            prior to the final layer to make the final layer
#                            train more slowly.
#   self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#
# The following initialization and natural-gradient related options are, if
# provided, passed through to the config file; if not, they are left at the
# defaults in the code.  See nnet-convolutional-component.h for more information.
#
#  param-stddev, bias-stddev, max-change, learning-rate-factor (float)
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)

class XconfigConvLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        for operation in first_token.split('-')[:-1]:
            assert operation in ['conv', 'renorm', 'batchnorm', 'relu',
                                 'noconv', 'dropout', 'so']
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height-in':-1,
                       'height-subsample-out':1,
                       'height-out':-1,
                       'height-offsets':'',
                       'num-filters-out':-1,
                       'time-offsets':'',
                       'required-time-offsets':'',
                       'target-rms':1.0,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold': 0.05,
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'max-change': 0.75, 'learning-rate-factor':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'', 'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'', 'l2-regularize':'',
                       'dropout-proportion': 0.5}

    def set_derived_configs(self):
        # sets 'num-filters-in'.
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        if height_in <= 0:
            raise RuntimeError("height-in must be specified");
        if input_dim % height_in != 0:
            raise RuntimeError("Input dimension {0} is not a multiple of height-in={1}".format(
                input_dim, height_in))
        self.config['num-filters-in'] = input_dim // height_in


    # Check whether 'str' is a sorted, unique, nonempty list of integers, like -1,0,1.,
    # returns true if so.
    def check_offsets_var(self, str):
        try:
            a = [ int(x) for x in str.split(",") ]
            if len(a) == 0:
                return False
            for i in range(len(a) - 1):
                if a[i] >= a[i+1]:
                    return False
            return True
        except:
            return False

    def check_configs(self):
        # Do some basic checking of the configs.  The component-level code does
        # some more thorough checking, but if you set the height-out too small it
        # prints it as a warning, which the user may not see, so at a minimum we
        # want to check for that here.
        height_subsample_out = self.config['height-subsample-out']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        if height_subsample_out <= 0:
            raise RuntimeError("height-subsample-out has invalid value {0}.".format(
                height_subsample_out))
        # we already checked height-in in set_derived_configs.
        if height_out <= 0:
            raise RuntimeError("height-out has invalid value {0}.".format(
                height_out))
        if height_out * height_subsample_out > height_in:
            raise RuntimeError("The combination height-in={0}, height-out={1} and "
                               "height-subsample-out={2} does not look right "
                               "(height-out too large).".format(
                                   height_in, height_out, height_subsample_out))
        height_offsets = self.config['height-offsets']
        time_offsets = self.config['time-offsets']
        required_time_offsets = self.config['required-time-offsets']

        if not 'noconv' in self.layer_type.split('-'):
            # only check height-offsets, time-offsets and required-time-offsets if there
            # is actually a convolution in this layer.
            if not self.check_offsets_var(height_offsets):
                raise RuntimeError("height-offsets={0} is not valid".format(height_offsets))
            if not self.check_offsets_var(time_offsets):
                raise RuntimeError("time-offsets={0} is not valid".format(time_offsets))
            if required_time_offsets != "" and not self.check_offsets_var(required_time_offsets):
                raise RuntimeError("required-time-offsets={0} is not valid".format(
                    required_time_offsets))

        if height_out * height_subsample_out < \
           height_in - len(height_offsets.split(',')):
            raise RuntimeError("The combination height-in={0}, height-out={1} and "
                               "height-subsample-out={2} and height-offsets={3} "
                               "does not look right (height-out too small).")

        if self.config['target-rms'] <= 0.0:
            raise RuntimeError("Config value target-rms={0} is not valid".format(
                self.config['target_rms']))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        assert auxiliary_output is None
        # note: the [:-1] is to remove the '-layer'.
        operations = self.layer_type.split('-')[:-1]
        if operations[-1] == 'noconv':
            operations = operations[:-1]
        assert len(operations) >= 1
        last_operation = operations[-1]
        assert last_operation in ['relu', 'conv', 'renorm', 'batchnorm', 'dropout', 'so']
        # we'll return something like 'layer1.batchnorm'.
        return '{0}.{1}'.format(self.name, last_operation)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['num-filters-out'] * self.config['height-out']

    def get_full_config(self):
        ans = []
        config_lines = self._generate_cnn_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # convenience function to generate the CNN config
    def _generate_cnn_config(self):
        configs = []

        name = self.name

        # These 3 variables will be updated as we add components.
        cur_num_filters = self.config['num-filters-in']
        cur_height = self.config['height-in']
        cur_descriptor = self.descriptors['input']['final-string']

        # note: the [:-1] is to remove the '-layer'.
        operations = self.layer_type.split('-')[:-1]
        if operations[-1] == 'noconv':
            operations = operations[:-1]
        # e.g.:
        # operations = [ 'conv', 'relu', 'batchnorm' ]
        # or:
        # operations = [ 'relu', 'conv', 'renorm' ]

        for operation in operations:
            if operation == 'conv':
                a = []
                for opt_name in [
                        'param-stddev', 'bias-stddev', 'use-natural-gradient',
                        'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                        'alpha-in', 'alpha-out', 'num-filters-in', 'num-filters-out',
                        'height-in','height-out', 'height-subsample-out',
                        'height-offsets', 'time-offsets', 'required-time-offsets',
                        'learning-rate-factor', 'l2-regularize' ]:
                    value = self.config[opt_name]
                    if value != '':
                        a.append('{0}={1}'.format(opt_name, value))
                conv_opts = ' '.join(a)

                configs.append('component name={0}.conv type=TimeHeightConvolutionComponent '
                               '{1}'.format(name, conv_opts))
                configs.append('component-node name={0}.conv component={0}.conv '
                               'input={1}'.format(name, cur_descriptor))
                cur_num_filters = self.config['num-filters-out']
                cur_height = self.config['height-out']
            elif operation == 'batchnorm':
                configs.append('component name={0}.batchnorm  type=BatchNormComponent dim={1} '
                               'block-dim={2} target-rms={3}'.format(
                                   name, cur_num_filters * cur_height, cur_num_filters,
                                   self.config['target-rms']))
                configs.append('component-node name={0}.batchnorm component={0}.batchnorm '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'renorm':
                configs.append('component name={0}.renorm type=NormalizeComponent '
                           'dim={1} target-rms={2}'.format(
                               name, cur_num_filters * cur_height,
                               self.config['target-rms']))
                configs.append('component-node name={0}.renorm component={0}.renorm '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'relu':
                configs.append('component name={0}.relu type=RectifiedLinearComponent '
                               'dim={1} block-dim={2} self-repair-scale={3} '
                               'self-repair-lower-threshold={4}'.format(
                                   name, cur_num_filters * cur_height, cur_num_filters,
                                   self.config['self-repair-scale'],
                                   self.config['self-repair-lower-threshold']))
                configs.append('component-node name={0}.relu component={0}.relu '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'dropout':
                configs.append('component name={0}.dropout type=DropoutComponent '
                           'dim={1} dropout-proportion={2}'.format(
                               name, cur_num_filters * cur_height,
                               self.config['dropout-proportion']))
                configs.append('component-node name={0}.dropout component={0}.dropout '
                               'input={1}'.format(name, cur_descriptor))
            elif operation == 'so':
                configs.append('component name={0}.so type=ScaleAndOffsetComponent '
                           'dim={1} block-dim={2}'.format(
                               name, cur_num_filters * cur_height, cur_num_filters))
                configs.append('component-node name={0}.so component={0}.so '
                               'input={1}'.format(name, cur_descriptor))
            else:
                raise RuntimeError("Un-handled operation type: " + operation)

            cur_descriptor = '{0}.{1}'.format(name, operation)

        return configs


# This class is for lines like the following:
#
# res-block name=res1 num-filters=64 height=32 time-period=1
#
# It implements a residual block as in ResNets, with pre-activation, and with
# some small differences-- basically, instead of adding the input to the output,
# we put a convolutional layer in there but initialize it to the unit matrix and
# if you want you can give it a relatively small (or even zero) learning rate
# and max-change.  And there is batch-norm in that path also.
#
# The number of filters is the same on the input and output; it is actually
# redundant to write it in the config file, because given that we know the
# height, we can work it out from the dimension of the input (as dimension =
# height * num-filters).  But we allow it to be specified anyway, for clarity.
#
# Note: the res-block does not support subsampling or changing the number of
# filters.  If you want to do that, we recommend that you should do it with a
# single relu-batchnorm-conv-layer.
#
# Here are the most important configuration values, with defaults shown if
# defaults exist:
#
# input='[-1]'    Descriptor giving the input of the layer.
# height          The input and output height of the image, e.g. 40.  Note: the width
#                 is associated with the time dimension and is dealt with
#                 implicitly, so it's not specified here.
# num-filters     The number of filters on the input and output, e.g. 64.
#                 It does not have to be specified; if it is not specified,
#                 we work it out from the input dimension.
# num-bottleneck-filters   If specified then this will be a 'bottleneck'
#                 ResBlock, in which there is a 1x1 convolution from
#                 num-filters->num-bottleneck-filters, a 3x3 convolution
#                 from num-bottleneck-filters->num-bottleneck-filters, and
#                 a 1x1 convolution from num-bottleneck-filters->num-filters.
#
# time-period=1   Think of this as the stride in the time dimension.  At the
#                 input of the network will always have time-period=1; then
#                 after subsampling once in time we'd have time-period=2; then
#                 after subsampling again we'd have time-period=4.  Because of
#                 the way nnet3 works, subsampling on the time axis is an
#                 implicit, not explicit, operation.
# height-period=1  This will almost always be left at the default (1).  It is
#                 analogous to time-period, but because the height, unlike the
#                 time, is explicitly subsampled, in normal topologies this should
#                 be left at 1.
#
# bypass-source=noop
#                       The output of this component is Sum(convolution, x), and
#                       this option controls what 'x' is.  There are 3 options
#                       here: 'noop', 'input', 'relu' or 'batchnorm'.  'noop' is
#                       equivalent to 'input' in what it computes; it just
#                       inserts a 'noop' component in order to make the
#                       computation more efficient.  For both 'noop' and
#                       'input', x is the input to this component.  If
#                       bypass-source=relu then we use the relu of the
#                       input; if 'batchnorm', then we use the relu+batchnorm of
#                       the input.
# allow-zero-padding=true By default this will allow zero-padding in the time
#                       dimension, meaning that you don't need extra frames at
#                       the input to compute the output.  There may be ASR
#                       applications where you want to pad in the time dimension
#                       with repeats of the first or last frame (as we do for
#                       TDNNs), where it would be appropriate to write
#                       allow-zero-padding=false.  Note: the way we have
#                       set it up, it does zero-padding on the height axis
#                       regardless
#
# Less important config variables:
#  self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#  max-change=0.75           Max-parameter-change constant (per minibatch)
#                            used for convolutional components.
#
#
# The following natural-gradient-related configuration variables are passed in
# to the convolution components, if specified:
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)
#

class XconfigResBlock(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == 'res-block'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height':-1,
                       'num-filters':-1,
                       'num-bottleneck-filters':-1,
                       'time-period':1,
                       'height-period':1,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold1': 0.05,
                       'self-repair-lower-threshold2': 0.05,
                       'self-repair-lower-threshold3': 0.05,
                       'max-change': 0.75,
                       'allow-zero-padding': True,
                       'bypass-source' : 'noop',
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'',
                       'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'', 'l2-regularize':'' }

    def set_derived_configs(self):
        # set 'num-filters' or check it..
        input_dim = self.descriptors['input']['dim']
        height = self.config['height']

        cur_num_filters = self.config['num-filters']
        if cur_num_filters == -1:
            if input_dim % height != 0:
                raise RuntimeError("Specified image height {0} does not "
                                   "divide the input dim {1}".format(
                                       height, input_dim))
            self.config['num-filters'] = input_dim / height
        elif input_dim != cur_num_filters * height:
            raise RuntimeError("Expected the input-dim to equal "
                               "height={0} * num-filters={1} = {2}, but "
                               "it is {3}".format(
                                   height, cur_num_filters,
                                   height * cur_num_filters,
                                   input_dim));

    def check_configs(self):
        # we checked the dimensions in set_derived_configs.
        if not self.config['bypass-source'] in [
                'input', 'noop', 'relu', 'batchnorm' ]:
            raise RuntimeError("Expected direct-convolution-source to "
                               "be input, relu or batchnorm, got: {1}".format(
                                   self.config['direct-convolution-source']))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        bypass_source = self.config['bypass-source']
        b = self.config['num-bottleneck-filters']
        conv = ('{0}.conv2' if b <= 0 else '{0}.conv3').format(self.name)
        if bypass_source == 'input':
            residual = self.descriptors['input']['final-string']
        elif bypass_source == 'noop':
            # we let the noop be the sum of the convolutional part and the
            # input, so just return the output of the no-op component.
            return '{0}.noop'.format(self.name)
        elif bypass_source == 'relu':
            residual = '{0}.relu1'.format(self.name)
        else:
            assert bypass_source == 'batchnorm'
            residual = '{0}.batchnorm1'.format(self.name)

        return 'Sum({0}, {1})'.format(conv, residual)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        input_dim = self.descriptors['input']['dim']
        return input_dim

    def get_full_config(self):
        ans = []
        b = self.config['num-bottleneck-filters']
        if b <= 0:
            config_lines = self._generate_normal_resblock_config()
        else:
            config_lines = self._generate_bottleneck_resblock_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # _generate_normal_resblock_config is a convenience function to generate the
    # res-block config (the non-bottleneck version).
    #
    # The main path inside the res-block in the non-bottleneck case is as
    # follows:
    #
    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
    #
    # We put the relu before the batchnorm because we think it makes more sense;
    # because the Torch people seemed to find that this works better
    # (https://github.com/gcr/torch-residual-networks/issues/5);
    # and because in our batchnorm component we haven't implemented the beta and
    # gamma; these would be essential to having it work before relu, but
    # when before a convolution or linear component, they add no extra modeling
    # power.
    #
    # The output of the res-block can be the sum of the last convolutional
    # component (conv2), with the input.  However, the option ('bypass-source')
    # controls whether we sum with the raw input, or its relu or relu+batchnorm.
    # If the term is going to be the raw input, we give the option ('noop') and
    # to cache the output sum via a NoOpComponent)-- because due to how nnet3
    # works, if we didn't do this, redundant summing operations would take
    # place.
    def _generate_normal_resblock_config(self):
        configs = []

        name = self.name
        num_filters = self.config['num-filters']
        assert self.config['num-bottleneck-filters'] == -1
        height = self.config['height']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        height_period = self.config['height-period']
        time_period = self.config['time-period']

        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
        cur_descriptor = input_descriptor
        for n in [1, 2]:
            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, num_filters * height, num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, num_filters * height,
                                   num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)


            # the convolution.
            a = []
            for opt_name in [
                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                    'alpha-in', 'alpha-out', 'l2-regularize' ]:
                value = self.config[opt_name]
                if value != '':
                        a.append('{0}={1}'.format(opt_name, value))
            conv_opts = ('height-in={h} height-out={h} height-offsets=-{hp},0,{hp} '
                         'time-offsets=-{p},0,{p} '
                         'num-filters-in={f} num-filters-out={f} {r} {o}'.format(
                             h=height, hp=height_period, p=time_period, f=num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=' '.join(a)))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)


        if self.config['bypass-source'] == 'noop':
            dim = self.descriptors['input']['dim']
            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
                name, dim))
            configs.append('component-node name={0}.noop component={0}.noop '
                           'input=Sum({1}, {0}.conv2)'.format(name,
                                                              input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network.
        return configs


    # _generate_bottleneck_resblock_config is a convenience function to generate the
    # res-block config (this is the bottleneck version, where there is
    # a 3x3 kernel with a smaller number of filters than at the input and output,
    # sandwiched between two 1x1 kernels.
    #
    # The main path inside the res-block in the bottleneck case is as follows:
    #
    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 ->
    #   relu3 -> batchnorm3 -> conv3
    #
    # power.
    #
    # The output of the res-block can be the sum of the last convolutional
    # component (conv3), with the input.  However we give the option
    # ('bypass-source') to sum with the raw input, or its relu or
    # relu+batchnorm.  If the term is going to be the raw input, we give the
    # option ('noop') and to cache the output sum via a NoOpComponent)-- because
    # due to how nnet3 works, if we didn't do this, redundant summing operations
    # would take place.
    def _generate_bottleneck_resblock_config(self):
        configs = []

        name = self.name
        num_filters = self.config['num-filters']
        num_bottleneck_filters = self.config['num-bottleneck-filters']
        assert num_bottleneck_filters > 0
        height = self.config['height']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        height_period = self.config['height-period']
        time_period = self.config['time-period']

        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
        cur_descriptor = input_descriptor
        cur_num_filters = num_filters

        for n in [1, 2, 3]:
            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)


            # the convolution.
            a = []
            for opt_name in [
                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                    'alpha-in', 'alpha-out', 'l2-regularize' ]:
                value = self.config[opt_name]
                if value != '':
                        a.append('{0}={1}'.format(opt_name, value))

            height_offsets = ('-{hp},0,{hp}'.format(hp=height_period) if n == 2 else '0')
            time_offsets = ('-{t},0,{t}'.format(t=time_period) if n == 2 else '0')
            next_num_filters = (num_filters if n == 3 else num_bottleneck_filters)
            conv_opts = ('height-in={h} height-out={h} height-offsets={ho} time-offsets={to} '
                         'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format(
                             h=height, ho=height_offsets, to=time_offsets,
                             fi=cur_num_filters, fo=next_num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=' '.join(a)))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)
            cur_num_filters = next_num_filters


        if self.config['bypass-source'] == 'noop':
            dim = self.descriptors['input']['dim']
            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
                name, dim))
            configs.append('component-node name={0}.noop component={0}.noop '
                           'input=Sum({1}, {0}.conv3)'.format(name,
                                                              input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network.
        return configs


# This class is for lines like the following:
#
# res2-block name=res1 num-filters=64 height=32 time-period=1
#
# It is a residual block with post-activations, which does not support
# downsampling (strided convolution) or changing the number of filters;
# for that, see res2-downsample-block.
# It's a pretty standard res-block, more standard than "res-block" (XconfigResBlock).
#
# The number of filters is the same on the input and output; it is actually
# redundant to write it in the config file, because given that we know the
# height, we can work it out from the dimension of the input (as dimension =
# height * num-filters).  But we allow it to be specified anyway, for clarity.
#

# Here are the most important configuration values, with defaults shown if
# defaults exist:
#
# input='[-1]'    Descriptor giving the input of the layer.
# height          The input and output height of the image, e.g. 40.  Note: the width
#                 is associated with the time dimension and is dealt with
#                 implicitly, so it's not specified here.
# num-filters     The number of filters on the input and output, e.g. 64.
#                 It does not have to be specified; if it is not specified,
#                 we work it out from the input dimension.
# num-bottleneck-filters   If specified then this will be a 'bottleneck'
#                 ResBlock, in which there is a 1x1 convolution from
#                 num-filters->num-bottleneck-filters, a 3x3 convolution
#                 from num-bottleneck-filters->num-bottleneck-filters, and
#                 a 1x1 convolution from num-bottleneck-filters->num-filters.
# time-period=1   Think of this as the stride in the time dimension.  At the
#                 input of the network will always have time-period=1; then
#                 after subsampling once in time we'd have time-period=2; then
#                 after subsampling again we'd have time-period=4.  Because of
#                 the way nnet3 works, subsampling on the time axis is an
#                 implicit, not explicit, operation.
# allow-zero-padding=true By default this will allow zero-padding in the time
#                       dimension, meaning that you don't need extra frames at
#                       the input to compute the output.  There may be ASR
#                       applications where you want to pad in the time dimension
#                       with repeats of the first or last frame (as we do for
#                       TDNNs), where it would be appropriate to write
#                       allow-zero-padding=false.  Note: the way we have
#                       set it up, it does zero-padding on the height axis
#                       regardless
#
# Less important config variables:
#  self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
#                            'self-repair' mechanism that nudges the inputs to the
#                            ReLUs into the appropriate range in cases where
#                            the unit is active either too little of the time
#                            (<10%) or too much of the time (>90%).
#  max-change=0.75           Max-parameter-change constant (per minibatch)
#                            used for convolutional components.
#
#
# The following natural-gradient-related configuration variables are passed in
# to the convolution components, if specified:
#  use-natural-gradient (bool)
#  rank-in, rank-out    (int)
#  num-minibatches-history (float)
#  alpha-in, alpha-out (float)
# the following is also passed into the convolution components, if specified:
#  l2-regularize (float)

class XconfigRes2Block(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == 'res2-block'
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'height':-1,  # sets height-in and height-out
                       'height-in':-1,
                       'height-out':-1,
                       'num-filters':-1, # interpreted as num-filters-out.
                       'num-bottleneck-filters':-1,
                       'time-period':1,
                       'self-repair-scale': 2.0e-05,
                       'self-repair-lower-threshold1': 0.05,
                       'self-repair-lower-threshold2': 0.05,
                       'self-repair-lower-threshold3': 0.05,
                       'max-change': 0.75,
                       'allow-zero-padding': True,
                       # the following are not really inspected by this level of
                       # code, just passed through (but not if left at '').
                       'param-stddev':'', 'bias-stddev':'',
                       'use-natural-gradient':'',
                       'rank-in':'', 'rank-out':'',
                       'num-minibatches-history':'',
                       'alpha-in':'', 'alpha-out':'',
                       'l2-regularize':'' }

    def set_derived_configs(self):
        input_dim = self.descriptors['input']['dim']

        if not ((self.config['height'] > 0  and self.config['height-in'] == -1 and
                 self.config['height-out'] == -1) or
                (self.config['height-out'] > 0 and self.config['height-in'] > 0)):
            raise RuntimeError("You must specify height, or height-in and height-out, for res2-block.")

        if not (self.config['height-in'] > 0 and self.config['height-out'] > 0):
            height = self.config['height']
            if not height > 0:
                raise RuntimeError("You must specify either height, or height-in and height-out, for "
                                   "res2-block.")
            self.config['height-in'] = height
            self.config['height-out'] = height

        height_in = self.config['height-in']
        if input_dim % height_in != 0:
            raise RuntimeError("Specified input image height {0} does not "
                                   "divide the input dim {1}".format(
                                       height_in, input_dim))
            self.config['num-filters'] = input_dim / height

    def check_configs(self):
        if self.config['num-filters'] == -1:
            raise RuntimeError("You must specify num-filters for res2-block.")

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        b = self.config['num-bottleneck-filters']
        return ('{0}.relu2' if b <= 0 else '{0}.relu3').format(self.name)

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['height-out'] * self.config['num-filters']

    def get_full_config(self):
        ans = []
        b = self.config['num-bottleneck-filters']
        if b <= 0:
            config_lines = self._generate_normal_resblock_config()
        else:
            config_lines = self._generate_bottleneck_resblock_config()

        for line in config_lines:
            for config_name in ['ref', 'final']:
                # we do not support user specified matrices in CNN initialization
                # so 'ref' and 'final' configs are the same.
                ans.append((config_name, line))
        return ans

    # _generate_normal_resblock_config is a convenience function to generate the
    # res-block config (the non-bottleneck version).
    #
    # The main path inside the res-block in the non-bottleneck case is as
    # follows:
    #
    # input -> conv1 -> batchnorm1 -> scaleoffset1 -> relu1 -> conv2 -> batchnorm2 -> scaleoffset2 -> relu2
    #
    # where the 'scaleoffsetN' are ScaleAndOffsetComponent, which conventionally would be
    # considered part of the BatchNorm.
    #
    # The relu2 actually sees the sum of the input and  'scaleoffset2'-- which gives us the bypass
    # connection.
    def _generate_normal_resblock_config(self):
        configs = []
        name = self.name
        assert self.config['num-bottleneck-filters'] == -1
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        time_period_out = self.config['time-period']
        if not input_dim % height_in == 0:
            raise RuntimeError("input-dim {0} does not divide height-in {1}".format(
                input_dim, height_in))
        num_filters_in = input_dim / height_in
        num_filters_out = self.config['num-filters']

        if height_out != height_in:
            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                   "height-in={0} height-out={1}".format(height_in, height_out))
            if not time_period_out % 2 == 0:
                raise RuntimeError("Expected time-period to be a multiple of 2 if you are subsampling "
                                   "on height.")
            time_period_in = time_period_out / 2
            height_subsample = 2
        else:
            time_period_in = time_period_out
            height_subsample = 1


        cur_time_period = time_period_in
        cur_num_filters = num_filters_in
        cur_height = height_in

        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        if height_subsample == 1 and num_filters_in == num_filters_out:
            bypass_descriptor = input_descriptor
        else:
            bypass_descriptor = '{0}.conv_bypass'.format(name)

        cur_descriptor = input_descriptor

        # get miscellaneous convolution options passed in from the xconfig line
        a = []
        for opt_name in [
                'param-stddev', 'bias-stddev', 'use-natural-gradient',
                'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                'alpha-in', 'alpha-out', 'l2-regularize' ]:
            value = self.config[opt_name]
            if value != '':
                a.append('{0}={1}'.format(opt_name, value))
        misc_conv_opts = ' '.join(a)

        for n in [1, 2]:
            # the convolution.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=-1,0,1 '
                         'height-subsample-out={hs} '
                         'time-offsets=-{p},0,{p} '
                         'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format(
                             hi=cur_height, ho=height_out,
                             p=cur_time_period,
                             hs=(height_subsample if n == 1 else 1),
                             fi=cur_num_filters,
                             fo=num_filters_out,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=misc_conv_opts))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.conv{1}'.format(name, n)

            cur_num_filters = num_filters_out
            cur_height = height_out
            cur_time_period = time_period_out

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)

            # the scale-and-offset
            configs.append('component name={0}.scaleoffset{1}  type=ScaleAndOffsetComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.scaleoffset{1}'.format(name, n)


            if n == 2:
                # the bypass connection
                cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor)


            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * cur_height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

        if bypass_descriptor != input_descriptor:
            # We need to add the 1x1 bypass convolution because we're either doing height
            # subsampling or changing the number of filters.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 '
                         'time-offsets=0 height-subsample-out={hs} '
                         'num-filters-in={fi} num-filters-out={fo} {o}'.format(
                             hi=height_in, ho=height_out, hs=height_subsample,
                             fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts))
            configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent '
                           '{1}'.format(name, conv_opts))
            configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass '
                           'input={1}'.format(name, input_descriptor))


        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network, which in
        # this case would be '{0}.relu2'.format(name).
        return configs


    # _generate_bottleneck_resblock_config is a convenience function to generate the
    # res-block config (this is the bottleneck version, where there is
    # a 3x3 kernel with a smaller number of filters than at the input and output,
    # sandwiched between two 1x1 kernels.
    #
    # The main path inside the res-block in the bottleneck case is as follows:
    #
    # input -> conv1 -> batchnorm1 -> scaleoffset1 -> relu1 ->
    #          conv2 -> batchnorm2 -> scaleoffset2 -> relu2 ->
    #          conv3 -> batchnorm3 -> scaleoffset3 -> relu3
    #
    #  but the relu3 takes as its input the sum of 'input' and 'scaleoffset3'.
    #
    def _generate_bottleneck_resblock_config(self):
        configs = []

        name = self.name
        num_bottleneck_filters = self.config['num-bottleneck-filters']
        assert num_bottleneck_filters > 0
        input_dim = self.descriptors['input']['dim']
        height_in = self.config['height-in']
        height_out = self.config['height-out']
        input_descriptor = self.descriptors['input']['final-string']
        allow_zero_padding = self.config['allow-zero-padding']
        time_period_out = self.config['time-period']
        if not input_dim % height_in == 0:
            raise RuntimeError("input-dim={0} does not divide height-in={1}".format(
                input_dim, height_in))
        num_filters_in = input_dim / height_in
        num_filters_out = self.config['num-filters']

        if height_out != height_in:
            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                   "height-in={0} height-out={1}".format(height_in, height_out))
            height_subsample = 2
        else:
            height_subsample = 1

        cur_descriptor = input_descriptor
        cur_num_filters = num_filters_in
        cur_height = height_in
        if height_subsample == 1 and num_filters_in == num_filters_out:
            bypass_descriptor = input_descriptor
        else:
            bypass_descriptor = '{0}.conv_bypass'.format(name)

        # get miscellaneous convolution options passed in from the xconfig line
        a = []
        for opt_name in [
                'param-stddev', 'bias-stddev', 'use-natural-gradient',
                'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
                'alpha-in', 'alpha-out', 'l2-regularize' ]:
            value = self.config[opt_name]
            if value != '':
                a.append('{0}={1}'.format(opt_name, value))
        misc_conv_opts = ' '.join(a)


        for n in [1, 2, 3]:
            # the convolution.
            height_offsets = ('-1,0,1' if n == 2 else '0')
            this_height_subsample = height_subsample if n == 1 else 1
            time_offsets = ('-{t},0,{t}'.format(t=time_period_out) if n == 2 else '0')
            next_num_filters = (num_filters_out if n == 3 else num_bottleneck_filters)

            conv_opts = ('height-in={h_in} height-out={h_out} height-offsets={ho} time-offsets={to} '
                         'num-filters-in={fi} num-filters-out={fo} height-subsample-out={hs} '
                         '{r} {o}'.format(
                             h_in=cur_height, h_out=height_out,
                             to=time_offsets, ho=height_offsets,
                             hs=this_height_subsample,
                             fi=cur_num_filters, fo=next_num_filters,
                             r=('required-time-offsets=0' if allow_zero_padding else ''),
                             o=misc_conv_opts))

            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
                           '{2}'.format(name, n, conv_opts))
            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_num_filters = next_num_filters
            cur_height = height_out
            cur_descriptor = '{0}.conv{1}'.format(name, n)

            # the batch-norm
            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)

            # the scale and offset
            configs.append('component name={0}.scaleoffset{1}  type=ScaleAndOffsetComponent dim={2} '
                               'block-dim={3}'.format(
                                   name, n, cur_num_filters * cur_height,
                                   cur_num_filters))
            configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} '
                           'input={2}'.format(name, n, cur_descriptor))
            cur_descriptor = '{0}.scaleoffset{1}'.format(name, n)

            if n == 3:
                # the bypass connection
                cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor)

            # the ReLU
            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
                           'dim={2} block-dim={3} self-repair-scale={4} '
                           'self-repair-lower-threshold={5}'.format(
                               name, n, cur_num_filters * cur_height, cur_num_filters,
                               self.config['self-repair-scale'],
                               self.config['self-repair-lower-threshold{0}'.format(n)]))
            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
                           'input={2}'.format(name, n, cur_descriptor))

            cur_descriptor = '{0}.relu{1}'.format(name, n)

        if bypass_descriptor != input_descriptor:
            # We need to add the 1x1 bypass convolution because we're either doing height
            # subsampling or changing the number of filters.
            conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 '
                         'time-offsets=0 height-subsample-out={hs} '
                         'num-filters-in={fi} num-filters-out={fo} {o}'.format(
                             hi=height_in, ho=height_out, hs=height_subsample,
                             fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts))
            configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent '
                           '{1}'.format(name, conv_opts))
            configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass '
                           'input={1}'.format(name, input_descriptor))

        # Note: the function 'output_name' is responsible for returning the
        # descriptor corresponding to the output of the network, which
        # in this case will be '{0}.relu3'.format(name).
        return configs


# This layer just maps to a single component, a SumBlockComponent.  It's for
# doing channel averaging at the end of neural networks.  See scripts for
# examples of how to use it.
# An example line using this layer is:
# channel-average-layer name=channel-average input=Append(2, 4, 6, 8) dim=64

# the configuration value 'dim' is the output dimension of this layer.
# The input dimension is expected to be a multiple of 'dim'.  The output
# will be the average of 'dim'-sized blocks of the input.
class ChannelAverageLayer(XconfigLayerBase):
    def __init__(self, first_token, key_to_value, prev_names = None):
        assert first_token == "channel-average-layer"
        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)

    def set_default_configs(self):
        self.config = {'input':'[-1]',
                       'dim': -1 }

    def set_derived_configs(self):
        pass

    def check_configs(self):
        input_dim = self.descriptors['input']['dim']
        dim = self.config['dim']
        if dim <= 0:
            raise RuntimeError("dim must be specified and > 0.")
        if input_dim % dim != 0:
            raise RuntimeError("input-dim={0} is not a multiple of dim={1}".format(
                input_dim, dim))

    def auxiliary_outputs(self):
        return []

    def output_name(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.name

    def output_dim(self, auxiliary_output = None):
        assert auxiliary_output is None
        return self.config['dim']


    def get_full_config(self):
        ans = []
        config_lines = self._generate_channel_average_config()
        for line in config_lines:
            for config_name in ['ref', 'final']:
                ans.append((config_name, line))
        return ans

    def _generate_channel_average_config(self):
        configs = []
        name = self.name
        input_dim = self.descriptors['input']['dim']
        input_descriptor = self.descriptors['input']['final-string']
        dim = self.config['dim']
        # choose the scale that makes it an average rather than a sum.
        scale = dim * 1.0 / input_dim
        configs.append('component name={0} type=SumBlockComponent input-dim={1} '
                       'output-dim={2} scale={3}'.format(name, input_dim,
                                                         dim, scale))
        configs.append('component-node name={0} component={0} input={1}'.format(
            name, input_descriptor))
        return configs