Blame view

egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh 29.7 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
  #!/bin/bash
  
  
  # 1b is like 1a but instead of having 3 fast-lstm-layers, having one
  #  lstmb-layer.  Caution: although it's better than run_tdnn_lstm_1a.sh, it's
  #  still not better than run_tdnn_1f.sh, and my experience with this LSTMB layer
  #  on larger-scale setups like Switchboard has not been good.  So I *don't
  #  particularly recommend* this setup.
  
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b_sp
  # System                tdnn_lstm1a_sp  tdnn_lstm1b_sp
  #WER dev93 (tgpr)                7.64       7.24
  #WER dev93 (tg)                  7.29       7.03
  #WER dev93 (big-dict,tgpr)       5.53       5.04
  #WER dev93 (big-dict,fg)         5.14       4.92
  #WER eval92 (tgpr)               5.62       5.23
  #WER eval92 (tg)                 5.30       4.78
  #WER eval92 (big-dict,tgpr)      3.62       3.17
  #WER eval92 (big-dict,fg)        3.31       2.73
  # Final train prob        -0.0344    -0.0403
  # Final valid prob        -0.0518    -0.0526
  # Final train prob (xent)   -0.5589    -0.7406
  # Final valid prob (xent)   -0.6620    -0.7766
  # Num-params                 9106252    4216524
  
  # 1b22 is as 1b21 but setting chain.l2-regularize to zero.
  
  # 1b21 is as 1b20 but half the learning rate..
  
  # 1b20 is as 1b19b but reducing dimensions of TDNN layers from 512 to 448.
  # 1b19b is as 1b19 but with more epochs (4->6)
  # 1b19 is a rerun of 1b18d3 (a fairly small LSTM+TDNN setup).
  #
  #
  # 1b18d3 is as 1b18d2 but reducing lstm bottleneck dim from 304 to 256.
  # [1b18d2 is just a rerun of 1b18d as I merged various code changes and
  #  I want to make sure nothing bad happened.]
  #
  # Results below show it's probably slightly better than the average of 18d and 18d2
  #   (which are supposed to be the same experiment)...
  #
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18d_sp exp/chain/tdnn_lstm1b18d2_sp exp/chain/tdnn_lstm1b18d3_sp
  # System                tdnn_lstm1b18d_sp tdnn_lstm1b18d2_sp tdnn_lstm1b18d3_sp
  #WER dev93 (tgpr)                7.78      7.46      7.46
  #WER dev93 (tg)                  7.29      7.30      7.04
  #WER dev93 (big-dict,tgpr)       5.56      5.51      5.55
  #WER dev93 (big-dict,fg)         5.32      5.08      5.05
  #WER eval92 (tgpr)               5.33      5.40      5.39
  #WER eval92 (tg)                 5.05      5.03      4.96
  #WER eval92 (big-dict,tgpr)      3.42      3.26      3.35
  #WER eval92 (big-dict,fg)        2.91      2.64      2.82
  # Final train prob        -0.0529   -0.0536   -0.0543
  # Final valid prob        -0.0633   -0.0630   -0.0636
  # Final train prob (xent)   -0.8327   -0.8330   -0.8415
  # Final valid prob (xent)   -0.8693   -0.8672   -0.8695
  # Num-params                 4922060   4922060   4805324
  
  #
  # 1b18d is as 1b18c, but adding 'self-scale=2.0' to scale up the m_trunc when it is given
  # as input to the affine projections (I found previously this was helpful).
  # .. Interesting: objf improves but WER is not better.
  #
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18c_sp exp/chain/tdnn_lstm1b18d_sp
  # System                tdnn_lstm1b18c_sp tdnn_lstm1b18d_sp
  #WER dev93 (tgpr)                7.77      7.78
  #WER dev93 (tg)                  7.40      7.29
  #WER dev93 (big-dict,tgpr)       5.39      5.56
  #WER dev93 (big-dict,fg)         5.25      5.32
  #WER eval92 (tgpr)               5.48      5.33
  #WER eval92 (tg)                 4.98      5.05
  #WER eval92 (big-dict,tgpr)      3.07      3.42
  #WER eval92 (big-dict,fg)        2.69      2.91
  # Final train prob        -0.0546   -0.0529
  # Final valid prob        -0.0641   -0.0633
  # Final train prob (xent)   -0.8679   -0.8327
  # Final valid prob (xent)   -0.8954   -0.8693
  # Num-params                 4922060   4922060
  
  # 1b18c is as 1b18b, but fixing a bug in the script whereby c instead of m had been used
  # as input to the affine projections.
  
  # 1b18b is as 1b18, but doubling l2 regularization on the output
  #  and lstm layers, parts of them were training too slowly.
  #
  # 1b18 is as 1b17, but via script change, not using memory-norm (actually
  #   this is the same as 1b17d).
  #  I don't see any WER change, but objf is worse.
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b17_sp exp/chain/tdnn_lstm1b17d_sp exp/chain/tdnn_lstm1b18_sp
  # System                tdnn_lstm1b17_sp tdnn_lstm1b17d_sp tdnn_lstm1b18_sp
  #WER dev93 (tgpr)                7.49      7.44      7.48
  #WER dev93 (tg)                  7.18      7.13      7.19
  #WER dev93 (big-dict,tgpr)       5.50      5.34      5.48
  #WER dev93 (big-dict,fg)         5.11      5.15      5.04
  #WER eval92 (tgpr)               5.26      5.32      5.32
  #WER eval92 (tg)                 5.00      4.94      5.03
  #WER eval92 (big-dict,tgpr)      3.24      3.28      3.26
  #WER eval92 (big-dict,fg)        2.82      2.80      2.84
  # Final train prob        -0.0489   -0.0486   -0.0496
  # Final valid prob        -0.0583   -0.0599   -0.0612
  # Final train prob (xent)   -0.7550   -0.7809   -0.7749
  # Final valid prob (xent)   -0.7988   -0.8121   -0.8131
  # Num-params                 4922060   4922060   4922060
  
  # 1b17 is as 1b13m, it's just a rerun after some code changes (adding
  # diagonal natural gradient stuff) which should make no difference.
  # Still seems to be working.
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp exp/chain/tdnn_lstm1b17_sp
  # System                tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp tdnn_lstm1b17_sp
  #WER dev93 (tgpr)                7.86      7.43      7.49
  #WER dev93 (tg)                  7.40      7.00      7.18
  #WER dev93 (big-dict,tgpr)       5.65      5.21      5.50
  #WER dev93 (big-dict,fg)         5.11      4.76      5.11
  #WER eval92 (tgpr)               5.64      5.39      5.26
  #WER eval92 (tg)                 5.17      5.00      5.00
  #WER eval92 (big-dict,tgpr)      3.21      3.30      3.24
  #WER eval92 (big-dict,fg)        2.84      2.62      2.82
  # Final train prob        -0.0469   -0.0516   -0.0489
  # Final valid prob        -0.0601   -0.0607   -0.0583
  # Final train prob (xent)   -0.7424   -0.7593   -0.7550
  # Final valid prob (xent)   -0.7920   -0.7982   -0.7988
  # Num-params                 5456076   4922060   4922060
  
  # 1b13m is as 1b13l, but reverting the LSTM script "fix" (which actually
  #  made things worse), so the baseline is 1b13{c,d} (and the change versus
  # c,d is to add bottleneck-dim=256).
  #
  # It's helpful:
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp
  # System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp
  #WER dev93 (tgpr)                7.68      7.86      7.43
  #WER dev93 (tg)                  7.34      7.40      7.00
  #WER dev93 (big-dict,tgpr)       5.42      5.65      5.21
  #WER dev93 (big-dict,fg)         5.05      5.11      4.76
  #WER eval92 (tgpr)               5.48      5.64      5.39
  #WER eval92 (tg)                 5.26      5.17      5.00
  #WER eval92 (big-dict,tgpr)      3.23      3.21      3.30
  #WER eval92 (big-dict,fg)        2.82      2.84      2.62
  # Final train prob        -0.0490   -0.0469   -0.0516
  # Final valid prob        -0.0597   -0.0601   -0.0607
  # Final train prob (xent)   -0.7549   -0.7424   -0.7593
  # Final valid prob (xent)   -0.7910   -0.7920   -0.7982
  # Num-params                 5456076   5456076   4922060
  #
  #
  # 1b13l is as 1b13k, but adding bottleneck-dim=256 to the output layers.
  #  Definitely helpful:
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13k_sp exp/chain/tdnn_lstm1b13l_sp
  # System                tdnn_lstm1b13k_sp tdnn_lstm1b13l_sp
  #WER dev93 (tgpr)                7.94      7.46
  #WER dev93 (tg)                  7.68      7.09
  #WER dev93 (big-dict,tgpr)       5.91      5.39
  #WER dev93 (big-dict,fg)         5.56      4.94
  #WER eval92 (tgpr)               5.65      5.44
  #WER eval92 (tg)                 5.32      5.09
  #WER eval92 (big-dict,tgpr)      3.49      3.15
  #WER eval92 (big-dict,fg)        3.07      2.94
  # Final train prob        -0.0491   -0.0513
  # Final valid prob        -0.0600   -0.0599
  # Final train prob (xent)   -0.7395   -0.7490
  # Final valid prob (xent)   -0.7762   -0.7860
  # Num-params                 5456076   4922060
  
  # 1b13k is as 1b13d, but after a script fix: previously we were using the 'c'
  # for the full-matrix part of the recurrence instead of the 'm'.
  
  # 1b13d is as 1b13c, but a rerun after fixing a code bug whereby the natural gradient
  # for the LinearComponent was turned off by default when initializing from config.
  #   **Update: turns out there was no difference here, the code had been ignoring
  #     that config variable.**
  #
  # It seems to optimize better, although the WER change is unclear.  However, it's
  # interesting that the average objf in the individual training jobs (train.*.log) is not better-
  # but in compute_prob_train.*.log it is.  It seems that the natural gradient interacts
  # well with model averaging, which is what we found previously in the NG paper.
  
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp
  # System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp
  #WER dev93 (tgpr)                7.68      7.86
  #WER dev93 (tg)                  7.34      7.40
  #WER dev93 (big-dict,tgpr)       5.42      5.65
  #WER dev93 (big-dict,fg)         5.05      5.11
  #WER eval92 (tgpr)               5.48      5.64
  #WER eval92 (tg)                 5.26      5.17
  #WER eval92 (big-dict,tgpr)      3.23      3.21
  #WER eval92 (big-dict,fg)        2.82      2.84
  # Final train prob        -0.0490   -0.0469
  # Final valid prob        -0.0597   -0.0601
  # Final train prob (xent)   -0.7549   -0.7424
  # Final valid prob (xent)   -0.7910   -0.7920
  # Num-params                 5456076   5456076
  #
  #
  # 1b13c is as 1b13b, but after script change in which the lstmb layer was
  # rewritten, adding memnorm and removing the scale of 4.0, along with some
  #  more minor changes and streamlining/removing options.
  #
  # 1b13b is as 1b13, but a rerun after merging with the memnorm-and-combine
  #   branch.  Slight difference in num-params is because of 300 vs 304.
  
  # 1b13 is as 1b10 but reducing the bottleneck dim to 304
  # (because I want to get in the habit of using multiples of 8).
  #  WER seems improved.
  #
  #
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b10_sp exp/chain/tdnn_lstm1b13_sp
  # System                tdnn_lstm1b10_sp tdnn_lstm1b13_sp
  #WER dev93 (tgpr)                7.87      7.63
  #WER dev93 (tg)                  7.48      7.46
  #WER dev93 (big-dict,tgpr)       5.55      5.56
  #WER dev93 (big-dict,fg)         5.25      5.09
  #WER eval92 (tgpr)               5.44      5.48
  #WER eval92 (tg)                 5.05      5.12
  #WER eval92 (big-dict,tgpr)      3.24      3.17
  #WER eval92 (big-dict,fg)        2.73      2.60
  # Final train prob        -0.0463   -0.0470
  # Final valid prob        -0.0561   -0.0565
  # Final train prob (xent)   -0.7362   -0.7588
  # Final valid prob (xent)   -0.7730   -0.7831
  # Num-params                 5650636   5446348
  
  # 1b10 is as 1b9 but reducing the cell and bottleneck dimension of LSTM layer from 512 to 384.
  # Seems helpful on average-- nice!
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b9_sp exp/chain/tdnn_lstm1b10_sp
  # System                tdnn_lstm1b9_sp tdnn_lstm1b10_sp
  #WER dev93 (tgpr)                7.74      7.87
  #WER dev93 (tg)                  7.46      7.48
  #WER dev93 (big-dict,tgpr)       5.67      5.55
  #WER dev93 (big-dict,fg)         5.31      5.25
  #WER eval92 (tgpr)               5.60      5.44
  #WER eval92 (tg)                 5.42      5.05
  #WER eval92 (big-dict,tgpr)      3.47      3.24
  #WER eval92 (big-dict,fg)        3.07      2.73
  # Final train prob        -0.0413   -0.0463
  # Final valid prob        -0.0543   -0.0561
  # Final train prob (xent)   -0.6786   -0.7362
  # Final valid prob (xent)   -0.7249   -0.7730
  # Num-params                 7021644   5650636
  
  # 1b9 is as 1b8 but adding batchnorm after the LSTM layer.. this is
  #  to correct an oversight.
  # 1b8 is as 1b7 but with quite a few layers removed.  WER effect is unclear.
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1b7_sp exp/chain/tdnn_lstm1b8_sp
  # System                tdnn_lstm1b7_sp tdnn_lstm1b8_sp
  #WER dev93 (tgpr)                7.31      7.60
  #WER dev93 (tg)                  7.10      7.25
  #WER dev93 (big-dict,tgpr)       5.26      5.26
  #WER dev93 (big-dict,fg)         4.64      4.93
  #WER eval92 (tgpr)               5.48      5.32
  #WER eval92 (tg)                 5.00      5.07
  #WER eval92 (big-dict,tgpr)      3.35      3.31
  #WER eval92 (big-dict,fg)        2.99      2.84
  # Final train prob        -0.0483   -0.0533
  # Final valid prob        -0.0573   -0.0627
  # Final train prob (xent)   -0.7207   -0.8234
  # Final valid prob (xent)   -0.7467   -0.8466
  # Num-params                11752524   7021644
  
  # 1b7 is as 1b6 but adding self-stabilize=true and normalize-type=none;
  # and after a script-level change that scale 'c' by 4 before giving it
  # to the W_all_a matrix (to see where all this came from, look at run_tdnn_lstm_1b16.sh
  # in the mini_librispeech setup, although by the time you see this, that may no longer exist).
  #
  # 1b6 is as 1b3 but replacing renorm with batchnorm for the TDNN layers,
  # and adding batchnorm to the LSTMB layers.  Effect on WER unclear but generally
  # it's better.
  
  
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1{a2,a3,b3,b6}_sp
  # local/chain/compare_wer.sh exp/chain/tdnn_lstm1a2_sp exp/chain/tdnn_lstm1a3_sp exp/chain/tdnn_lstm1b3_sp exp/chain/tdnn_lstm1b6_sp
  # System                tdnn_lstm1a2_sp tdnn_lstm1a3_sp tdnn_lstm1b3_sp tdnn_lstm1b6_sp
  #WER dev93 (tgpr)                7.47      7.65      7.26      7.32
  #WER dev93 (tg)                  7.29      7.24      6.96      6.98
  #WER dev93 (big-dict,tgpr)       5.44      5.60      5.43      5.22
  #WER dev93 (big-dict,fg)         4.98      5.04      4.97      4.86
  #WER eval92 (tgpr)               5.78      5.21      5.30      5.14
  #WER eval92 (tg)                 5.44      5.00      4.87      4.82
  #WER eval92 (big-dict,tgpr)      3.35      3.23      3.42      3.24
  #WER eval92 (big-dict,fg)        2.99      2.96      3.03      2.82
  # Final train prob        -0.0447   -0.0410   -0.0484   -0.0503
  # Final valid prob        -0.0566   -0.0518   -0.0594   -0.0599
  # Final train prob (xent)   -0.6859   -0.6676   -0.7528   -0.7415
  # Final valid prob (xent)   -0.7378   -0.7230   -0.8078   -0.7804
  # Num-params                 9106252   9106252  11747916  11746380
  
  # 1b3 is as 1a2 but with the same change as in a->b, replacing lstmp with lstmb
  # 1a2 is as 1a but adding l2 regularization.
  
  # this is a TDNN+LSTM chain system.
  # It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
  # reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
  # Note: we're using the same hidden-layer sizes as
  # ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
  # fact that we'd normally choose a smaller model for a setup with
  # less data, because the Tedlium model was probably on the small side.
  # Note: we normally use more parameters for LSTM-containing than TDNN-only
  # systems.
  
  # steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
  # exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
  
  # The following compares:
  # (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
  # system.
  # This is consistently better than the nnet3 TDNN+LSTM, but the
  # difference with the chain TDNN is inconsistent.
  
  # local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
  # System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
  #WER dev93 (tgpr)                8.54      7.87      7.48
  #             [online:]          8.57      8.02      7.49
  #WER dev93 (tg)                  8.25      7.61      7.41
  #             [online:]          8.34      7.70      7.40
  #WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
  #             [online:]          6.40      5.60      5.70
  #WER dev93 (big-dict,fg)         5.70      5.10      5.40
  #             [online:]          5.77      5.21      5.19
  #WER eval92 (tgpr)               6.52      5.23      5.67
  #             [online:]          6.56      5.44      5.60
  #WER eval92 (tg)                 6.13      4.87      5.46
  #             [online:]          6.24      4.87      5.53
  #WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
  #             [online:]          3.88      3.31      3.63
  #WER eval92 (big-dict,fg)        3.38      2.71      3.28
  #             [online:]          3.53      2.92      3.31
  # Final train prob                  -0.0414   -0.0341
  # Final valid prob                  -0.0634   -0.0506
  # Final train prob (xent)             -0.8216   -0.5643
  # Final valid prob (xent)             -0.9208   -0.6648
  
  
  set -e -o pipefail
  
  # First the options that are passed through to run_ivector_common.sh
  # (some of which are also used in this script directly).
  stage=0
  nj=30
  train_set=train_si284
  test_sets="test_dev93 test_eval92"
  gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
                   # should have alignments for the specified training data.
  num_threads_ubm=32
  nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
  
  # Options which are not passed through to run_ivector_common.sh
  affix=1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
  common_egs_dir=
  reporting_email=
  
  # LSTM/chain options
  train_stage=-10
  label_delay=8
  xent_regularize=0.1
  
  # training chunk-options
  chunk_width=140,100,160
  chunk_left_context=40
  chunk_right_context=0
  
  # training options
  srand=0
  remove_egs=true
  
  #decode options
  test_online_decoding=false  # if true, it will run the last decoding stage.
  
  # End configuration section.
  echo "$0 $@"  # Print the command line for logging
  
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  
  if ! cuda-compiled; then
    cat <<EOF && exit 1
  This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
  If you want to use GPUs (and have them), go to src/, and configure and make on a machine
  where "nvcc" is installed.
  EOF
  fi
  
  local/nnet3/run_ivector_common.sh \
    --stage $stage --nj $nj \
    --train-set $train_set --gmm $gmm \
    --num-threads-ubm $num_threads_ubm \
    --nnet3-affix "$nnet3_affix"
  
  
  
  gmm_dir=exp/${gmm}
  ali_dir=exp/${gmm}_ali_${train_set}_sp
  lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
  dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
  train_data_dir=data/${train_set}_sp_hires
  train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
  lores_train_data_dir=data/${train_set}_sp
  
  # note: you don't necessarily have to change the treedir name
  # each time you do a new experiment-- only if you change the
  # configuration in a way that affects the tree.
  tree_dir=exp/chain${nnet3_affix}/tree_a_sp
  # the 'lang' directory is created by this script.
  # If you create such a directory with a non-standard topology
  # you should probably name it differently.
  lang=data/lang_chain
  
  for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
      $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
      $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
  done
  
  
  if [ $stage -le 12 ]; then
    echo "$0: creating lang directory $lang with chain-type topology"
    # Create a version of the lang/ directory that has one state per phone in the
    # topo file. [note, it really has two states.. the first one is only repeated
    # once, the second one has zero or more repeats.]
    if [ -d $lang ]; then
      if [ $lang/L.fst -nt data/lang/L.fst ]; then
        echo "$0: $lang already exists, not overwriting it; continuing"
      else
        echo "$0: $lang already exists and seems to be older than data/lang..."
        echo " ... not sure what to do.  Exiting."
        exit 1;
      fi
    else
      cp -r data/lang $lang
      silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
      nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
      # Use our special topology... note that later on may have to tune this
      # topology.
      steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
    fi
  fi
  
  if [ $stage -le 13 ]; then
    # Get the alignments as lattices (gives the chain training more freedom).
    # use the same num-jobs as the alignments
    steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
      data/lang $gmm_dir $lat_dir
    rm $lat_dir/fsts.*.gz # save space
  fi
  
  if [ $stage -le 14 ]; then
    # Build a tree using our new topology.  We know we have alignments for the
    # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
    # those.  The num-leaves is always somewhat less than the num-leaves from
    # the GMM baseline.
     if [ -f $tree_dir/final.mdl ]; then
       echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
       exit 1;
    fi
    steps/nnet3/chain/build_tree.sh \
      --frame-subsampling-factor 3 \
      --context-opts "--context-width=2 --central-position=1" \
      --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
      $lang $ali_dir $tree_dir
  fi
  
  
  if [ $stage -le 15 ]; then
    mkdir -p $dir
    echo "$0: creating neural net configs using the xconfig parser";
  
    num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
    learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
    tdnn_opts="l2-regularize=0.01"
    output_opts="l2-regularize=0.005 bottleneck-dim=256"
    lstm_opts="l2-regularize=0.005 self-scale=2.0"
  
  
    mkdir -p $dir/configs
    cat <<EOF > $dir/configs/network.xconfig
    input dim=100 name=ivector
    input dim=40 name=input
  
    # please note that it is important to have input layer with the name=input
    # as the layer immediately preceding the fixed-affine-layer to enable
    # the use of short notation for the descriptor
    fixed-affine-layer name=lda delay=5 input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
  
    # the first splicing is moved before the lda layer, so no splicing here
    relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=448
    relu-batchnorm-layer name=tdnn2 $tdnn_opts dim=448 input=Append(-1,0,1)
    relu-batchnorm-layer name=tdnn3 $tdnn_opts dim=448 input=Append(-3,0,3)
    relu-batchnorm-layer name=tdnn4 $tdnn_opts dim=448 input=Append(-3,0,3)
    lstmb-layer name=lstm3 $lstm_opts cell-dim=384 bottleneck-dim=256 decay-time=20 delay=-3
  
    ## adding the layers for chain branch
    output-layer name=output input=lstm3 $output_opts output-delay=$label_delay include-log-softmax=false dim=$num_targets
  
    # adding the layers for xent branch
    # This block prints the configs for a separate output that will be
    # trained with a cross-entropy objective in the 'chain' models... this
    # has the effect of regularizing the hidden parts of the model.  we use
    # 0.5 / args.xent_regularize as the learning rate factor- the factor of
    # 0.5 / args.xent_regularize is suitable as it means the xent
    # final-layer learns at a rate independent of the regularization
    # constant; and the 0.5 was tuned so as to make the relative progress
    # similar in the xent and regular final layers.
    output-layer name=output-xent input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor
  EOF
    steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
  fi
  
  
  if [ $stage -le 16 ]; then
    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
      utils/create_split_dir.pl \
       /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
    fi
  
    steps/nnet3/chain/train.py --stage=$train_stage \
      --cmd="$decode_cmd" \
      --feat.online-ivector-dir=$train_ivector_dir \
      --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
      --chain.xent-regularize $xent_regularize \
      --chain.leaky-hmm-coefficient=0.1 \
      --chain.l2-regularize=0.0 \
      --chain.apply-deriv-weights=false \
      --chain.lm-opts="--num-extra-lm-states=2000" \
      --trainer.srand=$srand \
      --trainer.max-param-change=2.0 \
      --trainer.num-epochs=6 \
      --trainer.deriv-truncate-margin=10 \
      --trainer.frames-per-iter=1500000 \
      --trainer.optimization.num-jobs-initial=2 \
      --trainer.optimization.num-jobs-final=10 \
      --trainer.optimization.initial-effective-lrate=0.0005 \
      --trainer.optimization.final-effective-lrate=0.00005 \
      --trainer.num-chunk-per-minibatch=128,64 \
      --trainer.optimization.momentum=0.0 \
      --egs.chunk-width=$chunk_width \
      --egs.chunk-left-context=$chunk_left_context \
      --egs.chunk-right-context=$chunk_right_context \
      --egs.chunk-left-context-initial=0 \
      --egs.chunk-right-context-final=0 \
      --egs.dir="$common_egs_dir" \
      --egs.opts="--frames-overlap-per-eg 0" \
      --cleanup.remove-egs=$remove_egs \
      --use-gpu=true \
      --reporting.email="$reporting_email" \
      --feat-dir=$train_data_dir \
      --tree-dir=$tree_dir \
      --lat-dir=$lat_dir \
      --dir=$dir  || exit 1;
  fi
  
  if [ $stage -le 17 ]; then
    # The reason we are using data/lang here, instead of $lang, is just to
    # emphasize that it's not actually important to give mkgraph.sh the
    # lang directory with the matched topology (since it gets the
    # topology file from the model).  So you could give it a different
    # lang directory, one that contained a wordlist and LM of your choice,
    # as long as phones.txt was compatible.
  
    utils/lang/check_phones_compatible.sh \
      data/lang_test_tgpr/phones.txt $lang/phones.txt
    utils/mkgraph.sh \
      --self-loop-scale 1.0 data/lang_test_tgpr \
      $tree_dir $tree_dir/graph_tgpr || exit 1;
  
    utils/lang/check_phones_compatible.sh \
      data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
    utils/mkgraph.sh \
      --self-loop-scale 1.0 data/lang_test_bd_tgpr \
      $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
  fi
  
  if [ $stage -le 18 ]; then
    frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
    rm $dir/.error 2>/dev/null || true
  
    for data in $test_sets; do
      (
        data_affix=$(echo $data | sed s/test_//)
        nspk=$(wc -l <data/${data}_hires/spk2utt)
        for lmtype in tgpr bd_tgpr; do
          steps/nnet3/decode.sh \
            --acwt 1.0 --post-decode-acwt 10.0 \
            --extra-left-context $chunk_left_context \
            --extra-right-context $chunk_right_context \
            --extra-left-context-initial 0 \
            --extra-right-context-final 0 \
            --frames-per-chunk $frames_per_chunk \
            --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
            --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
            $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
        done
        steps/lmrescore.sh \
          --self-loop-scale 1.0 \
          --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
          data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_test_bd_{tgpr,fgconst} \
         data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
      ) || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
  fi
  
  if [ $stage -le 19 ]; then
    # 'looped' decoding.
    # note: you should NOT do this decoding step for setups that have bidirectional
    # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
    # we didn't write a -parallel version of this program yet,
    # so it will take a bit longer as the --num-threads option is not supported.
    # we just hardcode the --frames-per-chunk option as it doesn't have to
    # match any value used in training, and it won't affect the results (unlike
    # regular decoding).
    rm $dir/.error 2>/dev/null || true
  
    for data in $test_sets; do
      (
        data_affix=$(echo $data | sed s/test_//)
        nspk=$(wc -l <data/${data}_hires/spk2utt)
        for lmtype in tgpr bd_tgpr; do
          steps/nnet3/decode_looped.sh \
            --acwt 1.0 --post-decode-acwt 10.0 \
            --frames-per-chunk 30 \
            --nj $nspk --cmd "$decode_cmd" \
            --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
            $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
        done
        steps/lmrescore.sh \
          --self-loop-scale 1.0 \
          --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
          data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_test_bd_{tgpr,fgconst} \
         data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
      ) || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
  fi
  
  if $test_online_decoding && [ $stage -le 20 ]; then
    # note: if the features change (e.g. you add pitch features), you will have to
    # change the options of the following command line.
    steps/online/nnet3/prepare_online_decoding.sh \
      --mfcc-config conf/mfcc_hires.conf \
      $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
  
    rm $dir/.error 2>/dev/null || true
  
    for data in $test_sets; do
      (
        data_affix=$(echo $data | sed s/test_//)
        nspk=$(wc -l <data/${data}_hires/spk2utt)
        # note: we just give it "data/${data}" as it only uses the wav.scp, the
        # feature type does not matter.
        for lmtype in tgpr bd_tgpr; do
          steps/online/nnet3/decode.sh \
            --acwt 1.0 --post-decode-acwt 10.0 \
            --nj $nspk --cmd "$decode_cmd" \
            $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
        done
        steps/lmrescore.sh \
          --self-loop-scale 1.0 \
          --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
          data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
          data/lang_test_bd_{tgpr,fgconst} \
         data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
      ) || touch $dir/.error &
    done
    wait
    [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
  fi
  
  
  exit 0;