run_e2e_cnn.sh
5.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/bash
# Copyright 2017 Hossein Hadian
# This script does end2end chain training (i.e. from scratch)
# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
# System e2e_cnn_1a
# score_basic rescoring + nomalized
# WER 16.24 11.0
# WER (rescored) 15.63 10.5
# CER 5.98 5.6
# CER (rescored) 5.66 5.3
# Final train prob 0.1376
# Final valid prob 0.1913
# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
set -e
# configs for 'chain'
stage=0
nj=30
train_stage=-10
get_egs_stage=-10
affix=1a
# training options
tdnn_dim=450
minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
cmvn_opts="--norm-means=false --norm-vars=false"
train_set=train
# End configuration section.
echo "$0 $@" # Print the command line for logging
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
lang=data/lang_e2e
treedir=exp/chain/e2e_monotree # it's actually just a trivial tree (no tree building)
dir=exp/chain/e2e_cnn_${affix}
if [ $stage -le 0 ]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi
if [ $stage -le 1 ]; then
steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
--shared-phones true \
--type mono \
data/$train_set $lang $treedir
$cmd $treedir/log/make_phone_lm.log \
cat data/$train_set/text \| \
steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
utils/sym2int.pl -f 2- data/lang/phones.txt \| \
chain-est-phone-lm --num-extra-lm-states=500 \
ark:- $treedir/phone_lm.fst
fi
if [ $stage -le 2 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
cnn_opts="l2-regularize=0.075"
tdnn_opts="l2-regularize=0.075"
output_opts="l2-regularize=0.1"
common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
mkdir -p $dir/configs
cat <<EOF > $dir/configs/network.xconfig
input dim=40 name=input
conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
## adding the layers for chain branch
relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
EOF
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
fi
if [ $stage -le 3 ]; then
steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
--cmd "$cmd" \
--feat.cmvn-opts "$cmvn_opts" \
--chain.leaky-hmm-coefficient 0.1 \
--chain.apply-deriv-weights true \
--egs.stage $get_egs_stage \
--egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
--chain.frame-subsampling-factor 4 \
--chain.alignment-subsampling-factor 4 \
--trainer.add-option="--optimization.memory-compression-level=2" \
--trainer.num-chunk-per-minibatch $minibatch_size \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs 3 \
--trainer.optimization.momentum 0 \
--trainer.optimization.num-jobs-initial 5 \
--trainer.optimization.num-jobs-final 8 \
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.shrink-value 1.0 \
--trainer.max-param-change 2.0 \
--cleanup.remove-egs true \
--feat-dir data/${train_set} \
--tree-dir $treedir \
--dir $dir || exit 1;
fi