Blame view
egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
6.46 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
#!/bin/bash # this is run_nnet2_ms_disc.sh but with 4 jobs not 2 (and double the learning rate). # This script does discriminative training on top of the online, multi-splice # system trained in run_nnet2_ms.sh (the one with extra-wide context). # note: this relies on having a cluster that has plenty of CPUs as well as GPUs, # since the lattice generation runs in about real-time, so takes of the order of # 1000 hours of CPU time. # # Note: rather than using any features we have dumped on disk, this script # regenerates them from the wav data three times-- when we do lattice # generation, numerator alignment and discriminative training. This made the # script easier to write and more generic, because we don't have to know where # the features and the iVectors are, but of course it's a little inefficient. # The time taken is dominated by the lattice generation anyway, so this isn't # a huge deal. . ./cmd.sh stage=0 train_stage=-10 use_gpu=true srcdir=exp/nnet2_multicondition/nnet_ms_a criterion=smbr drop_frames=false # only matters for MMI. learning_rate=0.00015 num_jobs_nnet=12 train_stage=-10 # can be used to start training in the middle. decode_start_epoch=0 # can be used to avoid decoding all epochs, e.g. if we decided to run more. num_epochs=4 cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, # alignments and degs). set -e . ./cmd.sh . ./path.sh . ./utils/parse_options.sh if $use_gpu; then if ! cuda-compiled; then cat <<EOF && exit 1 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA If you want to use GPUs (and have them), go to src/, and configure and make on a machine where "nvcc" is installed. Otherwise, call this script with --use-gpu false EOF fi parallel_opts="--gpu 1" #parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false" num_threads=1 else # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be # almost the same, but this may be a little bit slow. num_threads=16 parallel_opts="--num-threads $num_threads" fi if [ ! -f ${srcdir}/final.mdl ]; then echo "$0: expected ${srcdir}/final.mdl to exist; first run run_nnet2_multisplice.sh." exit 1; fi if [ $stage -le 1 ]; then nj=250 # this doesn't really affect anything strongly, except the num-jobs for one of # the phases of get_egs_discriminative2.sh below. num_threads_denlats=6 subsplit=70 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving # total slots = 80 * 6 = 480. steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \ --online-ivector-dir exp/nnet2_multicondition/ivectors_train \ --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ data/train_rvb_hires data/lang $srcdir ${srcdir}_denlats || exit 1; # the command below is a more generic, but slower, way to do it. #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \ # --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ # data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1; fi if [ $stage -le 2 ]; then # hardcode no-GPU for alignment, although you could use GPU [you wouldn't # get excellent GPU utilization though.] nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using # throttle control option (--max-jobs-run with qalter) # have a high number of jobs because this could take a while, and we might # have some stragglers. max_jobs_run=200 use_gpu=no gpu_opts= steps/nnet2/align.sh --cmd "$decode_cmd --max-jobs-run $max_jobs_run $gpu_opts" --use-gpu "$use_gpu" \ --online-ivector-dir exp/nnet2_multicondition/ivectors_train \ --nj $nj data/train_rvb_hires data/lang $srcdir ${srcdir}_ali || exit 1; # the command below is a more generic, but slower, way to do it. # steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \ # --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1; fi if [ $stage -le 3 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then utils/create_split_dir.pl \ /export/b0{1,2,5,6}/$USER/kaldi-data/egs/fisher_reverb-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage fi # have a higher maximum num-jobs if if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi steps/nnet2/get_egs_discriminative2.sh \ --cmd "$decode_cmd --max-jobs-run $max_jobs" \ --online-ivector-dir exp/nnet2_multicondition/ivectors_train \ --criterion $criterion --drop-frames $drop_frames \ data/train_rvb_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1; # the command below is a more generic, but slower, way to do it. #steps/online/nnet2/get_egs_discriminative2.sh \ # --cmd "$decode_cmd --max-jobs-run $max_jobs" \ # --criterion $criterion --drop-frames $drop_frames \ # data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1; fi if [ $stage -le 4 ]; then steps/nnet2/train_discriminative2.sh --cmd "$decode_cmd $parallel_opts" \ --stage $train_stage \ --learning-rate $learning_rate \ --one-silence-class true \ --criterion $criterion --drop-frames $drop_frames \ --num-epochs $num_epochs \ --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \ ${srcdir}_degs ${srcdir}_${criterion}_${learning_rate}_nj${num_jobs_nnet} || exit 1; fi if [ $stage -le 5 ]; then dir=${srcdir}_${criterion}_${learning_rate}_nj${num_jobs_nnet} #ln -sf $(utils/make_absolute.sh ${srcdir}_multicondition/conf) $dir/conf # so it acts like an online-decoding directory graph_dir=exp/tri5a/graph for epoch in $(seq $decode_start_epoch $num_epochs); do for data_dir in dev_rvb test_rvb dev_aspire dev test; do steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \ --iter epoch$epoch "$graph_dir" data/${data_dir} $dir/decode_epoch${epoch}_${data_dir} || exit 1 done done wait for dir in $dir/decode*; do grep WER $dir/wer_* | utils/best_wer.sh; done fi if [ $stage -le 6 ] && $cleanup; then # if you run with "--cleanup true --stage 6" you can clean up. rm ${srcdir}_denlats/lat.*.gz || true rm ${srcdir}_ali/ali.*.gz || true steps/nnet2/remove_egs.sh ${srcdir}_degs || true fi exit 0; |