Blame view
egs/rm/s5/local/nnet/run_multilingual.sh
8.67 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
#!/bin/bash # Copyright 2015 University of Illinois (Author: Amit Das) # Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely) # Apache 2.0 # This example script trains Multi-lingual DNN with <BlockSoftmax> output, using FBANK features. # The network is trained on multiple languages simultaneously, creating a separate softmax layer # per language while sharing hidden layers across all languages. # The script supports arbitrary number of languages. . ./cmd.sh . ./path.sh # Example setup, the options are in 'csl' format, they must have same number of elements, lang_code_csl="rm,wsj" # One label for each language, lang_weight_csl="1.0,0.1" # Per-language weights, they scale loss-function and gradient, 1.0 for each language is good, ali_dir_csl="exp/tri3b_ali,../../wsj/s5/exp/tri4b_ali_si284" # One ali-dir per language, data_dir_csl="data/train,../../wsj/s5/data/train_si284" # One train-data-dir per language (features will be re-computed), nnet_type=dnn_small # dnn_small | dnn | bn stage=0 . utils/parse_options.sh || exit 1; set -euxo pipefail # Convert 'csl' to bash array (accept separators ',' ':'), lang_code=($(echo $lang_code_csl | tr ',:' ' ')) ali_dir=($(echo $ali_dir_csl | tr ',:' ' ')) data_dir=($(echo $data_dir_csl | tr ',:' ' ')) # Make sure we have same number of items in lists, ! [ ${#lang_code[@]} -eq ${#ali_dir[@]} -a ${#lang_code[@]} -eq ${#data_dir[@]} ] && \ echo "Non-matching number of 'csl' items: lang_code ${#lang_code[@]}, ali_dir ${ali_dir[@]}, data_dir ${#data_dir[@]}" && \ exit 1 num_langs=${#lang_code[@]} # Check if all the input directories exist, for i in $(seq 0 $[num_langs-1]); do echo "lang = ${lang_code[$i]}, alidir = ${ali_dir[$i]}, datadir = ${data_dir[$i]}" [ ! -d ${ali_dir[$i]} ] && echo "Missing ${ali_dir[$i]}" && exit 1 [ ! -d ${data_dir[$i]} ] && echo "Missing ${data_dir[$i]}" && exit 1 done # Make the features, data=data-fbank-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') data_tr90=$data/combined_tr90 data_cv10=$data/combined_cv10 if [ $stage -le 0 ]; then # Make local copy of data-dirs (while adding language-code), tr90="" cv10="" for i in $(seq 0 $[num_langs-1]); do code=${lang_code[$i]} dir=${data_dir[$i]} tgt_dir=$data/${code}_$(basename $dir) utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features, # extract features, get cmvn stats, steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd --max-jobs-run 10" $tgt_dir{,/log,/data} steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data} # split lists 90% train / 10% held-out, utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10 tr90="$tr90 ${tgt_dir}_tr90" cv10="$cv10 ${tgt_dir}_cv10" done # Merge the datasets, utils/combine_data.sh $data_tr90 $tr90 utils/combine_data.sh $data_cv10 $cv10 # Validate, utils/validate_data_dir.sh $data_tr90 utils/validate_data_dir.sh $data_cv10 fi # Extract the tied-state numbers from transition models, for i in $(seq 0 $[num_langs-1]); do ali_dim[i]=$(hmm-info ${ali_dir[i]}/final.mdl | grep pdfs | awk '{ print $NF }') done ali_dim_csl=$(echo ${ali_dim[@]} | tr ' ' ',') # Total number of DNN outputs (sum of all per-language blocks), output_dim=$(echo ${ali_dim[@]} | tr ' ' ' ' | awk '{ sum += $i; } END{ print sum; }') echo "Total number of DNN outputs: $output_dim = $(echo ${ali_dim[@]} | sed 's: : + :g')" # Objective function string (per-language weights are imported from '$lang_weight_csl'), objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' ' ' | \ awk -v w=$lang_weight_csl 'BEGIN{ split(w,w_arr,/[,:]/); } { printf(",xent,%d,%s", $1, w_arr[NR]); }')" echo "Multitask objective function: $objective_function" # DNN training will be in $dir, the alignments are prepared beforehand, dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')-${nnet_type} [ ! -e $dir ] && mkdir -p $dir echo "$lang_code_csl" >$dir/lang_code_csl echo "$ali_dir_csl" >$dir/ali_dir_csl echo "$data_dir_csl" >$dir/data_dir_csl echo "$ali_dim_csl" >$dir/ali_dim_csl echo "$objective_function" >$dir/objective_function # Prepare the merged targets, if [ $stage -le 1 ]; then [ ! -e $dir/ali-post ] && mkdir -p $dir/ali-post # re-saving the ali in posterior format, indexed by 'scp', for i in $(seq 0 $[num_langs-1]); do code=${lang_code[$i]} ali=${ali_dir[$i]} # utt suffix added by 'awk', ali-to-pdf $ali/final.mdl "ark:gunzip -c ${ali}/ali.*.gz |" ark,t:- | awk -v c=$code '{ $1=$1"_"c; print $0; }' | \ ali-to-post ark:- ark,scp:$dir/ali-post/$code.ark,$dir/ali-post/$code.scp done # pasting the ali's, adding language-specific offsets to the posteriors, featlen="ark:feat-to-len 'scp:cat $data_tr90/feats.scp $data_cv10/feats.scp |' ark,t:- |" # get number of frames for every utterance, post_scp_list=$(echo ${lang_code[@]} | tr ' ' ' ' | awk -v d=$dir '{ printf(" scp:%s/ali-post/%s.scp", d, $1); }') paste-post --allow-partial=true "$featlen" "${ali_dim_csl}" ${post_scp_list} \ ark,scp:$dir/ali-post/combined.ark,$dir/ali-post/combined.scp fi # Train the <BlockSoftmax> system, 1st stage of Stacked-Bottleneck-Network, if [ $stage -le 2 ]; then case $nnet_type in bn) # Bottleneck network (40 dimensional bottleneck is good for fMLLR), $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 2 --hid-dim 1500 --bn-dim 40 \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --feat-type "traps" --splice 5 --traps-dct-basis 6 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; sbn) # Stacked Bottleneck Netowork, no fMLLR in between, bn1_dim=80 bn2_dim=30 # Train 1st part, dir_part1=${dir}_part1 $cuda_cmd ${dir}_part1/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 2 --hid-dim 1500 --bn-dim $bn1_dim \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --feat-type "traps" --splice 5 --traps-dct-basis 6 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir_part1 # Compose feature_transform for 2nd part, nnet-initialize <(echo "<Splice> <InputDim> $bn1_dim <OutputDim> $((13*bn1_dim)) <BuildVector> -10 -5:5 10 </BuildVector>") \ $dir_part1/splice_for_bottleneck.nnet nnet-concat $dir_part1/final.feature_transform "nnet-copy --remove-last-components=4 $dir_part1/final.nnet - |" \ $dir_part1/splice_for_bottleneck.nnet $dir_part1/final.feature_transform.part1 # Train 2nd part, $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --feature-transform $dir_part1/final.feature_transform.part1 \ --hid-layers 2 --hid-dim 1500 --bn-dim $bn2_dim \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; dnn_small) # 4 hidden layers, 1024 sigmoid neurons, $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --cmvn-opts "--norm-means=true --norm-vars=true" \ --delta-opts "--delta-order=2" --splice 5 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; dnn) # 6 hidden layers, 2048 simgoid neurons, $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 6 --hid-dim 2048 \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --delta-opts "--delta-order=2" --splice 5 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; *) echo "Unknown --nnet-type $nnet_type"; exit 1; ;; esac fi exit 0 |