Blame view
egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh
4.73 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#!/bin/bash # # This script generates separate egs directory for each input # language in multilingual setup, which contains both egs.*.ark and egs.*.scp. # # This script will generally be called from nnet3 multilingual training script. echo "$0 $@" # Print the command line for logging . ./cmd.sh set -e # Begin configuration section cmd= stage=0 left_context=13 right_context=9 online_multi_ivector_dirs= # list of iVector dir for all languages # can be used if we are including speaker information as iVectors. # e.g. "exp/lang1/train-ivector exp/lang2/train-ivector" samples_per_iter=400000 # this is the target number of egs in each archive of egs # (prior to merging egs). We probably should have called # it egs_per_iter. This is just a guideline; it will pick # a number that divides the number of samples in the # entire data. # Configuration to allocate egs minibatch_size=512 num_archives=100 num_jobs=10 cmvn_opts= echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# -lt 4 ]; then echo "Usage: $0 [opts] N <data-dir1> .. <data-dirN> <ali-dir1> .. <ali-dirN>" echo " <egs-out1> .. <egs-outN>" echo " e.g.: $0 2 data/lang1/train data/lang2/train exp/lang1/tri5_ali" echo " exp/lang2/tri5_ali exp/lang1/nnet3/egs exp/lang2/nnet3/egs" echo "" echo "Main options (for others, see top of script file)" echo " --config <config-file> # config file containing options" echo " --num-jobs <nj> # The maximum number of jobs you want to run in" echo " # parallel (increase this only if you have good disk and" echo " # network speed). default=6" echo " --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs." echo " --samples-per-iter <#samples;400000> # Target number of egs per archive (option is badly named)" echo " --frames-per-eg <frames;8> # number of frames per eg on disk" echo " --left-context <width;4> # Number of frames on left side to append for feature input" echo " --right-context <width;4> # Number of frames on right side to append for feature input" echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics" echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the" echo " # very end." echo " --stage <stage|0> # Used to run a partially-completed training process from somewhere in" echo " # the middle." exit 1; fi num_lang=$1 shift args=("$@") if [ ${#args[@]} != $[$num_lang*3] ]; then echo "$0: num of input dirs provided for all langs is not compatible with num-langs in input." && exit 1; fi # read input data, ali and egs dir per lang for l in `seq 0 $[$num_lang-1]`; do multi_data_dirs[$l]=${args[$l]} multi_ali_dirs[$l]=${args[$l+$num_lang]} multi_egs_dirs[$l]=${args[$l+2*$num_lang]} done echo "$0: Generate separate egs directory per language for multilingual training." online_multi_ivector_dirs=(${online_multi_ivector_dirs[@]}) for lang_index in `seq 0 $[$num_lang-1]`; do data=${multi_data_dirs[$lang_index]} ali_dir=${multi_ali_dirs[$lang_index]} egs_dir=${multi_egs_dirs[$lang_index]} online_ivector_dir= if [ ! -z "${online_multi_ivector_dirs[$lang_index]}" ]; then online_ivector_dir=${online_multi_ivector_dirs[$lang_index]} fi echo online_ivector_dir = $online_ivector_dir if [ ! -d "$egs_dir" ]; then echo "$0: Generate egs for ${lang_list[$lang_index]}" if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then utils/create_split_dir.pl \ /export/b0{3,4,5,6}/$USER/kaldi-data/egs/${lang_list[$lang_index]}-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage fi extra_opts=() [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) extra_opts+=(--left-context $left_context) extra_opts+=(--right-context $right_context) echo "$0: calling get_egs.sh" steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ --samples-per-iter $samples_per_iter --stage $stage \ --cmd "$cmd" $egs_opts \ --generate-egs-scp true \ $data $ali_dir $egs_dir || exit 1; fi done |