Blame view

egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh 4.73 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  #!/bin/bash
  #
  # This script generates separate egs directory for each input
  # language in multilingual setup, which contains both egs.*.ark and egs.*.scp.
  #
  # This script will generally be called from nnet3 multilingual training script.
  
  echo "$0 $@"  # Print the command line for logging
  . ./cmd.sh
  set -e
  
  
  # Begin configuration section
  cmd=
  stage=0
  left_context=13
  right_context=9
  online_multi_ivector_dirs=     # list of iVector dir for all languages
                                # can be used if we are including speaker information as iVectors.
                                # e.g. "exp/lang1/train-ivector exp/lang2/train-ivector"
  samples_per_iter=400000 # this is the target number of egs in each archive of egs
                          # (prior to merging egs).  We probably should have called
                          # it egs_per_iter. This is just a guideline; it will pick
                          # a number that divides the number of samples in the
                          # entire data.
  # Configuration to allocate egs
  minibatch_size=512
  num_archives=100
  num_jobs=10
  cmvn_opts=
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  if [ $# -lt 4 ]; then
    echo "Usage: $0 [opts] N <data-dir1> .. <data-dirN> <ali-dir1> .. <ali-dirN>"
    echo " <egs-out1> .. <egs-outN>"
    echo " e.g.: $0 2 data/lang1/train data/lang2/train exp/lang1/tri5_ali"
    echo " exp/lang2/tri5_ali exp/lang1/nnet3/egs exp/lang2/nnet3/egs"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config file containing options"
    echo "  --num-jobs <nj>                                  # The maximum number of jobs you want to run in"
    echo "                                                   # parallel (increase this only if you have good disk and"
    echo "                                                   # network speed).  default=6"
    echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
    echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
    echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
    echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
    echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
    echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
    echo "                                                   # very end."
    echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
  
    exit 1;
  fi
  
  num_lang=$1
  shift
  args=("$@")
  
  if [ ${#args[@]} != $[$num_lang*3] ]; then
    echo "$0: num of input dirs provided for all langs is not compatible with num-langs in input." && exit 1;
  fi
  
  # read input data, ali and egs dir per lang
  for l in `seq 0 $[$num_lang-1]`; do
    multi_data_dirs[$l]=${args[$l]}
    multi_ali_dirs[$l]=${args[$l+$num_lang]}
    multi_egs_dirs[$l]=${args[$l+2*$num_lang]}
  done
  
  echo "$0: Generate separate egs directory per language for multilingual training."
  online_multi_ivector_dirs=(${online_multi_ivector_dirs[@]})
  for lang_index in `seq 0 $[$num_lang-1]`; do
    data=${multi_data_dirs[$lang_index]}
    ali_dir=${multi_ali_dirs[$lang_index]}
    egs_dir=${multi_egs_dirs[$lang_index]}
    online_ivector_dir=
    if [ ! -z "${online_multi_ivector_dirs[$lang_index]}" ]; then
      online_ivector_dir=${online_multi_ivector_dirs[$lang_index]}
    fi
    echo online_ivector_dir = $online_ivector_dir
    if [ ! -d "$egs_dir" ]; then
      echo "$0: Generate egs for ${lang_list[$lang_index]}"
      if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
        utils/create_split_dir.pl \
         /export/b0{3,4,5,6}/$USER/kaldi-data/egs/${lang_list[$lang_index]}-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage
      fi
  
      extra_opts=()
      [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
      [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
      extra_opts+=(--left-context $left_context)
      extra_opts+=(--right-context $right_context)
      echo "$0: calling get_egs.sh"
      steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
          --samples-per-iter $samples_per_iter --stage $stage \
          --cmd "$cmd" $egs_opts \
          --generate-egs-scp true \
          $data $ali_dir $egs_dir || exit 1;
  
    fi
  done