Yannick Estève / ONTRAC-Kaldi

Blame view

egs/gp/s5/run.sh 12.4 KB
  #!/bin/bash -u
  
  # Copyright 2012  Arnab Ghoshal
  
  #
  # Copyright 2016 by Idiap Research Institute, http://www.idiap.ch
  #
  # See the file COPYING for the licence associated with this software.
  #
  # Author(s):
  #   Bogdan Vlasenko, February 2016
  #
  
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # This script shows the steps needed to build a recognizer for certain languages
  # of the GlobalPhone corpus.
  # !!! NOTE: The current recipe assumes that you have pre-built LMs.
  echo "This shell script may run as-is on your system, but it is recommended
  that you run the commands one by one by copying and pasting into the shell."
  #exit 1;
  
  [ -f cmd.sh ] && source ./cmd.sh || echo "cmd.sh not found. Jobs may not execute properly."
  
  # CHECKING FOR AND INSTALLING REQUIRED TOOLS:
  #  This recipe requires shorten (3.6.1) and sox (14.3.2).
  #  If they are not found, the local/gp_install.sh script will install them.
  #local/gp_check_tools.sh $PWD path.sh || exit 1;
  
  . ./path.sh || { echo "Cannot source path.sh"; exit 1; }
  
  # Set the locations of the GlobalPhone corpus and language models
  GP_CORPUS=/idiap/resource/database/GLOBALPHONE
  GP_LM=$PWD/language_models
  
  # Set the languages that will actually be processed
  export GP_LANGUAGES="FR GE RU"
  
  # The following data preparation step actually converts the audio files from
  # shorten to WAV to take out the empty files and those with compression errors.
  local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS --languages="$GP_LANGUAGES" || exit 1;
  local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES || exit 1;
  
  for L in $GP_LANGUAGES; do
   utils/prepare_lang.sh --position-dependent-phones true \
     data/$L/local/dict "<unk>" data/$L/local/lang_tmp data/$L/lang \
     >& data/$L/prepare_lang.log || exit 1;
  done
  
  # Convert the different available language models to FSTs, and create separate
  # decoding configurations for each.
  for L in $GP_LANGUAGES; do
     local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L &
  done
  wait
  
  # Now make MFCC features.
  for L in $GP_LANGUAGES; do
    mfccdir=mfcc/$L
    for x in train dev eval; do
      (
        steps/make_mfcc.sh --nj 6 --cmd "$train_cmd" data/$L/$x \
          exp/$L/make_mfcc/$x $mfccdir;
        steps/compute_cmvn_stats.sh data/$L/$x exp/$L/make_mfcc/$x $mfccdir;
      ) &
    done
  done
  wait;
  
  
  for L in $GP_LANGUAGES; do
    mkdir -p exp/$L/mono;
    steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
      data/$L/train data/$L/lang exp/$L/mono >& exp/$L/mono/train.log &
  done
  wait;
  
  
  for L in $GP_LANGUAGES; do
    for lm_suffix in tgpr_sri; do
      (
        graph_dir=exp/$L/mono/graph_${lm_suffix}
        mkdir -p $graph_dir
        utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/mono \
           $graph_dir
  
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
           exp/$L/mono/decode_dev_${lm_suffix}
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
           exp/$L/mono/decode_eval_${lm_suffix}
      ) &
    done
  done
  
  # Train tri1, which is first triphone pass
  for L in $GP_LANGUAGES; do
    (
      mkdir -p exp/$L/mono_ali
      steps/align_si.sh --nj 10 --cmd "$train_cmd" \
          data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \
          >& exp/$L/mono_ali/align.log
  
      num_states=$(grep "^$L" conf/tri.conf | cut -f2)
      num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
      mkdir -p exp/$L/tri1
      steps/train_deltas.sh --cmd "$train_cmd" \
          --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
          exp/$L/mono_ali exp/$L/tri1 >& exp/$L/tri1/train.log
    ) &
  done
  wait;
  
  # Decode tri1
  for L in $GP_LANGUAGES; do
    for lm_suffix in tgpr_sri; do
      (
        graph_dir=exp/$L/tri1/graph_${lm_suffix}
        mkdir -p $graph_dir
        utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 \
          $graph_dir
  
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
          exp/$L/tri1/decode_dev_${lm_suffix}
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
          exp/$L/tri1/decode_eval_${lm_suffix}
      ) &
    done
  done
  
  
  # Train tri2a, which is deltas + delta-deltas
  for L in $GP_LANGUAGES; do
    (
      mkdir -p exp/$L/tri1_ali
      steps/align_si.sh --nj 10 --cmd "$train_cmd" \
          data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \
          >& exp/$L/tri1_ali/tri1_ali.log
  
      num_states=$(grep "^$L" conf/tri.conf | cut -f2)
      num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
      mkdir -p exp/$L/tri2a
      steps/train_deltas.sh --cmd "$train_cmd" \
          --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
          exp/$L/tri1_ali exp/$L/tri2a >& exp/$L/tri2a/train.log
    ) &
  done
  wait;
  
  # Decode tri2a
  for L in $GP_LANGUAGES; do
    for lm_suffix in tgpr_sri; do
      (
        graph_dir=exp/$L/tri2a/graph_${lm_suffix}
        mkdir -p $graph_dir
        utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a \
          $graph_dir
  
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
          exp/$L/tri2a/decode_dev_${lm_suffix}
        steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
          exp/$L/tri2a/decode_eval_${lm_suffix}
      ) &
    done
  done
  
  # Train tri2b, which is LDA+MLLT
  for L in $GP_LANGUAGES; do
    (
      num_states=$(grep "^$L" conf/tri.conf | cut -f2)
      num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
      mkdir -p exp/$L/tri2b
      steps/train_lda_mllt.sh --cmd "$train_cmd" \
          --splice-opts "--left-context=3 --right-context=3" $num_states $num_gauss data/$L/train \
          data/$L/lang exp/$L/tri1_ali exp/$L/tri2b >& exp/$L/tri2b/tri2_ali.log
    ) &
  done
  wait;
  
  # for L in $GP_LANGUAGES; do
  #   mode=4
  # # Doing this only for the LMs whose vocabs were limited using SRILM, since the
  # # other approach didn't yield LMs for all languages.
  #   steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
  #     data/$L/lang_test_tgpr_sri data/$L/lang_test_tg_sri data/$L/dev \
  #     exp/$L/tri2a/decode_dev_tgpr_sri exp/$L/tri2a/decode_dev_tg_sri$mode
  # done
  
  # Decode tri2b
  for L in $GP_LANGUAGES; do
    for lm_suffix in tgpr_sri; do
    (
      graph_dir=exp/$L/tri2b/graph_${lm_suffix}
      mkdir -p $graph_dir
      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2b \
          $graph_dir
  
      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
          exp/$L/tri2b/decode_dev_${lm_suffix}
      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
          exp/$L/tri2b/decode_eval_${lm_suffix}
    ) &
    done
  done
  wait;
  
  # Train tri3b, which is LDA+MLLT+SAT.
  for L in $GP_LANGUAGES; do
    (
      mkdir -p exp/$L/tri2b_ali
      steps/align_si.sh --nj 10 --cmd "$train_cmd" \
          --use-graphs true data/$L/train data/$L/lang exp/$L/tri2b exp/$L/tri2b_ali \
          >& exp/$L/tri2b_ali/align.log
  
      num_states=$(grep "^$L" conf/tri.conf | cut -f2)
      num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
      mkdir -p exp/$L/tri3b
      steps/train_sat.sh --cmd "$train_cmd" \
          --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
          exp/$L/tri2b_ali exp/$L/tri3b >& exp/$L/tri3b/train.log
    ) &
  done
  wait;
  
  # Decode 3b
  for L in $GP_LANGUAGES; do
    for lm_suffix in tgpr_sri; do
    (
      graph_dir=exp/$L/tri3b/graph_${lm_suffix}
      mkdir -p $graph_dir
      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri3b \
          $graph_dir
  
      mkdir -p exp/$L/tri3b/decode_dev_${lm_suffix}
      steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \
          $graph_dir data/$L/dev exp/$L/tri3b/decode_dev_${lm_suffix}
      steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \
          $graph_dir data/$L/eval exp/$L/tri3b/decode_eval_${lm_suffix}
    ) &
  done
  done
  wait;
  
  ## Train sgmm2b, which is SGMM on top of LDA+MLLT+SAT features.
  for L in $GP_LANGUAGES; do
    (
      mkdir -p exp/$L/tri3b_ali
      steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
          data/$L/train data/$L/lang exp/$L/tri3b exp/$L/tri3b_ali
  
      num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
      num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
      mkdir -p exp/$L/ubm4a
      steps/train_ubm.sh --cmd "$train_cmd" \
          600 data/$L/train data/$L/lang exp/$L/tri3b_ali exp/$L/ubm4a
  
      mkdir -p exp/$L/sgmm2_4a
      steps/train_sgmm2.sh --cmd "$train_cmd" \
          $num_states $num_substates data/$L/train data/$L/lang exp/$L/tri3b_ali \
          exp/$L/ubm4a/final.ubm exp/$L/sgmm2_4a
    ) &
  done
  wait;
  
  ## Decode sgmm2_4a
  for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
    (
      graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
      mkdir -p $graph_dir
      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2_4a \
          $graph_dir
  
      steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
          --transform-dir exp/$L/tri3b/decode_dev_${lm_suffix}  $graph_dir data/$L/dev \
          exp/$L/sgmm2_4a/decode_dev_${lm_suffix}
      steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
          --transform-dir exp/$L/tri3b/decode_eval_${lm_suffix}  $graph_dir data/$L/eval \
          exp/$L/sgmm2_4a/decode_eval_${lm_suffix}
    )
   done
  done
  wait;
  
  
  # Now we'll align the SGMM system to prepare for discriminative training MMI
  for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
    (
      mkdir -p exp/$L/sgmm2_4a_ali
      steps/align_sgmm2.sh --nj 10 --cmd "$train_cmd" \
          --transform-dir exp/$L/tri3b_ali --use-graphs true --use-gselect true data/$L/train \
          data/$L/lang exp/$L/sgmm2_4a exp/$L/sgmm2_4a_ali
  
      mkdir -p exp/$L/sgmm2_4a_denlats
      steps/make_denlats_sgmm2.sh --nj 10 --sub-split 10 --cmd "$decode_cmd" \
          --transform-dir exp/$L/tri3b_ali data/$L/train data/$L/lang \
          exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats
      mkdir -p exp/$L/sgmm2_4a_mmi_b0.1
      steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \
          --transform-dir exp/$L/tri3b_ali --boost 0.1 data/$L/train data/$L/lang \
          exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats exp/$L/sgmm2_4a_mmi_b0.1
    ) &
   done
  done
  wait;
  
  # decode sgmm2_4a-mmi_b0.1
  for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
    (
     graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
      for iter in 1 2 3 4; do
       for test in dev eval; do
        steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" \
          --iter $iter --transform-dir exp/$L/tri3b/decode_${test}_${lm_suffix} data/$L/lang_test_${lm_suffix} \
          data/$L/${test} exp/$L/sgmm2_4a/decode_${test}_${lm_suffix} \
          exp/$L/sgmm2_4a_mmi_b0.1/decode_${test}_${lm_suffix}_it$iter
       done
      done
    ) &
   done
  done
  wait;
  
  
  # SGMMs starting from non-SAT triphone system, both with and without
  # speaker vectors.
  for L in $GP_LANGUAGES; do
    (
      mkdir -p exp/$L/ubm2a
      steps/train_ubm.sh --cmd "$train_cmd" \
          400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \
          >& exp/$L/ubm2a/train.log
  
      num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
      num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
      mkdir -p exp/$L/sgmm2a
      steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \
        $num_states $num_substates data/$L/train data/$L/lang exp/$L/tri1_ali \
        exp/$L/ubm2a/final.ubm exp/$L/sgmm2a >& exp/$L/sgmm2a/train.log
  
      mkdir -p exp/$L/sgmm2b
      steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 \
        $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali \
        exp/$L/ubm2a/final.ubm exp/$L/sgmm2b >& exp/$L/sgmm2b/train.log
    ) &
  done
  wait
  
  for L in $GP_LANGUAGES; do
    # Need separate decoding graphs for models with and without speaker vectors,
    # since the trees may be different.
    for sgmm in sgmm2a sgmm2b; do
      for lm_suffix in tgpr_sri; do
        (
          graph_dir=exp/$L/$sgmm/graph_${lm_suffix}
          mkdir -p $graph_dir
          $highmem_cmd $graph_dir/mkgraph.log \
            utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/$sgmm $graph_dir
  
          steps/decode_sgmm2.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
            exp/$L/$sgmm/decode_dev_${lm_suffix}
        ) &
      done  # loop over LMs
    done    # loop over model with and without speaker vecs
  done      # loop over languages