Yannick Estève / ONTRAC-Kaldi

Blame view

egs/zeroth_korean/s5/run.sh 9.8 KB
  #!/bin/bash
  #
  # Based mostly on the WSJ/Librispeech recipe. 
  # The training/testing database is described in http://www.openslr.org/40/
  # This corpus consists of 51hrs korean speech with cleaned automatic transcripts:
  #
  # Copyright  2018  Atlas Guide (Author : Lucas Jo)
  #            2018  Gridspace Inc. (Author: Wonkyum Lee)
  #
  # Apache 2.0
  #
  
  # Check list before start
  # 1. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh)
  
  stage=0
  db_dir=./db
  nj=16
  
  chain_train=true
  decode=true # set false if you don't want to decode each GMM model
  decode_rescoring=true # set false if you don't want to rescore with large language model
  test_set="test_clean"
  
  . ./cmd.sh
  . ./path.sh
  . utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
  
  # you might not want to do this for interactive shells.
  set -e
  
  if [ $stage -le 0 ]; then
    # download the data.  
    local/download_and_untar.sh $db_dir
  fi
  
  if [ $stage -le 1 ]; then
    # format the data as Kaldi data directories
    for part in train_data_01 test_data_01; do
    	# use underscore-separated names in data directories.
    	local/data_prep.sh $db_dir $part
    done
  fi
  
  if [ $stage -le 2 ]; then
    # update segmentation of transcripts
    for part in train_data_01 test_data_01; do
    	local/update_segmentation.sh data/$part data/local/lm
    done
  fi
  
  if [ $stage -le 3 ]; then
    # prepare dictionary and language model 
    local/prepare_dict.sh data/local/lm data/local/dict_nosp
    
    utils/prepare_lang.sh data/local/dict_nosp \
    	"<UNK>" data/local/lang_tmp_nosp data/lang_nosp
  fi
  
  if [ $stage -le 4 ]; then
    # build testing language model
    local/format_lms.sh --src-dir data/lang_nosp data/local/lm
  
    # re-scoring language model
    if $decode_rescoring ; then
      utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.tg.arpa.gz \
      	data/lang_nosp data/lang_nosp_test_tglarge
      utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.fg.arpa.gz \
      	  data/lang_nosp data/lang_nosp_test_fglarge
    fi
  fi
  
  
  if [ $stage -le 5 ]; then
    # Feature extraction (MFCC)
    mfccdir=mfcc
    for part in train_data_01 test_data_01; do
    	steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir
    	steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
    done
    
    # ... and then combine data sets into one (for later extension)
    utils/combine_data.sh \
      data/train_clean data/train_data_01
    
    utils/combine_data.sh \
      data/test_clean data/test_data_01
    
    # Make some small data subsets for early system-build stages.
    utils/subset_data_dir.sh --shortest data/train_clean 2000 data/train_2kshort
    utils/subset_data_dir.sh data/train_clean 5000 data/train_5k
    utils/subset_data_dir.sh data/train_clean 10000 data/train_10k
  fi
  
  if [ $stage -le 6 ]; then
    echo "$0: #### Monophone Training ###########"
    # train a monophone system with 2k short utts
    steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
    	data/train_2kshort data/lang_nosp exp/mono
    if $decode; then
      utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/mono exp/mono/graph_nosp_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
        exp/mono/graph_nosp_tgsmall data/${test_set} exp/mono/decode_nosp_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test_set exp/mono/decode_nosp_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
          data/$test_set exp/mono/decode_nosp_{tgsmall,fglarge}_$test_set
      fi 
    fi
  fi
  
  if [ $stage -le 7 ]; then
    echo "$0: #### Triphone Training, delta + delta-delta ###########"
    steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
    	data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k
    # train a first delta + delta-delta triphone system on a subset of 5000 utterances
    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
        2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
    if $decode; then
      utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri1 exp/tri1/graph_nosp_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
        exp/tri1/graph_nosp_tgsmall data/${test_set} exp/tri1/decode_nosp_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test_set exp/tri1/decode_nosp_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
          data/$test_set exp/tri1/decode_nosp_{tgsmall,fglarge}_$test_set
      fi
    fi
  fi
  
  if [ $stage -le 8 ]; then
    echo "$0: #### Triphone Training, LDA+MLLT ###########"
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
    # train an LDA+MLLT system.
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
       --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
       data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2
    if $decode; then
      utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri2 exp/tri2/graph_nosp_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
        exp/tri2/graph_nosp_tgsmall data/${test_set} exp/tri2/decode_nosp_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test_set exp/tri2/decode_nosp_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
          data/$test_set exp/tri2/decode_nosp_{tgsmall,fglarge}_$test_set
      fi
    fi
  fi
  
  
  if [ $stage -le 9 ]; then
    echo "$0: #### Triphone Training, LDA+MLLT+SAT ###########"
    # Align the entire train_clean using the tri2 model
    steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
      data/train_clean data/lang_nosp exp/tri2 exp/tri2_ali_train_clean
    
    # Train tri3, which is LDA+MLLT+SAT on the entire train_clean
    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
      data/train_clean data/lang_nosp exp/tri2_ali_train_clean exp/tri3
    if $decode; then
      utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri3 exp/tri3/graph_nosp_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
        exp/tri3/graph_nosp_tgsmall data/${test_set} exp/tri3/decode_nosp_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
          data/$test_set exp/tri3/decode_nosp_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
          data/$test_set exp/tri3/decode_nosp_{tgsmall,fglarge}_$test_set
      fi
    fi
  fi 
  
  if [ $stage -le 10 ]; then
    echo "$0: #### Re-computing pronunciation model using tri3 model ###########"
    # Now we compute the pronunciation and silence probabilities from training data,
    # and re-create the lang directory.
    # silence transition probability ...
    steps/get_prons.sh --cmd "$train_cmd" \
          data/train_clean data/lang_nosp exp/tri3
    
    utils/dict_dir_add_pronprobs.sh --max-normalize true \
          data/local/dict_nosp \
            exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
              exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
    
    utils/prepare_lang.sh data/local/dict \
          "<UNK>" data/local/lang_tmp data/lang
    
    local/format_lms.sh --src-dir data/lang data/local/lm
    
    utils/build_const_arpa_lm.sh \
          data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge
    utils/build_const_arpa_lm.sh \
          data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge
  
    if $decode; then
      utils/mkgraph.sh data/lang_test_tgsmall exp/tri3 exp/tri3/graph_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
        exp/tri3/graph_tgsmall data/${test_set} exp/tri3/decode_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
          data/$test_set exp/tri3/decode_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
          data/$test_set exp/tri3/decode_{tgsmall,fglarge}_$test_set
      fi
    fi
  fi
  
  if [ $stage -le 11 ]; then
  
    echo "$0: #### SAT again on train_clean ###########"
    # align the entire train_clean using the tri3 model
    steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
      data/train_clean data/lang exp/tri3 exp/tri3_ali_train_clean
    
    # train another LDA+MLLT+SAT system on the entire train_clean
    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
      data/train_clean data/lang exp/tri3_ali_train_clean exp/tri4
   
    if $decode; then
      utils/mkgraph.sh data/lang_test_tgsmall exp/tri4 exp/tri4/graph_tgsmall
      nspk=$(wc -l <data/${test_set}/spk2utt)
      steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
        exp/tri4/graph_tgsmall data/${test_set} exp/tri4/decode_tgsmall_${test_set}
      if $decode_rescoring; then
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
          data/$test_set exp/tri4/decode_{tgsmall,tglarge}_$test_set
        steps/lmrescore_const_arpa.sh \
          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
          data/$test_set exp/tri4/decode_{tgsmall,fglarge}_$test_set
      fi
    fi 
  fi 
  
  echo "$0: GMM trainig is Done"
  
  if $chain_train; then
    ## Training Chain Acoustic model using clean data set
    echo "$0: #### chain training  ###########"
    local/chain/run_tdnn.sh
  fi 
  
  exit 0;