Yannick Estève / ONTRAC-Kaldi

Blame view

egs/hub4_english/s5/run.sh 8.12 KB
  #!/bin/bash
  
  # Copyright 2016   Vimal Manohar
  # Apache 2.0.
  
  # See README.txt for more info on data required.
  
  . ./cmd.sh
  . ./path.sh
  
  set -o pipefail
  set -e
  
  mfccdir=`pwd`/mfcc
  nj=40
  stage=-1
  
  . utils/parse_options.sh
  
  # Training corpora
  
  # 1996 English Broadcast News Train (HUB4)
  hub4_96_train_transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
  hub4_96_train_speech=/export/corpora/LDC/LDC97S44/data
  # 1997 English Broadcast News Train (HUB4)
  hub4_97_train_transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
  hub4_97_train_speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
  # 1996 CSR HUB4 Language Model
  csr_hub4_lm=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
  # 1995 CSR-IV HUB4 corpus
  csr95_hub4=/export/corpora/LDC/LDC96S31/csr95_hub4
  # North American News Text Corpus
  NA_text=/export/corpora/LDC/LDC95T21
  # North American News Text Supplement Corpus
  NA_text_supp=/export/corpora/LDC/LDC98T30/northam_news_txt_sup
  
  # Test corpora
  
  # 1996 English Broadcast News Dev and Eval (HUB4)
  hub4_96_eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
  # 1997 HUB4 English Evaluation corpus
  hub4_97_eval=/export/corpora/LDC/LDC2002S11/hub4e_97
  # 1998 HUB4 Broadcast News Evaluation English Test Material
  hub4_98_eval=/export/corpora/LDC/LDC2000S86
  # 1999 HUB4 Broadcast News Evaluation English Test Material
  hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999
  
  # Test sets used -- Uncomment and keep only test sets needed
  test_sets="eval97.pem"
  # test_sets="dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem"
  
  if [ $stage -le 0 ]; then
    # Prepare 1996 English Broadcast News Train (HUB4)
    local/data_prep/prepare_1996_bn_data.sh \
      $hub4_96_train_transcripts \
      $hub4_96_train_speech \
      data/local/data/train_bn96
  
    # Prepare 1997 English Broadcast News Train (HUB4)
    local/data_prep/prepare_1997_bn_data.sh \
      $hub4_97_train_transcripts \
      $hub4_97_train_speech \
      data/local/data/train_bn97
  fi
  
  # Install Beautiful Soup 4 python package for parsing SGML-like files
  # in CSR-IV HUB4 corpus
  if [ ! -d tools/beautifulsoup4 ]; then
    mkdir -p tools
    pip install -t tools/beautifulsoup4 beautifulsoup4
  fi
  export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
  
  if [ $stage -le 1 ]; then
    if [ ! -f $csr_hub4_lm/utils.tar ]; then
      echo "Expected CSR-IV utils.tar to be found"
      exit 1
    fi
  
    mkdir -p tools/csr4_utils
    (
      cd tools/csr4_utils
      tar -xvf $csr_hub4_lm/utils.tar
    )
  
    chmod a+w tools/csr4_utils
    patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
  fi
  
  if [ $stage -le 2 ]; then
    # Prepare 1995 CSR-IV HUB4 corpus
    local/data_prep/prepare_1995_csr_hub4_corpus.sh \
      $csr95_hub4 data/local/data/csr95_hub4
  fi
  
  if [ $stage -le 3 ]; then
    # Prepare North American News Text Corpus
    local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
       $NA_text data/local/data/na_news
  
    # Prepare North American News Text Supplement Corpus
    local/data_prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
      $NA_text_supp data/local/data/na_news_supp
  fi
  
  if [ $stage -le 4 ]; then
    # Prepare 1996 CSR HUB4 Language Model
    local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
       $csr_hub4_lm data/local/data/csr96_hub4
  fi
  
  if [ $stage -le 5 ]; then
    # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
    local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
      $hub4_96_eval \
      data/local/data/hub4_96_dev_eval
  
    # Prepare 1997 HUB4 English Evaluation corpus
    local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
      $hub4_97_eval data/local/data/eval97
  
    # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
    local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
      $hub4_98_eval data/local/data/eval98
  
    # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
    local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
      $hub4_99_eval data/local/data/eval99
  fi
  
  if [ $stage -le 6 ]; then
    local/format_data.sh
  fi
  
  if [ $stage -le 7 ]; then
    local/train_lm.sh
  fi
  
  if [ $stage -le 8 ]; then
    local/prepare_dict.sh --dict-suffix "_nosp" \
      data/local/local_lm/data/work/wordlist
  
    utils/prepare_lang.sh data/local/dict_nosp \
      "<unk>" data/local/lang_tmp_nosp data/lang_nosp
  fi
  
  if [ $stage -le 9 ]; then
    local/format_lms.sh --local-lm-dir data/local/local_lm
  fi
  
  if [ $stage -le 10 ]; then
    for x in train $test_sets; do
      this_nj=$(cat data/$x/utt2spk | wc -l)
      if [ $this_nj -gt 30 ]; then
        this_nj=30
      fi
  
      steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
        --cmd "$train_cmd" \
        data/$x exp/make_mfcc $mfccdir
      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
      utils/fix_data_dir.sh data/$x
    done
  fi
  
  if [ $stage -le 15 ]; then
    utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
    utils/subset_data_dir.sh data/train 2000 data/train_2k
  
    # Note: the --boost-silence option should probably be omitted by default
    # for normal setups.  It doesn't always help. [it's to discourage non-silence
    # models from modeling silence.]
    steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
      data/train_1kshort data/lang_nosp exp/mono0a
  fi
  
  if [ $stage -le 16 ]; then
    steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
      data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
  
    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
      data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
  fi
  
  if [ $stage -le 17 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang_nosp exp/tri1 exp/tri1_ali
  
    steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
      data/train data/lang_nosp exp/tri1_ali exp/tri2
  fi
  
  if [ $stage -le 18 ]; then
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang_nosp exp/tri2 exp/tri2_ali
  
    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
      data/train data/lang_nosp exp/tri2_ali exp/tri3
  fi
  
  if [ $stage -le 19 ]; then
    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
  
    for dset in $test_sets; do
      (
      this_nj=`cat data/$dset/spk2utt | wc -l`
      if [ $this_nj -gt 20 ]; then
        this_nj=20
      fi
      steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
        exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset} || touch exp/tri3/.error
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
        data/lang_nosp_test data/lang_nosp_test_rescore \
        data/${dset} exp/tri3/decode_nosp_${dset} \
        exp/tri3/decode_nosp_${dset}_rescore || touch exp/tri3/.error
      ) &
    done
    wait
  
    if [ -f exp/tri3/.error ]; then
      echo "Decode failed in exp/tri3/decode*"
      exit 1
    fi
  fi
  
  if [ $stage -le 20 ]; then
    steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang_nosp exp/tri3 exp/tri3_ali
  
    steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
      data/train data/lang_nosp exp/tri3_ali exp/tri4
  fi
  
  if [ $stage -le 21 ]; then
    utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
  
    for dset in $test_sets; do
      (
      this_nj=`cat data/$dset/spk2utt | wc -l`
      if [ $this_nj -gt 20 ]; then
        this_nj=20
      fi
      steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
        exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
        data/lang_nosp_test data/lang_nosp_test_rescore \
        data/${dset} exp/tri4/decode_nosp_${dset} \
        exp/tri4/decode_nosp_${dset}_rescore
      ) &
    done
    wait
  
    if [ -f exp/tri4/.error ]; then
      echo "Decode failed in exp/tri4/decode*"
      exit 1
    fi
  fi
  
  wait
  
  # %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
  # %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
  
  # The following demonstrates how to use out-of-domain WSJ models to segment long
  # audio recordings of HUB4 with raw unaligned transcripts into short segments
  # with aligned transcripts for training new ASR models.
  
  # local/run_segmentation_wsj.sh
  exit 0