Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/local/run_segmentation.sh 3.05 KB
  #!/bin/bash
  
  # Copyright 2014  Guoguo Chen
  # Apache 2.0
  
  # This script demonstrates how to re-segment long audios into short segments.
  # The basic idea is to decode with an existing in-domain acoustic model, and a
  # bigram language model built from the reference, and then work out the
  # segmentation from a ctm like file.
  # See the script local/run_segmentation_long_utts.sh for 
  # a more sophesticated approach using Smith-Waterman alignment
  # to align decoded hypothesis and parts of imperfect long-transcripts # retrieved using TF-IDF document similarities.
  
  stage=0
  
  . utils/parse_options.sh
  
  . ./cmd.sh
  . ./path.sh
  
  if [ $stage -le 0 ]; then
    local/append_utterances.sh data/train_si284 data/train_si284_long
    steps/cleanup/split_long_utterance.sh \
      --seg-length 30 --overlap-length 5 \
      data/train_si284_long data/train_si284_split
  fi
  
  if [ $stage -le 1 ]; then
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
                       data/train_si284_split exp/make_mfcc/train_si284_split mfcc || exit 1;
    steps/compute_cmvn_stats.sh data/train_si284_split \
      exp/make_mfcc/train_si284_split mfcc || exit 1;
  fi
  
  if [ $stage -le 2 ]; then
    steps/cleanup/make_segmentation_graph.sh \
      --cmd "$mkgraph_cmd" --nj 32 \
      data/train_si284_split/ data/lang_nosp exp/tri2b/ \
      exp/tri2b/graph_train_si284_split || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    steps/cleanup/decode_segmentation.sh \
      --nj 64 --cmd "$decode_cmd" --skip-scoring true \
      exp/tri2b/graph_train_si284_split \
      data/train_si284_split exp/tri2b/decode_train_si284_split || exit 1;
  fi
  
  if [ $stage -le 4 ]; then
    steps/get_ctm.sh --cmd "$decode_cmd" data/train_si284_split \
      exp/tri2b/graph_train_si284_split exp/tri2b/decode_train_si284_split
  fi
  
  if [ $stage -le 5 ]; then
    steps/cleanup/make_segmentation_data_dir.sh --wer-cutoff 0.9 \
      --min-sil-length 0.5 --max-seg-length 15 --min-seg-length 1 \
      exp/tri2b/decode_train_si284_split/score_10/train_si284_split.ctm \
     data/train_si284_split data/train_si284_reseg_a
  fi
  
  # Now, use the re-segmented data for training.
  if [ $stage -le 6 ]; then
    steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
      data/train_si284_reseg_a exp/make_mfcc/train_si284_reseg_a mfcc || exit 1;
    steps/compute_cmvn_stats.sh data/train_si284_reseg_a \
                                exp/make_mfcc/train_si284_reseg_a mfcc || exit 1;
  fi
  
  if [ $stage -le 7 ]; then
    steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
      data/train_si284_reseg_a data/lang_nosp exp/tri3b exp/tri3b_ali_si284_reseg_a || exit 1;
  fi
  
  if [ $stage -le 8 ]; then
    steps/train_sat.sh  --cmd "$train_cmd" \
      4200 40000 data/train_si284_reseg_a \
      data/lang_nosp exp/tri3b_ali_si284_reseg_a exp/tri4c || exit 1;
  fi
  
  
  if [ $stage -le 9 ]; then
    utils/mkgraph.sh data/lang_nosp_test_tgpr exp/tri4c exp/tri4c/graph_tgpr || exit 1;
    steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
      exp/tri4c/graph_tgpr data/test_dev93 exp/tri4c/decode_tgpr_dev93 || exit 1;
    steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
      exp/tri4c/graph_tgpr data/test_eval92 exp/tri4c/decode_tgpr_eval92 || exit 1;
  fi