Blame view

egs/tedlium/s5_r2_wsj/run.sh 4.39 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  #!/bin/bash
  #
  # This recipe uses WSJ models and TED-LIUM audio with un-aligned transcripts.
  #
  # http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus
  # http://www.openslr.org/resources (Mirror).
  #
  # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
  # which allow free non-commercial use, while only a citation is required.
  #
  # Copyright  2014  Nickolay V. Shmyrev
  #            2014  Brno University of Technology (Author: Karel Vesely)
  #            2016  Vincent Nguyen
  #            2016  Johns Hopkins University (Author: Daniel Povey)
  #
  # Apache 2.0
  #
  
  . ./cmd.sh
  . ./path.sh
  
  
  set -e -o pipefail -u
  
  nj=35
  decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
                 # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                 # this will be too many jobs if you're using run.pl.
  
  . utils/parse_options.sh # accept options
  
  # Data preparation
  local/download_data.sh
  
  wsj0=/export/corpora5/LDC/LDC93S6B
  wsj1=/export/corpora5/LDC/LDC94S13B
  local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
  
  local/wsj_format_data.sh
  
  local/prepare_data.sh
  
  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
  # lets us use more jobs for decoding etc.
  # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is
  #  more than our normal 30 jobs.]
  for dset in dev test train; do
    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset}
  done
    
  local/train_lm.sh
  
  local/prepare_dict.sh --dict-suffix "_nosp" \
    data/local/local_lm/data/work/wordlist
  
  utils/prepare_lang.sh data/local/dict_nosp \
    "<unk>" data/local/lang_nosp data/lang_nosp
  
  local/format_lms.sh
  
  # Feature extraction
  for set in train_si284; do
    dir=data/$set
    steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir
    steps/compute_cmvn_stats.sh $dir
    utils/fix_data_dir.sh $dir
  done
  
  utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
  
  # Now make subset with the shortest 2k utterances from si-84.
  utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
  
  # Now make subset with half of the data from si-84.
  utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
  
  # Note: the --boost-silence option should probably be omitted by default
  # for normal setups.  It doesn't always help. [it's to discourage non-silence
  # models from modeling silence.]
  steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a || exit 1;
  
  steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
    data/train_si84_half data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali || exit 1;
  
  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
    data/train_si84_half data/lang_nosp exp/wsj_mono0a_ali exp/wsj_tri1 || exit 1;
  
  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
    data/train_si84 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si84 || exit 1;
  
  steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
    data/train_si84 data/lang_nosp exp/wsj_tri1_ali_si84 exp/wsj_tri2b || exit 1;
  
  # Align tri2b system with si84 data.
  steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
    --use-graphs true data/train_si84 \
    data/lang_nosp exp/wsj_tri2b exp/wsj_tri2b_ali_si84  || exit 1;
  
  # From 2b system, train 3b which is LDA + MLLT + SAT.
  steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
    data/train_si84 data/lang_nosp exp/wsj_tri2b_ali_si84 exp/wsj_tri3b || exit 1;
  
  # From 3b system, align all si284 data.
  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
    data/train_si284 data/lang_nosp exp/wsj_tri3b exp/wsj_tri3b_ali_si284 || exit 1;
  
  # From 3b system, train another SAT system (tri4a) with all the si284 data.
  steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
    data/train_si284 data/lang_nosp exp/wsj_tri3b_ali_si284 exp/wsj_tri4a || exit 1;
  
  utils/mkgraph.sh data/lang_nosp exp/wsj_tri4a exp/wsj_tri4a/graph_nosp
  
  (
  for dset in dev test; do
    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
      exp/wsj_tri4a/graph_nosp data/${dset} exp/wsj_tri4a/decode_nosp_${dset}
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
      data/${dset} exp/wsj_tri4a/decode_nosp_${dset} exp/wsj_tri4a/decode_nosp_${dset}_rescore
  done
  ) &
  
  wait 
  
  echo "$0: success."
  exit 0