Blame view

egs/commonvoice/s5/run.sh 4.96 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  #!/bin/bash
  
  # Recipe for Mozilla Common Voice corpus v1
  #
  # Copyright 2017   Ewald Enzinger
  # Apache 2.0
  
  data=$HOME/cv_corpus_v1
  data_url=https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz
  
  . ./cmd.sh
  . ./path.sh
  
  stage=0
  
  . ./utils/parse_options.sh
  
  set -euo pipefail
  
  if [ $stage -le 0 ]; then
    mkdir -p $data
  
    local/download_and_untar.sh $(dirname $data) $data_url
  fi
  
  if [ $stage -le 1 ]; then
    for part in valid-train valid-dev valid-test; do
      # use underscore-separated names in data directories.
      local/data_prep.pl $data cv-$part data/$(echo $part | tr - _)
    done
    
    # Prepare ARPA LM and vocabulary using SRILM
    local/prepare_lm.sh data/valid_train
    # Prepare the lexicon and various phone lists
    # Pronunciations for OOV words are obtained using a pre-trained Sequitur model
    local/prepare_dict.sh
  
    # Prepare data/lang and data/local/lang directories
    utils/prepare_lang.sh data/local/dict \
      '<unk>' data/local/lang data/lang || exit 1
  
    utils/format_lm.sh data/lang data/local/lm.gz data/local/dict/lexicon.txt data/lang_test/
  fi
  
  if [ $stage -le 2 ]; then
    mfccdir=mfcc
    # spread the mfccs over various machines, as this data-set is quite large.
    if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
      utils/create_split_dir.pl /export/b{07,14,16,17}/$USER/kaldi-data/mfcc/commonvoice/s5/$mfcc/storage \
        $mfccdir/storage
    fi
  
    for part in valid_train valid_dev valid_test; do
      steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$part exp/make_mfcc/$part $mfccdir
      steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
    done
  
    # Get the shortest 10000 utterances first because those are more likely
    # to have accurate alignments.
    utils/subset_data_dir.sh --shortest data/valid_train 10000 data/train_10kshort || exit 1;
    utils/subset_data_dir.sh data/valid_train 20000 data/train_20k || exit 1;
  fi
  
  # train a monophone system
  if [ $stage -le 3 ]; then
    steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
      data/train_10kshort data/lang exp/mono || exit 1;
    (
      utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
      for testset in valid_dev; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph \
          data/$testset exp/mono/decode_$testset
      done
    )&
    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
      data/train_20k data/lang exp/mono exp/mono_ali_train_20k
  fi
  
  # train a first delta + delta-delta triphone system
  if [ $stage -le 4 ]; then
    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
      2000 10000 data/train_20k data/lang exp/mono_ali_train_20k exp/tri1
  
    # decode using the tri1 model
    (
      utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
      for testset in valid_dev; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph \
          data/$testset exp/tri1/decode_$testset
      done
    )&
  
    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
      data/train_20k data/lang exp/tri1 exp/tri1_ali_train_20k
  fi
  
  # train an LDA+MLLT system.
  if [ $stage -le 5 ]; then
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
      --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
      data/train_20k data/lang exp/tri1_ali_train_20k exp/tri2b
  
    # decode using the LDA+MLLT model
    utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
    (
      for testset in valid_dev; do
        steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph \
          data/$testset exp/tri2b/decode_$testset
      done
    )&
  
    # Align utts using the tri2b model
    steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \
      data/train_20k data/lang exp/tri2b exp/tri2b_ali_train_20k
  fi
  
  # Train tri3b, which is LDA+MLLT+SAT
  if [ $stage -le 6 ]; then
    steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
      data/train_20k data/lang exp/tri2b_ali_train_20k exp/tri3b
  
    # decode using the tri3b model
    (
      utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
      for testset in valid_dev; do
        steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
          exp/tri3b/graph data/$testset exp/tri3b/decode_$testset
      done
    )&
  fi
  
  if [ $stage -le 7 ]; then
    # Align utts in the full training set using the tri3b model
    steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
      data/valid_train data/lang \
      exp/tri3b exp/tri3b_ali_valid_train
  
    # train another LDA+MLLT+SAT system on the entire training set
    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
      data/valid_train data/lang \
      exp/tri3b_ali_valid_train exp/tri4b
  
    # decode using the tri4b model
    (
      utils/mkgraph.sh data/lang_test exp/tri4b exp/tri4b/graph
      for testset in valid_dev; do
        steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
          exp/tri4b/graph data/$testset \
          exp/tri4b/decode_$testset
      done
    )&
  fi
  
  # Train a chain model
  if [ $stage -le 8 ]; then
    local/chain/run_tdnn.sh --stage 0
  fi
  
  # Don't finish until all background decoding jobs are finished.
  wait