Blame view

egs/sprakbanken/s5/run.sh 6.18 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
  #!/bin/bash
  
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
             ## This relates to the queue.
  . ./path.sh # so python3 is on the path if not on the system (we made a link to utils/).a
  
  nj=12
  stage=0
  . utils/parse_options.sh
  
  if [ $stage -le 0 ]; then
    # Download the corpus and prepare parallel lists of sound files and text files
    # Divide the corpus into train, dev and test sets
    local/sprak_data_prep.sh  || exit 1;
  fi
  
  if [ $stage -le 1 ]; then
    # Perform text normalisation, prepare dict folder and LM data transcriptions
    # This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
    # local/dict_prep.sh || exit 1;
    local/copy_dict.sh || exit 1;
  fi
  
  if [ $stage -le 2 ]; then
    utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang_tmp data/lang || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    # Extract mfccs 
    # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some 
    # wave files are corrupt 
    # Will return a warning message because of the corrupt audio files, but compute them anyway
    # If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
    for dataset in train test dev; do
      steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$dataset || exit 1;
  
      # Compute cepstral mean and variance normalisation
      steps/compute_cmvn_stats.sh data/$dataset || exit 1;
  
      # Repair data set (remove corrupt data points with corrupt audio)
      utils/fix_data_dir.sh data/$dataset || exit 1;
  
    done
    # Make a subset of the training data with the shortest 120k utterances. 
    utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
  fi
  
  if [ $stage -le 4 ]; then
    # Train LM with irstlm
    local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "tg" data/lang data/local/train3_lm &> data/local/tg.log || exit 1;
    local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "fg" data/lang data/local/train4_lm &> data/local/fg.log || exit 1;
  fi
  
  if [ $stage -le 5 ]; then
    # Train monophone model on short utterances
    steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
      data/train_120kshort data/lang exp/mono0a || exit 1;
    utils/mkgraph.sh --mono data/lang_test_tg exp/mono0a exp/mono0a/graph_tg || exit 1;
    steps/decode.sh --nj 12 --cmd "$decode_cmd" \
      exp/mono0a/graph_tg data/dev exp/mono0a/decode_tg_dev || exit 1;
  fi
  
  if [ $stage -le 6 ]; then
    # Train tri1 (delta+delta-delta)
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
    steps/train_deltas.sh --cmd "$train_cmd" \
      3000 40000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
  
    # Decode dev set with both LMs
    utils/mkgraph.sh data/lang_test_tg exp/tri1 exp/tri1/graph_tg || exit 1;
    utils/mkgraph.sh data/lang_test_fg exp/tri1 exp/tri1/graph_fg || exit 1; 
    steps/decode.sh --nj 12 --cmd "$decode_cmd" \
      exp/tri1/graph_fg data/dev exp/tri1/decode_fg_dev || exit 1;
    steps/decode.sh --nj 12 --cmd "$decode_cmd" \
      exp/tri1/graph_tg data/dev exp/tri1/decode_tg_dev || exit 1;
  fi
  
  if [ $stage -le 7 ]; then
    # Train tri2a (delta + delta-delta)
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
    steps/train_deltas.sh --cmd "$train_cmd" \
      5000 60000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
    utils/mkgraph.sh data/lang_test_tg exp/tri2a exp/tri2a/graph_tg || exit 1;
    steps/decode.sh --nj 12 --cmd "$decode_cmd" \
      exp/tri2a/graph_tg data/dev exp/tri2a/decode_tg_dev || exit 1;
  fi
  
  if [ $stage -le 8 ]; then
    # Train tri2b (LDA+MLLT)
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
    steps/train_lda_mllt.sh --cmd "$train_cmd" \
      --splice-opts "--left-context=5 --right-context=5" \
      6500 75000 data/train data/lang exp/tri2a_ali exp/tri2b || exit 1;
    utils/mkgraph.sh data/lang_test_tg exp/tri2b exp/tri2b/graph_tg || exit 1;
    steps/decode.sh --nj 12 --cmd "$decode_cmd" \
      exp/tri2b/graph_tg data/dev exp/tri2b/decode_tg_dev || exit 1;
  fi
  
  if [ $stage -le 9 ]; then
    # From 2b system, train 3b which is LDA + MLLT + SAT.
    steps/align_si.sh  --nj $nj --cmd "$train_cmd" \
      --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
    steps/train_sat.sh --cmd "$train_cmd" \
      7500 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
  
    # Decode dev with 4gram and 3gram LMs
    utils/mkgraph.sh data/lang_test_tg exp/tri3b exp/tri3b/graph_tg || exit 1;
    steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \
      exp/tri3b/graph_tg data/dev exp/tri3b/decode_tg_dev || exit 1;
    utils/mkgraph.sh data/lang_test_fg exp/tri3b exp/tri3b/graph_fg || exit 1;
    steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \
      exp/tri3b/graph_fg data/dev exp/tri3b/decode_fg_dev || exit 1;
  
    # Decode test with 4gram and 3gram LMs
    # there are fewer speaker (n=7) and decoding usually ends up waiting
    # for a single job so we use --num-threads 2 to speed up
    steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \
      exp/tri3b/graph_tg data/test exp/tri3b/decode_tg_test || exit 1;
    steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \
      exp/tri3b/graph_fg data/test exp/tri3b/decode_fg_test || exit 1;
  fi
  
  if [ $stage -le 10 ]; then
    # Alignment used to train nnets and sgmms
    steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
      data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
  fi
  
  
  ## Works
  #local/sprak_run_nnet_cpu.sh tg dev 
  
  ## Works
  #local/sprak_run_sgmm2.sh dev
  
  
  # Run neural network setups based in the TEDLIUM recipe
  
  # Running the nnet3-tdnn setup will train an ivector extractor that
  # is used by the subsequent nnet3 and chain systems (why --stage is
  # specified)
  #local/nnet3/run_tdnn.sh --tdnn-affix "0" --nnet3-affix ""
  
  # nnet3 LSTM
  #local/nnet3/run_lstm.sh --stage 13 --affix "0"
  
  # nnet3 bLSTM
  #local/nnet3/run_blstm.sh --stage 12
  
  
  
  # chain TDNN
  # This setup creates a new lang directory that is also used by the
  # TDNN-LSTM system
  #local/chain/run_tdnn.sh --stage 14
  
  # chain TDNN-LSTM
  local/chain/run_tdnn_lstm.sh --stage 17
  
  
  # Getting results [see RESULTS file]
  local/generate_results_file.sh 2> /dev/null > RESULTS