Blame view

egs/wsj/s5/local/e2e/run_end2end_char.sh 3.87 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  #!/bin/bash
  # Copyright 2017    Hossein Hadian
  
  # This top-level script demonstrates character-based end-to-end LF-MMI training
  # (specifically single-stage flat-start LF-MMI models) on WSJ. It is exactly
  # like "run_end2end_phone.sh" excpet it uses a trivial grapheme-based
  # (i.e. character-based) lexicon and a stronger neural net (i.e. TDNN-LSTM)
  
  set -euo pipefail
  
  
  stage=0
  trainset=train_si284
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work
             ## on your system. This relates to the queue.
  
  #wsj0=/ais/gobi2/speech/WSJ/csr_?_senn_d?
  #wsj1=/ais/gobi2/speech/WSJ/csr_senn_d?
  
  #wsj0=/mnt/matylda2/data/WSJ0
  #wsj1=/mnt/matylda2/data/WSJ1
  
  #wsj0=/data/corpora0/LDC93S6B
  #wsj1=/data/corpora0/LDC94S13B
  
  wsj0=/export/corpora5/LDC/LDC93S6B
  wsj1=/export/corpora5/LDC/LDC94S13B
  
  . ./utils/parse_options.sh
  . ./path.sh
  
  # We use the suffix _nosp for the phoneme-based dictionary and
  # lang directories (for consistency with run.sh) and the suffix
  # _char for character-based dictionary and lang directories.
  
  if [ $stage -le 0 ]; then
    [[ -d data/local/data ]] || \
      local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?
    [[ -f data/local/dict_nosp/lexicon.txt ]] || \
      local/wsj_prepare_dict.sh --dict-suffix "_nosp"
  
    local/wsj_prepare_char_dict.sh
    utils/prepare_lang.sh data/local/dict_char \
                          "<SPOKEN_NOISE>" data/local/lang_tmp_char data/lang_char
    local/wsj_format_data.sh --lang-suffix "_char"
    echo "$0: Done preparing data & lang."
  fi
  
  if [ $stage -le 1 ]; then
    local/wsj_extend_char_dict.sh $wsj1/13-32.1 data/local/dict_char \
                                data/local/dict_char_larger
    utils/prepare_lang.sh data/local/dict_char_larger \
                          "<SPOKEN_NOISE>" data/local/lang_larger_tmp \
                          data/lang_char_bd
    # Note: this will overwrite data/local/local_lm:
    local/wsj_train_lms.sh --dict-suffix "_char"
    local/wsj_format_local_lms.sh --lang-suffix "_char"
    echo "$0: Done extending the vocabulary."
    exit 0;
  fi
  
  if [ $stage -le 2 ]; then
    # make MFCC features for the test data. Only hires since it's flat-start.
    if [ -f data/test_eval92_hires/feats.scp ]; then
      echo "$0: It seems that features for the test sets already exist."
      echo "skipping this stage..."
    else
      echo "$0: extracting MFCC features for the test sets"
      for x in test_eval92 test_eval93 test_dev93; do
        mv data/$x data/${x}_hires
        steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
                           --mfcc-config conf/mfcc_hires.conf data/${x}_hires
        steps/compute_cmvn_stats.sh data/${x}_hires
      done
    fi
  fi
  
  if [ -f data/${trainset}_spEx_hires/feats.scp ]; then
    echo "$0: It seems that features for the perturbed training data already exist."
    echo "If you want to extract them anyway, remove them first and run this"
    echo "stage again. Skipping this stage..."
  else
    if [ $stage -le 3 ]; then
      echo "$0: perturbing the training data to allowed lengths..."
      utils/data/get_utt2dur.sh data/$trainset  # necessary for the next command
  
      # 12 in the following command means the allowed lengths are spaced
      # by 12% change in length.
      utils/data/perturb_speed_to_allowed_lengths.py 12 data/${trainset} \
                                                     data/${trainset}_spe2e_hires
      cat data/${trainset}_spe2e_hires/utt2dur | \
        awk '{print $1 " " substr($1,5)}' >data/${trainset}_spe2e_hires/utt2uniq
      utils/fix_data_dir.sh data/${trainset}_spe2e_hires
    fi
  
    if [ $stage -le 4 ]; then
      echo "$0: extracting MFCC features for the training data..."
      steps/make_mfcc.sh --nj 50 --mfcc-config conf/mfcc_hires.conf \
                         --cmd "$train_cmd" data/${trainset}_spe2e_hires
      steps/compute_cmvn_stats.sh data/${trainset}_spe2e_hires
    fi
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: calling the flat-start chain recipe..."
    local/chain/e2e/run_tdnnf_flatstart_char.sh
  fi