Blame view

egs/gale_arabic/s5c/run.sh 4.34 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
  #!/bin/bash -e
  
  # Copyright 2014 QCRI (author: Ahmed Ali)
  #           2019 Dongji Gao
  # Apache 2.0
  
  # This is an example script for subword implementation
  
  num_jobs=120
  num_decode_jobs=40
  decode_gmm=true
  stage=0
  overwrite=false
  num_merges=1000
  
  dir1=/export/corpora/LDC/LDC2013S02/
  dir2=/export/corpora/LDC/LDC2013S07/
  dir3=/export/corpora/LDC/LDC2014S07/
  text1=/export/corpora/LDC/LDC2013T17/
  text2=/export/corpora/LDC/LDC2013T04/
  text3=/export/corpora/LDC/LDC2014T17/
  
  galeData=GALE
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
             ## This relates to the queue.
  . ./path.sh
  . ./utils/parse_options.sh  # e.g. this parses the above options
                              # if supplied.
  
  if [ $stage -le 0 ]; then
  
    if [ -f data/train/text ] && ! $overwrite; then
      echo "$0: Not processing, probably script have run from wrong stage"
      echo "Exiting with status 1 to avoid data corruption"
      exit 1;
    fi
  
    echo "$0: preparing data..."
    local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
                          --text1 $text1 --text2 $text2 --text3 $text3
  
    echo "$0: Preparing lexicon and LM..." 
    local/prepare_dict_subword.sh --num_merges $num_merges
  
    utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang
  
    for set in train test; do
      utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text
    done
  
    local/prepare_lm_subword.sh
  
    utils/format_lm.sh data/lang data/local/lm/lm.gz \
                       data/local/dict/lexicon.txt data/lang_test
  fi
  
  mfccdir=mfcc
  if [ $stage -le 1 ]; then
    echo "$0: Preparing the test and train feature files..."
    for x in train test ; do
      steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
        data/$x exp/make_mfcc/$x $mfccdir
      utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
    done
  fi
  
  if [ $stage -le 2 ]; then
    echo "$0: creating sub-set and training monophone system"
    utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
  
    steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
      data/train.10K data/lang exp/mono_subword || exit 1;
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: Aligning data using monophone system"
    steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
      data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1;
  
    echo "$0: training triphone system with delta features"
    steps/train_deltas.sh --cmd "$train_cmd" \
      2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1;
  fi
  
  if [ $stage -le 4 ] && $decode_gmm; then
    utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph
    steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
      exp/tri1_subword/graph data/test exp/tri1_subword/decode
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: Aligning data and retraining and realigning with lda_mllt"
    steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
      data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1;
  
    steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
      data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1;
  fi
  
  if [ $stage -le 6 ] && $decode_gmm; then
    utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph
    steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
      exp/tri2b_subword/graph data/test exp/tri2b_subword/decode
  fi
  
  if [ $stage -le 7 ]; then
    echo "$0: Aligning data and retraining and realigning with sat_basis"
    steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
      data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1;
  
    steps/train_sat_basis.sh --cmd "$train_cmd" \
      5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1;
  
    steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
      data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1;
  fi
  
  if [ $stage -le 8 ] && $decode_gmm; then
    utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph
    steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
      "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode
  fi
  
  if [ $stage -le 9 ]; then
    echo "$0: Training a regular chain model using the e2e alignments..."
    local/chain/run_tdnn.sh --gmm tri3b_subword
  fi
  
  echo "$0: training succeed"
  exit 0