Blame view

egs/yomdle_russian/v1/run_end2end.sh 6.89 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
  #!/bin/bash
  
  # Copyright 2018    Hossein Hadian
  #                   Ashish Arora
  #                   Jonathan Chang
  # Apache 2.0
  
  set -e
  stage=0
  nj=30
  
  language_main=Russian
  slam_dir=/export/corpora5/slam/SLAM/
  yomdle_dir=/export/corpora5/slam/YOMDLE/
  corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ru/
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  ./local/check_tools.sh
  # Start from stage=-2 for data preparation. This stage stores line images,
  # csv files and splits{train,test,train_unsup} data/download/truth_line_image,
  # data/download/truth_csv and data/local/splits respectively.
  if [ $stage -le -2 ]; then
    echo "$0: $(date): preparing data, obtaining line images and csv files..."
    local/yomdle/create_download_dir.sh --language_main $language_main \
      --slam_dir $slam_dir --yomdle_dir $yomdle_dir
  fi
  
  if [ $stage -le -1 ]; then
    echo "$0: $(date): getting corpus text for language modelling..."
    mkdir -p data/local/text/cleaned
    cat $corpus_dir/* > data/local/text/ru.txt
    head -20000 data/local/text/ru.txt > data/local/text/cleaned/val.txt
    tail -n +20000 data/local/text/ru.txt > data/local/text/cleaned/corpus.txt
  fi
  
  mkdir -p data/{train,test}/data
  if [ $stage -le 0 ]; then
    echo "$0: stage 0: Processing train and test data.$(date)"
    echo "$0: creating text, images.scp, utt2spk and spk2utt"
    #local/prepare_data.sh data/download/
    for set in train test; do
      local/process_data.py data/download/ \
        data/local/splits/${set}.txt data/${set}
      image/fix_data_dir.sh data/${set}
    done
  fi
  
  if [ $stage -le 1 ]; then
    echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
    image/get_image2num_frames.py --feat-dim 40 data/train
    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
    for set in train test; do
      echo "$0: $(date) Extracting features, creating feats.scp file"
      local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
      steps/compute_cmvn_stats.sh data/${set} || exit 1;
    done
    image/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: $(date) stage 3: BPE preparation"
    # getting non-silence phones.
    cut -d' ' -f2- data/train/text | \
  python3 <(
  cat << "END"
  import os, sys, io;
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
  phone_dict = dict();
  for line in infile:
      line_vect = line.strip().split();
      for word in line_vect:
          for phone in word:
              phone_dict[phone] = phone;
  
  for phone in phone_dict.keys():
        output.write(phone+ '
  ');
  END
     ) > data/local/text/cleaned/phones.txt
  
    cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
  
    echo "$0: learning BPE..."
    # it is currently learned with only training text but we can also use all corpus text
    # to learn BPE. phones are added so that one isolated occurance of every phone exists.
    cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
      utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
  fi
  
  if [ $stage -le 4 ]; then
    echo "$0: $(date) stage 4: applying BPE..."
    echo "$0: applying BPE on train, test text..."
    for set in test train; do
      cut -d' ' -f1 data/$set/text > data/$set/ids
      cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
        utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
        sed 's/@@//g' > data/$set/bpe_text
      mv data/$set/text data/$set/text.old
      paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
      rm -f data/$set/bpe_text data/$set/ids
    done
  
    echo "$0: applying BPE to corpus text..."
    cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
      sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
    cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
      sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: $(date) stage 5: Preparing dictionary and lang..."
    local/prepare_dict.sh --dir data/local/dict
    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
      data/local/dict "<sil>" data/lang/temp data/lang
    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
  fi
  
  if [ $stage -le 6 ]; then
    echo "$0: $(date) stage 6: Calling the flat-start chain recipe..."
    local/chain/run_e2e_cnn.sh
  fi
  
  if [ $stage -le 7 ]; then
    echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
    steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
      --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
      data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
  fi
  
  chunk_width='340,300,200,100'
  lang_decode=data/lang
  lang_rescore=data/lang_rescore_6g
  if [ $stage -le 8 ]; then
    echo "$0: $(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
    local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
  fi
  
  if [ $stage -le 9 ]; then
    echo "$0: $(date) stage 9: Estimating a language model for decoding..."
    local/train_lm.sh
    utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
                       data/local/dict/lexicon.txt data/lang
    utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                                 data/lang data/lang_rescore_6g
  fi
  
  if [ $stage -le 10 ] && $decode_e2e; then
    echo "$0: $(date) stage 10: decoding end2end setup..."
  
    utils/mkgraph.sh \
      --self-loop-scale 1.0 $lang_decode \
      exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
  
    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
      --nj 30 --cmd "$cmd" --beam 12 \
      exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
  
    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
                                  data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
  
    echo "$0: Done. Date: $(date). Results:"
    local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
  fi
  
  if [ $stage -le 11 ] && $decode_chain; then
    echo "$0: $(date) stage 11: decoding chain alignment setup..."
  
    utils/mkgraph.sh \
      --self-loop-scale 1.0 $lang_decode \
      exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
  
    frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
      --nj 30 --cmd "$cmd" --beam 12 \
      exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
  
    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
                                  data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
  
    echo "$0: Done. Date: $(date). Results:"
    local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
  fi