Blame view

egs/yomdle_tamil/v1/run_end2end.sh 6.09 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
  #!/bin/bash
  
  # Copyright 2018    Hossein Hadian
  #                   Ashish Arora
  #                   Jonathan Chang
  # Apache 2.0
  
  set -e
  stage=0
  nj=30
  
  language_main=Tamil
  slam_dir=/export/corpora5/slam/SLAM/
  yomdle_dir=/export/corpora5/slam/YOMDLE/
  corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ta/
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  ./local/check_tools.sh
  # Start from stage=-2 for data preparation. This stage stores line images,
  # csv files and splits{train,test,train_unsup} data/download/truth_line_image,
  # data/download/truth_csv and data/local/splits respectively.
  if [ $stage -le -2 ]; then
    echo "$(date): preparing data, obtaining line images and csv files..."
    local/yomdle/create_download_dir.sh --language_main $language_main \
      --slam_dir $slam_dir --yomdle_dir $yomdle_dir
  fi
  
  if [ $stage -le -1 ]; then
    echo "$(date): getting corpus text for language modelling..."
    mkdir -p data/local/text/cleaned
    cat $corpus_dir/* > data/local/text/ta.txt
    head -20000 data/local/text/ta.txt > data/local/text/val.txt
    tail -n +20000 data/local/text/ta.txt > data/local/text/corpus.txt
  fi
  
  mkdir -p data/{train,test}/data
  if [ $stage -le 0 ]; then
    echo "$(date) stage 0: Processing train and test data."
    echo " creating text, images.scp, utt2spk and spk2utt"
    # removing empty transcription line images from train and test set.
    # It can cause error while applying BPE.
    for set in train test; do
      local/process_data.py data/download/ \
        data/local/splits/${set}.txt data/${set}
      image/fix_data_dir.sh data/${set}
    done
  fi
  
  if [ $stage -le 1 ]; then
    echo "$(date) stage 1: getting allowed image widths for e2e training..."
    image/get_image2num_frames.py --feat-dim 40 data/train
    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
    for set in train test; do
      echo "$(date) Extracting features, creating feats.scp file"
      local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
      steps/compute_cmvn_stats.sh data/${set} || exit 1;
    done
    image/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 2 ]; then
    for set in train; do
      echo "$(date) stage 2: Performing augmentation, it will double training data"
      local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
      steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
    done
  fi
  
  if [ $stage -le 3 ]; then
    echo "$(date) stage 3: BPE preparation"
    # getting non-silence phones.
    cut -d' ' -f2- data/train/text | \
  python3 <(
  cat << "END"
  import os, sys, io;
  infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
  output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
  phone_dict = dict();
  for line in infile:
      line_vect = line.strip().split();
      for word in line_vect:
          for phone in word:
              phone_dict[phone] = phone;
  
  for phone in phone_dict.keys():
        output.write(phone+ '
  ');
  END
     ) > data/local/text/cleaned/phones.txt
  
    cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
  
    echo "Processing corpus text..."
    # we are removing the lines from the corpus which which have
    # phones other than the phones in data/local/text/cleaned/phones.txt.
    cat data/local/text/corpus.txt | \
      local/process_corpus.py > data/local/text/cleaned/corpus.txt
    cat data/local/text/val.txt | \
      local/process_corpus.py > data/local/text/cleaned/val.txt
  
    echo "learning BPE..."
    # it is currently learned with only training text but we can also use all corpus text
    # to learn BPE. phones are added so that one isolated occurance of every phone exists.
    cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
      utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
  fi
  
  if [ $stage -le 4 ]; then
    echo "$(date) stage 4: applying BPE..."
    echo "applying BPE on train, test text..."
    for set in test train train_aug; do
      cut -d' ' -f1 data/$set/text > data/$set/ids
      cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
        utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
        sed 's/@@//g' > data/$set/bpe_text
      mv data/$set/text data/$set/text.old
      paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
      rm -f data/$set/bpe_text data/$set/ids
    done
  
    echo "applying BPE to corpus text..."
    cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
      sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
    cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
      sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
  fi
  
  if [ $stage -le 5 ]; then
    echo "$(date) stage 5: Preparing dictionary and lang..."
    local/prepare_dict.sh --dir data/local/dict
    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
      data/local/dict "<sil>" data/lang/temp data/lang
    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
  fi
  
  if [ $stage -le 6 ]; then
    echo "$(date) stage 6: Estimating a language model for decoding..."
    local/train_lm.sh
    utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
                       data/local/dict/lexicon.txt data/lang
    utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                                 data/lang data/lang_rescore_6g
  fi
  
  if [ $stage -le 7 ]; then
    echo "$(date) stage 7: Calling the flat-start chain recipe..."
    local/chain/run_e2e_cnn.sh --train_set train_aug
  fi
  
  if [ $stage -le 8 ]; then
    echo "$(date) stage 8: Aligning the training data using the e2e chain model..."
    steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
      --use-gpu false \
      --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
      data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
  fi
  
  if [ $stage -le 9 ]; then
    echo "$(date) stage 9: Building a tree and training a regular chain model using the e2e alignments..."
    local/chain/run_cnn_e2eali.sh --train_set train_aug
  fi