run_end2end.sh
6.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/bin/bash
# Copyright 2018 Hossein Hadian
# Ashish Arora
# Jonathan Chang
# Apache 2.0
set -e
stage=0
nj=30
language_main=Russian
slam_dir=/export/corpora5/slam/SLAM/
yomdle_dir=/export/corpora5/slam/YOMDLE/
corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ru/
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
./local/check_tools.sh
# Start from stage=-2 for data preparation. This stage stores line images,
# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
# data/download/truth_csv and data/local/splits respectively.
if [ $stage -le -2 ]; then
echo "$0: $(date): preparing data, obtaining line images and csv files..."
local/yomdle/create_download_dir.sh --language_main $language_main \
--slam_dir $slam_dir --yomdle_dir $yomdle_dir
fi
if [ $stage -le -1 ]; then
echo "$0: $(date): getting corpus text for language modelling..."
mkdir -p data/local/text/cleaned
cat $corpus_dir/* > data/local/text/ru.txt
head -20000 data/local/text/ru.txt > data/local/text/cleaned/val.txt
tail -n +20000 data/local/text/ru.txt > data/local/text/cleaned/corpus.txt
fi
mkdir -p data/{train,test}/data
if [ $stage -le 0 ]; then
echo "$0: stage 0: Processing train and test data.$(date)"
echo "$0: creating text, images.scp, utt2spk and spk2utt"
#local/prepare_data.sh data/download/
for set in train test; do
local/process_data.py data/download/ \
data/local/splits/${set}.txt data/${set}
image/fix_data_dir.sh data/${set}
done
fi
if [ $stage -le 1 ]; then
echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
image/get_image2num_frames.py --feat-dim 40 data/train
image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
for set in train test; do
echo "$0: $(date) Extracting features, creating feats.scp file"
local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
steps/compute_cmvn_stats.sh data/${set} || exit 1;
done
image/fix_data_dir.sh data/train
fi
if [ $stage -le 3 ]; then
echo "$0: $(date) stage 3: BPE preparation"
# getting non-silence phones.
cut -d' ' -f2- data/train/text | \
python3 <(
cat << "END"
import os, sys, io;
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
phone_dict = dict();
for line in infile:
line_vect = line.strip().split();
for word in line_vect:
for phone in word:
phone_dict[phone] = phone;
for phone in phone_dict.keys():
output.write(phone+ '\n');
END
) > data/local/text/cleaned/phones.txt
cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
echo "$0: learning BPE..."
# it is currently learned with only training text but we can also use all corpus text
# to learn BPE. phones are added so that one isolated occurance of every phone exists.
cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
fi
if [ $stage -le 4 ]; then
echo "$0: $(date) stage 4: applying BPE..."
echo "$0: applying BPE on train, test text..."
for set in test train; do
cut -d' ' -f1 data/$set/text > data/$set/ids
cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
sed 's/@@//g' > data/$set/bpe_text
mv data/$set/text data/$set/text.old
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
rm -f data/$set/bpe_text data/$set/ids
done
echo "$0: applying BPE to corpus text..."
cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
fi
if [ $stage -le 5 ]; then
echo "$0: $(date) stage 5: Preparing dictionary and lang..."
local/prepare_dict.sh --dir data/local/dict
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
data/local/dict "<sil>" data/lang/temp data/lang
utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
fi
if [ $stage -le 6 ]; then
echo "$0: $(date) stage 6: Calling the flat-start chain recipe..."
local/chain/run_e2e_cnn.sh
fi
if [ $stage -le 7 ]; then
echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
fi
chunk_width='340,300,200,100'
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
if [ $stage -le 8 ]; then
echo "$0: $(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
fi
if [ $stage -le 9 ]; then
echo "$0: $(date) stage 9: Estimating a language model for decoding..."
local/train_lm.sh
utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
data/local/dict/lexicon.txt data/lang
utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
data/lang data/lang_rescore_6g
fi
if [ $stage -le 10 ] && $decode_e2e; then
echo "$0: $(date) stage 10: decoding end2end setup..."
utils/mkgraph.sh \
--self-loop-scale 1.0 $lang_decode \
exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj 30 --cmd "$cmd" --beam 12 \
exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
echo "$0: Done. Date: $(date). Results:"
local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
fi
if [ $stage -le 11 ] && $decode_chain; then
echo "$0: $(date) stage 11: decoding chain alignment setup..."
utils/mkgraph.sh \
--self-loop-scale 1.0 $lang_decode \
exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj 30 --cmd "$cmd" --beam 12 \
exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
echo "$0: Done. Date: $(date). Results:"
local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
fi