run_end2end.sh
5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
# Copyright 2017 Hossein Hadian
set -e
stage=0
nj=20
username=
password=
process_aachen_split=false
overwrite=false
# iam_database points to the database path on the JHU grid. If you have not
# already downloaded the database you can set it to a local directory
# like "data/download" and follow the instructions
# in "local/prepare_data.sh" to download the database:
iam_database=/export/corpora5/handwriting_ocr/IAM
# wellington_database points to the database path on the JHU grid. The Wellington
# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus).
# This corpus is of written NZ English that can be purchased here:
# "https://www.victoria.ac.nz/lals/resources/corpora-default"
wellington_database=/export/corpora5/Wellington/WWC/
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
. ./utils/parse_options.sh # e.g. this parses the above options
# if supplied.
./local/check_tools.sh
if [ $stage -le 0 ]; then
if [ -f data/train/text ] && ! $overwrite; then
echo "$0: Not processing, probably script have run from wrong stage"
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
echo "$0: Preparing data..."
local/prepare_data.sh --download-dir "$iam_database" \
--wellington-dir "$wellington_database" \
--username "$username" --password "$password" \
--process_aachen_split $process_aachen_split
fi
mkdir -p data/{train,test}/data
if [ $stage -le 1 ]; then
echo "$(date) stage 1: getting allowed image widths for e2e training..."
image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
# The next command creates a "allowed_lengths.txt" file in data/train
# which will be used by local/make_features.py to enforce the images to
# have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
echo "$(date) Extracting features, creating feats.scp file"
local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
steps/compute_cmvn_stats.sh data/train || exit 1;
for set in val test; do
local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
--feat-dim 40 data/${set}
steps/compute_cmvn_stats.sh data/${set} || exit 1;
done
utils/fix_data_dir.sh data/train
fi
if [ $stage -le 2 ]; then
for set in train; do
echo "$(date) stage 2: Performing augmentation, it will double training data"
local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
done
fi
if [ $stage -le 3 ]; then
echo "$0: Preparing BPE..."
# getting non-silence phones.
cut -d' ' -f2- data/train/text | \
python3 <(
cat << "END"
import os, sys, io;
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
phone_dict = dict();
for line in infile:
line_vect = line.strip().split();
for word in line_vect:
for phone in word:
phone_dict[phone] = phone;
for phone in phone_dict.keys():
output.write(phone+ '\n');
END
) > data/local/phones.txt
cut -d' ' -f2- data/train/text > data/local/train_data.txt
cat data/local/phones.txt data/local/train_data.txt | \
utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
for set in test train val train_aug; do
cut -d' ' -f1 data/$set/text > data/$set/ids
cut -d' ' -f2- data/$set/text | \
utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
| sed 's/@@//g' > data/$set/bpe_text
mv data/$set/text data/$set/text.old
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
done
fi
if [ $stage -le 4 ]; then
echo "$0: Estimating a language model for decoding..."
local/train_lm.sh
fi
if [ $stage -le 5 ]; then
echo "$0: Preparing dictionary and lang..."
local/prepare_dict.sh
# This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
# So we set --sil-prob to 0.0
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
data/local/dict "<sil>" data/lang/temp data/lang
silphonelist=`cat data/lang/phones/silence.csl`
nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
data/local/dict/lexicon.txt data/lang
utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
data/lang data/lang_rescore_6g
fi
if [ $stage -le 6 ]; then
echo "$0: Calling the flat-start chain recipe..."
local/chain/run_e2e_cnn.sh --train_set train_aug
fi
if [ $stage -le 7 ]; then
echo "$0: Aligning the training data using the e2e chain model..."
steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
--use-gpu false \
--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train
fi
if [ $stage -le 8 ]; then
echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
local/chain/run_cnn_e2eali.sh --train_set train_aug
fi