run_text_localization.sh
5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/bash
# Copyright 2017 Hossein Hadian
# 2018 Ashish Arora
# This script performs full page text recognition on automatically extracted line images
# from madcat arabic data. It is created as a separate scrip, because it performs
# data augmentation, uses smaller language model and calls process_waldo_data for
# test images (automatically extracted line images). Data augmentation increases image
# height hence requires different DNN arachitecture and different chain scripts.
set -e
stage=0
nj=70
# download_dir{1,2,3} points to the database path on the JHU grid. If you have not
# already downloaded the database you can set it to a local directory
# This corpus can be purchased here:
# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
data_splits_dir=data/download/data_splits
images_scp_dir=data/local
overwrite=false
subset=true
augment=true
verticle_shift=16
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
. ./utils/parse_options.sh # e.g. this parses the above options
# if supplied.
./local/check_tools.sh
mkdir -p data/{train,test,dev}/data
mkdir -p data/local/{train,test,dev}
if [ $stage -le 0 ]; then
if [ -f data/train/text ] && ! $overwrite; then
echo "$0: Not processing, probably script have run from wrong stage"
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
echo "$0: Downloading data splits...$(date)"
local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3
for set in train dev; do
data_split_file=$data_splits_dir/madcat.$set.raw.lineid
local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
--download_dir1 $download_dir1 --download_dir2 $download_dir2 \
--download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
--writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
--data data/local/$set --subset $subset --augment $augment || exit 1
done
echo "$0: Preparing data..."
for set in dev train; do
local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
$writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
image/fix_data_dir.sh data/${set}
done
local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test
utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
fi
if [ $stage -le 1 ]; then
echo "$0: Obtaining image groups. calling get_image2num_frames $(date)."
image/get_image2num_frames.py data/train
image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
for set in dev train test; do
echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)"
local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \
--verticle_shift $verticle_shift data/$set
steps/compute_cmvn_stats.sh data/$set || exit 1;
done
echo "$0: Fixing data directory for train dataset $(date)."
image/fix_data_dir.sh data/train
fi
if [ $stage -le 2 ]; then
for set in train; do
echo "$(date) stage 2: Performing augmentation, it will double training data"
local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \
--verticle_shift $verticle_shift data/${set} data/${set}_aug data
steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
done
fi
if [ $stage -le 3 ]; then
echo "$0: Preparing BPE..."
cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
for set in test train dev train_aug; do
cut -d' ' -f1 data/$set/text > data/$set/ids
cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
utils/lang/bpe/prepend_words.py | \
utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
| sed 's/@@//g' > data/$set/bpe_text
mv data/$set/text data/$set/text.old
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
rm -f data/$set/bpe_text data/$set/ids
done
echo "$0:Preparing dictionary and lang..."
local/prepare_dict.sh
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
data/local/dict "<sil>" data/lang/temp data/lang
utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
fi
if [ $stage -le 4 ]; then
echo "$0: Estimating a language model for decoding..."
local/tl/train_lm.sh --order 3
utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
data/local/dict/lexicon.txt data/lang
fi
nj=30
if [ $stage -le 5 ]; then
echo "$0: Calling the flat-start chain recipe... $(date)."
local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug
fi
if [ $stage -le 6 ]; then
echo "$0: Aligning the training data using the e2e chain model...$(date)."
steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
--use-gpu false \
--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
fi
if [ $stage -le 7 ]; then
echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)"
local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug
fi