Blame view
egs/madcat_ar/v1/local/tl/run_text_localization.sh
5.99 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
#!/bin/bash # Copyright 2017 Hossein Hadian # 2018 Ashish Arora # This script performs full page text recognition on automatically extracted line images # from madcat arabic data. It is created as a separate scrip, because it performs # data augmentation, uses smaller language model and calls process_waldo_data for # test images (automatically extracted line images). Data augmentation increases image # height hence requires different DNN arachitecture and different chain scripts. set -e stage=0 nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: # https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits images_scp_dir=data/local overwrite=false subset=true augment=true verticle_shift=16 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. ./local/check_tools.sh mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then if [ -f data/train/text ] && ! $overwrite; then echo "$0: Not processing, probably script have run from wrong stage" echo "Exiting with status 1 to avoid data corruption" exit 1; fi echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 for set in train dev; do data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$set --subset $subset --augment $augment || exit 1 done echo "$0: Preparing data..." for set in dev train; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset image/fix_data_dir.sh data/${set} done local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt fi if [ $stage -le 1 ]; then echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in dev train test; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \ --verticle_shift $verticle_shift data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." image/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then for set in train; do echo "$(date) stage 2: Performing augmentation, it will double training data" local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \ --verticle_shift $verticle_shift data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train dev train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text rm -f data/$set/bpe_text data/$set/ids done echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "<sil>" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/tl/train_lm.sh --order 3 utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang fi nj=30 if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug fi if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug fi |