Blame view
egs/iam/v2/local/prepare_data.sh
7.35 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
#!/bin/bash # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora # 2017 Hossein Hadian # Apache 2.0 # This script downloads the IAM handwriting database and prepares the training # and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. # It also downloads the LOB and Brown text corpora. It downloads the database files # only if they do not already exist in download directory. # Eg. local/prepare_data.sh # Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from # utt2spk file: 000_a01-000u-00 000 # images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png # spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 stage=0 download_dir=data/download process_aachen_split=false wellington_dir= username= password= # username and password for downloading the IAM database # if you have not already downloaded the database, please # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database # and provide this script with your username and password. . ./cmd.sh . ./path.sh . ./utils/parse_options.sh || exit 1; if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" echo "exist in your data/local directory this script will fail because the required files" echo "can't be downloaded automatically (it needs registration)." echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" echo "... and then call this script again with --username <username> --password <password>" echo "" exit 1 fi lines=data/local/lines xml=data/local/xml ascii=data/local/ascii bcorpus=data/local/browncorpus lobcorpus=data/local/lobcorpus wcorpus=data/local/wellingtoncorpus data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip wellington_corpus_loc=/export/corpora5/Wellington/WWC/ aachen_split_url=http://www.openslr.org/resources/56/splits.zip aachen_splits=data/local/aachensplits mkdir -p $download_dir data/local # download and extact images and transcription if [ -d $lines ]; then echo "$0: Not downloading lines images as it is already there." else if [ ! -f $download_dir/lines.tgz ]; then echo "$0: Trying to download lines images..." wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; fi mkdir -p $lines tar -xzf $download_dir/lines.tgz -C $lines || exit 1; echo "$0: Done downloading and extracting lines images" fi if [ -d $xml ]; then echo "$0: Not downloading transcriptions as it is already there." else if [ ! -f $download_dir/xml.tgz ]; then echo "$0: Trying to download transcriptions..." wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; fi mkdir -p $xml tar -xzf $download_dir/xml.tgz -C $xml || exit 1; echo "$0: Done downloading and extracting transcriptions." fi if [ -d $data_split_info ]; then echo "$0: Not downloading data split information as it is already there." else if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then echo "$0: Trying to download training and testing data split information..." wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; fi mkdir -p $data_split_info unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; echo "$0: Done downloading and extracting training and testing data split information" fi if [ -d $ascii ]; then echo "$0: Not downloading ascii.tgz as it is already there." else if [ ! -f $download_dir/ascii.tgz ]; then echo "$0: trying to download ascii.tgz..." wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; fi mkdir -p $ascii tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; echo "$0: Done downloading and extracting ascii.tgz" fi if [ -d $lobcorpus ]; then echo "$0: Not downloading the LOB text corpus as it is already there." else if [ ! -f $lobcorpus/0167.zip ]; then echo "$0: Downloading the LOB text corpus ..." mkdir -p $lobcorpus wget -P $lobcorpus/ $lob_corpus_url || exit 1; fi unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; echo "$0: Done downloading and extracting LOB corpus" fi if [ -d $bcorpus ]; then echo "$0: Not downloading the Brown corpus as it is already there." else if [ ! -f $bcorpus/brown.txt ]; then mkdir -p $bcorpus echo "$0: Downloading the Brown text corpus..." wget -P $bcorpus $brown_corpus_url || exit 1; fi echo "$0: Done downloading the Brown text corpus" fi if [ -d $wcorpus ]; then echo "$0: Not copying Wellington corpus as it is already there." elif [ ! -z $wellington_dir ]; then mkdir -p $wcorpus cp -r $wellington_dir/. $wcorpus # Combine Wellington corpora and replace some of their annotations cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt echo "$0: Done copying Wellington corpus" else echo "$0: Wellington Corpus not included because wellington_dir not provided" fi if [ -d $aachen_splits ]; then echo "$0: Not downloading the Aachen splits as it is already there." else if [ ! -f $aachen_splits/splits.zip ]; then echo "$0: Downloading Aachen splits ..." mkdir -p $aachen_splits wget -P $aachen_splits/ $aachen_split_url || exit 1; fi unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; echo "$0: Done downloading and extracting Aachen splits" fi mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask train_old="data/local/$file_name/trainset.txt" test_old="data/local/$file_name/testset.txt" val1_old="data/local/$file_name/validationset1.txt" val2_old="data/local/$file_name/validationset2.txt" train_new="data/local/train.uttlist" test_new="data/local/test.uttlist" val_new="data/local/validation.uttlist" cat $train_old > $train_new cat $test_old > $test_new cat $val1_old $val2_old > $val_new if $process_aachen_split; then local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 else local/process_data.py data/local data/train --dataset train || exit 1 local/process_data.py data/local data/test --dataset test || exit 1 local/process_data.py data/local data/val --dataset validation || exit 1 fi image/fix_data_dir.sh data/train image/fix_data_dir.sh data/test image/fix_data_dir.sh data/val |