Blame view

egs/iam/v2/local/prepare_data.sh 7.35 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
  #!/bin/bash
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Hossein Hadian
  # Apache 2.0
  
  # This script downloads the IAM handwriting database and prepares the training
  # and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
  # It also downloads the LOB and Brown text corpora. It downloads the database files
  # only if they do not already exist in download directory.
  
  #  Eg. local/prepare_data.sh
  #  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
  #      utt2spk file: 000_a01-000u-00 000
  #      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
  #      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
  
  stage=0
  download_dir=data/download
  process_aachen_split=false
  wellington_dir=
  username=
  password=       # username and password for downloading the IAM database
                  # if you have not already downloaded the database, please
                  # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
                  # and provide this script with your username and password.
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh || exit 1;
  
  if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
    echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
    echo "exist in your data/local directory this script will fail because the required files"
    echo "can't be downloaded automatically (it needs registration)."
    echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
    echo "... and then call this script again with --username <username> --password <password>"
    echo ""
    exit 1
  fi
  
  lines=data/local/lines
  xml=data/local/xml
  ascii=data/local/ascii
  bcorpus=data/local/browncorpus
  lobcorpus=data/local/lobcorpus
  wcorpus=data/local/wellingtoncorpus
  data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
  lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
  xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
  data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
  ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
  brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
  lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
  wellington_corpus_loc=/export/corpora5/Wellington/WWC/
  aachen_split_url=http://www.openslr.org/resources/56/splits.zip
  aachen_splits=data/local/aachensplits
  mkdir -p $download_dir data/local
  
  # download and extact images and transcription
  if [ -d $lines ]; then
    echo "$0: Not downloading lines images as it is already there."
  else
    if [ ! -f $download_dir/lines.tgz ]; then
      echo "$0: Trying to download lines images..."
      wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
    fi
    mkdir -p $lines
    tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
    echo "$0: Done downloading and extracting lines images"
  fi
  
  if [ -d $xml ]; then
    echo "$0: Not downloading transcriptions as it is already there."
  else
    if [ ! -f $download_dir/xml.tgz ]; then
      echo "$0: Trying to download transcriptions..."
      wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
    fi
    mkdir -p $xml
    tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
    echo "$0: Done downloading and extracting transcriptions."
  fi
  
  if [ -d $data_split_info ]; then
    echo "$0: Not downloading data split information as it is already there."
  else
    if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
      echo "$0: Trying to download training and testing data split information..."
      wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
    fi
    mkdir -p $data_split_info
    unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
    echo "$0: Done downloading and extracting training and testing data split information"
  fi
  
  if [ -d $ascii ]; then
    echo "$0: Not downloading ascii.tgz as it is already there."
  else
    if [ ! -f $download_dir/ascii.tgz ]; then
      echo "$0: trying to download ascii.tgz..."
      wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
    fi
    mkdir -p $ascii
    tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
    echo "$0: Done downloading and extracting ascii.tgz"
  fi
  
  if [ -d $lobcorpus ]; then
    echo "$0: Not downloading the LOB text corpus as it is already there."
  else
    if [ ! -f $lobcorpus/0167.zip ]; then
      echo "$0: Downloading the LOB text corpus ..."
      mkdir -p $lobcorpus
      wget -P $lobcorpus/ $lob_corpus_url || exit 1;
    fi
    unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
    echo "$0: Done downloading and extracting LOB corpus"
  fi
  
  if [ -d $bcorpus ]; then
    echo "$0: Not downloading the Brown corpus as it is already there."
  else
    if [ ! -f $bcorpus/brown.txt ]; then
      mkdir -p $bcorpus
      echo "$0: Downloading the Brown text corpus..."
      wget -P $bcorpus $brown_corpus_url || exit 1;
    fi
    echo "$0: Done downloading the Brown text corpus"
  fi
  
  if [ -d $wcorpus ]; then
    echo "$0: Not copying Wellington corpus as it is already there."
  elif [ ! -z $wellington_dir ]; then
    mkdir -p $wcorpus
    cp -r $wellington_dir/. $wcorpus
  
    # Combine Wellington corpora and replace some of their annotations
    cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
      cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
  
    cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
  
    echo "$0: Done copying Wellington corpus"
  else
    echo "$0: Wellington Corpus not included because wellington_dir not provided"
  fi
  
  if [ -d $aachen_splits ]; then
    echo "$0: Not downloading the Aachen splits as it is already there."
  else
    if [ ! -f $aachen_splits/splits.zip ]; then
      echo "$0: Downloading Aachen splits ..."
      mkdir -p $aachen_splits
      wget -P $aachen_splits/ $aachen_split_url || exit 1;
    fi
    unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
    echo "$0: Done downloading and extracting Aachen splits"
  fi
  
  
  mkdir -p data/{train,test,val}
  file_name=largeWriterIndependentTextLineRecognitionTask
  
  train_old="data/local/$file_name/trainset.txt"
  test_old="data/local/$file_name/testset.txt"
  val1_old="data/local/$file_name/validationset1.txt"
  val2_old="data/local/$file_name/validationset2.txt"
  
  train_new="data/local/train.uttlist"
  test_new="data/local/test.uttlist"
  val_new="data/local/validation.uttlist"
  
  cat $train_old > $train_new
  cat $test_old > $test_new
  cat $val1_old $val2_old > $val_new
  
  if $process_aachen_split; then
      local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
      local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
      local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
  else
      local/process_data.py data/local data/train --dataset train || exit 1
      local/process_data.py data/local data/test --dataset test || exit 1
      local/process_data.py data/local data/val --dataset validation || exit 1
  fi
  
  image/fix_data_dir.sh data/train
  image/fix_data_dir.sh data/test
  image/fix_data_dir.sh data/val