prepare_data.sh 7.35 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191


#!/bin/bash

# Copyright      2017  Chun Chieh Chang
#                2017  Ashish Arora
#                2017  Hossein Hadian
# Apache 2.0

# This script downloads the IAM handwriting database and prepares the training
# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
# It also downloads the LOB and Brown text corpora. It downloads the database files
# only if they do not already exist in download directory.

#  Eg. local/prepare_data.sh
#  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
#      utt2spk file: 000_a01-000u-00 000
#      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
#      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03

stage=0
download_dir=data/download
process_aachen_split=false
wellington_dir=
username=
password=       # username and password for downloading the IAM database
                # if you have not already downloaded the database, please
                # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
                # and provide this script with your username and password.

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;

if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
  echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
  echo "exist in your data/local directory this script will fail because the required files"
  echo "can't be downloaded automatically (it needs registration)."
  echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
  echo "... and then call this script again with --username <username> --password <password>"
  echo ""
  exit 1
fi

lines=data/local/lines
xml=data/local/xml
ascii=data/local/ascii
bcorpus=data/local/browncorpus
lobcorpus=data/local/lobcorpus
wcorpus=data/local/wellingtoncorpus
data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
wellington_corpus_loc=/export/corpora5/Wellington/WWC/
aachen_split_url=http://www.openslr.org/resources/56/splits.zip
aachen_splits=data/local/aachensplits
mkdir -p $download_dir data/local

# download and extact images and transcription
if [ -d $lines ]; then
  echo "$0: Not downloading lines images as it is already there."
else
  if [ ! -f $download_dir/lines.tgz ]; then
    echo "$0: Trying to download lines images..."
    wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
  fi
  mkdir -p $lines
  tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
  echo "$0: Done downloading and extracting lines images"
fi

if [ -d $xml ]; then
  echo "$0: Not downloading transcriptions as it is already there."
else
  if [ ! -f $download_dir/xml.tgz ]; then
    echo "$0: Trying to download transcriptions..."
    wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
  fi
  mkdir -p $xml
  tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
  echo "$0: Done downloading and extracting transcriptions."
fi

if [ -d $data_split_info ]; then
  echo "$0: Not downloading data split information as it is already there."
else
  if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
    echo "$0: Trying to download training and testing data split information..."
    wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
  fi
  mkdir -p $data_split_info
  unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
  echo "$0: Done downloading and extracting training and testing data split information"
fi

if [ -d $ascii ]; then
  echo "$0: Not downloading ascii.tgz as it is already there."
else
  if [ ! -f $download_dir/ascii.tgz ]; then
    echo "$0: trying to download ascii.tgz..."
    wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
  fi
  mkdir -p $ascii
  tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
  echo "$0: Done downloading and extracting ascii.tgz"
fi

if [ -d $lobcorpus ]; then
  echo "$0: Not downloading the LOB text corpus as it is already there."
else
  if [ ! -f $lobcorpus/0167.zip ]; then
    echo "$0: Downloading the LOB text corpus ..."
    mkdir -p $lobcorpus
    wget -P $lobcorpus/ $lob_corpus_url || exit 1;
  fi
  unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
  echo "$0: Done downloading and extracting LOB corpus"
fi

if [ -d $bcorpus ]; then
  echo "$0: Not downloading the Brown corpus as it is already there."
else
  if [ ! -f $bcorpus/brown.txt ]; then
    mkdir -p $bcorpus
    echo "$0: Downloading the Brown text corpus..."
    wget -P $bcorpus $brown_corpus_url || exit 1;
  fi
  echo "$0: Done downloading the Brown text corpus"
fi

if [ -d $wcorpus ]; then
  echo "$0: Not copying Wellington corpus as it is already there."
elif [ ! -z $wellington_dir ]; then
  mkdir -p $wcorpus
  cp -r $wellington_dir/. $wcorpus

  # Combine Wellington corpora and replace some of their annotations
  cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
    cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt

  cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt

  echo "$0: Done copying Wellington corpus"
else
  echo "$0: Wellington Corpus not included because wellington_dir not provided"
fi

if [ -d $aachen_splits ]; then
  echo "$0: Not downloading the Aachen splits as it is already there."
else
  if [ ! -f $aachen_splits/splits.zip ]; then
    echo "$0: Downloading Aachen splits ..."
    mkdir -p $aachen_splits
    wget -P $aachen_splits/ $aachen_split_url || exit 1;
  fi
  unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
  echo "$0: Done downloading and extracting Aachen splits"
fi

mkdir -p data/{train,test,val}
file_name=largeWriterIndependentTextLineRecognitionTask

train_old="data/local/$file_name/trainset.txt"
test_old="data/local/$file_name/testset.txt"
val1_old="data/local/$file_name/validationset1.txt"
val2_old="data/local/$file_name/validationset2.txt"

train_new="data/local/train.uttlist"
test_new="data/local/test.uttlist"
val_new="data/local/validation.uttlist"

cat $train_old > $train_new
cat $test_old > $test_new
cat $val1_old $val2_old > $val_new

if $process_aachen_split; then
    local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
    local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
    local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
else
    local/process_data.py data/local data/train --dataset train || exit 1
    local/process_data.py data/local data/test --dataset test || exit 1
    local/process_data.py data/local data/val --dataset validation || exit 1
fi

image/fix_data_dir.sh data/train
image/fix_data_dir.sh data/test
image/fix_data_dir.sh data/val