Yannick Estève / ONTRAC-Kaldi

Blame view

egs/iam/v2/local/prepare_data.sh 7.35 KB
  #!/bin/bash
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Hossein Hadian
  # Apache 2.0
  
  # This script downloads the IAM handwriting database and prepares the training
  # and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
  # It also downloads the LOB and Brown text corpora. It downloads the database files
  # only if they do not already exist in download directory.
  
  #  Eg. local/prepare_data.sh
  #  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
  #      utt2spk file: 000_a01-000u-00 000
  #      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
  #      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
  
  stage=0
  download_dir=data/download
  process_aachen_split=false
  wellington_dir=
  username=
  password=       # username and password for downloading the IAM database
                  # if you have not already downloaded the database, please
                  # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
                  # and provide this script with your username and password.
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh || exit 1;
  
  if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
    echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
    echo "exist in your data/local directory this script will fail because the required files"
    echo "can't be downloaded automatically (it needs registration)."
    echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
    echo "... and then call this script again with --username <username> --password <password>"
    echo ""
    exit 1
  fi
  
  lines=data/local/lines
  xml=data/local/xml
  ascii=data/local/ascii
  bcorpus=data/local/browncorpus
  lobcorpus=data/local/lobcorpus
  wcorpus=data/local/wellingtoncorpus
  data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
  lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
  xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
  data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
  ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
  brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
  lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
  wellington_corpus_loc=/export/corpora5/Wellington/WWC/
  aachen_split_url=http://www.openslr.org/resources/56/splits.zip
  aachen_splits=data/local/aachensplits
  mkdir -p $download_dir data/local
  
  # download and extact images and transcription
  if [ -d $lines ]; then
    echo "$0: Not downloading lines images as it is already there."
  else
    if [ ! -f $download_dir/lines.tgz ]; then
      echo "$0: Trying to download lines images..."
      wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
    fi
    mkdir -p $lines
    tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
    echo "$0: Done downloading and extracting lines images"
  fi
  
  if [ -d $xml ]; then
    echo "$0: Not downloading transcriptions as it is already there."
  else
    if [ ! -f $download_dir/xml.tgz ]; then
      echo "$0: Trying to download transcriptions..."
      wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
    fi
    mkdir -p $xml
    tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
    echo "$0: Done downloading and extracting transcriptions."
  fi
  
  if [ -d $data_split_info ]; then
    echo "$0: Not downloading data split information as it is already there."
  else
    if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
      echo "$0: Trying to download training and testing data split information..."
      wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
    fi
    mkdir -p $data_split_info
    unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
    echo "$0: Done downloading and extracting training and testing data split information"
  fi
  
  if [ -d $ascii ]; then
    echo "$0: Not downloading ascii.tgz as it is already there."
  else
    if [ ! -f $download_dir/ascii.tgz ]; then
      echo "$0: trying to download ascii.tgz..."
      wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
    fi
    mkdir -p $ascii
    tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
    echo "$0: Done downloading and extracting ascii.tgz"
  fi
  
  if [ -d $lobcorpus ]; then
    echo "$0: Not downloading the LOB text corpus as it is already there."
  else
    if [ ! -f $lobcorpus/0167.zip ]; then
      echo "$0: Downloading the LOB text corpus ..."
      mkdir -p $lobcorpus
      wget -P $lobcorpus/ $lob_corpus_url || exit 1;
    fi
    unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
    echo "$0: Done downloading and extracting LOB corpus"
  fi
  
  if [ -d $bcorpus ]; then
    echo "$0: Not downloading the Brown corpus as it is already there."
  else
    if [ ! -f $bcorpus/brown.txt ]; then
      mkdir -p $bcorpus
      echo "$0: Downloading the Brown text corpus..."
      wget -P $bcorpus $brown_corpus_url || exit 1;
    fi
    echo "$0: Done downloading the Brown text corpus"
  fi
  
  if [ -d $wcorpus ]; then
    echo "$0: Not copying Wellington corpus as it is already there."
  elif [ ! -z $wellington_dir ]; then
    mkdir -p $wcorpus
    cp -r $wellington_dir/. $wcorpus
  
    # Combine Wellington corpora and replace some of their annotations
    cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
      cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
  
    cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
  
    echo "$0: Done copying Wellington corpus"
  else
    echo "$0: Wellington Corpus not included because wellington_dir not provided"
  fi
  
  if [ -d $aachen_splits ]; then
    echo "$0: Not downloading the Aachen splits as it is already there."
  else
    if [ ! -f $aachen_splits/splits.zip ]; then
      echo "$0: Downloading Aachen splits ..."
      mkdir -p $aachen_splits
      wget -P $aachen_splits/ $aachen_split_url || exit 1;
    fi
    unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
    echo "$0: Done downloading and extracting Aachen splits"
  fi
  
  
  mkdir -p data/{train,test,val}
  file_name=largeWriterIndependentTextLineRecognitionTask
  
  train_old="data/local/$file_name/trainset.txt"
  test_old="data/local/$file_name/testset.txt"
  val1_old="data/local/$file_name/validationset1.txt"
  val2_old="data/local/$file_name/validationset2.txt"
  
  train_new="data/local/train.uttlist"
  test_new="data/local/test.uttlist"
  val_new="data/local/validation.uttlist"
  
  cat $train_old > $train_new
  cat $test_old > $test_new
  cat $val1_old $val2_old > $val_new
  
  if $process_aachen_split; then
      local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
      local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
      local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
  else
      local/process_data.py data/local data/train --dataset train || exit 1
      local/process_data.py data/local data/test --dataset test || exit 1
      local/process_data.py data/local data/val --dataset validation || exit 1
  fi
  
  image/fix_data_dir.sh data/train
  image/fix_data_dir.sh data/test
  image/fix_data_dir.sh data/val