Blame view

egs/madcat_ar/v1/local/prepare_data.sh 2.54 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  #!/bin/bash
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Hossein Hadian
  # Apache 2.0
  
  # This script downloads the data splits for MADCAT Arabic dataset and prepares the training
  # validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
  # It also uses Arabic Gigaword text corpus for language modeling.
  
  #  Eg. local/prepare_data.sh
  #  Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 
  #                 وهناك تداخل بين الرأسمالية الإسرائيلية
  #      utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
  #      images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 
  #                        data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png
  
  download_dir1=/export/corpora/LDC/LDC2012T15/data
  download_dir2=/export/corpora/LDC/LDC2013T09/data
  download_dir3=/export/corpora/LDC/LDC2013T15/data
  train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
  test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
  dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
  data_splits=data/download/data_splits
  stage=0
  download_dir=data/download
  gigacorpus=data/local/gigawordcorpus
  gigaword_loc=/export/corpora5/LDC/LDC2011T11
  use_extra_corpus_text=true
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh || exit 1;
  
  if [ -d $data_splits ]; then
    echo "$0: Not downloading the data splits as it is already there."
  else
    if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
      mkdir -p $data_splits
      echo "$0: Downloading the data splits..."
      wget -P $data_splits $train_split_url || exit 1;
      wget -P $data_splits $test_split_url || exit 1;
      wget -P $data_splits $dev_split_url || exit 1;
    fi
    echo "$0: Done downloading the data splits"
  fi
  
  if [ -d $download_dir1 ]; then
    echo "$0: madcat arabic data directory is present."
  else
    if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
      echo "$0: please download madcat data..."
    fi
  fi
  
  mkdir -p $download_dir data/local
  if $use_extra_corpus_text; then
    mkdir -p $gigacorpus
    cp -r $gigaword_loc/. $gigacorpus
    for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
      for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
        gzip -d $file
      done
      for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
        sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
      done
    done
  fi