Blame view

egs/yomdle_tamil/v1/local/yomdle/create_download_dir.sh 4.39 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  #!/bin/bash
  
  # Copyright      2018  Chun Chieh Chang
  #                2018  Ashish Arora
  #                2018  Hossein Hadian
  # Apache 2.0
  
  # This script assumes that the SLAM and Yomdle OCR database is stored in slam_dir and
  # yomdle_dir. It reads the xml files and converts them to csv files. It then with the
  # help of csv files, extracts lines images from page images. It can create dataset for
  # any yomdle and slam language. Assuming it is creating dataset for Tamil OCR. It
  # creates csv files for yomdle English, yomdle Tamil, slam Tamil transcribed and slam
  # Tamil boxed. It also creates train, test and train_unsup sets for training and testing.
  # Yomdle (English and Tamil) is training set, slam Tamil transcribed is test set, and
  # slam Tamil boxed is semi-supervised set.
  
  set -e
  stage=0
  language_main=Tamil
  slam_dir=/export/corpora5/slam/SLAM/
  yomdle_dir=/export/corpora5/slam/YOMDLE/
  
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  mkdir -p data/local/splits
  language_lower=$(echo "$language_main" | tr '[:upper:]' '[:lower:]')
  
  echo "$0: extracting line images for english and ${language} for shared model training"
  if [ $stage -le 0 ]; then
    for language in  english $language_lower; do
      echo "$0: Processing YOMDLE ${language}"
      mkdir -p data/download/${language}/{truth_csv,truth_line_image}
      local/yomdle/yomdle2csv.py \
        --inputDir $yomdle_dir/final_$language/ \
        --outputDir data/download/${language}/truth_csv/ \
        --log data/download/yomdle2csv.${language}.log
      local/yomdle/create_line_image_from_page_image.py \
        $yomdle_dir/final_$language/images/ \
        data/download/${language}/truth_csv/ \
        data/download/${language}/truth_line_image/ \
        data/local/yomdle-${language}-train.list \
        --filter
    done
  fi
  
  echo "$0: extracting line images for slam ${language} for testing"
  if [ $stage -le 1 ]; then
    echo "$0: Processing slam ${language_main}"
    mkdir -p data/download/${language_main}/{truth_csv,truth_line_image}
    local/yomdle/gedi2csv_enriched.py \
      --inputDir $slam_dir/${language_main}/transcribed/ \
      --outputDir data/download/${language_main}/truth_csv/ \
      --log data/download/gedi2csv.${language_main}.log
    local/yomdle/create_line_image_from_page_image.py \
      $slam_dir/${language_main}/transcribed/ \
      data/download/${language_main}/truth_csv/ \
      data/download/${language_main}/truth_line_image/ \
      data/local/yomdle-${language_main}-test.list \
      --ext '.png'
  fi
  
  echo "$0: extracting line images for semi supervised training for slam ${language}"
  if [ $stage -le 2 ]; then
    echo "$0: Processing slam ${language_main}"
    mkdir -p data/download/${language_main}_boxed/{truth_csv,truth_line_image}
    local/yomdle/gedi2csv_enriched.py \
      --inputDir $slam_dir/${language_main}/boxed \
      --ftype boxed \
      --outputDir data/download/${language_main}_boxed/truth_csv/ \
      --log data/download/gedi2csv.${language_main}_boxed.log
    local/yomdle/create_line_image_from_page_image.py \
      $slam_dir/${language_main}/boxed \
      data/download/${language_main}_boxed/truth_csv/ \
      data/download/${language_main}_boxed/truth_line_image/ \
      data/local/yomdle-${language_main}-train_unsup.list \
      --ext '.png' \
      --filter
  fi
  
  echo "$0: storing english, given language(transcribed and untranscribed) line images together"
  if [ $stage -le 3 ]; then
    cp -r data/download/${language_main}_boxed/truth_line_image/* data/download/$language_lower/truth_line_image/
    cp -r data/download/$language_main/truth_line_image/* data/download/$language_lower/truth_line_image/
    cp -r data/download/english/truth_line_image/* data/download/$language_lower/truth_line_image/
    cp -r data/download/${language_main}_boxed/truth_csv/* data/download/$language_lower/truth_csv/
    cp -r data/download/$language_main/truth_csv/* data/download/$language_lower/truth_csv/
    cp -r data/download/english/truth_csv/* data/download/$language_lower/truth_csv/
  fi
  
  
  if [ $stage -le 4 ]; then
    mv data/download/$language_lower/truth_line_image/ data/download/
    mv data/download/$language_lower/truth_csv/ data/download/
  fi
  
  echo "$0: storing train, test and train unsupervised splits"
  if [ $stage -le 5 ]; then
    cat data/local/yomdle-${language_lower}-train.list data/local/yomdle-english-train.list > data/local/splits/train.txt
    cp data/local/yomdle-${language_main}-test.list data/local/splits/test.txt
    cp data/local/yomdle-${language_main}-train_unsup.list data/local/splits/train_unsup.txt
  fi