Blame view
egs/yomdle_tamil/v1/local/yomdle/create_download_dir.sh
4.39 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
#!/bin/bash # Copyright 2018 Chun Chieh Chang # 2018 Ashish Arora # 2018 Hossein Hadian # Apache 2.0 # This script assumes that the SLAM and Yomdle OCR database is stored in slam_dir and # yomdle_dir. It reads the xml files and converts them to csv files. It then with the # help of csv files, extracts lines images from page images. It can create dataset for # any yomdle and slam language. Assuming it is creating dataset for Tamil OCR. It # creates csv files for yomdle English, yomdle Tamil, slam Tamil transcribed and slam # Tamil boxed. It also creates train, test and train_unsup sets for training and testing. # Yomdle (English and Tamil) is training set, slam Tamil transcribed is test set, and # slam Tamil boxed is semi-supervised set. set -e stage=0 language_main=Tamil slam_dir=/export/corpora5/slam/SLAM/ yomdle_dir=/export/corpora5/slam/YOMDLE/ . ./cmd.sh . ./path.sh . ./utils/parse_options.sh mkdir -p data/local/splits language_lower=$(echo "$language_main" | tr '[:upper:]' '[:lower:]') echo "$0: extracting line images for english and ${language} for shared model training" if [ $stage -le 0 ]; then for language in english $language_lower; do echo "$0: Processing YOMDLE ${language}" mkdir -p data/download/${language}/{truth_csv,truth_line_image} local/yomdle/yomdle2csv.py \ --inputDir $yomdle_dir/final_$language/ \ --outputDir data/download/${language}/truth_csv/ \ --log data/download/yomdle2csv.${language}.log local/yomdle/create_line_image_from_page_image.py \ $yomdle_dir/final_$language/images/ \ data/download/${language}/truth_csv/ \ data/download/${language}/truth_line_image/ \ data/local/yomdle-${language}-train.list \ --filter done fi echo "$0: extracting line images for slam ${language} for testing" if [ $stage -le 1 ]; then echo "$0: Processing slam ${language_main}" mkdir -p data/download/${language_main}/{truth_csv,truth_line_image} local/yomdle/gedi2csv_enriched.py \ --inputDir $slam_dir/${language_main}/transcribed/ \ --outputDir data/download/${language_main}/truth_csv/ \ --log data/download/gedi2csv.${language_main}.log local/yomdle/create_line_image_from_page_image.py \ $slam_dir/${language_main}/transcribed/ \ data/download/${language_main}/truth_csv/ \ data/download/${language_main}/truth_line_image/ \ data/local/yomdle-${language_main}-test.list \ --ext '.png' fi echo "$0: extracting line images for semi supervised training for slam ${language}" if [ $stage -le 2 ]; then echo "$0: Processing slam ${language_main}" mkdir -p data/download/${language_main}_boxed/{truth_csv,truth_line_image} local/yomdle/gedi2csv_enriched.py \ --inputDir $slam_dir/${language_main}/boxed \ --ftype boxed \ --outputDir data/download/${language_main}_boxed/truth_csv/ \ --log data/download/gedi2csv.${language_main}_boxed.log local/yomdle/create_line_image_from_page_image.py \ $slam_dir/${language_main}/boxed \ data/download/${language_main}_boxed/truth_csv/ \ data/download/${language_main}_boxed/truth_line_image/ \ data/local/yomdle-${language_main}-train_unsup.list \ --ext '.png' \ --filter fi echo "$0: storing english, given language(transcribed and untranscribed) line images together" if [ $stage -le 3 ]; then cp -r data/download/${language_main}_boxed/truth_line_image/* data/download/$language_lower/truth_line_image/ cp -r data/download/$language_main/truth_line_image/* data/download/$language_lower/truth_line_image/ cp -r data/download/english/truth_line_image/* data/download/$language_lower/truth_line_image/ cp -r data/download/${language_main}_boxed/truth_csv/* data/download/$language_lower/truth_csv/ cp -r data/download/$language_main/truth_csv/* data/download/$language_lower/truth_csv/ cp -r data/download/english/truth_csv/* data/download/$language_lower/truth_csv/ fi if [ $stage -le 4 ]; then mv data/download/$language_lower/truth_line_image/ data/download/ mv data/download/$language_lower/truth_csv/ data/download/ fi echo "$0: storing train, test and train unsupervised splits" if [ $stage -le 5 ]; then cat data/local/yomdle-${language_lower}-train.list data/local/yomdle-english-train.list > data/local/splits/train.txt cp data/local/yomdle-${language_main}-test.list data/local/splits/test.txt cp data/local/yomdle-${language_main}-train_unsup.list data/local/splits/train_unsup.txt fi |