Blame view

egs/madcat_ar/v1/run_end2end.sh 5.3 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  #!/bin/bash
  # Copyright 2017    Hossein Hadian
  #           2018    Ashish Arora
  set -e
  stage=0
  nj=70
  # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
  # already downloaded the database you can set it to a local directory
  # This corpus can be purchased here:
  # https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
  download_dir1=/export/corpora/LDC/LDC2012T15/data
  download_dir2=/export/corpora/LDC/LDC2013T09/data
  download_dir3=/export/corpora/LDC/LDC2013T15/data
  writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
  writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
  writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
  data_splits_dir=data/download/data_splits
  images_scp_dir=data/local
  overwrite=false
  subset=false
  augment=false
  use_extra_corpus_text=true
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
             ## This relates to the queue.
  . ./path.sh
  . ./utils/parse_options.sh  # e.g. this parses the above options
                              # if supplied.
  ./local/check_tools.sh
  
  mkdir -p data/{train,test,dev}/data
  mkdir -p data/local/{train,test,dev}
  if [ $stage -le 0 ]; then
  
    if [ -f data/train/text ] && ! $overwrite; then
      echo "$0: Not processing, probably script have run from wrong stage"
      echo "Exiting with status 1 to avoid data corruption"
      exit 1;
    fi
  
    echo "$0: preparing data...$(date)"
    local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
                           --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
                           --use_extra_corpus_text $use_extra_corpus_text
  
    for set in test train dev; do
      data_split_file=$data_splits_dir/madcat.$set.raw.lineid
      local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
          --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
          --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
          --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
          --data data/local/$set --subset $subset --augment $augment || exit 1
    done
  
    echo "$0: Processing data..."
    for set in dev train test; do
      local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
        $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
        $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
      image/fix_data_dir.sh data/${set}
    done
  
  fi
  
  if [ $stage -le 1 ]; then
    echo "$0: Obtaining image groups. calling get_image2num_frames $(date)."
    image/get_image2num_frames.py data/train
    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
  
    for set in test dev train; do
      echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $set. $(date)"
      local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set
      steps/compute_cmvn_stats.sh data/$set || exit 1;
    done
    echo "$0: Fixing data directory for train dataset $(date)."
    utils/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 2 ]; then
    echo "$0: Preparing BPE..."
    cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
      utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
  
    for set in test train dev; do
      cut -d' ' -f1 data/$set/text > data/$set/ids
      cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
        utils/lang/bpe/prepend_words.py | \
        utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
        | sed 's/@@//g' > data/$set/bpe_text
  
      mv data/$set/text data/$set/text.old
      paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
      rm -f data/$set/bpe_text data/$set/ids
    done
  
    echo "$0:Preparing dictionary and lang..."
    local/prepare_dict.sh
    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
                          data/local/dict "<sil>" data/lang/temp data/lang
    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: Calling the flat-start chain recipe... $(date)."
    local/chain/run_e2e_cnn.sh
  fi
  
  lang_decode=data/lang
  lang_rescore=data/lang_rescore_6g
  decode_e2e=true
  if [ $stage -le 4 ]; then
    echo "$0: Estimating a language model for decoding..."
    local/train_lm.sh
    utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
                       data/local/dict/lexicon.txt $lang_decode
    utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                                 data/lang $lang_rescore
  fi
  
  if [ $stage -le 5 ] && $decode_e2e; then
    echo "$0: $(date) stage 5: decoding end2end setup..."
    utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
      exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
  
    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
      exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
  
    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
      data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
  
    echo "$0: Done. Date: $(date). Results:"
    local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
  fi