Blame view

egs/madcat_ar/v1/run.sh 5.61 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  #!/bin/bash
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Hossein Hadian
  
  set -e
  stage=0
  nj=70
  decode_gmm=false
  # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
  # already downloaded the database you can set it to a local directory
  # This corpus can be purchased here:
  # https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
  download_dir1=/export/corpora/LDC/LDC2012T15/data
  download_dir2=/export/corpora/LDC/LDC2013T09/data
  download_dir3=/export/corpora/LDC/LDC2013T15/data
  writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
  writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
  writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
  data_splits_dir=data/download/data_splits
  images_scp_dir=data/local
  overwrite=false
  subset=false
  augment=false
  use_extra_corpus_text=true
  . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
             ## This relates to the queue.
  . ./path.sh
  . ./utils/parse_options.sh  # e.g. this parses the above options
                              # if supplied.
  ./local/check_tools.sh
  mkdir -p data/{train,test,dev}/data
  mkdir -p data/local/{train,test,dev}
  
  if [ $stage -le 0 ]; then
    if [ -f data/train/text ] && ! $overwrite; then
      echo "$0: Not processing, probably script have run from wrong stage"
      echo "Exiting with status 1 to avoid data corruption"
      exit 1;
    fi
    local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
                           --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
                           --use_extra_corpus_text $use_extra_corpus_text
  
    for set in test train dev; do
      data_split_file=$data_splits_dir/madcat.$set.raw.lineid
      local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
          --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
          --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
          --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
          --data data/local/$set --subset $subset --augment $augment || exit 1
    done
  
    echo "$0: Processing data..."
    for set in dev train test; do
      local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
        $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
        $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
      image/fix_data_dir.sh data/${set}
    done
  fi
  
  
  if [ $stage -le 1 ]; then
    for dataset in test train; do
      local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset
      steps/compute_cmvn_stats.sh data/$dataset || exit 1;
    done
    utils/fix_data_dir.sh data/train
  fi
  
  if [ $stage -le 2 ]; then
    echo "$0: Preparing BPE..."
    cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
      utils/lang/bpe/prepend_words.py | \
      utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
  
    for set in test train dev; do
      cut -d' ' -f1 data/$set/text > data/$set/ids
      cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
        utils/lang/bpe/prepend_words.py | \
        utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
        | sed 's/@@//g' > data/$set/bpe_text
  
      mv data/$set/text data/$set/text.old
      paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
      rm -f data/$set/bpe_text data/$set/ids
    done
  
    echo "$0:Preparing dictionary and lang..."
    local/prepare_dict.sh
    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
                          data/local/dict "<sil>" data/lang/temp data/lang
    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
  fi
  
  if [ $stage -le 3 ]; then
    echo "$0: Estimating a language model for decoding..."
    local/train_lm.sh
    utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
                       data/local/dict/lexicon.txt data/lang
    utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                                 data/lang data/lang_rescore_6g
  fi
  
  if [ $stage -le 4 ]; then
    steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
      data/lang exp/mono
  fi
  
  if [ $stage -le 5 ] && $decode_gmm; then
    utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
  
    steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
      exp/mono/decode_test
  fi
  
  if [ $stage -le 6 ]; then
    steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
      exp/mono exp/mono_ali
  
    steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \
      exp/mono_ali exp/tri
  fi
  
  if [ $stage -le 7 ] && $decode_gmm; then
    utils/mkgraph.sh data/lang exp/tri exp/tri/graph
  
    steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
      exp/tri/decode_test
  fi
  
  if [ $stage -le 8 ]; then
    steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
      exp/tri exp/tri_ali
  
    steps/train_lda_mllt.sh --cmd $cmd \
      --splice-opts "--left-context=3 --right-context=3" 500 20000 \
      data/train data/lang exp/tri_ali exp/tri3
  fi
  
  if [ $stage -le 9 ] && $decode_gmm; then
    utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
  
    steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \
      data/test exp/tri3/decode_test
  fi
  
  if [ $stage -le 10 ]; then
    steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
      data/train data/lang exp/tri3 exp/tri3_ali
  fi
  
  if [ $stage -le 11 ]; then
    local/chain/run_cnn.sh
  fi
  
  if [ $stage -le 12 ]; then
    local/chain/run_cnn_chainali.sh --stage 2
  fi