Blame view

egs/cifar/v1/image/nnet3/get_egs.sh 5.21 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  #!/usr/bin/env bash
  
  # This script is like steps/nnet3/get_egs.sh (it dumps examples for nnet3-based
  # neural net training), except it is specialized for classification of
  # fixed-size images (setups like MNIST, CIFAR and ImageNet); and you have to
  # provide the dev or test data in a separate directory.
  
  
  # Begin configuration section.
  cmd=run.pl
  egs_per_archive=25000
  train_subset_egs=5000
  test_mode=false
  stage=0
  # end configuration section
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 3 ]; then
    echo "Usage: $0 [opts] <train-data-dir> <test-or-dev-data-dir> <egs-dir>"
    echo " e.g.: $0 --egs-per-iter 25000 data/cifar10_train exp/cifar10_train_egs"
    echo " or: $0 --test-mode true data/cifar10_test exp/cifar10_test_egs"
    echo "Options (with defaults):"
    echo "  --cmd 'run.pl'     How to run jobs (e.g. queue.pl)"
    echo "  --test-mode false  Set this to true if you just want a single archive"
    echo "                     egs.ark to be created (useful for test data)"
    echo "  --egs-per-archive 25000  Number of images to put in each training archive"
    echo "                     (this is a target; the actual number will be chosen"
    echo "                     as an integer fraction of the total."
    echo "  --train-subset-egs 5000  Number of images to put in the subset of"
    echo "                     training examples that's used for diagnostics on"
    echo "                     each iteration and for combination at the end"
    echo "                     (note: there is no data held-out from training"
    echo "                     data; we use the test or dev set for that.)"
    exit 1;
  fi
  
  
  set -eu
  
  train=$1
  test=$2
  dir=$3
  
  for f in $train/images.scp $train/labels.txt $test/images.scp $test/labels.txt; do
     if [ ! -f $f ]; then
       echo "$0: expected file $f to exist"
       exit 1
     fi
  done
  
  
  
  if ! mkdir -p $dir; then
    echo "$0: could not make directory $dir"
    exit 1
  fi
  
  mkdir -p $dir/info $dir/log
  
  
  paf="--print-args=false"
  num_channels=$(cat $train/num_channels)
  num_cols=$(head -n 1 $train/images.scp | feat-to-dim $paf scp:- -)
  num_rows=$(head -n 1 $train/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}')
  width=$num_rows
  height=$[$num_cols/$num_channels]
  # the width of the image equals $num_rows.
  
  
  # We put the label on t=0, and on the input, the t values
  # go from 0 to $width-1, so in a sense the left-context
  # of the model is 0 and the right-context is $width-1.
  # This way of looking at it is more natural for speech
  # or handwriting-recognition/OCR tasks than it is for
  # images, but it's the way we do it.
  echo 0 > $dir/info/left_context
  echo $[num_rows-1] > $dir/info/right_context
  echo $num_cols >$dir/info/feat_dim
  
  num_train_images=$(wc -l < $train/labels.txt)
  num_test_images=$(wc -l < $test/labels.txt)
  
  awk '{print $1}' $train/labels.txt | utils/shuffle_list.pl | \
     head -n $train_subset_egs > $dir/train_subset_ids.txt
  
  
  num_classes=$(wc -l <$train/classes.txt)
  num_classes_test=$(wc -l <$test/classes.txt)
  
  if ! [ "$num_classes" -eq "$num_classes_test" ]; then
    echo "$0: training and test dirs $train and $test are not compatible"
    exit 1
  fi
  
  if [ $stage -le 0 ]; then
    $cmd $dir/log/get_train_diagnostic_egs.log \
         ali-to-post "ark:filter_scp.pl $dir/train_subset_ids.txt $train/labels.txt|" ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input="scp:filter_scp.pl $dir/train_subset_ids.txt $train/images.scp|" \
         output=ark:- ark:$dir/train_diagnostic.egs
  fi
  
  
  
  if [ $stage -le 1 ]; then
    # we use the same filenames as the regular training script, but
    # the 'valid_diagnostic' egs are actually used as the test or dev
    # set.
    $cmd $dir/log/get_test_or_dev_egs.log \
         ali-to-post ark:$test/labels.txt ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input=scp:$test/images.scp \
         output=ark:- ark:$dir/valid_diagnostic.egs
  fi
  
  # Now work out the split of the training data.
  
  num_train_images=$(wc -l <$train/labels.txt)
  
  # the + 1 is to round up, not down... we assume it doesn't divide exactly.
  num_archives=$[num_train_images/egs_per_archive+1]
  
  
  if [ $stage -le 2 ]; then
    echo "$0: creating $num_archives archives of egs"
  
    image/split_image_dir.sh $train $num_archives
  
    sdata=$train/split$num_archives
  
    $cmd JOB=1:$num_archives $dir/log/get_egs.JOB.log \
         ali-to-post ark:$sdata/JOB/labels.txt ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input=scp:$sdata/JOB/images.scp \
          output=ark:- ark:$dir/egs.JOB.ark
  fi
  
  rm $dir/train_subset_ids.txt 2>/dev/null || true
  
  ln -sf train_diagnostic.egs $dir/combine.egs
  
  echo $num_archives >$dir/info/num_archives
  
  # 'frames_per_eg' is actually the number of supervised frames per example, and
  # in this case we only have supervision on t=0 at the output.
  echo 1 >$dir/info/frames_per_eg
  
  echo $num_train_images >$dir/info/num_frames
  
  # actually 'output_dim' is not something that would be present in the
  # 'info' directory for speech tasks; we add it here for the convenience of
  # the training script, to make it easier to get the number of classes.
  echo $num_classes >$dir/info/output_dim
  
  echo "$0: finished generating egs"
  exit 0