get_egs.sh
5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env bash
# This script is like steps/nnet3/get_egs.sh (it dumps examples for nnet3-based
# neural net training), except it is specialized for classification of
# fixed-size images (setups like MNIST, CIFAR and ImageNet); and you have to
# provide the dev or test data in a separate directory.
# Begin configuration section.
cmd=run.pl
egs_per_archive=25000
train_subset_egs=5000
test_mode=false
stage=0
# end configuration section
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [opts] <train-data-dir> <test-or-dev-data-dir> <egs-dir>"
echo " e.g.: $0 --egs-per-iter 25000 data/cifar10_train exp/cifar10_train_egs"
echo " or: $0 --test-mode true data/cifar10_test exp/cifar10_test_egs"
echo "Options (with defaults):"
echo " --cmd 'run.pl' How to run jobs (e.g. queue.pl)"
echo " --test-mode false Set this to true if you just want a single archive"
echo " egs.ark to be created (useful for test data)"
echo " --egs-per-archive 25000 Number of images to put in each training archive"
echo " (this is a target; the actual number will be chosen"
echo " as an integer fraction of the total."
echo " --train-subset-egs 5000 Number of images to put in the subset of"
echo " training examples that's used for diagnostics on"
echo " each iteration and for combination at the end"
echo " (note: there is no data held-out from training"
echo " data; we use the test or dev set for that.)"
exit 1;
fi
set -eu
train=$1
test=$2
dir=$3
for f in $train/images.scp $train/labels.txt $test/images.scp $test/labels.txt; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done
if ! mkdir -p $dir; then
echo "$0: could not make directory $dir"
exit 1
fi
mkdir -p $dir/info $dir/log
paf="--print-args=false"
num_channels=$(cat $train/num_channels)
num_cols=$(head -n 1 $train/images.scp | feat-to-dim $paf scp:- -)
num_rows=$(head -n 1 $train/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}')
width=$num_rows
height=$[$num_cols/$num_channels]
# the width of the image equals $num_rows.
# We put the label on t=0, and on the input, the t values
# go from 0 to $width-1, so in a sense the left-context
# of the model is 0 and the right-context is $width-1.
# This way of looking at it is more natural for speech
# or handwriting-recognition/OCR tasks than it is for
# images, but it's the way we do it.
echo 0 > $dir/info/left_context
echo $[num_rows-1] > $dir/info/right_context
echo $num_cols >$dir/info/feat_dim
num_train_images=$(wc -l < $train/labels.txt)
num_test_images=$(wc -l < $test/labels.txt)
awk '{print $1}' $train/labels.txt | utils/shuffle_list.pl | \
head -n $train_subset_egs > $dir/train_subset_ids.txt
num_classes=$(wc -l <$train/classes.txt)
num_classes_test=$(wc -l <$test/classes.txt)
if ! [ "$num_classes" -eq "$num_classes_test" ]; then
echo "$0: training and test dirs $train and $test are not compatible"
exit 1
fi
if [ $stage -le 0 ]; then
$cmd $dir/log/get_train_diagnostic_egs.log \
ali-to-post "ark:filter_scp.pl $dir/train_subset_ids.txt $train/labels.txt|" ark:- \| \
post-to-smat --dim=$num_classes ark:- ark:- \| \
nnet3-get-egs-simple input="scp:filter_scp.pl $dir/train_subset_ids.txt $train/images.scp|" \
output=ark:- ark:$dir/train_diagnostic.egs
fi
if [ $stage -le 1 ]; then
# we use the same filenames as the regular training script, but
# the 'valid_diagnostic' egs are actually used as the test or dev
# set.
$cmd $dir/log/get_test_or_dev_egs.log \
ali-to-post ark:$test/labels.txt ark:- \| \
post-to-smat --dim=$num_classes ark:- ark:- \| \
nnet3-get-egs-simple input=scp:$test/images.scp \
output=ark:- ark:$dir/valid_diagnostic.egs
fi
# Now work out the split of the training data.
num_train_images=$(wc -l <$train/labels.txt)
# the + 1 is to round up, not down... we assume it doesn't divide exactly.
num_archives=$[num_train_images/egs_per_archive+1]
if [ $stage -le 2 ]; then
echo "$0: creating $num_archives archives of egs"
image/split_image_dir.sh $train $num_archives
sdata=$train/split$num_archives
$cmd JOB=1:$num_archives $dir/log/get_egs.JOB.log \
ali-to-post ark:$sdata/JOB/labels.txt ark:- \| \
post-to-smat --dim=$num_classes ark:- ark:- \| \
nnet3-get-egs-simple input=scp:$sdata/JOB/images.scp \
output=ark:- ark:$dir/egs.JOB.ark
fi
rm $dir/train_subset_ids.txt 2>/dev/null || true
ln -sf train_diagnostic.egs $dir/combine.egs
echo $num_archives >$dir/info/num_archives
# 'frames_per_eg' is actually the number of supervised frames per example, and
# in this case we only have supervision on t=0 at the output.
echo 1 >$dir/info/frames_per_eg
echo $num_train_images >$dir/info/num_frames
# actually 'output_dim' is not something that would be present in the
# 'info' directory for speech tasks; we add it here for the convenience of
# the training script, to make it easier to get the number of classes.
echo $num_classes >$dir/info/output_dim
echo "$0: finished generating egs"
exit 0