Blame view
egs/cifar/v1/image/nnet3/get_egs.sh
5.21 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
#!/usr/bin/env bash # This script is like steps/nnet3/get_egs.sh (it dumps examples for nnet3-based # neural net training), except it is specialized for classification of # fixed-size images (setups like MNIST, CIFAR and ImageNet); and you have to # provide the dev or test data in a separate directory. # Begin configuration section. cmd=run.pl egs_per_archive=25000 train_subset_egs=5000 test_mode=false stage=0 # end configuration section echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: $0 [opts] <train-data-dir> <test-or-dev-data-dir> <egs-dir>" echo " e.g.: $0 --egs-per-iter 25000 data/cifar10_train exp/cifar10_train_egs" echo " or: $0 --test-mode true data/cifar10_test exp/cifar10_test_egs" echo "Options (with defaults):" echo " --cmd 'run.pl' How to run jobs (e.g. queue.pl)" echo " --test-mode false Set this to true if you just want a single archive" echo " egs.ark to be created (useful for test data)" echo " --egs-per-archive 25000 Number of images to put in each training archive" echo " (this is a target; the actual number will be chosen" echo " as an integer fraction of the total." echo " --train-subset-egs 5000 Number of images to put in the subset of" echo " training examples that's used for diagnostics on" echo " each iteration and for combination at the end" echo " (note: there is no data held-out from training" echo " data; we use the test or dev set for that.)" exit 1; fi set -eu train=$1 test=$2 dir=$3 for f in $train/images.scp $train/labels.txt $test/images.scp $test/labels.txt; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 fi done if ! mkdir -p $dir; then echo "$0: could not make directory $dir" exit 1 fi mkdir -p $dir/info $dir/log paf="--print-args=false" num_channels=$(cat $train/num_channels) num_cols=$(head -n 1 $train/images.scp | feat-to-dim $paf scp:- -) num_rows=$(head -n 1 $train/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}') width=$num_rows height=$[$num_cols/$num_channels] # the width of the image equals $num_rows. # We put the label on t=0, and on the input, the t values # go from 0 to $width-1, so in a sense the left-context # of the model is 0 and the right-context is $width-1. # This way of looking at it is more natural for speech # or handwriting-recognition/OCR tasks than it is for # images, but it's the way we do it. echo 0 > $dir/info/left_context echo $[num_rows-1] > $dir/info/right_context echo $num_cols >$dir/info/feat_dim num_train_images=$(wc -l < $train/labels.txt) num_test_images=$(wc -l < $test/labels.txt) awk '{print $1}' $train/labels.txt | utils/shuffle_list.pl | \ head -n $train_subset_egs > $dir/train_subset_ids.txt num_classes=$(wc -l <$train/classes.txt) num_classes_test=$(wc -l <$test/classes.txt) if ! [ "$num_classes" -eq "$num_classes_test" ]; then echo "$0: training and test dirs $train and $test are not compatible" exit 1 fi if [ $stage -le 0 ]; then $cmd $dir/log/get_train_diagnostic_egs.log \ ali-to-post "ark:filter_scp.pl $dir/train_subset_ids.txt $train/labels.txt|" ark:- \| \ post-to-smat --dim=$num_classes ark:- ark:- \| \ nnet3-get-egs-simple input="scp:filter_scp.pl $dir/train_subset_ids.txt $train/images.scp|" \ output=ark:- ark:$dir/train_diagnostic.egs fi if [ $stage -le 1 ]; then # we use the same filenames as the regular training script, but # the 'valid_diagnostic' egs are actually used as the test or dev # set. $cmd $dir/log/get_test_or_dev_egs.log \ ali-to-post ark:$test/labels.txt ark:- \| \ post-to-smat --dim=$num_classes ark:- ark:- \| \ nnet3-get-egs-simple input=scp:$test/images.scp \ output=ark:- ark:$dir/valid_diagnostic.egs fi # Now work out the split of the training data. num_train_images=$(wc -l <$train/labels.txt) # the + 1 is to round up, not down... we assume it doesn't divide exactly. num_archives=$[num_train_images/egs_per_archive+1] if [ $stage -le 2 ]; then echo "$0: creating $num_archives archives of egs" image/split_image_dir.sh $train $num_archives sdata=$train/split$num_archives $cmd JOB=1:$num_archives $dir/log/get_egs.JOB.log \ ali-to-post ark:$sdata/JOB/labels.txt ark:- \| \ post-to-smat --dim=$num_classes ark:- ark:- \| \ nnet3-get-egs-simple input=scp:$sdata/JOB/images.scp \ output=ark:- ark:$dir/egs.JOB.ark fi rm $dir/train_subset_ids.txt 2>/dev/null || true ln -sf train_diagnostic.egs $dir/combine.egs echo $num_archives >$dir/info/num_archives # 'frames_per_eg' is actually the number of supervised frames per example, and # in this case we only have supervision on t=0 at the output. echo 1 >$dir/info/frames_per_eg echo $num_train_images >$dir/info/num_frames # actually 'output_dim' is not something that would be present in the # 'info' directory for speech tasks; we add it here for the convenience of # the training script, to make it easier to get the number of classes. echo $num_classes >$dir/info/output_dim echo "$0: finished generating egs" exit 0 |