Yannick Estève / ONTRAC-Kaldi

Blame view

egs/cifar/v1/image/nnet3/get_egs.sh 5.21 KB
  #!/usr/bin/env bash
  
  # This script is like steps/nnet3/get_egs.sh (it dumps examples for nnet3-based
  # neural net training), except it is specialized for classification of
  # fixed-size images (setups like MNIST, CIFAR and ImageNet); and you have to
  # provide the dev or test data in a separate directory.
  
  
  # Begin configuration section.
  cmd=run.pl
  egs_per_archive=25000
  train_subset_egs=5000
  test_mode=false
  stage=0
  # end configuration section
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  
  if [ $# != 3 ]; then
    echo "Usage: $0 [opts] <train-data-dir> <test-or-dev-data-dir> <egs-dir>"
    echo " e.g.: $0 --egs-per-iter 25000 data/cifar10_train exp/cifar10_train_egs"
    echo " or: $0 --test-mode true data/cifar10_test exp/cifar10_test_egs"
    echo "Options (with defaults):"
    echo "  --cmd 'run.pl'     How to run jobs (e.g. queue.pl)"
    echo "  --test-mode false  Set this to true if you just want a single archive"
    echo "                     egs.ark to be created (useful for test data)"
    echo "  --egs-per-archive 25000  Number of images to put in each training archive"
    echo "                     (this is a target; the actual number will be chosen"
    echo "                     as an integer fraction of the total."
    echo "  --train-subset-egs 5000  Number of images to put in the subset of"
    echo "                     training examples that's used for diagnostics on"
    echo "                     each iteration and for combination at the end"
    echo "                     (note: there is no data held-out from training"
    echo "                     data; we use the test or dev set for that.)"
    exit 1;
  fi
  
  
  set -eu
  
  train=$1
  test=$2
  dir=$3
  
  for f in $train/images.scp $train/labels.txt $test/images.scp $test/labels.txt; do
     if [ ! -f $f ]; then
       echo "$0: expected file $f to exist"
       exit 1
     fi
  done
  
  
  
  if ! mkdir -p $dir; then
    echo "$0: could not make directory $dir"
    exit 1
  fi
  
  mkdir -p $dir/info $dir/log
  
  
  paf="--print-args=false"
  num_channels=$(cat $train/num_channels)
  num_cols=$(head -n 1 $train/images.scp | feat-to-dim $paf scp:- -)
  num_rows=$(head -n 1 $train/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}')
  width=$num_rows
  height=$[$num_cols/$num_channels]
  # the width of the image equals $num_rows.
  
  
  # We put the label on t=0, and on the input, the t values
  # go from 0 to $width-1, so in a sense the left-context
  # of the model is 0 and the right-context is $width-1.
  # This way of looking at it is more natural for speech
  # or handwriting-recognition/OCR tasks than it is for
  # images, but it's the way we do it.
  echo 0 > $dir/info/left_context
  echo $[num_rows-1] > $dir/info/right_context
  echo $num_cols >$dir/info/feat_dim
  
  num_train_images=$(wc -l < $train/labels.txt)
  num_test_images=$(wc -l < $test/labels.txt)
  
  awk '{print $1}' $train/labels.txt | utils/shuffle_list.pl | \
     head -n $train_subset_egs > $dir/train_subset_ids.txt
  
  
  num_classes=$(wc -l <$train/classes.txt)
  num_classes_test=$(wc -l <$test/classes.txt)
  
  if ! [ "$num_classes" -eq "$num_classes_test" ]; then
    echo "$0: training and test dirs $train and $test are not compatible"
    exit 1
  fi
  
  if [ $stage -le 0 ]; then
    $cmd $dir/log/get_train_diagnostic_egs.log \
         ali-to-post "ark:filter_scp.pl $dir/train_subset_ids.txt $train/labels.txt|" ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input="scp:filter_scp.pl $dir/train_subset_ids.txt $train/images.scp|" \
         output=ark:- ark:$dir/train_diagnostic.egs
  fi
  
  
  
  if [ $stage -le 1 ]; then
    # we use the same filenames as the regular training script, but
    # the 'valid_diagnostic' egs are actually used as the test or dev
    # set.
    $cmd $dir/log/get_test_or_dev_egs.log \
         ali-to-post ark:$test/labels.txt ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input=scp:$test/images.scp \
         output=ark:- ark:$dir/valid_diagnostic.egs
  fi
  
  # Now work out the split of the training data.
  
  num_train_images=$(wc -l <$train/labels.txt)
  
  # the + 1 is to round up, not down... we assume it doesn't divide exactly.
  num_archives=$[num_train_images/egs_per_archive+1]
  
  
  if [ $stage -le 2 ]; then
    echo "$0: creating $num_archives archives of egs"
  
    image/split_image_dir.sh $train $num_archives
  
    sdata=$train/split$num_archives
  
    $cmd JOB=1:$num_archives $dir/log/get_egs.JOB.log \
         ali-to-post ark:$sdata/JOB/labels.txt ark:- \| \
         post-to-smat --dim=$num_classes ark:- ark:- \| \
         nnet3-get-egs-simple input=scp:$sdata/JOB/images.scp \
          output=ark:- ark:$dir/egs.JOB.ark
  fi
  
  rm $dir/train_subset_ids.txt 2>/dev/null || true
  
  ln -sf train_diagnostic.egs $dir/combine.egs
  
  echo $num_archives >$dir/info/num_archives
  
  # 'frames_per_eg' is actually the number of supervised frames per example, and
  # in this case we only have supervision on t=0 at the output.
  echo 1 >$dir/info/frames_per_eg
  
  echo $num_train_images >$dir/info/num_frames
  
  # actually 'output_dim' is not something that would be present in the
  # 'info' directory for speech tasks; we add it here for the convenience of
  # the training script, to make it easier to get the number of classes.
  echo $num_classes >$dir/info/output_dim
  
  echo "$0: finished generating egs"
  exit 0