Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sre08/v1/sid/nnet3/xvector/get_egs.sh 11.9 KB
  #!/bin/bash
  
  # Copyright      2017 Johns Hopkins University (Author: Daniel Povey)
  #                2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
  #                2017 David Snyder
  # Apache 2.0
  #
  # This script dumps training examples (egs) for multiclass xvector training.
  # These egs consist of a data chunk and a zero-based speaker label.
  # Each archive of egs has, in general, a different input chunk-size.
  # We don't mix together different lengths in the same archive, because it
  # would require us to repeatedly run the compilation process within the same
  # training job.
  #
  # This script, which will generally be called from other neural net training
  # scripts, extracts the training examples used to train the neural net (and
  # also the validation examples used for diagnostics), and puts them in
  # separate archives.
  
  
  # Begin configuration section.
  cmd=run.pl
  # each archive has data-chunks off length randomly chosen between
  # $min_frames_per_eg and $max_frames_per_eg.
  min_frames_per_chunk=50
  max_frames_per_chunk=300
  frames_per_iter=10000000 # target number of frames per archive.
  
  frames_per_iter_diagnostic=100000 # have this many frames per archive for
                                     # the archives used for diagnostics.
  
  num_diagnostic_archives=3  # we want to test the training likelihoods
                             # on a range of utterance lengths, and this number controls
                             # how many archives we evaluate on.
  
  
  compress=true   # set this to false to disable compression (e.g. if you want to see whether
                  # results are affected).
  
  num_heldout_utts=100     # number of utterances held out for training subset
  
  num_repeats=1 # number of times each speaker repeats per archive
  
  stage=0
  nj=6         # This should be set to the maximum number of jobs you are
               # comfortable to run in parallel; you can increase it if your disk
               # speed is greater and you have more machines.
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  if [ $# != 2 ]; then
    echo "Usage: $0 [opts] <data> <egs-dir>"
    echo " e.g.: $0 data/train exp/xvector_a/egs"
    echo ""
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config file containing options"
    echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
    echo "                                                   # parallel (increase this only if you have good disk and"
    echo "                                                   # network speed).  default=6"
    echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
    echo "  --min-frames-per-eg <#frames;50>                 # The minimum number of frames per chunk that we dump"
    echo "  --max-frames-per-eg <#frames;200>                # The maximum number of frames per chunk that we dump"
    echo "  --num-repeats <#repeats;1>                       # The (approximate) number of times the training"
    echo "                                                   # data is repeated in the egs"
    echo "  --frames-per-iter <#samples;1000000>             # Target number of frames per archive"
    echo "  --num-diagnostic-archives <#archives;3>          # Option that controls how many different versions of"
    echo "                                                   # the train and validation archives we create (e.g."
    echo "                                                   # train_subset.{1,2,3}.egs and valid.{1,2,3}.egs by default;"
    echo "                                                   # they contain different utterance lengths."
    echo "  --frames-per-iter-diagnostic <#samples;100000>   # Target number of frames for the diagnostic archives"
    echo "                                                   # {train_subset,valid}.*.egs"
    echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
    echo "                                                   # the middle."
  
    exit 1;
  fi
  
  data=$1
  dir=$2
  
  for f in $data/utt2num_frames $data/feats.scp ; do
    [ ! -f $f ] && echo "$0: expected file $f" && exit 1;
  done
  
  feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1
  
  mkdir -p $dir/info $dir/info $dir/temp
  temp=$dir/temp
  
  echo $feat_dim > $dir/info/feat_dim
  echo '0' > $dir/info/left_context
  # The examples have at least min_frames_per_chunk right context.
  echo $min_frames_per_chunk > $dir/info/right_context
  echo '1' > $dir/info/frames_per_eg
  cp $data/utt2num_frames $dir/temp/utt2num_frames
  
  if [ $stage -le 0 ]; then
    echo "$0: Preparing train and validation lists"
    # Pick a list of heldout utterances for validation egs
    awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/valid_uttlist || exit 1;
    # The remaining utterances are used for training egs
    utils/filter_scp.pl --exclude $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.train
    utils/filter_scp.pl $temp/valid_uttlist $temp/utt2num_frames > $temp/utt2num_frames.valid
    # Pick a subset of the training list for diagnostics
    awk '{print $1}' $temp/utt2num_frames.train | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1;
    utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2num_frames.train > $temp/utt2num_frames.train_subset
    # Create a mapping from utterance to speaker ID (an integer)
    awk -v id=0 '{print $1, id++}' $data/spk2utt > $temp/spk2int
    utils/sym2int.pl -f 2 $temp/spk2int $data/utt2spk > $temp/utt2int
    utils/filter_scp.pl $temp/utt2num_frames.train $temp/utt2int > $temp/utt2int.train
    utils/filter_scp.pl $temp/utt2num_frames.valid $temp/utt2int > $temp/utt2int.valid
    utils/filter_scp.pl $temp/utt2num_frames.train_subset $temp/utt2int > $temp/utt2int.train_subset
  fi
  
  num_pdfs=$(awk '{print $2}' $temp/utt2int | sort | uniq -c | wc -l)
  # The script assumes you've prepared the features ahead of time.
  feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |"
  train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_ranges.1 $data/feats.scp |"
  valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_ranges.1 $data/feats.scp |"
  
  # first for the training data... work out how many archives.
  num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train)
  num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2num_frames.train_subset)
  
  echo $num_train_frames >$dir/info/num_frames
  num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1]
  echo "$0: Producing $num_train_archives archives for training"
  echo $num_train_archives > $dir/info/num_archives
  echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives
  
  if [ $nj -gt $num_train_archives ]; then
    echo "$0: Reducing num-jobs $nj to number of training archives $num_train_archives"
    nj=$num_train_archives
  fi
  
  if [ $stage -le 1 ]; then
    if [ -e $dir/storage ]; then
      # Make soft links to storage directories, if distributing this way..  See
      # utils/create_split_dir.pl.
      echo "$0: creating data links"
      utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs.$x.ark; done)
      utils/create_data_link.pl $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done)
    fi
  fi
  
  if [ $stage -le 2 ]; then
    echo "$0: Allocating training examples"
    $cmd $dir/log/allocate_examples_train.log \
      sid/nnet3/xvector/allocate_egs.py \
        --num-repeats=$num_repeats \
        --min-frames-per-chunk=$min_frames_per_chunk \
        --max-frames-per-chunk=$max_frames_per_chunk \
        --frames-per-iter=$frames_per_iter \
        --num-archives=$num_train_archives --num-jobs=$nj \
        --utt2len-filename=$dir/temp/utt2num_frames.train \
        --utt2int-filename=$dir/temp/utt2int.train --egs-dir=$dir  || exit 1
  
    echo "$0: Allocating training subset examples"
    $cmd $dir/log/allocate_examples_train_subset.log \
      sid/nnet3/xvector/allocate_egs.py \
        --prefix train_subset \
        --num-repeats=1 \
        --min-frames-per-chunk=$min_frames_per_chunk \
        --max-frames-per-chunk=$max_frames_per_chunk \
        --randomize-chunk-length false \
        --frames-per-iter=$frames_per_iter_diagnostic \
        --num-archives=$num_diagnostic_archives --num-jobs=1 \
        --utt2len-filename=$dir/temp/utt2num_frames.train_subset \
        --utt2int-filename=$dir/temp/utt2int.train_subset --egs-dir=$dir  || exit 1
  
    echo "$0: Allocating validation examples"
    $cmd $dir/log/allocate_examples_valid.log \
      sid/nnet3/xvector/allocate_egs.py \
        --prefix valid \
        --num-repeats=1 \
        --min-frames-per-chunk=$min_frames_per_chunk \
        --max-frames-per-chunk=$max_frames_per_chunk \
        --randomize-chunk-length false \
        --frames-per-iter=$frames_per_iter_diagnostic \
        --num-archives=$num_diagnostic_archives --num-jobs=1 \
        --utt2len-filename=$dir/temp/utt2num_frames.valid \
        --utt2int-filename=$dir/temp/utt2int.valid --egs-dir=$dir  || exit 1
  fi
  
  # At this stage we'll have created the ranges files that define how many egs
  # there are and where they come from.  If this is your first time running this
  # script, you might decide to put an exit 1 command here, and inspect the
  # contents of exp/$dir/temp/ranges.* before proceeding to the next stage.
  if [ $stage -le 3 ]; then
    echo "$0: Generating training examples on disk"
    rm $dir/.error 2>/dev/null
    for g in $(seq $nj); do
      outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g)
      $cmd $dir/log/train_create_examples.$g.log \
        nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/ranges.$g \
        "`echo $feats | sed s/JOB/$g/g`" $outputs || touch $dir/.error &
    done
    train_subset_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1)
    echo "$0: Generating training subset examples on disk"
    $cmd $dir/log/train_subset_create_examples.1.log \
      nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/train_subset_ranges.1 \
      "$train_subset_feats" $train_subset_outputs || touch $dir/.error &
    wait
    valid_outputs=$(awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1)
    echo "$0: Generating validation examples on disk"
    $cmd $dir/log/valid_create_examples.1.log \
      nnet3-xvector-get-egs --compress=$compress --num-pdfs=$num_pdfs $temp/valid_ranges.1 \
      "$valid_feats" $valid_outputs || touch $dir/.error &
    wait
    if [ -f $dir/.error ]; then
      echo "$0: Problem detected while dumping examples"
      exit 1
    fi
  fi
  
  if [ $stage -le 4 ]; then
    echo "$0: Shuffling order of archives on disk"
    $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \
      nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark \
      ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp || exit 1;
    $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \
      nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark \
      ark,scp:$dir/train_diagnostic_egs.JOB.ark,$dir/train_diagnostic_egs.JOB.scp || exit 1;
    $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \
      nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark \
      ark,scp:$dir/valid_egs.JOB.ark,$dir/valid_egs.JOB.scp || exit 1;
  fi
  
  if [ $stage -le 5 ]; then
    for file in $(for x in $(seq $num_diagnostic_archives); do echo $dir/train_subset_egs_temp.$x.ark; done) \
      $(for x in $(seq $num_diagnostic_archives); do echo $dir/valid_egs_temp.$x.ark; done) \
      $(for x in $(seq $num_train_archives); do echo $dir/egs_temp.$x.ark; done); do
      [ -L $file ] && rm $(readlink -f $file)
      rm $file
    done
    rm -rf $dir/valid_diagnostic.scp $dir/train_diagnostic.scp
    for x in $(seq $num_diagnostic_archives); do
      cat $dir/train_diagnostic_egs.$x.scp >> $dir/train_diagnostic.scp
      cat $dir/valid_egs.$x.scp >> $dir/valid_diagnostic.scp
    done
    ln -sf train_diagnostic.scp $dir/combine.scp
  fi
  
  echo "$0: Finished preparing training examples"