generate_uniformly_segmented_data_dir.sh 2.66 KB
#!/bin/bash

# Copyright Vijayaditya Peddinti, 2016.
# Apache 2.0.
# This script generates uniformly segmented data dir, if the directory
# already has a segments file (e.g. in data/dev_aspire) we create directory
# without segments and then uniformly segment it.
# It also extracts hires mfcc features

set -e
set -x

stage=1
num_jobs=30
overlap=5 # size of the overlap
window=10 # size of the uniform segment

. ./cmd.sh
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <data-set> <out-data-set>"
  echo " Options:"
  echo "    --stage (1|2|3)  # start scoring script from part-way through."
  echo "e.g.:"
  echo "$0 data/train data/lang exp/nnet3/tdnn"
  exit 1;
fi

data_set=$1
segmented_data_set=$2

if [ "$data_set" == "dev_aspire" ]; then
  if [ $stage -le 1 ]; then
    echo "$0: Creating the data dir with whole recordings without segmentation"
    # create a whole directory without the segments
    unseg_dir=data/${data_set}_whole_hires
    src_dir=data/${data_set}
    utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir

    echo "$0: Creating the $unseg_dir/reco2file_and_channel file"
    cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel
  fi
  data_set=${data_set}_whole
else
  utils/copy_data_dir.sh data/$data_set data/${data_set}_hires
fi

if [ $stage -le 2 ]; then
  echo "$0: Extracting features"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
    --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires

  steps/compute_cmvn_stats.sh data/${data_set}_hires

  utils/fix_data_dir.sh data/${data_set}_hires
  utils/validate_data_dir.sh --no-text data/${data_set}_hires
fi

if [ $stage -le 3 ]; then
  echo "$0: Generating uniform segments with length $window and overlap $overlap."
  [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires
  if [ ! -f data/${data_set}_hires/segments ]; then
    utils/data/get_segments_for_data.sh data/${data_set}_hires > \
      data/${data_set}_hires/segments
  fi

  mkdir -p data/${segmented_data_set}_hires

  utils/data/get_uniform_subsegments.py \
    --max-segment-duration=$window \
    --overlap-duration=$overlap \
    --max-remaining-duration=$(perl -e "print $window/ 2.0") \
    data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments

  utils/data/subsegment_data_dir.sh data/${data_set}_hires \
    data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires
  steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires

  utils/fix_data_dir.sh data/${segmented_data_set}_hires
  utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires
fi