subsegment_data_dir.sh 9.62 KB
#!/bin/bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0


# This script allows you to specify a 'segments' file with segments
# relative to existing utterances, with lines like
#  utterance_foo-1 utterance_foo 7.5 8.2
#  utterance_foo-2 utterance_foo 8.9 10.1
# and a 'text' file with sub-segmented text like
#  utterance_foo-1 hello there
#  utterance_foo-2 how are you
# and combine this with an existing data-dir that was all relative
# to the original utterance-ids like 'utterance_foo', producing
# a new subsegmented output directory.
#
# It does the right thing for you on the various files that the
# data directory contained (except you have to recreate
# the CMVN stats).


segment_end_padding=0.0
cmd=run.pl
nj=1

. utils/parse_options.sh

if [ $# != 4 ] && [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>"
  echo "This script sub-segments a data directory.  <subsegments-file> is to"
  echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>"
  echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>."
  echo "This script appropriately combines the <subsegments-file> with the original"
  echo "segments file, if necessary, and if not, creates a segments file."
  echo "e.g.:"
  echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented"
  echo " Options:"
  echo "  --segment-end-padding <padding-time>       # e.g. 0.02.  Default 0.0.  If provided,"
  echo "                                             # we will add this value to the end times of <destdir>/segments"
  echo "                                             # when creating it.  This can be useful to account for"
  echo "                                             # end effects in feature generation.  The reason this is"
  echo "                                             # not just applied to the input segments file, is that"
  echo "                                             # for purposes of computing the num-frames of the parts of"
  echo "                                             # matrices in feats.scp, the padding should not be done."
  echo "  See also: resolve_ctm_overlaps.py"
  exit 1;
fi


export LC_ALL=C

srcdir=$1
subsegments=$2

add_subsegment_text=false
if [ $# -eq 4 ]; then
  new_text=$3
  dir=$4
  add_subsegment_text=true

  if [ ! -f "$new_text" ]; then
    echo "$0: no such file $new_text"
    exit 1
  fi

else
  dir=$3
fi

for f in "$subsegments" "$srcdir/utt2spk"; do
  if [ ! -f "$f" ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! mkdir -p $dir; then
  echo "$0: failed to create directory $dir"
fi

if $add_subsegment_text; then
  if ! cmp <(awk '{print $1}' <$subsegments)  <(awk '{print $1}' <$new_text); then
    echo "$0: expected the first fields of the files $subsegments and $new_text to be identical"
    exit 1
  fi
fi

# create the utt2spk in $dir
if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then
  echo "$0: failed checking subsegments file $subsegments"
  exit 1
fi

set -e
set -o pipefail

# Create a mapping from the new to old utterances.  This file will be deleted later.
awk '{print $1, $2}' < $subsegments > $dir/new2old_utt

# Create the new utt2spk file [just map from the second field
utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk
# .. and the new spk2utt file.
utils/utt2spk_to_spk2utt.pl  <$dir/utt2spk >$dir/spk2utt

if $add_subsegment_text; then
  # the new text file is just what the user provides.
  cp $new_text $dir/text
fi

# copy the source wav.scp
cp $srcdir/wav.scp $dir
if [ -f $srcdir/reco2file_and_channel ]; then
  cp $srcdir/reco2file_and_channel $dir
fi

# copy the source reco2dur
if [ -f $srcdir/reco2dur ]; then
  cp $srcdir/reco2dur $dir
fi

if [ -f $srcdir/segments ]; then
  # we have to map the segments file.
  # What's going on below is a little subtle.
  # $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time>
  # and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time>
  # The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments]
  # with <recording-id> <start-time> <end-time>.
  # so after that first command we have lines like
  # <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \
  #   <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt>
  # which the awk command turns into:
  # <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording>
  utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \
    awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments
else
  # the subsegments file just becomes the segments file.
  awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments
fi

if [ -f $srcdir/utt2uniq ]; then
  utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq
fi

if [ -f $srcdir/feats.scp ]; then
  # We want to avoid recomputing the features.   We'll use sub-matrices of the
  # original feature matrices, using the [] notation that is available for
  # matrices in Kaldi.
  if [ ! -s $srcdir/frame_shift ]; then
    frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1
  else
    frame_shift=$(cat $srcdir/frame_shift)
  fi
  echo "$0: note: frame shift is $frame_shift [affects feats.scp]"

  # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
  # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
  # The first awk command replaces this with the format:
  # <new-utt-id> <old-utt-id> <first-frame> <last-frame>
  # e.g. 'utt_foo-1 utt_foo 721 893'
  # and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry
  # from the original wav.scp, so we get a line like:
  # e.g. 'utt_foo-1 foo-bar.ark:514231 721 892'
  # Note: the reason we subtract one from the last time is that it's going to
  # represent the 'last' frame, not the 'end' frame [i.e. not one past the last],
  # in the matlab-like, but zero-indexed [first:last] notion.  For instance, a segment with 1 frame
  # would have start-time 0.00 and end-time 0.01, which would become the frame range
  # [0:0]
  # The second awk command turns this into something like
  # utt_foo-1 foo-bar.ark:514231[721:892]
  # It has to be a bit careful because the format actually allows for more general things
  # like pipes that might contain spaces, so it has to be able to produce output like the
  # following:
  # utt_foo-1 some command|[721:892]
  # The 'end' frame is ensured to not exceed the feature archive size of
  # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
  # e.g if the number of frames in foo-bar.ark is 891, then the features are
  # truncated to that many frames.
  # utt_foo-1 foo-bar.ark:514231[721:890]
  # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
  # the original data-dir already had data-ranges in square brackets.

  # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
  # This is equal to the number of frames in the feature archive for <old-utt-id>.
  if [ ! -f $srcdir/utt2num_frames ]; then
    echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
  fi
  utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
  awk '{print $1" "$2}' $subsegments | \
    utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
    $dir/utt2max_frames

  awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
    utils/apply_map.pl -f 2 $srcdir/feats.scp | \
    awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
    utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
    utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }

  # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
  # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
  cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
    $dir/utt2num_frames

  # Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above.
  if [ -f $srcdir/vad.scp ]; then
    cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \
      utils/apply_map.pl -f 2 $srcdir/vad.scp | \
      awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
      utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
      utils/data/normalize_data_range.pl >$dir/vad.scp
  fi
fi


if [ -f $dir/cmvn.scp ]; then
  rm $dir/cmvn.scp
  echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features."
fi

# remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file.
rm $dir/utt2dur 2>/dev/null || true

if [ -f $srcdir/spk2gender ]; then
  cp $srcdir/spk2gender $dir
fi
if [ -f $srcdir/glm ]; then
  cp $srcdir/glm $dir
fi
if [ -f $srcdir/stm ]; then
  cp $srcdir/stm $dir
fi

for f in ctm; do
  if [ -f $srcdir/$f ]; then
    echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
    echo " ... not implemented yet (and probably it's not needed.)"
  fi
done

rm $dir/new2old_utt

echo "$0: subsegmented data from $srcdir to $dir"