Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/data/combine_short_segments.sh 6 KB
  #!/bin/bash
  
  # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  # This script copies and modifies a data directory while combining
  # segments whose duration is lower than a specified minimum segment
  # length.
  #
  # Note: this does not work for the wav.scp, since there is no natural way to
  # concatenate segments; you have to operate on directories that already have
  # features extracted.
  
  #
  
  
  # begin configuration section
  cleanup=true
  # end configuration section
  
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: "
    echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
    echo "e.g.:"
    echo " $0 data/train 1.55 data/train_comb"
    # options documentation here.
    exit 1;
  fi
  
  
  export LC_ALL=C
  
  srcdir=$1
  min_seg_len=$2
  dir=$3
  
  if [ "$dir" == "$srcdir" ]; then
    echo "$0: this script requires <srcdir> and <dir> to be different."
    exit 1
  fi
  
  for f in $srcdir/utt2spk $srcdir/feats.scp; do
    [ ! -s $f ] && echo "$0: expected file $f to exist and be nonempty" && exit 1
  done
  
  if ! awk '{if (NF != 2) exit(1);}' <$srcdir/feats.scp; then
    echo "$0: could not combine short segments because $srcdir/feats.scp has "
    echo " entries with too many fields"
  fi
  
  if ! mkdir -p $dir; then
    echo "$0: could not create directory $dir"
    exit 1;
  fi
  
  if ! utils/validate_data_dir.sh $srcdir; then
    echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
    exit 1
  fi
  
  if ! python -c "x=float('$min_seg_len'); assert(x>0.0 and x<100.0);" 2>/dev/null; then
    echo "$0: bad <min-segment-length-in-seconds>: got '$min_seg_len'"
    exit 1
  fi
  
  set -e
  set -o pipefail
  
  # make sure $srcdir/utt2dur exists.
  utils/data/get_utt2dur.sh $srcdir
  
  utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
    $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur
  
  utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt
  
  # create the feats.scp.
  # if a line of utt2utts is like 'utt2-comb2 utt2 utt3', then
  # the utils/apply_map.pl will create a line that looks like
  # 'utt2-comb2 foo.ark:4315 foo.ark:431423'
  # and the awk command creates suitable command lines like:
  # 'utt2-comb2 concat-feats foo.ark:4315 foo.ark:431423 - |'
  utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
    awk '{if (NF<=2){print;} else { $1 = $1 " concat-feats --print-args=false"; $NF = $NF " - |"; print; }}' > $dir/feats.scp
  
  # create $dir/text by concatenating the source 'text' entries for the original
  # utts.
  utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
  
  if [ -f $srcdir/utt2uniq ]; then
    # the utt2uniq file is such that if 2 utts were derived from the same original
    # utt (e.g. by speed perturbing) they map to the same 'uniq' value.  This is
    # so that we can properly hold out validation data for neural net training and
    # know that we're not training on perturbed verions of that utterance.  We
    # need to obtain the utt2uniq file so that if any 2 'new' utts contain any of
    # the same 'old' utts, their 'uniq' values are the same [but otherwise as far
    # as possible, the 'uniq' values are different.]
    #
    # we'll do this by arranging the old 'uniq' values into groups as necessary to
    # capture this property.
  
    # The following command creates 'uniq_sets', each line of which contains
    # a set of original 'uniq' values, and effectively we assert that they must
    # be grouped together to the same 'uniq' value.
    # the first awk command prints a group of the original utterance-ids that
    # are combined together into a single new utterance, and the apply_map
    # command converts those into a list of original 'uniq' values.
    awk '{$1 = ""; print;}' < $dir/utt2utts | \
      utils/apply_map.pl $srcdir/utt2uniq > $dir/uniq_sets
  
    # The next command creates $dir/uniq2merged_uniq, which is a map from the
    # original 'uniq' values to the 'merged' uniq values.
    # for example, if $dir/uniq_sets were to contain
    # a b
    # b c
    # d
    # then we'd obtain a uniq2merged_uniq file that looks like:
    # a a
    # b a
    # c a
    # d d
    # ... because a and b appear together, and b and c appear together,
    # they have to be merged into the same set, and we name that set 'a'
    # (in general, we take the lowest string in lexicographical order).
  
    cat $dir/uniq_sets | LC_ALL=C python -c '
  import sys;
  from collections import defaultdict
  uniq2orig_uniq = dict()
  equal_pairs = set()  # set of 2-tuples (a,b) which should have equal orig_uniq
  while True:
      line = sys.stdin.readline()
      if line == "": break
      split_line = line.split() # list of uniq strings that should map in same set
      # initialize uniq2orig_uniq to the identity mapping
      for uniq in split_line: uniq2orig_uniq[uniq] = uniq
      for a in split_line[1:]: equal_pairs.add((split_line[0], a))
  
  changed = True
  while changed:
      changed = False
      for a,b in equal_pairs:
           min_orig_uniq = min(uniq2orig_uniq[a], uniq2orig_uniq[b])
           for x in [a,b]:
               if uniq2orig_uniq[x] != min_orig_uniq:
                   uniq2orig_uniq[x] = min_orig_uniq
                   changed = True
  
  for uniq in sorted(uniq2orig_uniq.keys()):
      print uniq, uniq2orig_uniq[uniq]
  ' > $dir/uniq_to_orig_uniq
    rm $dir/uniq_sets
  
  
    # In the following command, suppose we have a line like:
    # utt1-comb2 utt1 utt2
    # .. the first awk command retains only the first original utt, to give
    # utt1-comb2 utt1
    # [we can pick one arbitrarily since we know any of them would map to the same
    # orig_uniq value.]
    # the first apply_map.pl command maps the 'utt1' to the 'uniq' value it mapped to
    # in $srcdir, and the second apply_map.pl command maps it to the grouped 'uniq'
    # value obtained by the inline python script above.
    awk '{print $1, $2}' < $dir/utt2utts | utils/apply_map.pl -f 2 $srcdir/utt2uniq | \
      utils/apply_map.pl -f 2 $dir/uniq_to_orig_uniq > $dir/utt2uniq
    rm $dir/uniq_to_orig_uniq
  fi
  
  # note: the user will have to recompute the cmvn, as the speakers may have changed.
  rm $dir/cmvn.scp 2>/dev/null || true
  
  utils/validate_data_dir.sh --no-wav $dir
  
  if $cleanup; then
    rm $dir/utt2utts
  fi