Yannick Estève / ONTRAC-Kaldi

Blame view

egs/lre07/v1/local/split_long_utts.sh 3.5 KB
  #!/bin/bash
  
  max_utt_len=60 # 60 seconds.
  stage=0
  cleanup=true
  
  . utils/parse_options.sh
  
  if [ $# -ne 2 ]; then
    echo "Usage: $0 [options] <in-data-dir> <out-data-dir>"
    echo "e.g.: $0 --max-utt-len 120 data/train data/train_split"
    echo "This script splits up long utterances into smaller pieces."
    echo "It assumes the wav.scp contains has a certain form, with .sph"
    echo "files in it (so the script is not completely general)."
    exit 1;
  fi
  
  in_dir=$1
  dir=$2
  
  for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang}; do
    if [ ! -f $f ]; then
      echo "$0: expected input file $f to exist";
      exit 1;
    fi
  done
  
  if [ $stage -le 0 ]; then
    utils/validate_data_dir.sh --no-text --no-feats $in_dir || exit 1;
  
    mkdir -p $dir/temp || exit 1;
  
    cat $in_dir/wav.scp | perl -ane '
      $_ =~ m:(^\S+) .* (\S+\.sph): || die "bad line $_";
      $utt=$1; $sph=$2;
      open (F, "<$sph") || die "Could not open sph file $sph";
      $samp_count = -1;
      while (<F>) {
        if (m/sample_rate -i (\d+)/) { $1 == 8000 || die "bad sample rate for $sph: $_"; }
        if (m/sample_count -i (\d+)/) { $samp_count = $1; }
        if (m/end_head/) { last; }
      }
      close(F);
      $samp_count > 0 || die "Could not get sample count for sph file $sph
  ";
      $secs = $samp_count / 8000;
      print "$utt $secs
  ";
    ' > $dir/temp/utt2len
  fi
  
  if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2spk | wc -l) ]; then
    echo "utt2spk and utt2len files have mismatched lengths";
    exit 1;
  fi
  if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2lang | wc -l) ]; then
    echo "utt2spk and utt2lang files have mismatched lengths";
    exit 1;
  fi
  
  
  if [ $stage -le 1 ]; then
  
  # Create, in a pipe, a file with lines
  # <utt-id> <length> <speaker-id> <language-id>
  # and pipe it into a perl script that outputs the segments file.
    awk '{print $2}' $in_dir/utt2spk | paste $dir/temp/utt2len -  | \
     paste -  <(awk '{print $2}' $in_dir/utt2lang) | perl -e '
    ($max_utt_len, $out_dir) = @ARGV;
    open(UTT2SPK, ">$out_dir/utt2spk") || die "opening utt2spk file $out_dir/utt2spk";
    open(SEGMENTS, ">$out_dir/segments") || die "opening segments file $out_dir/segments";
    open(UTT2LANG, ">$out_dir/utt2lang") || die "opening segments file $out_dir/utt2lang";
    while(<STDIN>) {
      ($utt, $len, $speaker, $language) = split(" ", $_);
      defined $speaker || die "Bad line $_";
      $reco = $utt; # old utt-id becomes recording-id.
      if ($len <= $max_utt_len) {
        print SEGMENTS "${utt}-1 ${utt} 0 -1
  ";
        print UTT2SPK "${utt}-1 $speaker
  ";
        print UTT2LANG "${utt}-1 $language
  ";
      } else {
        # We will now allow split length to exceed max_utt_len.
        $num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len);
        $num_split >= 1 || die;
        $split_len = $len / $num_split;
        for ($n = 1; $n <= $num_split; $n++) {
           $n_text = $n; # this will help remain in string-sorted order
           if ($num_split >= 10 && $n < 10) { $n_text = "0$n_text"; }
           if ($num_split >= 100 && $n < 100) { $n_text = "00$n_text"; }
           $t_start = $split_len * ($n - 1); $t_end = $split_len * $n;
           print SEGMENTS "${utt}-$n_text ${utt} $t_start $t_end
  ";
           print UTT2SPK "${utt}-$n_text $speaker
  ";
           print UTT2LANG "${utt}-$n_text $language
  ";
        }
      }
    }
    close(SEGMENTS)||die; close(UTT2SPK)||die; close(UTT2LANG)||die; ' $max_utt_len $dir
  fi
  
  cp $in_dir/wav.scp $dir/
  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
  utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1;
  
  $cleanup && rm -r $dir/temp
  
  exit 0;