Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/cleanup/make_segmentation_data_dir.sh 6.28 KB
  #!/bin/bash
  
  # Copyright 2014  Guoguo Chen
  # Apache 2.0
  
  # Begin configuration section.
  max_seg_length=10
  min_seg_length=2
  min_sil_length=0.5
  time_precision=0.05
  special_symbol="<***>"
  separator=";"
  wer_cutoff=-1
  # End configuration section.
  
  set -e
  
  echo "$0 $@"
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 3 ]; then
    echo "This script takes the ctm file that corresponds to the data directory"
    echo "created by steps/cleanup/split_long_utterance.sh, works out a new"
    echo "segmentation and creates a new data directory for the new segmentation."
    echo ""
    echo "Usage: $0 [options] <ctm-file> <old-data-dir> <new-data-dir>"
    echo " e.g.: $0 train_si284_split.ctm \\"
    echo "                          data/train_si284_split data/train_si284_reseg"
    echo "Options:"
    echo "    --wer-cutoff            # ignore segments with WER higher than the"
    echo "                            # specified value. -1 means no segment will"
    echo "                            # be ignored."
    echo "    --max-seg-length        # maximum length of new segments"
    echo "    --min-seg-length        # minimum length of new segments"
    echo "    --min-sil-length        # minimum length of silence as split point"
    echo "    --time-precision        # precision for determining \"same time\""
    echo "    --special-symbol        # special symbol to be aligned with"
    echo "                            # inserted or deleted words"
    echo "    --separator             # separator for aligned pairs"
    exit 1;
  fi
  
  ctm=$1
  old_data_dir=$2
  new_data_dir=$3
  
  for f in $ctm $old_data_dir/text.orig $old_data_dir/utt2spk \
    $old_data_dir/wav.scp $old_data_dir/segments; do
    if [ ! -f $f ]; then
      echo "$0: expected $f to exist"
      exit 1;
    fi
  done
  
  mkdir -p $new_data_dir/tmp/
  cp -f $old_data_dir/wav.scp $new_data_dir
  [ -f old_data_dir/spk2gender ] &&  cp -f $old_data_dir/spk2gender $new_data_dir
  
  # Removes the overlapping region (in utils/split_long_utterance.sh we create
  # the segmentation with overlapping region).
  #
  # Note that for each audio file, we expect its segments have been sorted in time
  # ascending order (if we ignore the overlap).
  cat $ctm | perl -e '
    $precision = $ARGV[0];
    @ctm = ();
    %processed_ids = ();
    $previous_id = "";
    while (<STDIN>) {
      chomp;
      my @current = split;
      @current >= 5 || die "Error: bad line $_
  ";
      $id = join("_", ($current[0], $current[1]));
      @previous = @{$ctm[-1]};
  
      # Start of a new audio file.
      if ($previous_id ne $id) {
        # Prints existing information.
        if (@ctm > 0) {
          foreach $line (@ctm) {
            print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4]
  ";
          }
        }
  
        # Checks if the ctm file is sorted.
        if (defined($processed_ids{$id})) {
          die "Error: \"$current[0] $current[1]\" has already been processed
  ";
        } else {
          $processed_ids{$id} = 1;
        }
  
        @ctm = ();
        push(@ctm, \@current);
        $previous_id = $id;
        next;
      }
  
      $new_start = sprintf("%.2f", $previous[2] + $previous[3]);
  
      if ($new_start > $current[2]) {
        # Case 2: scans for a splice point.
        $index = -1;
        while (defined($ctm[$index])
               && $ctm[$index]->[2] + $ctm[$index]->[3] > $current[2]) {
          if ($ctm[$index]->[4] eq $current[4]
              && abs($ctm[$index]->[2] - $current[2]) < $precision
              && abs($ctm[$index]->[3] - $current[3]) < $precision) {
            pop @ctm for 2..abs($index);
            last;
          } else {
            $index -= 1;
          }
        }
      } else {
        push(@ctm, \@current);
      }
    }
  
    if (@ctm > 0) {
      foreach $line (@ctm) {
        print "$line->[0] $line->[1] $line->[2] $line->[3] $line->[4]
  ";
      }
    }' $time_precision > $new_data_dir/tmp/ctm
  
  # Creates a text file from the ctm, which will be used in Levenshtein alignment.
  # Note that we remove <eps> in the text file.
  cat $new_data_dir/tmp/ctm | perl -e '
    $previous_wav = "";
    $previous_channel = "";
    $text = "";
    while (<STDIN>) {
      chomp;
      @col = split;
      @col >= 5 || die "Error: bad line $_
  ";
      if ($previous_wav eq $col[0]) {
        $previous_channel eq $col[1] ||
          die "Error: more than one channels detected
  ";
        if ($col[4] ne "<eps>") {
          $text .= " $col[4]";
        }
      } else {
        if ($text ne "") {
          print "$previous_wav $text
  ";
        }
        $text = $col[4];
        $previous_wav = $col[0];
        $previous_channel = $col[1];
      }
    }
    if ($text ne "") {
      print "$previous_wav $text
  ";
    }' > $new_data_dir/tmp/text
  
  # Computes the Levenshtein alignment.
  align-text --special-symbol=$special_symbol --separator=$separator \
    ark:$old_data_dir/text.orig ark:$new_data_dir/tmp/text \
    ark,t:$new_data_dir/tmp/aligned.txt
  
  # Creates new segmentation.
  steps/cleanup/create_segments_from_ctm.pl \
    --max-seg-length $max_seg_length --min-seg-length $min_seg_length \
    --min-sil-length $min_sil_length \
    --separator $separator --special-symbol $special_symbol \
    --wer-cutoff $wer_cutoff \
    $new_data_dir/tmp/ctm $new_data_dir/tmp/aligned.txt \
    $new_data_dir/segments $new_data_dir/text
  
  # Now creates the new utt2spk and spk2utt file.
  cat $old_data_dir/utt2spk | perl -e '
    ($old_seg_file, $new_seg_file, $utt2spk_file_out) = @ARGV;
    open(OS, "<$old_seg_file") || die "Error: fail to open $old_seg_file
  ";
    open(NS, "<$new_seg_file") || die "Error: fail to open $new_seg_file
  ";
    open(UO, ">$utt2spk_file_out") ||
      die "Error: fail to open $utt2spk_file_out
  ";
    while (<STDIN>) {
      chomp;
      @col = split;
      @col == 2 || die "Error: bad line $_
  ";
      $utt2spk{$col[0]} = $col[1];
    }
    while (<OS>) {
      chomp;
      @col = split;
      @col == 4 || die "Error: bad line $_
  ";
      if (defined($wav2spk{$col[1]})) {
        $wav2spk{$col[1]} == $utt2spk{$col[0]} ||
          die "Error: multiple speakers detected for wav file $col[1]
  ";
      } else {
        $wav2spk{$col[1]} = $utt2spk{$col[0]};
      }
    }
    while (<NS>) {
      chomp;
      @col = split;
      @col == 4 || die "Error: bad line $_
  ";
      defined($wav2spk{$col[1]}) ||
        die "Error: could not find speaker for wav file $col[1]
  ";
      print UO "$col[0] $wav2spk{$col[1]}
  ";
    } ' $old_data_dir/segments $new_data_dir/segments $new_data_dir/utt2spk
  utils/utt2spk_to_spk2utt.pl $new_data_dir/utt2spk > $new_data_dir/spk2utt
  
  utils/fix_data_dir.sh $new_data_dir
  
  exit 0;