Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/data/normalize_data_range.pl 5.05 KB
  #!/usr/bin/env perl
  
  # This script is intended to read and write scp files possibly containing indexes for
  # sub-ranges of features, like
  # foo-123  bar.ark:431423[78:89]
  # meaning rows 78 through 89 of the matrix located at bar.ark:431423.
  #
  # Its purpose is to normalize lines which have ranges on top of ranges, like
  #
  # foo-123  bar.ark:431423[78:89][3:4]
  #
  # This program interprets the later [] expression as a sub-range of the matrix returned by the first []
  # expression; in this case, we'd get
  #
  # foo-123  bar.ark:431423[81:82]
  #
  # Note that these ranges are based on zero-indexing, and have a 'first:last'
  # interpretation, so the range [0:0] is a matrix with one row.  And also note
  # that column ranges are permitted, after row ranges, and the row range may be
  # empty, e.g.
  
  # foo-123  bar.ark:431423[81:82,0:13]
  # or
  # foo-123  bar.ark:431423[81:82,0:13]
  #
  
  # This program reads from the standard input (or command-line file or files),
  # and writes to the standard output.
  
  
  # This function combines ranges, either row or column ranges.  start1 and end1
  # are the first range, and start2 and end2 are interpreted as a sub-range of the
  # first range.  It is acceptable for either start1 and end1, or start2 and end2, to
  # be empty.
  # This function returns the start and end of the range, as an array.
  sub combine_ranges {
    ($row_or_column, $start1, $end1, $start2, $end2) = @_;
  
    if ($start1 eq "" && $end1 eq "") {
      return ($start2, $end2);
    } elsif ($start2 eq "" && $end2 eq "") {
      return ($start1, $end1);
    } else {
      # For now this script doesn't support the case of ranges like [20:], even
      # though they are supported at the C++ level.
      if ($start1 eq "" || $start2 eq "" || $end1 eq "" || $end2 == "") {
        chop $line;
        print STDERR ("normalize_data_range.pl: could not make sense of line $line
  ");
        exit(1)
      }
      if ($start1 + $end2 > $end1) {
        chop $line;
        print STDERR ("normalize_data_range.pl: could not make sense of line $line " .
              "[second $row_or_column range too large vs first range, $start1 + $end2 > $end1]
  ");
            # exit(1);
        return ($start2+$start1, $end1);
      }
      return ($start2+$start1, $end2+$start1);
    }
  }
  
  
  while (<>) {
    $line = $_;
    # we only need to do something if we detect two of these ranges.
    # The following regexp matches strings of the form ...[foo][bar]
    # where foo and bar have no square brackets in them.
    if (m/\[([^][]*)\]\[([^][]*)\]\s*$/) {
      $before_range = $`;
      $first_range = $1;   # e.g. '0:500,20:21', or '0:500', or ',0:13'.
      $second_range = $2;  # has same general format as first_range.
      if ($_ =~ m/concat-feats /) {
        # sometimes in scp files, we use the command concat-feats to splice together
        # two feature matrices.  Handling this correctly is complicated and we don't
        # anticipate needing it, so we just refuse to process this type of data.
        print STDERR ("normalize_data_range.pl: this script cannot [yet] normalize the data ranges " .
          "if concat-feats was in the input data
  ");
        exit(1);
      }
      # print STDERR "matched: $before_range $first_range $second_range
  ";
      if ($first_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
        print STDERR "normalize_data_range.pl: could not make sense of input line $_";
        exit(1);
      }
      $row_start1 = $2;
      $row_end1 = $3;
      $col_start1 = $5;
      $col_end1 = $6;
  
      if ($second_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
        print STDERR "normalize_data_range.pl: could not make sense of input line $_";
        exit(1);
      }
      $row_start2 = $2;
      $row_end2 = $3;
      $col_start2 = $5;
      $col_end2 = $6;
  
      ($row_start, $row_end) = combine_ranges("row", $row_start1, $row_end1, $row_start2, $row_end2);
      ($col_start, $col_end) = combine_ranges("column", $col_start1, $col_end1, $col_start2, $col_end2);
  
  
      if ($row_start ne "") {
        $range = "$row_start:$row_end";
      } else {
        $range = "";
      }
      if ($col_start ne "") {
        $range .= ",$col_start:$col_end";
      }
      print $before_range . "[" . $range . "]
  ";
    } else {
      print;
    }
  }
  
  __END__
  
  # Testing
  # echo foo |  utils/data/normalize_data_range.pl -> foo
  # echo 'foo[bar:baz]' |  utils/data/normalize_data_range.pl -> foo[bar:baz]
  # echo 'foo[bar:baz][bin:bang]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of input line foo[bar:baz][bin:bang]
  # echo 'foo[10:20][0:5]' |  utils/data/normalize_data_range.pl -> foo[10:15]
  # echo 'foo[,10:20][,0:5]' |  utils/data/normalize_data_range.pl -> foo[,10:15]
  # echo 'foo[,0:100][1:15]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
  # echo 'foo[1:15][,0:100]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
  # echo 'foo[10:20][0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[10:20][0:11] [second row range too large vs first range, 10 + 11 > 20]
  # echo 'foo[,10:20][,0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[,10:20][,0:11] [second column range too large vs first range, 10 + 11 > 20]