Blame view

egs/wsj/s5/utils/data/normalize_data_range.pl 5.05 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  #!/usr/bin/env perl
  
  # This script is intended to read and write scp files possibly containing indexes for
  # sub-ranges of features, like
  # foo-123  bar.ark:431423[78:89]
  # meaning rows 78 through 89 of the matrix located at bar.ark:431423.
  #
  # Its purpose is to normalize lines which have ranges on top of ranges, like
  #
  # foo-123  bar.ark:431423[78:89][3:4]
  #
  # This program interprets the later [] expression as a sub-range of the matrix returned by the first []
  # expression; in this case, we'd get
  #
  # foo-123  bar.ark:431423[81:82]
  #
  # Note that these ranges are based on zero-indexing, and have a 'first:last'
  # interpretation, so the range [0:0] is a matrix with one row.  And also note
  # that column ranges are permitted, after row ranges, and the row range may be
  # empty, e.g.
  
  # foo-123  bar.ark:431423[81:82,0:13]
  # or
  # foo-123  bar.ark:431423[81:82,0:13]
  #
  
  # This program reads from the standard input (or command-line file or files),
  # and writes to the standard output.
  
  
  # This function combines ranges, either row or column ranges.  start1 and end1
  # are the first range, and start2 and end2 are interpreted as a sub-range of the
  # first range.  It is acceptable for either start1 and end1, or start2 and end2, to
  # be empty.
  # This function returns the start and end of the range, as an array.
  sub combine_ranges {
    ($row_or_column, $start1, $end1, $start2, $end2) = @_;
  
    if ($start1 eq "" && $end1 eq "") {
      return ($start2, $end2);
    } elsif ($start2 eq "" && $end2 eq "") {
      return ($start1, $end1);
    } else {
      # For now this script doesn't support the case of ranges like [20:], even
      # though they are supported at the C++ level.
      if ($start1 eq "" || $start2 eq "" || $end1 eq "" || $end2 == "") {
        chop $line;
        print STDERR ("normalize_data_range.pl: could not make sense of line $line
  ");
        exit(1)
      }
      if ($start1 + $end2 > $end1) {
        chop $line;
        print STDERR ("normalize_data_range.pl: could not make sense of line $line " .
              "[second $row_or_column range too large vs first range, $start1 + $end2 > $end1]
  ");
            # exit(1);
        return ($start2+$start1, $end1);
      }
      return ($start2+$start1, $end2+$start1);
    }
  }
  
  
  while (<>) {
    $line = $_;
    # we only need to do something if we detect two of these ranges.
    # The following regexp matches strings of the form ...[foo][bar]
    # where foo and bar have no square brackets in them.
    if (m/\[([^][]*)\]\[([^][]*)\]\s*$/) {
      $before_range = $`;
      $first_range = $1;   # e.g. '0:500,20:21', or '0:500', or ',0:13'.
      $second_range = $2;  # has same general format as first_range.
      if ($_ =~ m/concat-feats /) {
        # sometimes in scp files, we use the command concat-feats to splice together
        # two feature matrices.  Handling this correctly is complicated and we don't
        # anticipate needing it, so we just refuse to process this type of data.
        print STDERR ("normalize_data_range.pl: this script cannot [yet] normalize the data ranges " .
          "if concat-feats was in the input data
  ");
        exit(1);
      }
      # print STDERR "matched: $before_range $first_range $second_range
  ";
      if ($first_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
        print STDERR "normalize_data_range.pl: could not make sense of input line $_";
        exit(1);
      }
      $row_start1 = $2;
      $row_end1 = $3;
      $col_start1 = $5;
      $col_end1 = $6;
  
      if ($second_range !~ m/^((\d*):(\d*)|)(,(\d*):(\d*)|)$/) {
        print STDERR "normalize_data_range.pl: could not make sense of input line $_";
        exit(1);
      }
      $row_start2 = $2;
      $row_end2 = $3;
      $col_start2 = $5;
      $col_end2 = $6;
  
      ($row_start, $row_end) = combine_ranges("row", $row_start1, $row_end1, $row_start2, $row_end2);
      ($col_start, $col_end) = combine_ranges("column", $col_start1, $col_end1, $col_start2, $col_end2);
  
  
      if ($row_start ne "") {
        $range = "$row_start:$row_end";
      } else {
        $range = "";
      }
      if ($col_start ne "") {
        $range .= ",$col_start:$col_end";
      }
      print $before_range . "[" . $range . "]
  ";
    } else {
      print;
    }
  }
  
  __END__
  
  # Testing
  # echo foo |  utils/data/normalize_data_range.pl -> foo
  # echo 'foo[bar:baz]' |  utils/data/normalize_data_range.pl -> foo[bar:baz]
  # echo 'foo[bar:baz][bin:bang]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of input line foo[bar:baz][bin:bang]
  # echo 'foo[10:20][0:5]' |  utils/data/normalize_data_range.pl -> foo[10:15]
  # echo 'foo[,10:20][,0:5]' |  utils/data/normalize_data_range.pl -> foo[,10:15]
  # echo 'foo[,0:100][1:15]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
  # echo 'foo[1:15][,0:100]' |  utils/data/normalize_data_range.pl -> foo[1:15,0:100]
  # echo 'foo[10:20][0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[10:20][0:11] [second row range too large vs first range, 10 + 11 > 20]
  # echo 'foo[,10:20][,0:11]' |  utils/data/normalize_data_range.pl -> normalize_data_range.pl: could not make sense of line foo[,10:20][,0:11] [second column range too large vs first range, 10 + 11 > 20]