Blame view

egs/wsj/s5/utils/data/fix_subsegment_feats.pl 2.53 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  #!/usr/bin/env perl
  
  # Copyright 2016  Vimal Manohar
  # Apache 2.0.
  
  use warnings;
  
  # This script reads from stdin a feats.scp file that contains frame ranges and
  # ensures that they don't exceed the maximum number of frames supplied in the
  # <utt2max-frames> file. 
  # <utt2max-frames> is usually computed using get_utt2num_frames.sh on the 
  # original directory which will be segmented using 
  # utils/data/subsegment_data_dir.sh.
  # 
  # e.g. feats.scp
  # utt_foo-1 foo-bar.ark:514231[721:892]
  # 
  # utt2max-frames
  # utt_foo-1 891
  # 
  # fixed_feats.scp
  # utt_foo-1 foo-bar.ark:514231[721:890]
  # 
  # Note: Here 891 is the number of frames in the archive foo-bar.ark
  # The frame end for utt_foo-1, i.e. 892 (0-indexed) exceeds the archive size
  # (891) by two frames. This script fixes that line by truncating the range 
  # to 890.
  
  if (scalar @ARGV != 1) {
    my $usage = <<END;
  This script reads from stdin a feats.scp file that contains frame ranges and
  ensures that they don't exceed the maximum number of frames supplied in the
  <utt2max-frames> file. 
  
  Usage: $0 <utt2max-frames> < feats.scp > fixed_feats.scp
  END
    die "$usage";
  }
  
  my $utt2max_frames_file = $ARGV[0];
  
  open MAX_FRAMES, $utt2max_frames_file or die "$0: Could not open file $utt2max_frames_file";
  
  my %utt2max_frames;
  
  while (<MAX_FRAMES>) {
    chomp;
    my @F = split;
    
    (scalar @F == 2) or die "$0: Invalid line $_ in $utt2max_frames_file";
  
    $utt2max_frames{$F[0]} = $F[1];
  }
  
  while (<STDIN>) {
    my $line = $_;
    
    #if (m/\[([^][]*)\]\[([^][]*)\]\s*$/) {
    #  print STDERR ("fix_subsegment_feats.pl: this script only supports single indices");
    #  exit(1);
    #}
    
    my $before_range = "";
    my $range = "";
  
    if (m/^(.*)\[([^][]*)\]\s*$/) {
      $before_range = $1;
      $range = $2;
    } else {
      print;
      next;
    }
  
    my @F = split(/ /, $before_range);
    my $utt = shift @F;
    defined $utt2max_frames{$utt} or die "fix_subsegment_feats.pl: Could not find key $utt in $utt2max_frames_file.
  Error with line $line";
  
    if ($range !~ m/^(\d*):(\d*)([,]?.*)$/) {
      print STDERR "fix_subsegment_feats.pl: could not make sense of input line $_";
      exit(1);
    }
      
    my $row_start = $1;
    my $row_end = $2;
    my $col_range = $3;
  
    if ($row_end >= $utt2max_frames{$utt}) {
      print STDERR "Fixed row_end for $utt from $row_end to $utt2max_frames{$utt}-1
  ";
      $row_end = $utt2max_frames{$utt} - 1;
    }
  
    if ($row_start ne "") {
      $range = "$row_start:$row_end";
    } else {
      $range = "";
    }
  
    if ($col_range ne "") {
      $range .= ",$col_range";
    }
  
    print ("$utt " . join(" ", @F) . "[" . $range . "]
  ");
  }