Blame view

egs/wsj/s5/utils/apply_map.pl 2.89 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  #!/usr/bin/env perl
  use warnings; #sed replacement for -w perl parameter
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0.
  
  # This program is a bit like ./sym2int.pl in that it applies a map
  # to things in a file, but it's a bit more general in that it doesn't
  # assume the things being mapped to are single tokens, they could
  # be sequences of tokens.  See the usage message.
  
  
  $permissive = 0;
  
  for ($x = 0; $x <= 2; $x++) {
  
    if (@ARGV > 0 && $ARGV[0] eq "-f") {
      shift @ARGV;
      $field_spec = shift @ARGV;
      if ($field_spec =~ m/^\d+$/) {
        $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
      }
      if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
        if ($1 ne "") {
          $field_begin = $1 - 1;  # Change to zero-based indexing.
        }
        if ($2 ne "") {
          $field_end = $2 - 1;    # Change to zero-based indexing.
        }
      }
      if (!defined $field_begin && !defined $field_end) {
        die "Bad argument to -f option: $field_spec";
      }
    }
  
    if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
      shift @ARGV;
      # Mapping is optional (missing key is printed to output)
      $permissive = 1;
    }
  }
  
  if(@ARGV != 1) {
    print STDERR "Invalid usage: " . join(" ", @ARGV) . "
  ";
    print STDERR <<'EOF';
  Usage: apply_map.pl [options] map <input >output
   options: [-f <field-range> ] [--permissive]
     This applies a map to some specified fields of some input text:
     For each line in the map file: the first field is the thing wae
     map from, and the remaining fields are the sequence we map it to.
     The -f (field-range) option says which fields of the input file the map
     map should apply to.
     If the --permissive option is supplied, fields which are not present
     in the map will be left as they were.
   Applies the map 'map' to all input text, where each line of the map
   is interpreted as a map from the first field to the list of the other fields
   Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
   range in the input to apply the map to.
   e.g.: echo A B | apply_map.pl a.txt
   where a.txt is:
   A a1 a2
   B b
   will produce:
   a1 a2 b
  EOF
    exit(1);
  }
  
  ($map_file) = @ARGV;
  open(M, "<$map_file") || die "Error opening map file $map_file: $!";
  
  while (<M>) {
    @A = split(" ", $_);
    @A >= 1 || die "apply_map.pl: empty line.";
    $i = shift @A;
    $o = join(" ", @A);
    $map{$i} = $o;
  }
  
  while(<STDIN>) {
    @A = split(" ", $_);
    for ($x = 0; $x < @A; $x++) {
      if ( (!defined $field_begin || $x >= $field_begin)
           && (!defined $field_end || $x <= $field_end)) {
        $a = $A[$x];
        if (!defined $map{$a}) {
          if (!$permissive) {
            die "apply_map.pl: undefined key $a in $map_file
  ";
          } else {
            print STDERR "apply_map.pl: warning! missing key $a in $map_file
  ";
          }
        } else {
          $A[$x] = $map{$a};
        }
      }
    }
    print join(" ", @A) . "
  ";
  }