Blame view
egs/mini_librispeech/s5/local/kws/keywords_to_indices.pl
2.98 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
#!/usr/bin/env perl # Copyright 2012-2018 Johns Hopkins University (Author: Yenda Trmal) # Apache 2.0. use Data::Dumper; $Data::Dumper::Indent = 1; binmode STDOUT, ":utf8"; binmode STDIN, ":utf8"; sub permute { my $last = pop @_; unless(@_) { return map([$_], @$last); } return map { my $left = $_; map([@$left, $_], @$last) } permute(@_); } $oov_count=0; $ignore_oov = 0; $ignore_first_field = 0; for($x = 0; $x < 2; $x++) { if ($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; } if ($ARGV[0] eq "-f") { shift @ARGV; $field_spec = shift @ARGV; if ($field_spec =~ m/^\d+$/) { $field_begin = $field_spec - 1; $field_end = $field_spec - 1; } if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) if ($1 ne "") { $field_begin = $1 - 1; # Change to zero-based indexing. } if ($2 ne "") { $field_end = $2 - 1; # Change to zero-based indexing. } } if (!defined $field_begin && !defined $field_end) { die "Bad argument to -f option: $field_spec"; } } } $symtab = shift @ARGV; if (!defined $symtab) { print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions " . "options: [--map-oov <oov-symbol> ] [-f <field-range> ] " . "note: <field-range> can look like 4-5, or 4-, or 5-, or 1. "; } open(F, "<:encoding(UTF-8)", $symtab) || die "Error opening symbol table file $symtab"; while(<F>) { @A = split(" ", $_); @A == 2 || die "bad line in symbol table file: $_"; if ( not defined( $sym2int{$A[0]} ) ) { $sym2int{$A[0]} = []; } push @{ $sym2int{$A[0]} }, $A[1] + 0; } #print Dumper(\%sym2int); if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } $map_oov = $sym2int{$map_oov}; } $lines=0; while (<>) { @A = split(" ", $_); @B = (); $lines = $lines + 1; $undefined_words = 0; for ($n = 1; $n < @A; $n++) { $a = $A[$n]; $i = $sym2int{$a}; if (!defined ($i)) { if (defined $map_oov) { if ($num_warning++ < $max_warning) { print STDERR "sym2int.pl: replacing $a with $map_oov "; if ($num_warning == $max_warning) { print STDERR "sym2int.pl: not warning for OOVs any more times "; } } $i = [ $map_oov ]; } else { $pos = $n+1; die "sym2int.pl: undefined symbol $a (in position $pos) "; } $undefined_words = $undefined_words + 1; } $a = $i; push @B, $a; } #if ( defined $sym2int{$A[$n]} ) { # push @B, $sym2int{$A[$n]}; #} else { # push @B, [0]; #} if ($undefined_words > 0) { $oov_count = $oov_count + 1; } @C = permute @B; #print Dumper(\@B); #print Dumper(\@C); foreach $phrase ( @C ) { print "$A[0] "; print join(" ", @{$phrase}); print " "; } } print STDERR "Found $oov_count phrases containing (at least one) OOV... "; |