swbd1_map_words.pl
1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env perl
# Modified from swbd_map_words.pl in Kaldi s5 recipe to make pattern
# matches case-insensitive --Arnab (Jan 2013)
if ($ARGV[0] eq "-f") {
shift @ARGV;
$field_spec = shift @ARGV;
if ($field_spec =~ m/^\d+$/) {
$field_begin = $field_spec - 1; $field_end = $field_spec - 1;
}
if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
if ($1 ne "") {
$field_begin = $1 - 1; # Change to zero-based indexing.
}
if ($2 ne "") {
$field_end = $2 - 1; # Change to zero-based indexing.
}
}
if (!defined $field_begin && !defined $field_end) {
die "Bad argument to -f option: $field_spec";
}
}
while (<>) {
@A = split(" ", $_);
for ($n = 0; $n < @A; $n++) {
$a = $A[$n];
if ( (!defined $field_begin || $n >= $field_begin)
&& (!defined $field_end || $n <= $field_end)) {
# e.g. [LAUGHTER-STORY] -> STORY;
$a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:i;
# $1 and $3 relate to preserving trailing "-"
$a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note,
# 1st part may include partial-word stuff, which we process further below,
# e.g. [LEM[GUINI]-/LINGUINI]
# the (|\_) at the end is to accept and preserve trailing -'s.
$a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote;
# let the leading - be optional on input, as sometimes omitted.
$a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-;
# let the trailing - be optional on input, as sometimes omitted.
$a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
# which is a mistake in the input.
$a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM
$a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
$a =~ s:_\d$::; # e.g. THEM_1 -> THEM
}
$A[$n] = $a;
}
print join(" ", @A) . "\n";
}