Blame view
LIA_kaldiUtils/swbd_map_words.pl
1.86 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
#!/usr/bin/perl if ($ARGV[0] eq "-f") { shift @ARGV; $field_spec = shift @ARGV; if ($field_spec =~ m/^\d+$/) { $field_begin = $field_spec - 1; $field_end = $field_spec - 1; } if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) if ($1 ne "") { $field_begin = $1 - 1; # Change to zero-based indexing. } if ($2 ne "") { $field_end = $2 - 1; # Change to zero-based indexing. } } if (!defined $field_begin && !defined $field_end) { die "Bad argument to -f option: $field_spec"; } } while (<>) { @A = split(" ", $_); for ($n = 0; $n < @A; $n++) { $a = $A[$n]; if ( (!defined $field_begin || $n >= $field_begin) && (!defined $field_end || $n <= $field_end)) { $a =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:; # e.g. [LAUGHTER-STORY] -> STORY; $a =~ s/\(%HESITATION\)/euh/; # $1 and $3 relate to preserving trailing "-" $a =~ s:^\[(.+)/.+\](|\-)$:$1$2:; # e.g. [IT'N/ISN'T] -> IT'N ... note, # 1st part may include partial-word stuff, which we process further below, # e.g. [LEM[GUINI]-/LINGUINI] # the (|\_) at the end is to accept and preserve trailing -'s. $a =~ s:^(|\-)\[[^][]+\](.+)$:-$2:; # e.g. -[AN]Y , note \047 is quote; # let the leading - be optional on input, as sometimes omitted. $a =~ s:^(.+)\[[^][]+\](|\-)$:$1-:; # e.g. AB[SOLUTE]- -> AB-; # let the trailing - be optional on input, as sometimes omitted. $a =~ s:([^][]+)\[.+\]$:$1:; # e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX- # which is a mistake in the input. $a =~ s:^\{(.+)\}$:$1:; # e.g. {YUPPIEDOM} -> YUPPIEDOM $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:; # e.g. AMMU[N]IT- -> AMMU-IT- $a =~ s:_\d$::; # e.g. THEM_1 -> THEM } $A[$n] = $a; } print join(" ", @A) . " "; } |