Blame view
egs/gale_mandarin/s5/local/gale_normalize.pl
1.11 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # Copyright Chao Weng # normalizations for hkust trascript # see the docs/trans-guidelines.pdf for details while (<STDIN>) { @A = split(" ", $_); for ($n = 0; $n < @A; $n++) { $a = $A[$n]; if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}") || ($a eq "{lipsmack}")) {print "[VOCALIZED-NOISE] "; next;} if (($a eq "{laugh}")) {print "[LAUGHTER] "; next;} if (($a eq "<noise>")) {print "[NOISE] "; next;} $tmp = $a; if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+::; } if ($tmp =~ /。/) { $tmp =~ s:。::g; } $tmp =~ s:A:A:g; $tmp =~ s:D:D:g; $tmp =~ s:N:D:g; $tmp =~ s:Ⅱ::g; $tmp =~ s: ::g; $tmp =~ s:、::g; $tmp =~ s:】::g; if ($tmp =~ /?/) { $tmp =~ s:?::g; } if ($tmp =~ /!/) { $tmp =~ s:!::g; } if ($tmp =~ /,/) { $tmp =~ s:,::g; } if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z])::; } if ($tmp =~ /%\S/) { $tmp =~ s:%(\S)::; } if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} print "$tmp "; } print " "; } |