Blame view
egs/hkust/s5/local/hkust_normalize.pl
823 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # Copyright Chao Weng # normalizations for hkust trascript # see the docs/trans-guidelines.pdf for details while (<STDIN>) { @A = split(" ", $_); print "$A[0] "; for ($n = 1; $n < @A; $n++) { $a = $A[$n]; if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}") || ($a eq "{lipsmack}")) {print "[VOCALIZED-NOISE] "; next;} if (($a eq "{laugh}")) {print "[LAUGHTER] "; next;} if (($a eq "<noise>")) {print "[NOISE] "; next;} $tmp = $a; if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:; } if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; } if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; } if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);} print "$tmp "; } print " "; } |