Blame view
egs/hkust/s5/local/character_tokenizer
630 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
#!/usr/bin/env perl # Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) # Apache 2.0 use utf8; use open qw(:encoding(utf8)); binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; while (<>) { @F = split " "; print $F[0] . " "; foreach $s (@F[1..$#F]) { if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) { print " $s"; } else { @chars = split "", $s; foreach $c (@chars) { if ($c =~ /\p{InCJK_Unified_Ideographs}/) { print " $c"; } else { print "$c"; } } } print " "; } print " "; } |