character_tokenizer
630 Bytes
#!/usr/bin/env perl
# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
# Apache 2.0
use utf8;
use open qw(:encoding(utf8));
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
while (<>) {
@F = split " ";
print $F[0] . " ";
foreach $s (@F[1..$#F]) {
if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
print " $s";
} else {
@chars = split "", $s;
foreach $c (@chars) {
if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
print " $c";
} else {
print "$c";
}
}
}
print " ";
}
print "\n";
}