character_tokenizer 630 Bytes
#!/usr/bin/env perl
# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
# Apache 2.0
use utf8;

use open qw(:encoding(utf8));
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

while (<>) {
  @F = split " ";
  print $F[0] . " "; 
  foreach $s (@F[1..$#F]) {
    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
      print " $s";
    } else {
      @chars = split "", $s;
      foreach $c (@chars) {
        if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
          print " $c";
        } else {
          print "$c";
        }
      }
    }
    print " ";
  }
  print "\n";
}