character_tokenizer 630 Bytes
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32


#!/usr/bin/env perl
# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
# Apache 2.0
use utf8;

use open qw(:encoding(utf8));
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

while (<>) {
  @F = split " ";
  print $F[0] . " "; 
  foreach $s (@F[1..$#F]) {
    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
      print " $s";
    } else {
      @chars = split "", $s;
      foreach $c (@chars) {
        if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
          print " $c";
        } else {
          print "$c";
        }
      }
    }
    print " ";
  }
  print "\n";
}