hkust_word2ch_tran.pl
3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env perl
# Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin)
#
# Apache 2.0.
#
# A script to convert Kaldi Chinese words transcription to Chinese characters transcription.
# This is helpful for Chinese character error rate scoring.
#
# If no option is applied, by default the script converts the Chinese words transcription to Chinese characters transcription \
# by assuming the input Chinese words/characters are 3 bytes UTF8 code.
# Continuous English/ASCII characters without space are treated as single token.
#
# When --useword2charmap option is applied, an input Chinese words to Chinese characters mapping table \
# (e.g. a word2char_map likes "195k_chinese_word2char_map") is used for converting the corresponding Chinese words \
# to seperate Chinese characters.
#
# When --encodeoutput option is applied, the script runs like default mode w/o applying option except the \
# output Chinese characters are in readable encoded format. The output Chinese characters are encoded in a way \
# the same as the opensource HTK toolkit from the Cambridge University Engineering Department.
use POSIX();
sub printUsage {
print "usage: perl hkust_word2ch_tran.pl [--useword2charmap chinese_word2char_map|--encodeoutput] tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl --useword2charmap 195k_chinese_word2char_map tran_file \n";
print "e.g. perl hkust_word2ch_tran.pl --encodeoutput tran_file \n";
exit;
}
sub encodeByteCharacter {
$enbc = "\\";
$uchar = ord($_[0]);
$encrypt1 = (($uchar>>6)&7)+'0';
$encrypt2 = (($uchar>>3)&7)+'0';
$encrypt3 = ($uchar&7)+'0';
$enbc = $enbc."$encrypt1"."$encrypt2"."$encrypt3";
return $enbc;
}
if(@ARGV < 1 || @ARGV > 3 ) {
printUsage();
}
$useMapping=0;
$useEncodeoutput=0;
if(@ARGV == 2) {
if($ARGV[0] ne "--encodeoutput") {
printUsage();
}
$useEncodeoutput=1;
$tranfile=$ARGV[1];
}
elsif(@ARGV == 3) {
if($ARGV[0] ne "--useword2charmap") {
printUsage();
}
$useMapping=1;
$word2charfile=$ARGV[1];
$tranfile=$ARGV[2];
}
else {
$tranfile=$ARGV[0];
}
# if Chinese word to character map is provided, read it
if($useMapping) {
%word2charlist=();
open(INFILE, $word2charfile) || die("Can't open Chinese word to char map: ".$word2charfile."\n");
while(<INFILE>){
chomp;
@line=split(/\s+/);
$a=$line[0];
$b="";
for($i=1; $i<scalar(@line); $i++) {
$b=$b . " " . $line[$i];
}
$word2charlist{$a}=$b;
}
close(INFILE);
}
# process kaldi transcription
open(INFILE, $tranfile) || die("Can't open transcription file ".$tranfile."\n");
while(<INFILE>) {
chomp;
@line = split(/\s+/);
## utt_id
print $line[0];
## utt_character_word
for($i=1; $i<scalar(@line); $i++) {
if($useMapping) {
if(!exists($word2charlist{$line[$i]})) {
print " ".$line[$i];
}
else {
print $word2charlist{$line[$i]};
}
}
else {
@carray = split(//, $line[$i]);
$wspace=0;
$l=0;
while($l<@carray) {
$c = $carray[$l];
if(POSIX::isprint($c)) {
if($wspace) {
print $c;
}
else {
print " ".$c;
$wspace=1;
}
$l=$l+1;
}
else { ## here we find chinese character
if(!$useEncodeoutput) {
## print utf8 chinese character, which should be 3 bytes
print " ".$carray[$l].$carray[$l+1].$carray[$l+2];
}
else {
## print 3 bytes utf8 chinese character in readable encoded format
$enbc1 = encodeByteCharacter($carray[$l]);
$enbc2 = encodeByteCharacter($carray[$l+1]);
$enbc3 = encodeByteCharacter($carray[$l+2]);
print " ".$enbc1.$enbc2.$enbc3;
}
$l=$l+3;
$wspace=0;
}
}
}
}
print "\n";
}
close(INFILE);