Blame view
egs/hkust/s5/local/ext/hkust_word2ch_tran.pl
3.91 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#!/usr/bin/env perl # Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin) # # Apache 2.0. # # A script to convert Kaldi Chinese words transcription to Chinese characters transcription. # This is helpful for Chinese character error rate scoring. # # If no option is applied, by default the script converts the Chinese words transcription to Chinese characters transcription \ # by assuming the input Chinese words/characters are 3 bytes UTF8 code. # Continuous English/ASCII characters without space are treated as single token. # # When --useword2charmap option is applied, an input Chinese words to Chinese characters mapping table \ # (e.g. a word2char_map likes "195k_chinese_word2char_map") is used for converting the corresponding Chinese words \ # to seperate Chinese characters. # # When --encodeoutput option is applied, the script runs like default mode w/o applying option except the \ # output Chinese characters are in readable encoded format. The output Chinese characters are encoded in a way \ # the same as the opensource HTK toolkit from the Cambridge University Engineering Department. use POSIX(); sub printUsage { print "usage: perl hkust_word2ch_tran.pl [--useword2charmap chinese_word2char_map|--encodeoutput] tran_file "; print "e.g. perl hkust_word2ch_tran.pl tran_file "; print "e.g. perl hkust_word2ch_tran.pl --useword2charmap 195k_chinese_word2char_map tran_file "; print "e.g. perl hkust_word2ch_tran.pl --encodeoutput tran_file "; exit; } sub encodeByteCharacter { $enbc = "\\"; $uchar = ord($_[0]); $encrypt1 = (($uchar>>6)&7)+'0'; $encrypt2 = (($uchar>>3)&7)+'0'; $encrypt3 = ($uchar&7)+'0'; $enbc = $enbc."$encrypt1"."$encrypt2"."$encrypt3"; return $enbc; } if(@ARGV < 1 || @ARGV > 3 ) { printUsage(); } $useMapping=0; $useEncodeoutput=0; if(@ARGV == 2) { if($ARGV[0] ne "--encodeoutput") { printUsage(); } $useEncodeoutput=1; $tranfile=$ARGV[1]; } elsif(@ARGV == 3) { if($ARGV[0] ne "--useword2charmap") { printUsage(); } $useMapping=1; $word2charfile=$ARGV[1]; $tranfile=$ARGV[2]; } else { $tranfile=$ARGV[0]; } # if Chinese word to character map is provided, read it if($useMapping) { %word2charlist=(); open(INFILE, $word2charfile) || die("Can't open Chinese word to char map: ".$word2charfile." "); while(<INFILE>){ chomp; @line=split(/\s+/); $a=$line[0]; $b=""; for($i=1; $i<scalar(@line); $i++) { $b=$b . " " . $line[$i]; } $word2charlist{$a}=$b; } close(INFILE); } # process kaldi transcription open(INFILE, $tranfile) || die("Can't open transcription file ".$tranfile." "); while(<INFILE>) { chomp; @line = split(/\s+/); ## utt_id print $line[0]; ## utt_character_word for($i=1; $i<scalar(@line); $i++) { if($useMapping) { if(!exists($word2charlist{$line[$i]})) { print " ".$line[$i]; } else { print $word2charlist{$line[$i]}; } } else { @carray = split(//, $line[$i]); $wspace=0; $l=0; while($l<@carray) { $c = $carray[$l]; if(POSIX::isprint($c)) { if($wspace) { print $c; } else { print " ".$c; $wspace=1; } $l=$l+1; } else { ## here we find chinese character if(!$useEncodeoutput) { ## print utf8 chinese character, which should be 3 bytes print " ".$carray[$l].$carray[$l+1].$carray[$l+2]; } else { ## print 3 bytes utf8 chinese character in readable encoded format $enbc1 = encodeByteCharacter($carray[$l]); $enbc2 = encodeByteCharacter($carray[$l+1]); $enbc3 = encodeByteCharacter($carray[$l+2]); print " ".$enbc1.$enbc2.$enbc3; } $l=$l+3; $wspace=0; } } } } print " "; } close(INFILE); |