hkust_word2ch_tran.pl 3.91 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140


#!/usr/bin/env perl
# Copyright 2013  Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin) 
#                 
# Apache 2.0.
#
# A script to convert Kaldi Chinese words transcription to Chinese characters transcription.
# This is helpful for Chinese character error rate scoring. 
# 
# If no option is applied, by default the script converts the Chinese words transcription to Chinese characters transcription \
# by assuming the input Chinese words/characters are 3 bytes UTF8 code.
# Continuous English/ASCII characters without space are treated as single token.
# 
# When --useword2charmap option is applied, an input Chinese words to Chinese characters mapping table \
# (e.g. a word2char_map likes "195k_chinese_word2char_map") is used for converting the corresponding Chinese words \
# to seperate Chinese characters.
#
# When --encodeoutput option is applied, the script runs like default mode w/o applying option except the  \
# output Chinese characters are in readable encoded format. The output Chinese characters are encoded in a way \
# the same as the opensource HTK toolkit from the Cambridge University Engineering Department.

use POSIX();

sub printUsage {
  print "usage: perl hkust_word2ch_tran.pl [--useword2charmap chinese_word2char_map|--encodeoutput] tran_file \n";
  print "e.g. perl hkust_word2ch_tran.pl tran_file \n";
  print "e.g. perl hkust_word2ch_tran.pl --useword2charmap 195k_chinese_word2char_map tran_file \n";
  print "e.g. perl hkust_word2ch_tran.pl --encodeoutput tran_file \n";
  exit;
} 

sub encodeByteCharacter {
  $enbc = "\\";
  $uchar = ord($_[0]);
  $encrypt1 = (($uchar>>6)&7)+'0';
  $encrypt2 = (($uchar>>3)&7)+'0';
  $encrypt3 = ($uchar&7)+'0';
  $enbc = $enbc."$encrypt1"."$encrypt2"."$encrypt3";
  return $enbc;
}

if(@ARGV < 1 || @ARGV > 3 ) {
  printUsage();
}

$useMapping=0;
$useEncodeoutput=0;

if(@ARGV == 2) {
  if($ARGV[0] ne "--encodeoutput") {
    printUsage();
  }
  $useEncodeoutput=1;
  $tranfile=$ARGV[1];
}
elsif(@ARGV == 3) {
  if($ARGV[0] ne "--useword2charmap") {
    printUsage();
  }
  $useMapping=1;
  $word2charfile=$ARGV[1];
  $tranfile=$ARGV[2];
}
else {
  $tranfile=$ARGV[0];
}

# if Chinese word to character map is provided, read it
if($useMapping) {
  %word2charlist=();
  open(INFILE, $word2charfile) || die("Can't open Chinese word to char map: ".$word2charfile."\n");
  while(<INFILE>){
    chomp;
    @line=split(/\s+/);
    $a=$line[0];
    $b="";
    for($i=1; $i<scalar(@line); $i++) {
      $b=$b . " " . $line[$i];
    }
    $word2charlist{$a}=$b;
  }
  close(INFILE);
}

# process kaldi transcription
open(INFILE, $tranfile) || die("Can't open transcription file ".$tranfile."\n");
while(<INFILE>) {
  chomp;
  @line = split(/\s+/);

  ## utt_id
  print $line[0];

  ## utt_character_word
  for($i=1; $i<scalar(@line); $i++) {
    if($useMapping) {
      if(!exists($word2charlist{$line[$i]})) {
        print " ".$line[$i];
      }
      else {
        print $word2charlist{$line[$i]};
      }
    }
    else {
      @carray = split(//, $line[$i]);
      $wspace=0;
      $l=0;
      while($l<@carray) {
        $c = $carray[$l];
        if(POSIX::isprint($c)) {
          if($wspace) {
            print $c;
          }
          else {
            print " ".$c;
            $wspace=1;
          }
          $l=$l+1;
        }
        else { ## here we find chinese character
          if(!$useEncodeoutput) {
            ## print utf8 chinese character, which should be 3 bytes
            print " ".$carray[$l].$carray[$l+1].$carray[$l+2];
          }
          else {
            ## print 3 bytes utf8 chinese character in readable encoded format
            $enbc1 = encodeByteCharacter($carray[$l]);
            $enbc2 = encodeByteCharacter($carray[$l+1]);
            $enbc3 = encodeByteCharacter($carray[$l+2]);
            print " ".$enbc1.$enbc2.$enbc3;
          }
          $l=$l+3;
          $wspace=0;            
        }
      }
    }
  }
  print "\n";
}
close(INFILE);