Blame view
egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
1.14 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
#!/usr/bin/env perl # Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na) # # A script for char-based Chinese OOV lexicon generation. # # Input 1: char-based dictionary, example # CHAR1 ph1 ph2 # CHAR2 ph3 # CHAR3 ph2 ph4 # # Input 2: OOV word list, example # WORD1 # WORD2 # WORD3 # # where WORD1 is in the format of "CHAR1CHAR2". # # Output: OOV lexicon, in the format of normal lexicon if($#ARGV != 1) { print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex "; print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char "; print STDERR "### oovwordlist: OOV word list "; print STDERR "### oovlex: output OOV lexicon "; exit; } use utf8; my %prons; open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]." "); binmode(DICT,":encoding(utf8)"); foreach (<DICT>) { chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1]; } close DICT; open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]." "); binmode(WORDS,":encoding(utf8)"); while (<WORDS>) { chomp; print $_; @A = split("", $_); foreach (@A) { print " $prons{$_}"; } print " "; } close WORDS; |