Blame view

egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl 1.14 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  #!/usr/bin/env perl
  # Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
  #
  # A script for char-based Chinese OOV lexicon generation.
  #
  # Input 1: char-based dictionary, example
  # CHAR1 ph1 ph2
  # CHAR2 ph3
  # CHAR3 ph2 ph4
  #
  # Input 2: OOV word list, example
  # WORD1
  # WORD2
  # WORD3
  #
  # where WORD1 is in the format of "CHAR1CHAR2".
  #
  # Output: OOV lexicon, in the format of normal lexicon
  
  if($#ARGV != 1) {
    print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex
  
  ";
    print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char
  ";
    print STDERR "### oovwordlist: OOV word list
  ";
    print STDERR "### oovlex: output OOV lexicon
  ";
    exit;
  }
  
  use utf8;
  my %prons;
  open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."
  ");
  binmode(DICT,":encoding(utf8)");
  foreach (<DICT>) {
    chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
  }
  close DICT;
  
  open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."
  ");
  binmode(WORDS,":encoding(utf8)");
  while (<WORDS>) {
    chomp;
    print $_;
    @A = split("", $_);
    foreach (@A) {
      print " $prons{$_}";
    }
    print "
  ";
  }
  close WORDS;