Blame view

egs/hkust/s5/local/hkust_extract_subdict.pl 2.81 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  #!/usr/bin/env perl
  # Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
  # 
  # A script for dictionary generation with an input dict and a wordlist 
  #
  # example of dict format as follow:
  # WORD1 ph1 ph2
  # WORD2 ph1 ph2 ph3
  # WORDX ph4
  # WORDY ph4 ph5
  # WORDZ ph3 ph1
  #
  # example of wordlist (support phrase of words) format as follow:
  # WORD1
  # WORD2
  # WORDX WORDY 
  # WORDX WORDY WORDZ
  
  if($#ARGV+1 != 2 && $#ARGV+1 != 3) {
    printUsage();
    exit;
  }
  
  $usespron=0;
  if(@ARGV == 3) {
    if($ARGV[2] ne "--spron") {
      printUsage();
      exit;
    }
    $usespron=1;
  }
  
  $dictfile=$ARGV[0];
  $inputfile=$ARGV[1];
  
  %dictionarylist=();
  open(INFILE, $dictfile) || die("Can't open dict ".$dictfile."
  ");
  while(<INFILE>){
    chomp;
    @line=split(/\s+/);
    $a=$line[0];
    $b="";
    for($i=1; $i<scalar(@line); $i++) {
      $b=$b . " " . $line[$i];
    }
    push ( @{ $dictionarylist{$a} }, $b );
  }
  close(INFILE);
  
  open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile."
  ");
  while(<INFILE>) {
    chomp;
    $phrase = $_;
    @line = split(/\s+/);
  
    ## single pronunciation handling
    if($usespron==1) {
      if(scalar(@line)==0) {
        next;
      }
  
      for($i=0; $i<scalar(@line); $i++) {
        print $line[$i]." ";
      }
      print "\t";
  
      for($i=0; $i<scalar(@line); $i++) {
        if(!exists($dictionarylist{$line[$i]})) {
          print " _NOT_FOUND_";
        }
        else {
          @ref=@{ $dictionarylist{$line[$i]} };
          print $ref[0]."";
        }
      }
      print "
  ";
      next;
    }
  
    ## multiple pronunciations handling 
    @pronlist=();
    @tmppronlist=();
  
    if(scalar(@line)>0) {
      $word = $line[$0];
      if(!exists($dictionarylist{$word})) {
          push(@pronlist, '_NOT_FOUND_');
      }
      else {
        @ref=@{ $dictionarylist{$word} };
        for($i=0; $i<scalar(@ref); $i++) {
          push(@pronlist, $ref[$i]."");
        }
      }
  
      for($i=1; $i<scalar(@line); $i++) {
        $word = $line[$i];
        if(!exists($dictionarylist{$word})) {
          for($j=0; $j<scalar(@pronlist); $j++) {
            $pronlist[$j] = $pronlist[$j]." _NOT_FOUND_";
          }
        }
        else {
          @ref=@{ $dictionarylist{$word} };
          while(scalar(@pronlist)>0) {
            push(@tmppronlist, shift(@pronlist));
          }
          while(scalar(@tmppronlist)>0) {
            $tmppron = shift(@tmppronlist);
            for($j=0; $j<scalar(@ref); $j++) {
              push(@pronlist, $tmppron." ".$ref[$j]);
            } 
          }
        }
      }
      
      for($i=0; $i<scalar(@pronlist); $i++) {
        print $phrase."\t".$pronlist[$i]."
  ";
      }
    }
  
  }
  close(INFILE);
  
  sub printUsage {
      print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron]
  
  ";
      print "### this script handle multiple pronunciations for dict in default
  ";
      print "### if you want to extract single(top) pronunciation from dict, please use the option --spron
  
  ";
  }