hkust_extract_subdict.pl
2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env perl
# Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
#
# A script for dictionary generation with an input dict and a wordlist
#
# example of dict format as follow:
# WORD1 ph1 ph2
# WORD2 ph1 ph2 ph3
# WORDX ph4
# WORDY ph4 ph5
# WORDZ ph3 ph1
#
# example of wordlist (support phrase of words) format as follow:
# WORD1
# WORD2
# WORDX WORDY
# WORDX WORDY WORDZ
if($#ARGV+1 != 2 && $#ARGV+1 != 3) {
printUsage();
exit;
}
$usespron=0;
if(@ARGV == 3) {
if($ARGV[2] ne "--spron") {
printUsage();
exit;
}
$usespron=1;
}
$dictfile=$ARGV[0];
$inputfile=$ARGV[1];
%dictionarylist=();
open(INFILE, $dictfile) || die("Can't open dict ".$dictfile."\n");
while(<INFILE>){
chomp;
@line=split(/\s+/);
$a=$line[0];
$b="";
for($i=1; $i<scalar(@line); $i++) {
$b=$b . " " . $line[$i];
}
push ( @{ $dictionarylist{$a} }, $b );
}
close(INFILE);
open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile."\n");
while(<INFILE>) {
chomp;
$phrase = $_;
@line = split(/\s+/);
## single pronunciation handling
if($usespron==1) {
if(scalar(@line)==0) {
next;
}
for($i=0; $i<scalar(@line); $i++) {
print $line[$i]." ";
}
print "\t";
for($i=0; $i<scalar(@line); $i++) {
if(!exists($dictionarylist{$line[$i]})) {
print " _NOT_FOUND_";
}
else {
@ref=@{ $dictionarylist{$line[$i]} };
print $ref[0]."";
}
}
print "\n";
next;
}
## multiple pronunciations handling
@pronlist=();
@tmppronlist=();
if(scalar(@line)>0) {
$word = $line[$0];
if(!exists($dictionarylist{$word})) {
push(@pronlist, '_NOT_FOUND_');
}
else {
@ref=@{ $dictionarylist{$word} };
for($i=0; $i<scalar(@ref); $i++) {
push(@pronlist, $ref[$i]."");
}
}
for($i=1; $i<scalar(@line); $i++) {
$word = $line[$i];
if(!exists($dictionarylist{$word})) {
for($j=0; $j<scalar(@pronlist); $j++) {
$pronlist[$j] = $pronlist[$j]." _NOT_FOUND_";
}
}
else {
@ref=@{ $dictionarylist{$word} };
while(scalar(@pronlist)>0) {
push(@tmppronlist, shift(@pronlist));
}
while(scalar(@tmppronlist)>0) {
$tmppron = shift(@tmppronlist);
for($j=0; $j<scalar(@ref); $j++) {
push(@pronlist, $tmppron." ".$ref[$j]);
}
}
}
}
for($i=0; $i<scalar(@pronlist); $i++) {
print $phrase."\t".$pronlist[$i]."\n";
}
}
}
close(INFILE);
sub printUsage {
print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron]\n\n";
print "### this script handle multiple pronunciations for dict in default\n";
print "### if you want to extract single(top) pronunciation from dict, please use the option --spron\n\n";
}