make_lexicon_fst.pl
4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/perl -w
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# makes lexicon FST (no pron-probs involved).
if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt"
}
$lexfn = shift @ARGV;
if(@ARGV == 0) {
$silprob = 0.0;
} elsif (@ARGV == 2){
($silprob,$silphone) = @ARGV;
} else {
($silprob,$silphone,$sildisambig) = @ARGV;
}
if($silprob != 0.0) {
$silprob < 1.0 || die "Sil prob cannot be >= 1.0";
$silcost = -log($silprob);
$nosilcost = -log(1.0 - $silprob);
}
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
sub is_sil {
# Return true (1) if provided with a phone-sequence
# that means silence.
# @_ is the parameters of the function
# This function returns true if @_ equals ( $silphone )
# or something of the form ( "#0", $silphone, "#1" )
# where the "#0" and "#1" are disambiguation symbols.
return ( @_ == 1 && $_[0] eq $silphone ||
(@_ == 3 && $_[1] eq $silphone &&
$_[0] =~ m/^\#\d+$/ &&
$_[0] =~ m/^\#\d+$/));
}
if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
$loopstate = 0;
$nextstate = 1; # next unallocated state.
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
}
}
print "$loopstate\t0\n"; # final-cost.
} else { # have silence probs.
$startstate = 0;
$loopstate = 1;
$silstate = 2; # state from where we go to loopstate after emitting silence.
print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
if (!defined $sildisambig) {
print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
$nextstate = 3;
} else {
$disambigstate = 3;
$nextstate = 4;
print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
}
while(<L>) {
@A = split(" ", $_);
$w = shift @A;
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if(@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps\n";
$word_or_eps = "<eps>";
$s = $ns;
} else {
if(!is_sil(@A)){
# This is non-deterministic but relatively compact,
# and avoids epsilons.
print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
} else {
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps\n";
}
$word_or_eps = "<eps>";
}
}
}
print "$loopstate\t0\n"; # final-cost.
}