isolate_phones.pl
1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env perl
# Once the phonetic representation for words is generated by the LDC lexicon
# This script converts them into a KALDI compatible format
# In addition, it extends the list of phonemes to consider based on
# orthograhic representations of those words which do not have stressed vowels
use utf8;
($tmpdir)=$ARGV[0];
open(L, "<", "$tmpdir/lexicon_raw") || die "Can't open raw lexicon";
open(P, "<" , "$tmpdir/phones") || die "Can't open phone file";
open(I, ">$tmpdir/lexicon_one_column") || die "Can't open text file for writing";
open(E, ">$tmpdir/phones_extended") || die "Can't open ex-phone file for writing";
binmode(P, ":utf8");
binmode(L, ":utf8");
binmode(I, ":utf8");
binmode(E, ":utf8");
#Get all phones
my %phones = qw();
while (<P>) {
chomp;
$phones{$_} = 1;
}
print @phones;
while (<L>) {
if (substr($_, 0, 1) eq "#") {
print I $_;
next;
}
$len = length;
$current = 0;
$splitWord = "";
while ($current < $len) {
#First check for two char codes
$currentChar2 = substr($_, $current, 2);
$currentChar1 = substr($_, $current, 1);
if (exists($phones{$currentChar2})) {
$splitWord = $splitWord . " " . $currentChar2;
$current = $current + 2;
}
else {
# Check if this phone exists
if (!exists($phones{$currentChar1})) {
$phones{$currentChar1} = 1
}
$splitWord = $splitWord . " " . $currentChar1;
$current = $current + 1;
}
}
$splitWord =~ s/^\s*(.*?)\s*$/$1/;
print I $splitWord, "\n";
}
# Now write the phones to the extended phone file
foreach my $key (keys %phones) {
print E $key, "\n";
}
close(L);
close(P);
close(I);
close(E);