gp_make_questions.pl
2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# 'phonesets_mono' contains sets of phones that are shared when building the
# monophone system and when asking questions based on an automatic clustering
# of phones, for the triphone system.
# 'roots' contain the information about which phones share a common root in
# the phonetic decision tree and which have distinct pdfs. It also states
# whether the tree-building should split the roots or not.
my $usage = "Usage: gp_make_questions.pl [-p] -i phones -m phoneset_mono -r roots\
Creates sharerd phonesets for monophone and context-dependent training.\
Required arguments:\
-i\tInput list of phones (can contain stress/position markers)\
-m\tOutput shared phoneset for use in monophone training\
-r\tOutput sharing and splitting info for context-dependent training\
Options:\
-p\tSignal that input phone list contains position markers\n";
use strict;
use Getopt::Long;
my ($in_phones, $mono, $roots, $posdep, %phoneset);
GetOptions ("p" => \$posdep, # Using position-dependent phones
"i=s" => \$in_phones, # Input list of phones
"m=s" => \$mono, # Shared phone-set for monophone system
"r=s" => \$roots ); # roots file for context-dependent systems
die "$usage" unless(defined($in_phones) && defined($mono) && defined($roots));
open(P, "<$in_phones") or die "Cannot read from file '$in_phones': $!";
open(MONO, ">$mono") or die "Cannot write to file '$mono': $!";
open(ROOTS, ">$roots") or die "Cannot write to file '$roots': $!";
while (<P>) {
next if m/eps|SIL|SPN/;
chomp;
m/^(\S+)(_.)?\s+\S+$/ or die "Bad line: $_\n";
my $full_phone = defined($2)? $1.$2 : $1;
push @{$phoneset{$1}}, $full_phone;
}
print MONO "SIL SPN\n";
print ROOTS "not-shared not-split SIL SPN\n";
foreach my $p (sort keys %phoneset) {
print MONO join(" ", @{$phoneset{$p}}), "\n";
print ROOTS "shared split ", join(" ", @{$phoneset{$p}}), "\n";
}