Blame view

egs/gp/s5/local/gp_make_questions.pl 2.55 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  #!/usr/bin/env perl
  use warnings; #sed replacement for -w perl parameter
  
  # Copyright 2012  Arnab Ghoshal
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # 'phonesets_mono' contains sets of phones that are shared when building the 
  # monophone system and when asking questions based on an automatic clustering 
  # of phones, for the triphone system.  
  # 'roots' contain the information about which phones share a common root in 
  # the phonetic decision tree and which have distinct pdfs. It also states 
  # whether the tree-building should split the roots or not.
  
  my $usage = "Usage: gp_make_questions.pl [-p] -i phones -m phoneset_mono -r roots\
  Creates sharerd phonesets for monophone and context-dependent training.\
  Required arguments:\
    -i\tInput list of phones (can contain stress/position markers)\
    -m\tOutput shared phoneset for use in monophone training\
    -r\tOutput sharing and splitting info for context-dependent training\
  Options:\
    -p\tSignal that input phone list contains position markers
  ";
  
  use strict;
  use Getopt::Long;
  my ($in_phones, $mono, $roots, $posdep, %phoneset);
  GetOptions ("p"   => \$posdep,     # Using position-dependent phones
              "i=s" => \$in_phones,  # Input list of phones
              "m=s" => \$mono,       # Shared phone-set for monophone system
  	    "r=s" => \$roots );    # roots file for context-dependent systems
  
  die "$usage" unless(defined($in_phones) && defined($mono) && defined($roots));
  
  open(P, "<$in_phones") or die "Cannot read from file '$in_phones': $!";
  open(MONO, ">$mono") or die "Cannot write to file '$mono': $!";
  open(ROOTS, ">$roots") or die "Cannot write to file '$roots': $!";
  
  while (<P>) {
    next if m/eps|SIL|SPN/;
    chomp;
    m/^(\S+)(_.)?\s+\S+$/ or die "Bad line: $_
  ";
    my $full_phone = defined($2)? $1.$2 : $1;
    push @{$phoneset{$1}}, $full_phone;
  }
  
  print MONO "SIL SPN
  ";
  print ROOTS "not-shared not-split SIL SPN
  ";
  foreach my $p (sort keys %phoneset) {
    print MONO join(" ", @{$phoneset{$p}}), "
  ";
    print ROOTS "shared split ", join(" ", @{$phoneset{$p}}), "
  ";
  }