parse_trees.pl 4.08 KB
#!/usr/bin/perl

# Usage: parse_trees.pl < htk_tree > pre_kaldi_tree
# e.g. parse_trees.pl < /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees
# [the output is not a standard Kaldi format but one that
# I will write a Kaldi tool to parse].
 
# This program converts a tree in HTK format (e.g. 
# /mnt/matylda5/jhu09/setup/CH1/English/exp/xwrd.R0_800_TB500/hmm10_800_500/cluster.trees
# on BUT system) to trees in a format described below that's more convenient
# to convert to Kaldi files.

# (2)
# The trees for each of the positions of each of the phones 
# (the positions are numbered from zero (offset of -2 from HTK state), 
# i.e. 0, 1, 2.

# A few lines of the output are:
# H 2  SE 2 [ i e E W @ ] {  CE H_s4_3   SE 2 [ A x ] {  CE H_s4_2   CE H_s4_1  }  } 
# N 0 CE N_s2_1
# N 1  SE 0 [ d ] {  CE N_s3_2   CE N_s3_1  } 


while(<>) {
    if(m/^QS (\S+) \{ (\S+) \}/) {
        # a line like this:
        # QS 'L_Vowel' { "A-*","x-*","u-*","U-*","X-*","I-*","i-*","o-*","e-*","O-*","E-*","Y-*","c-*","a-*","W-*","@-*" }
        $qname = $1;
        @set = split(",", $2);
        # e.g. of $2:
        #"A-*","x-*","u-*","U-*","X-*","I-*","i-*","o-*","e-*","O-*","E-*","Y-*","c-*","a-*","W-*","@-*"
        @set > 0 || die "Bad line (1) $_";
        @phoneset = ( ); 
        if($set[0] =~ m:\".+\-\*\":) { # left-context.
            $pos = 0; # position of context we're asking about, in Kaldi format.
            foreach $a (@set) {
                $a =~ m:\"(.+)\-\*\": || die "Bad line (2) $_";
                push @phoneset, $1;
            }
        } else { # Right-context.
            $pos = 2; # position of context we're asking about, in Kaldi format
            foreach $a (@set) {
                $a =~ m:\"\*\+(.+)\": || die "Bad line (3) $_";
                push @phoneset, $1;
            }
        }
        $question{$qname} = $pos . " [ " . join(" ",@phoneset) . " ]";
        # print STDERR "$qname -> $question{$qname}\n";
    } elsif(m/^$/) {
        next;
    } elsif(m/(.+)\[(\d+)\]/) {
        $phone = $1; $position = $2 - 2;
        $nextline = <>;
        if($nextline =~ m/^\s*\"(.+)\"\s*$/) { # no splits.
            $leaf = $1;
#e.g.:
#N[2]
#   "N_s2_1"
            print "$phone $position CE $leaf\n";
#e.g.:
#H[4]
#{
#   0                'R_Front'      -1      "H_s4_3" 
#  -1               'R_Middle'   "H_s4_1"   "H_s4_2" 
#}
        } elsif($nextline =~ /\{/) { # just "{" on a line by itself...
            $m = "";
            @pos2line = ( );
            while(1) { 
                $m = <>;
                if($m =~ m/\s*\}\s*$/) { #  "}" on its own line..
                    last;
                }
                @A = split(" ", $m);
                @A == 4 || die "Bad line $m: line $.\n";
                $pos2line[-$A[0]] = $m;
            }
            @pos2str = ( ); # Recursive, parenthesis-based representation of each line.
            # HERE.
            for($x = @pos2line-1; $x >= 0; $x--) {
                @A = split(" ", $pos2line[$x]);
                @A==4 || die "bad line [code error]\n"; 
                ($n,  $qname, $no, $yes) = @A;
                if($no =~ m:\-(\d+):) { # e.g. -1
                    $no_str = $pos2str[$1];
                } elsif ($no =~ m:\"(.+)\":) {
                    $no_str = " CE $1 ";  # e.g. "H_s3_2"
                } else { die "Bad line $pos2line[$x] or code error: before line $."; }
                if($yes =~ m:\-(\d+):) { # e.g. -1
                    $yes_str = $pos2str[$1];
                } elsif ($yes =~ m:\"(.+)\":) {
                    $yes_str = " CE $1 ";  # e.g. "CE H_s3_2"
                } else { die "Bad line $pos2line[$x] or code error: before line $."; }
                defined $question{$qname} || die "No such question $qname\n";
                $pos2str[$x] = "SE $question{$qname} { $yes_str $no_str } "; # yes before no in format we print.
            }
            $treestr = $pos2str[0]; 
            print "$phone $position $treestr\n"
        } else {
            die "Could not parse line $_ (1): line $.\n";
        }
    } else {
        die "Could not parse line $_ (2): line $.\n";
    }
}