Blame view
Scripts/utils/convert_slf.pl
3.41 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
#!/usr/bin/perl # Copyright 2013 Korbinian Riedhammer # Convert a kaldi-lattice and convert it to HTK SLF format; if given an output # directory, each lattice will be put in an individual gzipped file. use utf8; binmode(STDIN, ":encoding(utf8)"); binmode(STDOUT, ":encoding(utf8)"); # defaults $framerate=0.01; $lmscale=1.0; $acscale=1.0; $wdpenalty=0.0; if (@ARGV < 1 || @ARGV > 2) { print STDERR "Convert kaldi lattices to HTK SLF (v1.1) format. "; print STDERR "Usage: convert_slf.pl [options] lat-file.txt [out-dir] "; print STDERR " e.g. lattice-word-align 'ark:gunzip -c lat.gz |' ark,t:- | $0 - slf/ "; print STDERR "Options regarding the SLF output: --lmscale x LM weight (default: lmscale=$lmscale) --acscale x Acoustic weight (default: acscale=$acscale) --wdpenalty x Word insertion penalty (default: $wdpenalty) --framerate x Frame rate to compute timing information (default: $framerate) "; exit 1; } while (@ARGV gt 0 and $ARGV[0] =~ m/^--/) { $param = shift @ARGV; if ($param eq "--lmscale") { $lmscale = shift @ARGV; } elsif ($param eq "--acscale") { $acscale = shift @ARGV; } elsif ($param eq "--wdpenalty") { $wdpenalty = shift @ARGV; } elsif ($param eq "--framerate") { $framerate = shift @ARGV; } else { print STDERR "Unknown option $param "; exit 1; } } $outdir = ""; if (@ARGV == 2) { $outdir = pop @ARGV; unless (-d $outdir) { print STDERR "Could not find directory $outdir "; exit 1; } } $utt = ""; @links = (); %nodes = (); %trace = (); if ($outdir eq "") { open(FH, ">-") or die "Could not write to stdout (???) "; } open (FI, $ARGV[0]) or die "Could not read from file "; binmode(FI, ":encoding(utf8)"); while(<FI>) { chomp; @A = split /\s+/; if (@A == 1 and $utt eq "") { # new lattice $utt = $A[0]; $nodes{0} = 0.0; $trace{0} = 0; } elsif (@A == 1) { # do nothing with an accepting state } elsif (@A == 4) { # FSA arc ($s, $e, $w, $info) = @A; ($gs, $as, $ss) = split(/,/, $info); # kaldi saves -log, but HTK does it the other way round $gs *= -1; $as *= -1; # the state sequence is something like 1_2_4_56_45 so we remove all digits and count the _+1 $ss =~ s/[0-9]*//g; $ss = 1 + length $ss; # we need the trace to compute the time segment $trace{$e} = $s; $nodes{$e} = $nodes{$s} + $ss * $framerate unless defined $nodes{$e}; # no not overwrite timing push @links, "S=$s\tE=$e\tW=$w\tv=0\ta=$as\tl=$gs"; } elsif (@A == 0) { # print out the lattice; open file handle first unless ($outdir eq "") { open(FH, "|-", "gzip -c > $outdir/$utt.lat.gz") or die "Could not write to $outdir/$utt.lat.gz "; binmode(FH, ":encoding(utf8)"); } # header print FH "VERSION=1.1 "; print FH "UTTERANCE=$utt "; print FH "lmscale=$lmscale "; print FH "acscale=$acscale "; print FH "N=".(keys %nodes)."\tL=".(@links)." "; # nodes for $n (sort { $a <=> $b } keys %nodes) { printf FH "I=%d\tt=%.2f ", $n, $nodes{$n}; } # links/arks for $i (0 .. $#links) { print FH "J=$i\t".$links[$i]." "; } print FH " "; # close handle if it was a file close(FH) unless ($outdir eq ""); # clear data $utt = ""; @links = (); %nodes = (); %trace = (); } } if ($utt != "") { print STDERR "Last lattice was not printed as it might be incomplete? Missing empty line? "; } |