Blame view

Scripts/utils/convert_slf.pl 3.41 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
  #!/usr/bin/perl
  
  # Copyright 2013  Korbinian Riedhammer
  
  # Convert a kaldi-lattice and convert it to HTK SLF format;  if given an output
  # directory, each lattice will be put in an individual gzipped file.
  
  use utf8;
  
  binmode(STDIN, ":encoding(utf8)");
  binmode(STDOUT, ":encoding(utf8)");
  
  # defaults
  $framerate=0.01;
  $lmscale=1.0;
  $acscale=1.0;
  $wdpenalty=0.0;
  
  if (@ARGV < 1 || @ARGV > 2) {
    print STDERR "Convert kaldi lattices to HTK SLF (v1.1) format.
  ";
    print STDERR "Usage: convert_slf.pl [options] lat-file.txt [out-dir]
  ";
    print STDERR "  e.g. lattice-word-align 'ark:gunzip -c lat.gz |' ark,t:- | $0 - slf/
  ";
    print STDERR "Options regarding the SLF output:
    --lmscale x    LM weight (default: lmscale=$lmscale)
    --acscale x    Acoustic weight (default: acscale=$acscale)
    --wdpenalty x  Word insertion penalty (default: $wdpenalty)
    --framerate x  Frame rate to compute timing information (default: $framerate)
  ";
  
    exit 1;
  }
  
  while (@ARGV gt 0 and $ARGV[0] =~ m/^--/) {
    $param = shift @ARGV;
    if ($param eq "--lmscale") { $lmscale = shift @ARGV; }
    elsif ($param eq "--acscale") { $acscale = shift @ARGV; }
    elsif ($param eq "--wdpenalty") { $wdpenalty = shift @ARGV; }
    elsif ($param eq "--framerate") { $framerate = shift @ARGV; }
    else {
      print STDERR "Unknown option $param
  ";
      exit 1;
    }
  }
  
  $outdir = "";
  if (@ARGV == 2) {
    $outdir = pop @ARGV;
    unless (-d $outdir) {
      print STDERR "Could not find directory $outdir
  ";
      exit 1;
    }
  }
  
  
  $utt = "";
  @links = ();
  %nodes = ();
  %trace = ();
  
  if ($outdir eq "") {
    open(FH, ">-") or die "Could not write to stdout (???)
  ";
  }
  
  open (FI, $ARGV[0]) or die "Could not read from file
  ";
  binmode(FI, ":encoding(utf8)");
  
  while(<FI>) {
    chomp;
  
    @A = split /\s+/;
  
    if (@A == 1 and $utt eq "") {
      # new lattice
      $utt = $A[0];
      $nodes{0} = 0.0;
      $trace{0} = 0;
    } elsif (@A == 1) {
      # do nothing with an accepting state
    } elsif (@A == 4) {
      # FSA arc
      ($s, $e, $w, $info) = @A;
      ($gs, $as, $ss) = split(/,/, $info);
  
      # kaldi saves -log, but HTK does it the other way round
      $gs *= -1;
      $as *= -1;
      
      # the state sequence is something like 1_2_4_56_45 so we remove all digits and count the _+1
      $ss =~ s/[0-9]*//g;
      $ss = 1 + length $ss;
  
      
      # we need the trace to compute the time segment
      $trace{$e} = $s;
      $nodes{$e} = $nodes{$s} + $ss * $framerate unless defined $nodes{$e}; # no not overwrite timing
  
      push @links, "S=$s\tE=$e\tW=$w\tv=0\ta=$as\tl=$gs";
    } elsif (@A == 0) {
      # print out the lattice;  open file handle first
      unless ($outdir eq "") {
        open(FH, "|-", "gzip -c > $outdir/$utt.lat.gz") or die "Could not write to $outdir/$utt.lat.gz
  ";
        binmode(FH, ":encoding(utf8)");
      } 
  
      # header
      print FH "VERSION=1.1
  ";
      print FH "UTTERANCE=$utt
  ";
      print FH "lmscale=$lmscale
  ";
      print FH "acscale=$acscale
  ";
      print FH "N=".(keys %nodes)."\tL=".(@links)."
  ";
  
      # nodes
      for $n (sort { $a <=> $b } keys %nodes) {
        printf FH "I=%d\tt=%.2f
  ", $n, $nodes{$n};
      }
  
      # links/arks
      for $i (0 .. $#links) {
        print FH "J=$i\t".$links[$i]."
  ";
      }
  
      print FH "
  ";
  
      # close handle if it was a file
      close(FH) unless ($outdir eq "");
  
      # clear data
      $utt = "";
      @links = ();
      %nodes = ();
      %trace = ();
    }
  }
  
  if ($utt != "") {
    print STDERR "Last lattice was not printed as it might be incomplete?  Missing empty line?
  ";
  }