Blame view

Scripts/utils/make_phone_bigram_lang.sh 3.14 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  #!/bin/bash
  
  # Apache 2.0.  Copyright 2012, Johns Hopkins University (author: Daniel Povey)
  
  # This script creates a "lang" directory of the "testing" type (including G.fst)
  # given an existing "alignment" directory and an existing "lang" directory.
  # The directory contains only single-phone words, and a bigram language model that
  # is built without smoothing, on top of single phones.  The point of no smoothing
  # is to limit the number of transitions, so we can decode reasonably fast, and the
  # graph won't blow up.  This is probably going to be most useful for things like
  # language-id.
  
  
  # We might later have options here; if not, I'llr emove this.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh; # source the path.
  . parse_options.sh || exit 1;
  
  
  if [ $# != 3 ]; then
    echo "Usage: utils/make_phone_bigram_lang.sh [options] <lang-dir> <ali-dir> <output-lang-dir>"
    echo "e.g.: utils/make_phone_bigram_lang.sh data/lang exp/tri3b_ali data/lang_phone_bg"
    exit 1;
  fi
  
  lang=$1
  alidir=$2
  lang_out=$3
  
  for f in $lang/phones.txt $alidir/ali.1.gz; do
    [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
  done
  
  mkdir -p $lang_out || exit 1;
  
  grep -v '#' $lang/phones.txt >  $lang_out/phones.txt # no disambig symbols
     # needed; G and L . G will be deterministic.
  cp $lang/topo $lang_out
  rm -r $lang_out/phones 2>/dev/null
  cp -r $lang/phones/ $lang_out/
  rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
    # no longer be valid.
  # List of disambig symbols will be empty.
  echo -n > $lang_out/phones/disambig.txt
  echo -n > $lang_out/phones/disambig.int
  echo -n > $lang_out/phones/disambig.csl
  
  # Get phone-level transcripts of training data and create a
  # language model.
  ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
    perl -e 'while(<>) {
      @A = split(" ", $_);
      shift @A; # Remove the utterance-id.
      foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones.
      unshift @A, "<s>";
      push @A, "</s>";
      for ($n = 0; $n+1 < @A; $n++) {
        $p = $A[$n]; $q = $A[$n+1];
        $count{$p,$q}++;
        $histcount{$p}++;
      }
    }
    @phones = keys %phones;
    unshift @phones, "<s>";
    # @phones is now all real phones, plus <s>.
    for ($n = 0; $n < @phones; $n++) {
      $phn2state{$phones[$n]} = $n;
    }
    foreach $p (@phones) {
      $src = $phn2state{$p};
      $hist = $histcount{$p};
      $hist > 0 || die;    
      foreach $q (@phones) {
        $c = $count{$p,$q};
        if (defined $c) {
          $cost = -log($c / $hist); # cost on FST arc.
          $dest = $phn2state{$q};
          print "$src $dest $q $cost
  ";  # Note: q is actually numeric.
        }
      }
      $c = $count{$p,"</s>"};
      if (defined $c) {
        $cost = -log($c / $hist); # cost on FST arc.      
        print "$src $cost
  "; # final-prob.
      }
    } ' | fstcompile --acceptor=true > $lang_out/G.fst
  
  # symbols for phones and words are the same.
  # Neither has disambig symbols.
  cp $lang_out/phones.txt $lang_out/words.txt
    
  grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s
  ", $2, $2);} END{print("0 0.0");}' | \
     fstcompile  > $lang_out/L.fst
  
  # L and L_disambig are the same.
  cp $lang_out/L.fst $lang_out/L_disambig.fst