Blame view
Scripts/utils/make_phone_bigram_lang.sh
3.14 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
#!/bin/bash # Apache 2.0. Copyright 2012, Johns Hopkins University (author: Daniel Povey) # This script creates a "lang" directory of the "testing" type (including G.fst) # given an existing "alignment" directory and an existing "lang" directory. # The directory contains only single-phone words, and a bigram language model that # is built without smoothing, on top of single phones. The point of no smoothing # is to limit the number of transitions, so we can decode reasonably fast, and the # graph won't blow up. This is probably going to be most useful for things like # language-id. # We might later have options here; if not, I'llr emove this. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: utils/make_phone_bigram_lang.sh [options] <lang-dir> <ali-dir> <output-lang-dir>" echo "e.g.: utils/make_phone_bigram_lang.sh data/lang exp/tri3b_ali data/lang_phone_bg" exit 1; fi lang=$1 alidir=$2 lang_out=$3 for f in $lang/phones.txt $alidir/ali.1.gz; do [ ! -f $f ] && echo "Expected file $f to exist" && exit 1; done mkdir -p $lang_out || exit 1; grep -v '#' $lang/phones.txt > $lang_out/phones.txt # no disambig symbols # needed; G and L . G will be deterministic. cp $lang/topo $lang_out rm -r $lang_out/phones 2>/dev/null cp -r $lang/phones/ $lang_out/ rm $lang_out/phones/word_boundary.* 2>/dev/null # these would # no longer be valid. # List of disambig symbols will be empty. echo -n > $lang_out/phones/disambig.txt echo -n > $lang_out/phones/disambig.int echo -n > $lang_out/phones/disambig.csl # Get phone-level transcripts of training data and create a # language model. ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ perl -e 'while(<>) { @A = split(" ", $_); shift @A; # Remove the utterance-id. foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones. unshift @A, "<s>"; push @A, "</s>"; for ($n = 0; $n+1 < @A; $n++) { $p = $A[$n]; $q = $A[$n+1]; $count{$p,$q}++; $histcount{$p}++; } } @phones = keys %phones; unshift @phones, "<s>"; # @phones is now all real phones, plus <s>. for ($n = 0; $n < @phones; $n++) { $phn2state{$phones[$n]} = $n; } foreach $p (@phones) { $src = $phn2state{$p}; $hist = $histcount{$p}; $hist > 0 || die; foreach $q (@phones) { $c = $count{$p,$q}; if (defined $c) { $cost = -log($c / $hist); # cost on FST arc. $dest = $phn2state{$q}; print "$src $dest $q $cost "; # Note: q is actually numeric. } } $c = $count{$p,"</s>"}; if (defined $c) { $cost = -log($c / $hist); # cost on FST arc. print "$src $cost "; # final-prob. } } ' | fstcompile --acceptor=true > $lang_out/G.fst # symbols for phones and words are the same. # Neither has disambig symbols. cp $lang_out/phones.txt $lang_out/words.txt grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s ", $2, $2);} END{print("0 0.0");}' | \ fstcompile > $lang_out/L.fst # L and L_disambig are the same. cp $lang_out/L.fst $lang_out/L_disambig.fst |