Blame view
egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
4.26 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
#!/bin/bash # Apache 2.0. Copyright 2012, Johns Hopkins University (author: Daniel Povey) # This script creates a "lang" directory of the "testing" type (including G.fst) # given an existing "alignment" directory and an existing "lang" directory. # The directory contains only single-phone words, and a bigram language model that # is built without smoothing, on top of single phones. The point of no smoothing # is to limit the number of transitions, so we can decode reasonably fast, and the # graph won't blow up. This is probably going to be most useful for things like # language-id. # # See also steps/make_phone_graph.sh echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: $0: [options] <lang-dir> <ali-dir> <output-lang-dir>" echo "e.g.: $0: data/lang exp/tri3b_ali data/lang_phone_bg" exit 1; fi lang=$1 alidir=$2 lang_out=$3 for f in $lang/phones.txt $alidir/ali.1.gz; do [ ! -f $f ] && echo "Expected file $f to exist" && exit 1; done mkdir -p $lang_out || exit 1; grep -v '#' $lang/phones.txt > $lang_out/phones.txt # no disambig symbols # needed; G and L . G will be deterministic. cp $lang/topo $lang_out rm -r $lang_out/phones 2>/dev/null cp -r $lang/phones/ $lang_out/ rm $lang_out/phones/word_boundary.* 2>/dev/null # these would # no longer be valid. rm $lang_out/phones/wdisambig* 2>/dev/null # ditto this. # List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst # are determinizable without any. echo -n > $lang_out/phones/disambig.txt echo -n > $lang_out/phones/disambig.int echo -n > $lang_out/phones/disambig.csl echo -n > $lang_out/phones/wdisambig.txt echo -n > $lang_out/phones/wdisambig_phones.int echo -n > $lang_out/phones/wdisambig_words.int # Let OOV symbol be the first phone. This is arbitrary, it's just # so that validate_lang.pl succeeds. We should never actually use # this. oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}') oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}') echo $oov_sym > $lang_out/oov.txt echo $oov_int > $lang_out/oov.int # Get phone-level transcripts of training data and create a # language model. ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ perl -e 'while(<>) { @A = split(" ", $_); shift @A; # Remove the utterance-id. foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones. unshift @A, "<s>"; push @A, "</s>"; for ($n = 0; $n+1 < @A; $n++) { $p = $A[$n]; $q = $A[$n+1]; $count{$p,$q}++; $histcount{$p}++; } } @phones = keys %phones; unshift @phones, "<s>"; # @phones is now all real phones, plus <s>. for ($n = 0; $n < @phones; $n++) { $phn2state{$phones[$n]} = $n; } foreach $p (@phones) { $src = $phn2state{$p}; $hist = $histcount{$p}; $hist > 0 || die; foreach $q (@phones) { $c = $count{$p,$q}; if (defined $c) { $cost = -log($c / $hist); # cost on FST arc. $dest = $phn2state{$q}; print "$src $dest $q $cost "; # Note: q is actually numeric. } } $c = $count{$p,"</s>"}; if (defined $c) { $cost = -log($c / $hist); # cost on FST arc. print "$src $cost "; # final-prob. } } ' | fstcompile --acceptor=true | \ fstarcsort --sort_type=ilabel > $lang_out/G.fst # symbols for phones and words are the same. # Neither has disambig symbols. cp $lang_out/phones.txt $lang_out/words.txt grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s ", $2, $2);} END{print("0 0.0");}' | \ fstcompile > $lang_out/L.fst # note: first two fields of align_lexicon.txt are interpreted as the word; the remaining # fields are the phones that are in the pron of the word. These are all the same, for us. for p in $(grep -v '<eps>' $lang_out/phones.txt | awk '{print $1}'); do echo $p $p $p; done > $lang_out/phones/align_lexicon.txt # just use one sym2int.pl command, since phones.txt and words.txt are identical. utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang_out/phones/align_lexicon.int # L and L_disambig are the same. cp $lang_out/L.fst $lang_out/L_disambig.fst utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1; |