make_phone_bigram_lang.sh 4.26 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123


#!/bin/bash

# Apache 2.0.  Copyright 2012, Johns Hopkins University (author: Daniel Povey)

# This script creates a "lang" directory of the "testing" type (including G.fst)
# given an existing "alignment" directory and an existing "lang" directory.
# The directory contains only single-phone words, and a bigram language model that
# is built without smoothing, on top of single phones.  The point of no smoothing
# is to limit the number of transitions, so we can decode reasonably fast, and the
# graph won't blow up.  This is probably going to be most useful for things like
# language-id.
#
#  See also steps/make_phone_graph.sh


echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: $0: [options] <lang-dir> <ali-dir> <output-lang-dir>"
  echo "e.g.: $0: data/lang exp/tri3b_ali data/lang_phone_bg"
  exit 1;
fi

lang=$1
alidir=$2
lang_out=$3

for f in $lang/phones.txt $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done

mkdir -p $lang_out || exit 1;

grep -v '#' $lang/phones.txt >  $lang_out/phones.txt # no disambig symbols
      # needed; G and L . G will be deterministic.
cp $lang/topo $lang_out
rm -r $lang_out/phones 2>/dev/null
cp -r $lang/phones/ $lang_out/
rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
  # no longer be valid.
rm $lang_out/phones/wdisambig* 2>/dev/null  # ditto this.

# List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
# are determinizable without any.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl
echo -n > $lang_out/phones/wdisambig.txt
echo -n > $lang_out/phones/wdisambig_phones.int
echo -n > $lang_out/phones/wdisambig_words.int

# Let OOV symbol be the first phone.  This is arbitrary, it's just
# so that validate_lang.pl succeeds.  We should never actually use
# this.
oov_sym=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $1}')
oov_int=$(tail -n +2 $lang_out/phones.txt | head -n 1 | awk '{print $2}')
echo $oov_sym > $lang_out/oov.txt
echo $oov_int > $lang_out/oov.int


# Get phone-level transcripts of training data and create a
# language model.
ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
  perl -e 'while(<>) {
    @A = split(" ", $_);
    shift @A; # Remove the utterance-id.
    foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones.
    unshift @A, "<s>";
    push @A, "</s>";
    for ($n = 0; $n+1 < @A; $n++) {
      $p = $A[$n]; $q = $A[$n+1];
      $count{$p,$q}++;
      $histcount{$p}++;
    }
  }
  @phones = keys %phones;
  unshift @phones, "<s>";
  # @phones is now all real phones, plus <s>.
  for ($n = 0; $n < @phones; $n++) {
    $phn2state{$phones[$n]} = $n;
  }
  foreach $p (@phones) {
    $src = $phn2state{$p};
    $hist = $histcount{$p};
    $hist > 0 || die;
    foreach $q (@phones) {
      $c = $count{$p,$q};
      if (defined $c) {
        $cost = -log($c / $hist); # cost on FST arc.
        $dest = $phn2state{$q};
        print "$src $dest $q $cost\n";  # Note: q is actually numeric.
      }
    }
    $c = $count{$p,"</s>"};
    if (defined $c) {
      $cost = -log($c / $hist); # cost on FST arc.
      print "$src $cost\n"; # final-prob.
    }
  } ' | fstcompile --acceptor=true | \
    fstarcsort --sort_type=ilabel > $lang_out/G.fst

# symbols for phones and words are the same.
# Neither has disambig symbols.
cp $lang_out/phones.txt $lang_out/words.txt

grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
   fstcompile  > $lang_out/L.fst

# note: first two fields of align_lexicon.txt are interpreted as the word; the remaining
# fields are the phones that are in the pron of the word.  These are all the same, for us.
for p in $(grep -v '<eps>' $lang_out/phones.txt | awk '{print $1}'); do echo $p $p $p; done > $lang_out/phones/align_lexicon.txt

# just use one sym2int.pl command, since phones.txt and words.txt are identical.
utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang_out/phones/align_lexicon.int

# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst

utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1;