make_phone_bigram_lang.sh 3.14 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98


#!/bin/bash

# Apache 2.0.  Copyright 2012, Johns Hopkins University (author: Daniel Povey)

# This script creates a "lang" directory of the "testing" type (including G.fst)
# given an existing "alignment" directory and an existing "lang" directory.
# The directory contains only single-phone words, and a bigram language model that
# is built without smoothing, on top of single phones.  The point of no smoothing
# is to limit the number of transitions, so we can decode reasonably fast, and the
# graph won't blow up.  This is probably going to be most useful for things like
# language-id.


# We might later have options here; if not, I'llr emove this.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;


if [ $# != 3 ]; then
  echo "Usage: utils/make_phone_bigram_lang.sh [options] <lang-dir> <ali-dir> <output-lang-dir>"
  echo "e.g.: utils/make_phone_bigram_lang.sh data/lang exp/tri3b_ali data/lang_phone_bg"
  exit 1;
fi

lang=$1
alidir=$2
lang_out=$3

for f in $lang/phones.txt $alidir/ali.1.gz; do
  [ ! -f $f ] && echo "Expected file $f to exist" && exit 1;
done

mkdir -p $lang_out || exit 1;

grep -v '#' $lang/phones.txt >  $lang_out/phones.txt # no disambig symbols
   # needed; G and L . G will be deterministic.
cp $lang/topo $lang_out
rm -r $lang_out/phones 2>/dev/null
cp -r $lang/phones/ $lang_out/
rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
  # no longer be valid.
# List of disambig symbols will be empty.
echo -n > $lang_out/phones/disambig.txt
echo -n > $lang_out/phones/disambig.int
echo -n > $lang_out/phones/disambig.csl

# Get phone-level transcripts of training data and create a
# language model.
ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
  perl -e 'while(<>) {
    @A = split(" ", $_);
    shift @A; # Remove the utterance-id.
    foreach $p ( @A ) { $phones{$p} = 1; } # assoc. array of phones.
    unshift @A, "<s>";
    push @A, "</s>";
    for ($n = 0; $n+1 < @A; $n++) {
      $p = $A[$n]; $q = $A[$n+1];
      $count{$p,$q}++;
      $histcount{$p}++;
    }
  }
  @phones = keys %phones;
  unshift @phones, "<s>";
  # @phones is now all real phones, plus <s>.
  for ($n = 0; $n < @phones; $n++) {
    $phn2state{$phones[$n]} = $n;
  }
  foreach $p (@phones) {
    $src = $phn2state{$p};
    $hist = $histcount{$p};
    $hist > 0 || die;    
    foreach $q (@phones) {
      $c = $count{$p,$q};
      if (defined $c) {
        $cost = -log($c / $hist); # cost on FST arc.
        $dest = $phn2state{$q};
        print "$src $dest $q $cost\n";  # Note: q is actually numeric.
      }
    }
    $c = $count{$p,"</s>"};
    if (defined $c) {
      $cost = -log($c / $hist); # cost on FST arc.      
      print "$src $cost\n"; # final-prob.
    }
  } ' | fstcompile --acceptor=true > $lang_out/G.fst

# symbols for phones and words are the same.
# Neither has disambig symbols.
cp $lang_out/phones.txt $lang_out/words.txt
  
grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
   fstcompile  > $lang_out/L.fst

# L and L_disambig are the same.
cp $lang_out/L.fst $lang_out/L_disambig.fst