apply_unk_lm.sh 3.49 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91


#!/bin/bash

# Copyright      2016 Johns Hopkins University (Author: Daniel Povey);

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Begin configuration section.

# end configuration sections

echo "$0 $@"  # Print the command line for logging
[ -f path.sh ] && . ./path.sh


. utils/parse_options.sh

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <input-unk-lm-fst> <lang-dir>"
  echo "e.g.: $0 exp/make_unk/unk_fst.txt data/lang_unk"
  echo ""
  echo "This script, which is called from the end of prepare_lang.sh,"
  echo "inserts the unknown-word LM FST into the lexicon FSTs"
  echo "<lang-dir>/L.fst and <lang-dir>/L_disambig.fst in place of"
  echo "the special disambiguation symbol #2 (which was inserted by"
  echo "add_lex_disambig.pl as a placeholder for this FST)."
  echo ""
  echo "  <input-unk-lm-fst>:  A text-form FST, typically with the name"
  echo "                unk_fst.txt.  We will remove all symbols from the"
  echo "                output before applying it."
  echo "  <lang-dir>:  A partially built lang/ directory.  We modify"
  echo "               L.fst and L_disambig.fst, and read only words.txt."
  exit 1;
fi


unk_lm_fst=$1
lang=$2

set -e

for f in "$unk_lm_fst" $lang/L.fst $lang/L_disambig.fst $lang/words.txt $lang/oov.int; do
  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
done

unused_phone_label=$(tail -n 1 $lang/phones.txt | awk '{print $2 + 1}')
label_to_replace=$(awk '{if ($1 == "#2") {print $2;}}' <$lang/phones.txt)
! [ "$unused_phone_label" -eq "$unused_phone_label" -a "$label_to_replace" -eq "$label_to_replace" ] && \
   echo "$0: error getting unused phone label or label for #2" && exit 1


# OK, now fstreplace works based on olabels, but we actually want to deal with ilabels,
# so we need to invert all the FSTs before and after doing fstreplace.
awk '{if(NF>=4) $4 = "<eps>"; print }' <$unk_lm_fst | \
  fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/words.txt | \
  fstinvert > $lang/unk_temp.fst

num_states_unk=$(fstinfo $lang/unk_temp.fst | grep '# of states' | awk '{print $NF}')

# fstreplace usage is:
# Usage: fstreplace root.fst rootlabel [rule1.fst label1 ...] [out.fst]
# ... the rootlabel should just be an otherwise unused symbol.
# all the labels are olabels (word labels).. that is hardcoded in fstreplace.

for f in L.fst L_disambig.fst; do

  # with OpenFst tools, to refer to the standard input/output you need to use
  # the empty string '' and not '-'.
  fstinvert $lang/$f | fstreplace '' "$unused_phone_label" $lang/unk_temp.fst "$label_to_replace" | fstinvert > $lang/${f}.temp

  num_states_old=$(fstinfo $lang/$f | grep '# of states' | awk '{print $NF}')
  num_states_new=$(fstinfo $lang/${f}.temp | grep '# of states' | awk '{print $NF}')
  num_states_added=$[$num_states_new-$num_states_old]
  echo "$0: in $f, substituting in the unknown-word LM (which had $num_states_unk states) added $num_states_added new FST states."
  mv -f $lang/${f}.temp $lang/$f
done

rm $lang/unk_temp.fst

exit 0;