apply_lexicon_edits.py 4.44 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110


#!/usr/bin/env python

# Copyright 2016  Xiaohui Zhang
# Apache 2.0.

from __future__ import print_function
import argparse
import sys

def GetArgs():
    parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                     "to produce a learned lexicon.",
                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

    parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                        help = "Input lexicon. Each line must be <word> <phones>.")
    parser.add_argument("lexicon_edits_file", metavar='<lexicon-edits-file>', type = str,
                        help = "Input lexicon edits file containing human-readable & editable"
                               "pronounciation info.  The info for each word is like:"
                         "------------ an 4086.0 --------------"
                         "R  | Y |  2401.6 |  AH N"
                         "R  | Y |  640.8 |  AE N"
                         "P  | Y |  1035.5 |  IH N"
                         "R(ef), P(hone-decoding) represents the pronunciation source"
                         "Y/N means the recommended decision of including this pron or not"
                         "and the numbers are soft counts accumulated from lattice-align-word outputs. See steps/dict/select_prons_bayesian.py for more details.")
    parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                        help = "Output lexicon to this file.")

    print (' '.join(sys.argv), file=sys.stderr)

    args = parser.parse_args()
    args = CheckArgs(args)

    return args

def CheckArgs(args):
    if args.in_lexicon == "-":
        args.in_lexicon = sys.stdin
    else:
        args.in_lexicon_handle = open(args.in_lexicon)
    args.lexicon_edits_file_handle = open(args.lexicon_edits_file)

    if args.out_lexicon == "-":
        args.out_lexicon_handle = sys.stdout
    else:
        args.out_lexicon_handle = open(args.out_lexicon, "w")

    return args

def ReadLexicon(lexicon_file_handle):
    lexicon = set()
    if lexicon_file_handle:
        for line in lexicon_file_handle.readlines():
            splits = line.strip().split()
            if len(splits) == 0:
                continue
            if len(splits) < 2:
                raise Exception('Invalid format of line ' + line
                                    + ' in lexicon file.')
            word = splits[0]
            phones = ' '.join(splits[1:])
            lexicon.add((word, phones))
    return lexicon

def ApplyLexiconEdits(lexicon, lexicon_edits_file_handle):
    if lexicon_edits_file_handle:
        for line in lexicon_edits_file_handle.readlines():
            # skip all commented lines
            if line.startswith('#'):
                continue
            # read a word from a line like "---- MICROPHONES 200.0 ----".
            if line.startswith('---'):
                splits = line.strip().strip('-').strip().split()
                if len(splits) != 2:
                    print(splits, file=sys.stderr)
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
                word = splits[0].strip()
            else:
            # parse the pron and decision 'Y/N' of accepting the pron or not,
            # from a line like: 'P  | Y |  42.0 |  M AY K R AH F OW N Z'
                splits = line.split('|')
                if len(splits) != 4:
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
                pron = splits[3].strip()
                if splits[1].strip() == 'Y':
                    lexicon.add((word, pron))
                elif splits[1].strip() == 'N':
                    lexicon.discard((word, pron))
                else:
                    raise Exception('Invalid format of line ' + line
                                        + ' in lexicon edits file.')
    return lexicon


def WriteLexicon(lexicon, out_lexicon_handle):
    for word, pron in lexicon:
        print('{0} {1}'.format(word, pron), file=out_lexicon_handle)
    out_lexicon_handle.close()

def Main():
    args = GetArgs()
    lexicon = ReadLexicon(args.in_lexicon_handle)
    ApplyLexiconEdits(lexicon, args.lexicon_edits_file_handle)
    WriteLexicon(lexicon, args.out_lexicon_handle)

if __name__ == "__main__":
    Main()