compute_tf_idf.py 5.87 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145


#! /usr/bin/env python

from __future__ import print_function
import argparse
import logging
import sys

import tf_idf
sys.path.insert(0, 'steps')

logger = logging.getLogger('tf_idf')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def _get_args():
    parser = argparse.ArgumentParser(
        description="""This script takes in a set of documents and computes the
        TF-IDF for each n-gram up to the specified order.  The script can also
        load IDF stats from a different file instead of computing them from the
        input set of documents.""")

    parser.add_argument("--tf-weighting-scheme", type=str, default="raw",
                        choices=["binary", "raw", "log", "normalized"],
                        help="""The function applied on the raw
                        term-frequencies f(t,d) when computing tf(t,d).
                        TF weighting schemes:-
                        binary : tf(t,d) = 1 if t in d else 0
                        raw    : tf(t,d) = f(t,d)
                        log    : tf(t,d) = 1 + log(f(t,d))
                        normalized : tf(t,d) = K + (1-K) * """
                        """f(t,d) / max{f(t',d): t' in d}""")
    parser.add_argument("--tf-normalization-factor", type=float, default=0.5,
                        help="K value for normalized TF weighting scheme")
    parser.add_argument("--idf-weighting-scheme", type=str, default="log",
                        choices=["unary", "log", "log-smoothed",
                                 "probabilistic"],
                        help="""The function applied on the raw
                        inverse-document frequencies n(t) = |d in D: t in d|
                        when computing idf(t,d).
                        IDF weighting schemes:-
                        unary  : idf(t,D) = 1
                        log    : idf(t,D) = log (N / 1 + n(t))
                        log-smoothed : idf(t,D) = log(1 + N / n(t))
                        probabilistic: idf(t,D) = log((N - n(t)) / n(t))""")
    parser.add_argument("--ngram-order", type=int, default=2,
                        help="Accumulate for terms upto this n-grams order")

    parser.add_argument("--input-idf-stats", type=argparse.FileType('r'),
                        help="If provided, IDF stats are loaded from this "
                        "file")
    parser.add_argument("--output-idf-stats", type=argparse.FileType('w'),
                        help="If providied, IDF stats are written to this "
                        "file")
    parser.add_argument("--accumulate-over-docs", type=str, default="true",
                        choices=["true", "false"],
                        help="If true, the stats are accumulated over all the "
                        "documents and a single tf-idf-file is written out.")
    parser.add_argument("docs", type=argparse.FileType('r'),
                        help="Input documents in kaldi text format i.e. "
                        "<document-id> <text>")
    parser.add_argument("tf_idf_file", type=argparse.FileType('w'),
                        help="Output tf-idf for each (t,d) pair in the "
                        "input documents written in the format "
                        "<terms> <document-id> <tf-idf>")

    args = parser.parse_args()

    if args.tf_normalization_factor >= 1.0 or args.tf_normalization_factor < 0:
        raise ValueError("--tf-normalization-factor must be in [0,1)")

    args.accumulate_over_docs = bool(args.accumulate_over_docs == "true")

    if not args.accumulate_over_docs and args.input_idf_stats is None:
        raise TypeError(
            "If --accumulate-over-docs=false is provided, "
            "then --input-idf-stats must be provided.")

    return args


def _run(args):
    tf_stats = tf_idf.TFStats()
    idf_stats = tf_idf.IDFStats()

    if args.input_idf_stats is not None:
        idf_stats.read(args.input_idf_stats)

    num_done = 0
    for line in args.docs:
        parts = line.strip().split()
        doc = parts[0]
        tf_stats.accumulate(doc, parts[1:], args.ngram_order)

        if not args.accumulate_over_docs:
            # Write the document-id and the corresponding tf-idf values.
            print (doc, file=args.tf_idf_file, end=' ')
            tf_idf.write_tfidf_from_stats(
                tf_stats, idf_stats, args.tf_idf_file,
                tf_weighting_scheme=args.tf_weighting_scheme,
                idf_weighting_scheme=args.idf_weighting_scheme,
                tf_normalization_factor=args.tf_normalization_factor,
                expected_document_id=doc)
            tf_stats = tf_idf.TFStats()
        num_done += 1

    if args.accumulate_over_docs:
        tf_stats.compute_term_stats(idf_stats=idf_stats
                                              if args.input_idf_stats is None
                                              else None)

        if args.output_idf_stats is not None:
            idf_stats.write(args.output_idf_stats)
            args.output_idf_stats.close()

        tf_idf.write_tfidf_from_stats(
            tf_stats, idf_stats, args.tf_idf_file,
            tf_weighting_scheme=args.tf_weighting_scheme,
            idf_weighting_scheme=args.idf_weighting_scheme,
            tf_normalization_factor=args.tf_normalization_factor)

    if num_done == 0:
        raise RuntimeError("Could not compute TF-IDF for any query documents")

def main():
    args = _get_args()

    try:
        _run(args)
    finally:
        if args.input_idf_stats is not None:
            args.input_idf_stats.close()
        if args.output_idf_stats is not None:
            args.output_idf_stats.close()
        args.docs.close()
        args.tf_idf_file.close()


if __name__ == '__main__':
    main()