Blame view
egs/wsj/s5/steps/cleanup/internal/compute_tf_idf.py
5.87 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
#! /usr/bin/env python from __future__ import print_function import argparse import logging import sys import tf_idf sys.path.insert(0, 'steps') logger = logging.getLogger('tf_idf') logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - " "%(funcName)s - %(levelname)s ] %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) def _get_args(): parser = argparse.ArgumentParser( description="""This script takes in a set of documents and computes the TF-IDF for each n-gram up to the specified order. The script can also load IDF stats from a different file instead of computing them from the input set of documents.""") parser.add_argument("--tf-weighting-scheme", type=str, default="raw", choices=["binary", "raw", "log", "normalized"], help="""The function applied on the raw term-frequencies f(t,d) when computing tf(t,d). TF weighting schemes:- binary : tf(t,d) = 1 if t in d else 0 raw : tf(t,d) = f(t,d) log : tf(t,d) = 1 + log(f(t,d)) normalized : tf(t,d) = K + (1-K) * """ """f(t,d) / max{f(t',d): t' in d}""") parser.add_argument("--tf-normalization-factor", type=float, default=0.5, help="K value for normalized TF weighting scheme") parser.add_argument("--idf-weighting-scheme", type=str, default="log", choices=["unary", "log", "log-smoothed", "probabilistic"], help="""The function applied on the raw inverse-document frequencies n(t) = |d in D: t in d| when computing idf(t,d). IDF weighting schemes:- unary : idf(t,D) = 1 log : idf(t,D) = log (N / 1 + n(t)) log-smoothed : idf(t,D) = log(1 + N / n(t)) probabilistic: idf(t,D) = log((N - n(t)) / n(t))""") parser.add_argument("--ngram-order", type=int, default=2, help="Accumulate for terms upto this n-grams order") parser.add_argument("--input-idf-stats", type=argparse.FileType('r'), help="If provided, IDF stats are loaded from this " "file") parser.add_argument("--output-idf-stats", type=argparse.FileType('w'), help="If providied, IDF stats are written to this " "file") parser.add_argument("--accumulate-over-docs", type=str, default="true", choices=["true", "false"], help="If true, the stats are accumulated over all the " "documents and a single tf-idf-file is written out.") parser.add_argument("docs", type=argparse.FileType('r'), help="Input documents in kaldi text format i.e. " "<document-id> <text>") parser.add_argument("tf_idf_file", type=argparse.FileType('w'), help="Output tf-idf for each (t,d) pair in the " "input documents written in the format " "<terms> <document-id> <tf-idf>") args = parser.parse_args() if args.tf_normalization_factor >= 1.0 or args.tf_normalization_factor < 0: raise ValueError("--tf-normalization-factor must be in [0,1)") args.accumulate_over_docs = bool(args.accumulate_over_docs == "true") if not args.accumulate_over_docs and args.input_idf_stats is None: raise TypeError( "If --accumulate-over-docs=false is provided, " "then --input-idf-stats must be provided.") return args def _run(args): tf_stats = tf_idf.TFStats() idf_stats = tf_idf.IDFStats() if args.input_idf_stats is not None: idf_stats.read(args.input_idf_stats) num_done = 0 for line in args.docs: parts = line.strip().split() doc = parts[0] tf_stats.accumulate(doc, parts[1:], args.ngram_order) if not args.accumulate_over_docs: # Write the document-id and the corresponding tf-idf values. print (doc, file=args.tf_idf_file, end=' ') tf_idf.write_tfidf_from_stats( tf_stats, idf_stats, args.tf_idf_file, tf_weighting_scheme=args.tf_weighting_scheme, idf_weighting_scheme=args.idf_weighting_scheme, tf_normalization_factor=args.tf_normalization_factor, expected_document_id=doc) tf_stats = tf_idf.TFStats() num_done += 1 if args.accumulate_over_docs: tf_stats.compute_term_stats(idf_stats=idf_stats if args.input_idf_stats is None else None) if args.output_idf_stats is not None: idf_stats.write(args.output_idf_stats) args.output_idf_stats.close() tf_idf.write_tfidf_from_stats( tf_stats, idf_stats, args.tf_idf_file, tf_weighting_scheme=args.tf_weighting_scheme, idf_weighting_scheme=args.idf_weighting_scheme, tf_normalization_factor=args.tf_normalization_factor) if num_done == 0: raise RuntimeError("Could not compute TF-IDF for any query documents") def main(): args = _get_args() try: _run(args) finally: if args.input_idf_stats is not None: args.input_idf_stats.close() if args.output_idf_stats is not None: args.output_idf_stats.close() args.docs.close() args.tf_idf_file.close() if __name__ == '__main__': main() |