Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/cleanup/internal/compute_tf_idf.py 5.87 KB
  #! /usr/bin/env python
  
  from __future__ import print_function
  import argparse
  import logging
  import sys
  
  import tf_idf
  sys.path.insert(0, 'steps')
  
  logger = logging.getLogger('tf_idf')
  logger.setLevel(logging.INFO)
  handler = logging.StreamHandler()
  handler.setLevel(logging.INFO)
  formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                                "%(funcName)s - %(levelname)s ] %(message)s")
  handler.setFormatter(formatter)
  logger.addHandler(handler)
  
  
  def _get_args():
      parser = argparse.ArgumentParser(
          description="""This script takes in a set of documents and computes the
          TF-IDF for each n-gram up to the specified order.  The script can also
          load IDF stats from a different file instead of computing them from the
          input set of documents.""")
  
      parser.add_argument("--tf-weighting-scheme", type=str, default="raw",
                          choices=["binary", "raw", "log", "normalized"],
                          help="""The function applied on the raw
                          term-frequencies f(t,d) when computing tf(t,d).
                          TF weighting schemes:-
                          binary : tf(t,d) = 1 if t in d else 0
                          raw    : tf(t,d) = f(t,d)
                          log    : tf(t,d) = 1 + log(f(t,d))
                          normalized : tf(t,d) = K + (1-K) * """
                          """f(t,d) / max{f(t',d): t' in d}""")
      parser.add_argument("--tf-normalization-factor", type=float, default=0.5,
                          help="K value for normalized TF weighting scheme")
      parser.add_argument("--idf-weighting-scheme", type=str, default="log",
                          choices=["unary", "log", "log-smoothed",
                                   "probabilistic"],
                          help="""The function applied on the raw
                          inverse-document frequencies n(t) = |d in D: t in d|
                          when computing idf(t,d).
                          IDF weighting schemes:-
                          unary  : idf(t,D) = 1
                          log    : idf(t,D) = log (N / 1 + n(t))
                          log-smoothed : idf(t,D) = log(1 + N / n(t))
                          probabilistic: idf(t,D) = log((N - n(t)) / n(t))""")
      parser.add_argument("--ngram-order", type=int, default=2,
                          help="Accumulate for terms upto this n-grams order")
  
      parser.add_argument("--input-idf-stats", type=argparse.FileType('r'),
                          help="If provided, IDF stats are loaded from this "
                          "file")
      parser.add_argument("--output-idf-stats", type=argparse.FileType('w'),
                          help="If providied, IDF stats are written to this "
                          "file")
      parser.add_argument("--accumulate-over-docs", type=str, default="true",
                          choices=["true", "false"],
                          help="If true, the stats are accumulated over all the "
                          "documents and a single tf-idf-file is written out.")
      parser.add_argument("docs", type=argparse.FileType('r'),
                          help="Input documents in kaldi text format i.e. "
                          "<document-id> <text>")
      parser.add_argument("tf_idf_file", type=argparse.FileType('w'),
                          help="Output tf-idf for each (t,d) pair in the "
                          "input documents written in the format "
                          "<terms> <document-id> <tf-idf>")
  
      args = parser.parse_args()
  
      if args.tf_normalization_factor >= 1.0 or args.tf_normalization_factor < 0:
          raise ValueError("--tf-normalization-factor must be in [0,1)")
  
      args.accumulate_over_docs = bool(args.accumulate_over_docs == "true")
  
      if not args.accumulate_over_docs and args.input_idf_stats is None:
          raise TypeError(
              "If --accumulate-over-docs=false is provided, "
              "then --input-idf-stats must be provided.")
  
      return args
  
  
  def _run(args):
      tf_stats = tf_idf.TFStats()
      idf_stats = tf_idf.IDFStats()
  
      if args.input_idf_stats is not None:
          idf_stats.read(args.input_idf_stats)
  
      num_done = 0
      for line in args.docs:
          parts = line.strip().split()
          doc = parts[0]
          tf_stats.accumulate(doc, parts[1:], args.ngram_order)
  
          if not args.accumulate_over_docs:
              # Write the document-id and the corresponding tf-idf values.
              print (doc, file=args.tf_idf_file, end=' ')
              tf_idf.write_tfidf_from_stats(
                  tf_stats, idf_stats, args.tf_idf_file,
                  tf_weighting_scheme=args.tf_weighting_scheme,
                  idf_weighting_scheme=args.idf_weighting_scheme,
                  tf_normalization_factor=args.tf_normalization_factor,
                  expected_document_id=doc)
              tf_stats = tf_idf.TFStats()
          num_done += 1
  
      if args.accumulate_over_docs:
          tf_stats.compute_term_stats(idf_stats=idf_stats
                                                if args.input_idf_stats is None
                                                else None)
  
          if args.output_idf_stats is not None:
              idf_stats.write(args.output_idf_stats)
              args.output_idf_stats.close()
  
          tf_idf.write_tfidf_from_stats(
              tf_stats, idf_stats, args.tf_idf_file,
              tf_weighting_scheme=args.tf_weighting_scheme,
              idf_weighting_scheme=args.idf_weighting_scheme,
              tf_normalization_factor=args.tf_normalization_factor)
  
      if num_done == 0:
          raise RuntimeError("Could not compute TF-IDF for any query documents")
  
  def main():
      args = _get_args()
  
      try:
          _run(args)
      finally:
          if args.input_idf_stats is not None:
              args.input_idf_stats.close()
          if args.output_idf_stats is not None:
              args.output_idf_stats.close()
          args.docs.close()
          args.tf_idf_file.close()
  
  
  if __name__ == '__main__':
      main()