Blame view

egs/hub4_english/s5/local/lm/merge_word_counts.py 766 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
  #! /usr/bin/env python
  
  # Copyright 2016    Vimal Manohar
  # Apache 2.0.
  
  """This script merges pocolm word_counts and write a new word_counts file.
  A min-count argument is required to only write counts that are above the
  specified minimum count.
  """
  from __future__ import print_function
  
  import sys
  
  
  def main():
      if len(sys.argv) != 2:
          sys.stderr.write("Usage: {0} <min-count>
  ".format(sys.argv[0]))
          raise SystemExit(1)
  
      words = {}
      for line in sys.stdin.readlines():
          parts = line.strip().split()
          words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
  
      for word, count in words.items():
          if count >= int(sys.argv[1]):
              print ("{0} {1}".format(count, word))
  
  
  if __name__ == '__main__':
      main()