Blame view
egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
1.13 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
#!/usr/bin/env python # Copyright 2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 from __future__ import print_function import sys, gzip, re # Parse options, if len(sys.argv) != 4: print("Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__) sys.exit(0) words_txt, arpa_gz, unigrams_out = sys.argv[1:] if arpa_gz == '-': arpa_gz = '/dev/stdin' if unigrams_out == '-': unigrams_out = '/dev/stdout' # Load the words.txt, words = [ l.split() for l in open(words_txt) ] # Load the unigram probabilities in 10log from ARPA, wrd_log10 = dict() with gzip.open(arpa_gz,'r') as f: read = False for l in f: if l.strip() == '\\1-grams:': read = True if l.strip() == '\\2-grams:': break if read and len(l.split())>=2: log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2] wrd_log10[wrd] = float(log10_p_unigram) # Create list, 'wrd id log_p_unigram', words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ] print(words_unigram[0], file=sys.stderr) # Store, with open(unigrams_out,'w') as f: f.writelines(['%s %s %g ' % (w,i,p) for (w,i,p) in words_unigram]) |