Blame view
egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py
4.2 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
#!/usr/bin/env python # Copyright 2014 Gaurav Kumar. Apache 2.0 # Extracts one best output for a set of files # The list of files in the conversations for which 1 best output has to be extracted # words.txt from __future__ import print_function import os import sys import subprocess latticeLocation = 'latjosh-2-callhome/lattices-pushed/' tmpdir = 'data/local/data/tmp/ch-d/lattmp' invalidplfdir = 'data/local/data/tmp/ch-d/invalidplf' symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt' conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/dev') provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/asr.test.plf', 'w+') invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/invalidPLF', 'w+') blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/blankPLF', 'w+') rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome/ch-d/removeLines', 'w+') if not os.path.exists(tmpdir): os.makedirs(tmpdir) if not os.path.exists(invalidplfdir): os.makedirs(invalidplfdir) else: os.system("rm " + invalidplfdir + "/*") def latticeConcatenate(lat1, lat2): ''' Concatenates lattices, writes temporary results to tmpdir ''' if lat1 == "": os.system('rm ' + tmpdir + '/tmp.lat') return lat2 else: proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')]) proc.wait() return tmpdir + '/tmp.lat' def findLattice(timeDetail): ''' Finds the lattice corresponding to a time segment ''' if os.path.isfile(latticeLocation + timeDetail + '.lat'): return latticeLocation + timeDetail + '.lat' else: return -1 # Now read list of files in conversations fileList = [] for line in conversationList: line = line.strip() line = line[:-4] fileList.append(line) # IN what order were the conversations added to the spanish files? # Now get timing information to concatenate the ASR outputs lineNo = 1 for item in fileList: timingFile = open('/export/a04/gkumar/corpora/fishcall/callhome/tim/' + item + '.es') for line in timingFile: timeInfo = line.split() # For utterances that are concatenated in the translation file, # the corresponding FSTs have to be translated as well mergedTranslation = "" for timeDetail in timeInfo: tmp = findLattice(timeDetail) if tmp != -1: # Concatenate lattices mergedTranslation = latticeConcatenate(mergedTranslation, tmp) print(mergedTranslation) if mergedTranslation != "": # Sanjeev's Recipe : Remove epsilons and topo sort finalFST = tmpdir + "/final.fst" os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST) # Now convert to PLF proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True) PLFline = proc.stdout.readline() finalPLFFile = tmpdir + "/final.plf" finalPLF = open(finalPLFFile, "w+") finalPLF.write(PLFline) finalPLF.close() # now check if this is a valid PLF, if not write it's ID in a # file so it can be checked later proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True) line = proc.stdout.readline() print("{} {}".format(line, lineNo)) if line.strip() != "PLF format appears to be correct.": os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0]) invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + " ") rmLines.write("{} ".format(lineNo)) else: provFile.write(PLFline) else: blankPLF.write(timeInfo[0] + " ") rmLines.write("{} ".format(lineNo)) # Now convert to PLF lineNo += 1 provFile.close() invalidPLF.close() blankPLF.close() rmLines.close() |