get_lattices.py
4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# Copyright 2014 Gaurav Kumar. Apache 2.0
# Extracts one best output for a set of files
# The list of files in the conversations for which 1 best output has to be extracted
# words.txt
from __future__ import print_function
import os
import sys
import subprocess
latticeLocation = 'latjosh-bmmi/lattices-pushed/'
tmpdir = 'data/local/data/tmp/bmmi-t/lattmp'
invalidplfdir = 'data/local/data/tmp/bmmi-t/invalidplf'
symtable = '/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-matt/data/lang/words.clean.txt'
conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/test')
provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/asr.test.plf', 'w+')
invalidPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/invalidPLF', 'w+')
blankPLF = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/blankPLF', 'w+')
rmLines = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/bmmi-t/removeLines', 'w+')
if not os.path.exists(tmpdir):
os.makedirs(tmpdir)
if not os.path.exists(invalidplfdir):
os.makedirs(invalidplfdir)
else:
os.system("rm " + invalidplfdir + "/*")
def latticeConcatenate(lat1, lat2):
'''
Concatenates lattices, writes temporary results to tmpdir
'''
if lat1 == "":
os.system('rm ' + tmpdir + '/tmp.lat')
return lat2
else:
proc = subprocess.Popen(['fstconcat', lat1, lat2, (tmpdir + '/tmp.lat')])
proc.wait()
return tmpdir + '/tmp.lat'
def findLattice(timeDetail):
'''
Finds the lattice corresponding to a time segment
'''
if os.path.isfile(latticeLocation + timeDetail + '.lat'):
return latticeLocation + timeDetail + '.lat'
else:
return -1
# Now read list of files in conversations
fileList = []
for line in conversationList:
line = line.strip()
line = line[:-4]
fileList.append(line)
# IN what order were the conversations added to the spanish files?
# Now get timing information to concatenate the ASR outputs
lineNo = 1
for item in fileList:
timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es')
for line in timingFile:
timeInfo = line.split()
# For utterances that are concatenated in the translation file,
# the corresponding FSTs have to be translated as well
mergedTranslation = ""
for timeDetail in timeInfo:
tmp = findLattice(timeDetail)
if tmp != -1:
# Concatenate lattices
mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
print(mergedTranslation)
if mergedTranslation != "":
# Sanjeev's Recipe : Remove epsilons and topo sort
finalFST = tmpdir + "/final.fst"
os.system("fstrmepsilon " + mergedTranslation + " | fsttopsort - " + finalFST)
# Now convert to PLF
proc = subprocess.Popen('/export/a04/gkumar/corpora/fishcall/bin/fsm2plf.sh ' + symtable + ' ' + finalFST, stdout=subprocess.PIPE, shell=True)
PLFline = proc.stdout.readline()
finalPLFFile = tmpdir + "/final.plf"
finalPLF = open(finalPLFFile, "w+")
finalPLF.write(PLFline)
finalPLF.close()
# now check if this is a valid PLF, if not write it's ID in a
# file so it can be checked later
proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
line = proc.stdout.readline()
print("{} {}".format(line, lineNo))
if line.strip() != "PLF format appears to be correct.":
os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
rmLines.write("{}\n".format(lineNo))
else:
provFile.write(PLFline)
else:
blankPLF.write(timeInfo[0] + "\n")
rmLines.write("{}\n".format(lineNo))
# Now convert to PLF
lineNo += 1
provFile.close()
invalidPLF.close()
blankPLF.close()
rmLines.close()