Blame view
egs/fisher_callhome_spanish/s5/local/train_get_1_best.py
3.09 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
#!/usr/bin/env python # Copyright 2014 Gaurav Kumar. Apache 2.0 import os import sys files = [ open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-1/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-2/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-3/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-4/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-5/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-6/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-7/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-8/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-9/exp/tri5a/decode_test/scoring/13.tra'), open('/export/a04/gkumar/kaldi-trunk/egs/fishcall_es/j-10/exp/tri5a/decode_test/scoring/13.tra')] def findTranscription(timeDetail): for file1 in files: file1.seek(0,0) for line in file1: lineComp = line.split() if lineComp[0] == timeDetail: return " ".join(lineComp[1:]) # No result found return -1 wordsFile = open('exp/tri5a/graph/words.txt') words = {} # Extract word list for line in wordsFile: lineComp = line.split() words[int(lineComp[1])] = lineComp[0].strip() # Now read list of files in conversations fileList = [] #conversationList = open('/export/a04/gkumar/corpora/fishcall/joshkal-splits/provisional_dev') conversationList = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/train') for line in conversationList: line = line.strip() line = line[:-4] fileList.append(line) # IN what order were the conversations added to the spanish files? # TODO: Make sure they match the order in which these english files are being written # Now get timing information to concatenate the ASR outputs if not os.path.exists('exp/tri5a/one-best/train'): os.makedirs('exp/tri5a/one-best/train') #provFile = open('/export/a04/gkumar/corpora/fishcall/fisher_provisional_dev.es', 'w+') provFile = open('/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt/asr.train', 'w+') for item in fileList: timingFile = open('/export/a04/gkumar/corpora/fishcall/fisher/tim/' + item + '.es') newFile = open('exp/tri5a/one-best/train/' + item + '.es', 'w+') for line in timingFile: timeInfo = line.split() mergedTranslation = "" for timeDetail in timeInfo: #Locate this in ASR dev/test, this is going to be very slow tmp = findTranscription(timeDetail) if tmp != -1: mergedTranslation = mergedTranslation + " " + tmp mergedTranslation = mergedTranslation.strip() transWords = [words[int(x)] for x in mergedTranslation.split()] newFile.write(" ".join(transWords) + " ") provFile.write(" ".join(transWords) + " ") newFile.close() provFile.close() |