Blame view
egs/callhome_egyptian/s5/local/splits/get_conversation.py
1.6 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
#!/usr/bin/env python from __future__ import print_function import os import re transdir = '/export/a04/gkumar/corpora/ECA/callhome/LDC97T19/callhome_arabic_trans_970711/transcrp' devtest = {} evaltest = {} train = {} devConv = 0 testConv = 0 trainConv = 0 pattern = re.compile(r"\d+\.\d*\s\d+\.\d*\s[AB]") for root, _, files in os.walk(transdir): for f in files: if ".scr" in f: fullpath = os.path.join(root, f) pathComponents = fullpath.split("/") # Get all conversations trans = open(fullpath) numberOfConversations = 0 for line in trans: if re.match(pattern, line): numberOfConversations = numberOfConversations + 1 trans.close() if pathComponents[10] == 'devtest': devtest[pathComponents[12]] = numberOfConversations devConv = devConv + numberOfConversations if pathComponents[10] == 'train': train[pathComponents[12]] = numberOfConversations trainConv = trainConv + numberOfConversations if pathComponents[10] == 'evaltest': evaltest[pathComponents[12]] = numberOfConversations testConv = testConv + numberOfConversations print("==============Train===============") print(train) print("Total Conversations in train = {}".format(trainConv)) print("==============Dev===============") print(devtest) print("Total Conversations in dev = {}".format(devConv)) print("==============Test===============") print(evaltest) print("Total Conversations in test = {}".format(testConv)) print("=================================") print("Total Conversations in Corpus = {}".format(trainConv + devConv + testConv)) |