Blame view

egs/callhome_egyptian/s5/local/splits/get_conversation.py 1.6 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  #!/usr/bin/env python
  
  from __future__ import print_function
  import os
  import re
  
  transdir = '/export/a04/gkumar/corpora/ECA/callhome/LDC97T19/callhome_arabic_trans_970711/transcrp'
  devtest = {}
  evaltest = {}
  train = {}
  devConv = 0
  testConv = 0
  trainConv = 0
  
  pattern = re.compile(r"\d+\.\d*\s\d+\.\d*\s[AB]")
  
  for root, _, files in os.walk(transdir):
    for f in files:
      if ".scr" in f:
        fullpath = os.path.join(root, f)
        pathComponents = fullpath.split("/")
  
        # Get all conversations
        trans = open(fullpath)
        numberOfConversations = 0
        for line in trans:
          if re.match(pattern, line):
            numberOfConversations = numberOfConversations + 1
        trans.close()
  
        if pathComponents[10] == 'devtest':
          devtest[pathComponents[12]] = numberOfConversations
          devConv = devConv + numberOfConversations
        if pathComponents[10] == 'train':
          train[pathComponents[12]] = numberOfConversations
          trainConv = trainConv + numberOfConversations
        if pathComponents[10] == 'evaltest':
          evaltest[pathComponents[12]] = numberOfConversations
          testConv = testConv + numberOfConversations
  
  print("==============Train===============")
  print(train)
  print("Total Conversations in train = {}".format(trainConv))
  print("==============Dev===============")
  print(devtest)
  print("Total Conversations in dev = {}".format(devConv))
  print("==============Test===============")
  print(evaltest)
  print("Total Conversations in test = {}".format(testConv))
  print("=================================")
  print("Total Conversations in Corpus = {}".format(trainConv + devConv + testConv))