Blame view

egs/fisher_swbd/s5/local/map_acronyms_transcripts.py 1.81 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  #!/usr/bin/env python
  
  # Copyright 2015  Minhua Wu
  # Apache 2.0
  
  # convert acronyms in swbd transcript to fisher convention
  # accoring to first two columns in the input acronyms mapping
  
  import argparse,re
  __author__ = 'Minhua Wu'
   
  parser = argparse.ArgumentParser(description='format acronyms to a._b._c.')
  parser.add_argument('-i','--input', help='Input transcripts',required=True)
  parser.add_argument('-o','--output',help='Output transcripts', required=True)
  parser.add_argument('-M','--Map', help='Input acronyms mapping',required=True)
  args = parser.parse_args()
  
  fin_map = open(args.Map, "r")
  dict_acronym = {}
  dict_acronym_noi = {} # Mapping of acronyms without I, i
  for pair in fin_map:
      items = pair.split('\t')
      dict_acronym[items[0]] = items[1].strip()
      dict_acronym_noi[items[0]] = items[1].strip()
  fin_map.close()
  del dict_acronym_noi['i']
  del dict_acronym_noi['I']
  
  fin_trans = open(args.input, "r")
  fout_trans = open(args.output, "w")
  for line in fin_trans:
      line = line.strip()
      items = line.split()
      L = len(items)
      # First pass mapping to map I as part of acronym
      for i in range(L):
          if items[i] == 'i':
              x = 0
              while(i-1-x >= 0 and re.match(r'^[A-Z]$',items[i-1-x])):
                  x += 1
              
              y = 0
              while(i+1+y < L and re.match(r'^[A-Z]$',items[i+1+y])):
                  y += 1
  
              if x+y > 0:
                  for bias in range(-x, y+1):
                      items[i+bias] = dict_acronym[items[i+bias]]
                    
      # Second pass mapping (not mapping 'i' and 'I')
      for i in range(len(items)):
          if items[i] in dict_acronym_noi.keys():
              items[i] = dict_acronym_noi[items[i]]
      sentence = ' '.join(items[1:])
      fout_trans.write(items[0]+ ' ' +sentence.lower() +'
  ')
  
  fin_trans.close()
  fout_trans.close()