Blame view

egs/chime5/s5/local/json2text.py 2.81 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  #!/usr/bin/env python3
  
  # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
  #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  
  import json
  import argparse
  import logging
  import sys
  
  
  def hms_to_seconds(hms):
      hour = hms.split(':')[0]
      minute = hms.split(':')[1]
      second = hms.split(':')[2].split('.')[0]
  
      # .xx (10 ms order)
      ms10 = hms.split(':')[2].split('.')[1]
  
      # total seconds
      seconds = int(hour) * 3600 + int(minute) * 60 + int(second)
  
      return '{:07d}'.format(int(str(seconds) + ms10))
  
  
  if __name__ == '__main__':
      parser = argparse.ArgumentParser()
      parser.add_argument('json', help='JSON transcription file')
      parser.add_argument('--mictype',
                          choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
                          help='Type of microphones')
      args = parser.parse_args()
  
      # logging info
      log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
      logging.basicConfig(level=logging.INFO, format=log_format)
  
      logging.debug("reading %s", args.json)
      with open(args.json, 'rt', encoding="utf-8") as f:
          j = json.load(f)
  
      for x in j:
          if '[redacted]' not in x['words']:
              session_id = x['session_id']
              speaker_id = x['speaker']
              if args.mictype == 'ref':
                  mictype = x['ref']
              elif args.mictype == 'worn':
                  mictype = 'original'
              else:
                  mictype = args.mictype.upper() # convert from u01 to U01
  
              # add location tag for scoring (only for dev and eval sets)
              if 'location' in x.keys():
                  location = x['location'].upper()
              else:
                  location = 'NOLOCATION'
  
              start_time = x['start_time'][mictype]
              end_time = x['end_time'][mictype]
          
              # remove meta chars and convert to lower
              words = x['words'].replace('"', '')\
                                .replace('.', '')\
                                .replace('?', '')\
                                .replace(',', '')\
                                .replace(':', '')\
                                .replace(';', '')\
                                .replace('!', '').lower()
  
              # remove multiple spaces
              words = " ".join(words.split())
  
              # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
              start_time = hms_to_seconds(start_time)
              end_time = hms_to_seconds(end_time)
  
              uttid = speaker_id + '_' + session_id
              if not args.mictype == 'worn':
                  uttid += '_' + mictype
              uttid += '_' + location + '-' + start_time + '-' + end_time
  
              if end_time > start_time:
                  sys.stdout.buffer.write((uttid + ' ' + words + '
  ').encode("utf-8"))