json2text.py 2.81 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84


#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import json
import argparse
import logging
import sys


def hms_to_seconds(hms):
    hour = hms.split(':')[0]
    minute = hms.split(':')[1]
    second = hms.split(':')[2].split('.')[0]

    # .xx (10 ms order)
    ms10 = hms.split(':')[2].split('.')[1]

    # total seconds
    seconds = int(hour) * 3600 + int(minute) * 60 + int(second)

    return '{:07d}'.format(int(str(seconds) + ms10))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('json', help='JSON transcription file')
    parser.add_argument('--mictype',
                        choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
                        help='Type of microphones')
    args = parser.parse_args()

    # logging info
    log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
    logging.basicConfig(level=logging.INFO, format=log_format)

    logging.debug("reading %s", args.json)
    with open(args.json, 'rt', encoding="utf-8") as f:
        j = json.load(f)

    for x in j:
        if '[redacted]' not in x['words']:
            session_id = x['session_id']
            speaker_id = x['speaker']
            if args.mictype == 'ref':
                mictype = x['ref']
            elif args.mictype == 'worn':
                mictype = 'original'
            else:
                mictype = args.mictype.upper() # convert from u01 to U01

            # add location tag for scoring (only for dev and eval sets)
            if 'location' in x.keys():
                location = x['location'].upper()
            else:
                location = 'NOLOCATION'

            start_time = x['start_time'][mictype]
            end_time = x['end_time'][mictype]
        
            # remove meta chars and convert to lower
            words = x['words'].replace('"', '')\
                              .replace('.', '')\
                              .replace('?', '')\
                              .replace(',', '')\
                              .replace(':', '')\
                              .replace(';', '')\
                              .replace('!', '').lower()

            # remove multiple spaces
            words = " ".join(words.split())

            # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
            start_time = hms_to_seconds(start_time)
            end_time = hms_to_seconds(end_time)

            uttid = speaker_id + '_' + session_id
            if not args.mictype == 'worn':
                uttid += '_' + mictype
            uttid += '_' + location + '-' + start_time + '-' + end_time

            if end_time > start_time:
                sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8"))