hub4_utils.py
5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Copyright 2016 Vimal Manohar
# Apache 2.0.
"""This module contains utilities for preparing the HUB4 broadcast news
evaluation corpora.
"""
import os
import re
import sys
def parse_uem_line(reco, line):
"""This method parses a 'line' from the UEM for recording 'reco'
and returns the line converted to kaldi segments format.
The format of UEM is
<file-id> <channel> <start-time> <end-time>
We force the channel to be 1 and take the file-id to be the recording-id.
"""
line = line.strip()
if len(line) == 0 or line[0:2] == ";;":
return None
parts = line.split()
if reco is None:
reco = parts[0]
# The channel ID is expected to be 1.
if parts[1] != "1":
raise TypeError("Invalid line {0}".format(line))
start_time = float(parts[2])
end_time = float(parts[3])
utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
int(end_time * 100))
return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
"""This line parses a 'line' from the CMU automatic segmentation for
recording.
The CMU segmentation has the following format:
<file> <channel> <speaker> <start-time> <end-time> <condition>
e.g.:
h4e_98_1 1 F0-0000 0.00 28.22 F0
We force the channel to be 1 and take the file-id to be the recording-id.
"""
line = line.strip()
if len(line) == 0 or line[0:2] == ";;":
return None
parts = line.split()
# Actually a file, but we assuming 1-1 mapping to recording and force
# channel to be 1.
reco = parts[0]
# The channel ID is expected to be 1.
if parts[1] != "1":
raise TypeError("Invalid line {0}".format(line))
spk = parts[2]
start_time = float(parts[3])
end_time = float(parts[4])
if prepend_reco_to_spk:
spk = reco + '-' + spk
utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
int(end_time * 100), spk=spk)
else:
utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
int(end_time * 100),
reco=reco, spk=spk)
segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
utt, reco, st=start_time, end=end_time)
utt2spk_line = "{0} {1}".format(utt, spk)
return (segment_line, utt2spk_line)
def normalize_csr_transcript(text, noise_word, spoken_noise_word):
"""Normalize broadcast news transcript for audio."""
text = text.upper()
# Remove long event markings
text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
# Remove comments
text = re.sub(r"\{\{[^}]*\}\}", "", text)
# Replace alternative words with a single one (second alternative)
text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
# Remove partial word completions
text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
# Remove accent marks and diacritics
text = re.sub(r"\\[3-8]", "", text)
# Remove unclear speech markings
text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
text = re.sub(r"#", "", text) # Remove overlapped speech markings
# Remove invented word markings
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
# Replace speaker-made noises with <SPOKEN_NOISE>
text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
spoken_noise_word, text)
# Replace noise with <NOISE>
text = re.sub(r"\[[^]]+\]", noise_word, text)
text = re.sub(r"\+([^+]+)\+", r"\1", text)
# Remove periods after letter.
text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
# Replace \. with .
text = re.sub(r"\\.", r".", text)
text1 = []
for word in text.split():
if word == spoken_noise_word or word == noise_word:
text1.append(word)
continue
# Remove mispronunciation brackets
word = re.sub(r"^@(\w+)$", r"\1", word)
# Remove everything other than the standard ASCII symbols
word = re.sub("[^A-Za-z0-9.' _-]", "", word)
text1.append(word)
return " ".join(text1)
def remove_punctuations(text):
"""Remove punctuations and some other processing for text sentence."""
# Remove HTML new lines that are not end of sentences
text1 = re.sub("\n", " ", text)
# Remove some markers like double dash that are normally used to separate
# name titles in newspapers.
text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
# Remove quotation marks
text1 = re.sub(r"''|``|\(|\)", " ", text1)
# Remove everything other than the standard ASCII symbols
text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
# Replace multiple .'s with single and then remove isolated '.'
text1 = re.sub(r"\.[.]+ ", ".", text1)
text1 = re.sub(r" \. ", " ", text1)
# Remove isolated '-'
text1 = re.sub(r" - ", " ", text1)
# Replace multiple spaces with single.
text1 = re.sub(r"[ ]+", " ", text1)
return text1