Blame view

egs/hub4_english/s5/local/data_prep/hub4_utils.py 5 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
  # Copyright 2016    Vimal Manohar
  # Apache 2.0.
  
  """This module contains utilities for preparing the HUB4 broadcast news
  evaluation corpora.
  """
  
  import os
  import re
  import sys
  
  
  def parse_uem_line(reco, line):
      """This method parses a 'line' from the UEM for recording 'reco'
      and returns the line converted to kaldi segments format.
      The format of UEM is
      <file-id> <channel> <start-time> <end-time>
  
      We force the channel to be 1 and take the file-id to be the recording-id.
      """
      line = line.strip()
      if len(line) == 0 or line[0:2] == ";;":
          return None
      parts = line.split()
  
      if reco is None:
          reco = parts[0]
  
      # The channel ID is expected to be 1.
      if parts[1] != "1":
          raise TypeError("Invalid line {0}".format(line))
  
      start_time = float(parts[2])
      end_time = float(parts[3])
  
      utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
                                         int(end_time * 100))
      return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
  
  
  def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
      """This line parses a 'line' from the CMU automatic segmentation for
      recording.
      The CMU segmentation has the following format:
      <file> <channel> <speaker> <start-time> <end-time> <condition>
  
      e.g.:
      h4e_98_1 1 F0-0000     0.00    28.22 F0
  
      We force the channel to be 1 and take the file-id to be the recording-id.
      """
      line = line.strip()
      if len(line) == 0 or line[0:2] == ";;":
          return None
      parts = line.split()
  
      # Actually a file, but we assuming 1-1 mapping to recording and force
      # channel to be 1.
      reco = parts[0]
  
      # The channel ID is expected to be 1.
      if parts[1] != "1":
          raise TypeError("Invalid line {0}".format(line))
      spk = parts[2]
  
      start_time = float(parts[3])
      end_time = float(parts[4])
  
      if prepend_reco_to_spk:
          spk = reco + '-' + spk
          utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
                                               int(end_time * 100), spk=spk)
      else:
          utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
                                                      int(end_time * 100),
                                                      reco=reco, spk=spk)
  
      segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
          utt, reco, st=start_time, end=end_time)
      utt2spk_line = "{0} {1}".format(utt, spk)
  
      return (segment_line, utt2spk_line)
  
  
  def normalize_csr_transcript(text, noise_word, spoken_noise_word):
      """Normalize broadcast news transcript for audio."""
      text = text.upper()
  
      # Remove long event markings
      text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
      # Remove comments
      text = re.sub(r"\{\{[^}]*\}\}", "", text)
      # Replace alternative words with a single one (second alternative)
      text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
      # Remove partial word completions
      text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
      # Remove accent marks and diacritics
      text = re.sub(r"\\[3-8]", "", text)
  
      # Remove unclear speech markings
      text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
      text = re.sub(r"#", "", text)   # Remove overlapped speech markings
      # Remove invented word markings
      text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
      # Replace speaker-made noises with <SPOKEN_NOISE>
      text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
                    spoken_noise_word, text)
      # Replace noise with <NOISE>
      text = re.sub(r"\[[^]]+\]", noise_word, text)
      text = re.sub(r"\+([^+]+)\+", r"\1", text)
  
      # Remove periods after letter.
      text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
      # Replace \. with .
      text = re.sub(r"\\.", r".", text)
  
      text1 = []
      for word in text.split():
          if word == spoken_noise_word or word == noise_word:
              text1.append(word)
              continue
  
          # Remove mispronunciation brackets
          word = re.sub(r"^@(\w+)$", r"\1", word)
          # Remove everything other than the standard ASCII symbols
          word = re.sub("[^A-Za-z0-9.' _-]", "", word)
          text1.append(word)
      return " ".join(text1)
  
  
  def remove_punctuations(text):
      """Remove punctuations and some other processing for text sentence."""
      # Remove HTML new lines that are not end of sentences
      text1 = re.sub("
  ", " ", text)
  
      # Remove some markers like double dash that are normally used to separate
      # name titles in newspapers.
      text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
  
      # Remove quotation marks
      text1 = re.sub(r"''|``|\(|\)", " ", text1)
  
      # Remove everything other than the standard ASCII symbols
      text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
  
      # Replace multiple .'s with single and then remove isolated '.'
      text1 = re.sub(r"\.[.]+ ", ".", text1)
      text1 = re.sub(r" \. ", " ", text1)
  
      # Remove isolated '-'
      text1 = re.sub(r" - ", " ", text1)
  
      # Replace multiple spaces with single.
      text1 = re.sub(r"[ ]+", " ", text1)
  
      return text1