Blame view
egs/hub4_english/s5/local/data_prep/hub4_utils.py
5 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# Copyright 2016 Vimal Manohar # Apache 2.0. """This module contains utilities for preparing the HUB4 broadcast news evaluation corpora. """ import os import re import sys def parse_uem_line(reco, line): """This method parses a 'line' from the UEM for recording 'reco' and returns the line converted to kaldi segments format. The format of UEM is <file-id> <channel> <start-time> <end-time> We force the channel to be 1 and take the file-id to be the recording-id. """ line = line.strip() if len(line) == 0 or line[0:2] == ";;": return None parts = line.split() if reco is None: reco = parts[0] # The channel ID is expected to be 1. if parts[1] != "1": raise TypeError("Invalid line {0}".format(line)) start_time = float(parts[2]) end_time = float(parts[3]) utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), int(end_time * 100)) return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time) def parse_cmu_seg_line(line, prepend_reco_to_spk=False): """This line parses a 'line' from the CMU automatic segmentation for recording. The CMU segmentation has the following format: <file> <channel> <speaker> <start-time> <end-time> <condition> e.g.: h4e_98_1 1 F0-0000 0.00 28.22 F0 We force the channel to be 1 and take the file-id to be the recording-id. """ line = line.strip() if len(line) == 0 or line[0:2] == ";;": return None parts = line.split() # Actually a file, but we assuming 1-1 mapping to recording and force # channel to be 1. reco = parts[0] # The channel ID is expected to be 1. if parts[1] != "1": raise TypeError("Invalid line {0}".format(line)) spk = parts[2] start_time = float(parts[3]) end_time = float(parts[4]) if prepend_reco_to_spk: spk = reco + '-' + spk utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100), int(end_time * 100), spk=spk) else: utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100), int(end_time * 100), reco=reco, spk=spk) segment_line = "{0} {1} {st:.3f} {end:.3f}".format( utt, reco, st=start_time, end=end_time) utt2spk_line = "{0} {1}".format(utt, spk) return (segment_line, utt2spk_line) def normalize_csr_transcript(text, noise_word, spoken_noise_word): """Normalize broadcast news transcript for audio.""" text = text.upper() # Remove long event markings text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text) # Remove comments text = re.sub(r"\{\{[^}]*\}\}", "", text) # Replace alternative words with a single one (second alternative) text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text) # Remove partial word completions text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text) # Remove accent marks and diacritics text = re.sub(r"\\[3-8]", "", text) # Remove unclear speech markings text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) text = re.sub(r"#", "", text) # Remove overlapped speech markings # Remove invented word markings text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # Replace speaker-made noises with <SPOKEN_NOISE> text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]", spoken_noise_word, text) # Replace noise with <NOISE> text = re.sub(r"\[[^]]+\]", noise_word, text) text = re.sub(r"\+([^+]+)\+", r"\1", text) # Remove periods after letter. text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text) # Replace \. with . text = re.sub(r"\\.", r".", text) text1 = [] for word in text.split(): if word == spoken_noise_word or word == noise_word: text1.append(word) continue # Remove mispronunciation brackets word = re.sub(r"^@(\w+)$", r"\1", word) # Remove everything other than the standard ASCII symbols word = re.sub("[^A-Za-z0-9.' _-]", "", word) text1.append(word) return " ".join(text1) def remove_punctuations(text): """Remove punctuations and some other processing for text sentence.""" # Remove HTML new lines that are not end of sentences text1 = re.sub(" ", " ", text) # Remove some markers like double dash that are normally used to separate # name titles in newspapers. text1 = re.sub(r"(&[^;]+;|--)", " ", text1) # Remove quotation marks text1 = re.sub(r"''|``|\(|\)", " ", text1) # Remove everything other than the standard ASCII symbols text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1) # Replace multiple .'s with single and then remove isolated '.' text1 = re.sub(r"\.[.]+ ", ".", text1) text1 = re.sub(r" \. ", " ", text1) # Remove isolated '-' text1 = re.sub(r" - ", " ", text1) # Replace multiple spaces with single. text1 = re.sub(r"[ ]+", " ", text1) return text1 |