Blame view
egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
16.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
#!/usr/bin/env python3 # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 from __future__ import print_function import sys, operator, argparse # Modify the CTM to include for each token the information from Levenshtein # alignment of 'hypothesis' and 'reference' # (i.e. the output of 'align-text'. # The information added to each token in the CTM is the reference word and one # of the following edit-types: # 'cor' = correct [note: as a special case we count as correct cases where # the hypothesis word is the OOV symbol and the reference # word is OOV w.r.t. the supplied vocabulary.] # 'sub' = substitution # 'del' = deletion # 'ins' = insertion # 'sil' = (silence in ctm; does not consume a reference word) # note: the script modify_ctm_edits.py will add the new # note: the following extra edit-type may be added by modify_ctm_edits.py: # 'fix' ... this is like 'cor', but it means the reference has been modified # to fix non-scoreable errors [typically errors that don't change the # meaning], so we don't trust the word or value it as much as a 'cor'. # # Note: Additional lines are added to the CTM to account for deletions. # Input CTM: # (note: the <eps> is for silence in the input CTM that comes from # optional-silence in the graph. However, the input edits don't have anything # for these silences. # We assume (and check) that the channel will always be '1', because the # input CTMs are expected to be 'per utterance', not including real # recording-ids. # Input ctm format: # <file-id> <channel> <start-time> <duration> <hyp-word> [<confidence>] # note, the confidence defaults to 1 if not provided (these # scripts don't actually use the confidence field). ## TimBrown_2008P-0007226-0007620 1 0.000 0.100 when ## TimBrown_2008P-0007226-0007620 1 0.100 0.090 i ## TimBrown_2008P-0007226-0007620 1 0.190 0.300 some ## TimBrown_2008P-0007226-0007620 1 0.490 0.110 when ## TimBrown_2008P-0007226-0007620 1 0.600 0.060 i ## TimBrown_2008P-0007226-0007620 1 0.660 0.190 say ## TimBrown_2008P-0007226-0007620 1 0.850 0.450 go ## TimBrown_2008P-0007226-0007620 1 1.300 0.310 [COUGH] ## TimBrown_2008P-0007226-0007620 1 1.610 0.130 you ## TimBrown_2008P-0007226-0007620 1 1.740 0.180 got ## TimBrown_2008P-0007226-0007620 1 1.920 0.370 thirty ## TimBrown_2008P-0007226-0007620 1 2.290 0.830 seconds ## TimBrown_2008P-0007226-0007620 1 3.120 0.330 <eps> ## TimBrown_2008P-0007226-0007620 1 3.450 0.040 [BREATH] ## TimBrown_2008P-0007226-0007620 1 3.490 0.110 to ## TimBrown_2008P-0007226-0007620 1 3.600 0.320 [NOISE] # Input Levenshtein edits : (the output of 'align-text' post-processed by 'wer_per_utt_details.pl') # AJJacobs_2007P-0001605-0003029 i i ; thought thought ; i'd i'd ; tell tell ; you you ; a a ; little little ; about about ; [UH] [UH] ; what what ; i i ; like like ; to to ; write write ; and and ; [UH] [UH] ; i i ; like like ; to to ; [UH] [UH] ; immerse immerse ; myself myself ; [SMACK] [SMACK] ; in in ; my my ; topics topics ; [UM] [UM] ; i i ; just just ; like like ; to to ; [UH] [UH] ; dive dive ; [SMACK] [SMACK] ; right right ; in in ; and and ; become become ; [UH] [UH] ; sort sort ; of of ; a a ; human human ; guinea guinea ; pig pig ; [BREATH] [BREATH] ; and and ; [UH] [UH] # AJJacobs_2007P-0003133-0004110 i i ; see see ; my my ; life life ; as as ; a a ; series series ; of of ; experiments experiments ; [BREATH] [BREATH] ; so so ; [UH] [UH] ; i i ; [NOISE] [NOISE] ; work work ; for for ; esquire esquire ; magazine magazine ; <eps> and ; a a ; couple couple ; of of ; years years ; ago ago ; [BREATH] [BREATH] ; i i ; wrote wrote ; an an ; article article ; called called ; [NOISE] [NOISE] ; my my ; outsourced outsourced ; life life # Output format: # <file-id> <channel> <start-time> <duration> <hyp-word> <confidence> <ref-word> <edit-type> # AJJacobs_2007P-0001605-0003029 1 0 0.09 <eps> 1.0 <eps> sil # AJJacobs_2007P-0001605-0003029 1 0.09 0.15 i 1.0 i cor # AJJacobs_2007P-0001605-0003029 1 0.24 0.25 thought 1.0 thought cor # AJJacobs_2007P-0001605-0003029 1 0.49 0.14 i'd 1.0 i'd cor # AJJacobs_2007P-0001605-0003029 1 0.63 0.22 tell 1.0 tell cor # AJJacobs_2007P-0001605-0003029 1 0.85 0.11 you 1.0 you cor # AJJacobs_2007P-0001605-0003029 1 0.96 0.05 a 1.0 a cor # AJJacobs_2007P-0001605-0003029 1 1.01 0.24 little 1.0 little cor # AJJacobs_2007P-0001605-0003029 1 1.25 0.5 about 1.0 about cor # AJJacobs_2007P-0001605-0003029 1 1.75 0.48 [UH] 1.0 [UH] cor # AJJacobs_2007P-0001605-0003029 1 2.23 0.34 <eps> 1.0 <eps> sil # AJJacobs_2007P-0001605-0003029 1 2.57 0.21 what 1.0 what cor # AJJacobs_2007P-0001605-0003029 1 2.78 0.1 i 1.0 i cor # AJJacobs_2007P-0001605-0003029 1 2.88 0.22 like 1.0 like cor # AJJacobs_2007P-0001605-0003029 1 3.1 0.13 to 1.0 to cor # AJJacobs_2007P-0001605-0003029 1 3.23 0.37 write 1.0 write cor # AJJacobs_2007P-0001605-0003029 1 3.6 0.03 <eps> 1.0 <eps> sil # AJJacobs_2007P-0001605-0003029 1 3.63 0.36 and 1.0 and cor parser = argparse.ArgumentParser( description = "Append to the CTM the Levenshtein alignment of 'hypothesis' and 'reference'; " "creates augmented CTM with extra fields (see script for details)") parser.add_argument("--oov", type = int, default = -1, help = "The integer representation of the OOV symbol; substitutions " "by the OOV symbol for out-of-vocabulary reference words are treated " "as correct, if you also supply the --symbol-table option.") parser.add_argument("--symbol-table", type = str, help = "The words.txt your system used; if supplied, it is used to " "determine OOV words (and such words will count as correct if " "substituted by the OOV symbol). See also the --oov option") # Required arguments parser.add_argument("edits_in", metavar = "<edits-in>", help = "Filename of output of 'align-text', which this program reads. " "Use /dev/stdin for standard input.") parser.add_argument("ctm_in", metavar = "<ctm-in>", help = "Filename of input hypothesis in ctm format") parser.add_argument("ctm_edits_out", metavar = "<ctm-edits-out>", help = "Filename of output (CTM appended with word-edit information)") args = parser.parse_args() def OpenFiles(): global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word try: ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format( args.ctm_edits_out)) try: edits_in = open(args.edits_in, encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format( args.edits_in)) try: ctm_in = open(args.ctm_in, encoding='utf-8') except: sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format( args.ctm_in)) symbol_table = set() oov_word = None if args.symbol_table != None: if args.oov == -1: print("get_ctm_edits.py: error: if you set the the --symbol-table option " "you must also set the --oov option", file = sys.stderr) try: f = open(args.symbol_table, 'r', encoding='utf-8') for line in f.readlines(): [ word, integer ] = line.split() if int(integer) == args.oov: oov_word = word symbol_table.add(word) except: sys.exit("get_ctm_edits.py: error opening symbol-table file {0} for " "input (or bad file), exception is: {1}".format(args.symbol_table)) f.close() if oov_word == None: sys.exit("get_ctm_edits.py: OOV word not found: check the values of " "--symbol-table={0} and --oov={1}".format(args.symbol_table, args.oov)) # This function takes two lists # edits_array = [ [ hyp_word1, ref_word1], [ hyp_word2, ref_word2 ], ... ] # ctm_array = [ [ start1, duration1, hyp_word1, confidence1 ], ... ] # # and pads them with new list elements so that the entries 'match up'. What we # are aiming for is that for each i, ctm_array[i][2] == edits_array[i][0]. The # reasons why this is not automatically true are: # # (1) There may be deletions in the hypothesis sequence, which would lead to # pairs like [ '<eps>', ref_word ]. # (2) The ctm may have been written 'with silence', which will lead to # ctm entries like [ 1, 7.8, 0.9, '<eps>' ] where the '<eps>' refers # to the optional-silence from the lexicon. # # We introduce suitable entries in to edits_array and ctm_array as necessary # to make them 'match up'. This function returns the pair (new_edits_array, # new_ctm_array). def PadArrays(edits_array, ctm_array): new_edits_array = [] new_ctm_array = [] edits_len = len(edits_array) ctm_len = len(ctm_array) edits_pos = 0 ctm_pos = 0 # current_time is the end of the last ctm segment we processesed. current_time = ctm_array[0][0] if ctm_len > 0 else 0.0 while edits_pos < edits_len or ctm_pos < ctm_len: if edits_pos < edits_len and ctm_pos < ctm_len and \ edits_array[edits_pos][0] == ctm_array[ctm_pos][2] and \ edits_array[edits_pos][0] != '<eps>': # This is the normal case, where there are 2 entries where # they hyp-words match up new_edits_array.append(edits_array[edits_pos]) edits_pos += 1 new_ctm_array.append(ctm_array[ctm_pos]) current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1] ctm_pos += 1 elif edits_pos < edits_len and edits_array[edits_pos][0] == '<eps>': # There was a deletion. Pad with an empty ctm segment with '<eps>' as # the word. new_edits_array.append(edits_array[edits_pos]) edits_pos += 1 duration = 0.0 confidence = 1.0 new_ctm_array.append([ current_time, duration, '<eps>', confidence]) elif ctm_pos < ctm_len and ctm_array[ctm_pos][2] == '<eps>': # There was silence in the ctm, and either we're reached the end of the # edits sequence, or the hyp word was not '<eps>': new_edits_array.append(['<eps>', '<eps>']) new_ctm_array.append(ctm_array[ctm_pos]) current_time = ctm_array[ctm_pos][0] + ctm_array[ctm_pos][1] ctm_pos += 1 else: raise Exception("Could not align edits_array = {0} and ctm_array = {1}; " "edits-position = {2}, ctm-position = {3}, " "pending-edit={4}, pending-ctm-entry={5}".format( edits_array, ctm_array, edits_pos, ctm_pos, edits_array[edits_pos] if edits_pos < edits_len else None, ctm_array[ctm_pos] if ctm_pos < ctm_len else None)) assert len(new_edits_array) == len(new_ctm_array) return (new_edits_array, new_ctm_array) # This function returns the appropriate edit-type to output in the ctm-edits # file. The ref_word and hyp_word and duration are the values we'll print in # the ctm-edits file. def GetEditType(hyp_word, ref_word, duration): global oov_word if hyp_word == ref_word and hyp_word !='<eps>': return 'cor' elif hyp_word != '<eps>' and ref_word == '<eps>': return 'ins' elif hyp_word == '<eps>' and ref_word != '<eps>' and duration == 0.0: return 'del' elif hyp_word == oov_word and \ len(symbol_table) != 0 and not ref_word in symbol_table: return 'cor' # this special case is treated as correct. elif hyp_word == '<eps>' == ref_word and duration > 0.0: # silence in hypothesis; we don't match this up with any reference word. return 'sil' else: # The following assertion is because, based on how PadArrays # works, we shouldn't hit this case. assert hyp_word != '<eps>' and ref_word != '<eps>' return 'sub' # this prints a number with a certain number of digits after # the point, while removing trailing zeros. def FloatToString(f): num_digits = 6 # we want to print 6 digits after the zero g = f while abs(g) > 1.0: g *= 0.1 num_digits += 1 format_str = '%.{0}g'.format(num_digits) return format_str % f def OutputCtm(utterance_id, edits_array, ctm_array): global ctm_edits_out # note: this function expects the padded entries created by PadARrays. assert len(edits_array) == len(ctm_array) channel = '1' # this is hardcoded at both input and output, since this CTM # doesn't really represent recordings, only utterances. for i in range(len(edits_array)): ( hyp_word, ref_word ) = edits_array[i] ( start_time, duration, hyp_word2, confidence ) = ctm_array[i] if not hyp_word == hyp_word2: print("Error producing output CTM for edit = {0} and ctm = {1}".format( edits_array[i], ctm_array[i]), file = sys.stderr) sys.exit(1) assert hyp_word == hyp_word2 edit_type = GetEditType(hyp_word, ref_word, duration) print(utterance_id, channel, FloatToString(start_time), FloatToString(duration), hyp_word, confidence, ref_word, edit_type, file = ctm_edits_out) def ProcessOneUtterance(utterance_id, edits_line, ctm_lines): try: # Remove the utterance-id from the beginning of the edits line edits_fields = edits_line[len(utterance_id) + 1:] # e.g. if edits_fields is now 'i i ; see be ; my my ', edits_array will become # [ ['i', 'i'], ['see', 'be'], ['my', 'my'] ] fields_split = edits_fields.split() first_fields, second_fields = fields_split[0::3], fields_split[1::3] if ( len(first_fields) != len(second_fields) or (len(fields_split) >= 3 and set(fields_split[2::3]) != {';'}) ): sys.exit("get_ctm_edits.py: could not make sense of edits line: " + edits_line) edits_array = list(zip(first_fields, second_fields)) # ctm_array will now become something like [ ['1', '1.010', '0.240', 'little ' ], ... ] ctm_array = [ x.split() for x in ctm_lines ] ctm_array = [] for line in ctm_lines: try: # Strip off the utterance-id and split the remaining fields # which should be: channel==1, start, dur, word, [confidence] a = line[len(utterance_id) + 1:].split() if len(a) == 4: a.append(1.0) # confidence defaults to 1.0. [ channel, start, dur, word, confidence ] = a if channel != '1': raise Exception("Channel should be 1, got: " + channel) ctm_array.append([ float(start), float(dur), word, float(confidence) ]) except Exception as e: sys.exit("get_ctm_edits.py: error procesing ctm line {0} " "... exception is: {1} {2}".format(line, type(e), str(e))) # ctm_array will now be something like [ [ 1.010, 0.240, 'little ', 1.0 ], ... ] # The following call pads the edits and ctm arrays with appropriate # entries so that they have the same length and the elements 'match up'. (edits_array, ctm_array) = PadArrays(edits_array, ctm_array) except Exception as e: sys.exit("get_ctm_edits.py: error processing utterance {0}, error was: {1}".format( utterance_id, str(e))) OutputCtm(utterance_id, edits_array, ctm_array) def ProcessData(): num_utterances_processed = 0 pending_ctm_line = ctm_in.readline() while True: this_edits_line = edits_in.readline() if this_edits_line == '': if pending_ctm_line != '': sys.exit("get_ctm_edits.py: edits_in input {0} ended before " "ctm input was ended. We processed {1} " "utterances.".format(args.edits_in, num_utterances_processed)) break a = this_edits_line.split() if len(a) == 0: sys.exit("get_ctm_edits.py: edits_input {0} had an empty line".format( args.edits_in)) utterance_id = a[0] utterance_id_len = len(utterance_id) this_utterance_ctm_lines = [] while len(pending_ctm_line.strip()) > 0 and pending_ctm_line.split()[0] == utterance_id: this_utterance_ctm_lines.append(pending_ctm_line) pending_ctm_line = ctm_in.readline() ProcessOneUtterance(utterance_id, this_edits_line, this_utterance_ctm_lines) num_utterances_processed += 1 print("get_ctm_edits.py: processed {0} utterances".format( num_utterances_processed), file=sys.stderr) OpenFiles() ProcessData() |