Blame view
egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
12.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
#!/usr/bin/env python # Copyright 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0. from __future__ import print_function import argparse import sys, os from collections import defaultdict parser = argparse.ArgumentParser(description="This script reads stats created in analyze_alignments.sh " "to print information about phone lengths in alignments. It's principally " "useful in order to see whether there is a reasonable amount of silence " "at the beginning and ends of segments. The normal output of this script " "is written to the standard output and is human readable (on crashes, " "we'll print an error to stderr.") parser.add_argument("--frequency-cutoff-percentage", type = float, default = 0.5, help="Cutoff, expressed as a percentage " "(between 0 and 100), of frequency at which we print stats " "for a phone.") parser.add_argument("lang", help="Language directory, e.g. data/lang.") args = parser.parse_args() # set up phone_int2text to map from phone to printed form. phone_int2text = {} try: f = open(args.lang + "/phones.txt", "r"); for line in f.readlines(): [ word, number] = line.split() phone_int2text[int(number)] = word f.close() except: sys.exit("analyze_phone_length_stats.py: error opening or reading {0}/phones.txt".format( args.lang)) # this is a special case... for begin- and end-of-sentence stats, # we group all nonsilence phones together. phone_int2text[0] = 'nonsilence' # populate the set 'nonsilence', which will contain the integer phone-ids of # nonsilence phones (and disambig phones, which won't matter). nonsilence = set(phone_int2text.keys()) nonsilence.remove(0) try: # open lang/phones/silence.csl-- while there are many ways of obtaining the # silence/nonsilence phones, we read this because it's present in graph # directories as well as lang directories. filename = "{0}/phones/silence.csl".format(args.lang) f = open(filename, "r") line = f.readline() f.close() for silence_phone in line.split(":"): nonsilence.remove(int(silence_phone)) except Exception as e: sys.exit("analyze_phone_length_stats.py: error processing {0}/phones/silence.csl: {1}".format( args.lang, str(e))) # phone_length is a dict of dicts of dicts; # phone_lengths[boundary_type] for boundary_type in [ 'begin', 'end', 'all' ] is # a dict indexed by phone, containing dicts from length to a count of occurrences. # Phones are ints and lengths are integers representing numbers of frames. # So: count == phone_lengths[boundary_type][phone][length]. # note: for the 'begin' and 'end' boundary-types, we group all nonsilence phones # into phone-id zero. phone_lengths = dict() for boundary_type in [ 'begin', 'end', 'all' ]: phone_lengths[boundary_type] = dict() for p in phone_int2text.keys(): phone_lengths[boundary_type][p] = defaultdict(int) # total_phones is a dict from boundary_type to total count [of phone occurrences] total_phones = defaultdict(int) # total_frames is a dict from boundary_type to total number of frames. total_frames = defaultdict(int) # total_frames is a dict from num-frames to count of num-utterances with that # num-frames. while True: line = sys.stdin.readline() if line == '': break a = line.split() if len(a) != 4: sys.exit("analyze_phone_length_stats.py: reading stdin, could not interpret line: " + line) try: count, boundary_type, phone, length = a total_phones[boundary_type] += int(count) total_frames[boundary_type] += int(count) * int(length) phone_lengths[boundary_type][int(phone)][int(length)] += int(count) if int(phone) in nonsilence: nonsilence_phone = 0 phone_lengths[boundary_type][nonsilence_phone][int(length)] += int(count) except Exception as e: sys.exit("analyze_phone_length_stats.py: unexpected phone {0} " "seen (lang directory mismatch?): {1}".format(phone, str(e))) if len(phone_lengths) == 0: sys.exit("analyze_phone_length_stats.py: read no input") # work out the optional-silence phone try: f = open(args.lang + "/phones/optional_silence.int", "r") optional_silence_phone = int(f.readline()) optional_silence_phone_text = phone_int2text[optional_silence_phone] f.close() if optional_silence_phone in nonsilence: print("analyze_phone_length_stats.py: was expecting the optional-silence phone to " "be a member of the silence phones, it is not. This script won't work correctly.") except: largest_count = 0 optional_silence_phone = 1 for p in phone_int2text.keys(): if p > 0 and not p in nonsilence: this_count = sum([ l * c for l,c in phone_lengths['all'][p].items() ]) if this_count > largest_count: largest_count = this_count optional_silence_phone = p optional_silence_phone_text = phone_int2text[optional_silence_phone] print("analyze_phone_length_stats.py: could not get optional-silence phone from " "{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format( args.lang, optional_silence_phone_text)) # If length_to_count is a map from length-in-frames to count, # return the length-in-frames that equals the (fraction * 100)'th # percentile of the distribution. def GetPercentile(length_to_count, fraction): total_phones = sum(length_to_count.values()) if total_phones == 0: return 0 else: items = sorted(length_to_count.items()) count_cutoff = int(fraction * total_phones) cur_count_total = 0 for length,count in items: assert count >= 0 cur_count_total += count if cur_count_total >= count_cutoff: return length assert false # we shouldn't reach here. def GetMean(length_to_count): total_phones = sum(length_to_count.values()) if total_phones == 0: return 0.0 total_frames = sum([ float(l * c) for l,c in length_to_count.items() ]) return total_frames / total_phones # Analyze frequency, median and mean of optional-silence at beginning and end of utterances. # The next block will print something like # "At utterance begin, SIL is seen 15.0% of the time; when seen, duration (median, mean) is (5, 7.6) frames." # "At utterance end, SIL is seen 14.6% of the time; when seen, duration (median, mean) is (4, 6.1) frames." # This block will print warnings if silence is seen less than 80% of the time at utterance # beginning and end. for boundary_type in 'begin', 'end': phone_to_lengths = phone_lengths[boundary_type] num_utterances = total_phones[boundary_type] assert num_utterances > 0 opt_sil_lengths = phone_to_lengths[optional_silence_phone] frequency_percentage = sum(opt_sil_lengths.values()) * 100.0 / num_utterances # The reason for this warning is that the tradition in speech recognition is # to supply a little silence at the beginning and end of utterances... up to # maybe half a second. If your database is not like this, you should know; # you may want to mess with the segmentation to add more silence. if frequency_percentage < 80.0: print("analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% " "of the time at utterance {2}. This may not be optimal.".format( optional_silence_phone_text, frequency_percentage, boundary_type)) # this will control a sentence that we print.. boundary_to_text = { } boundary_to_text['begin'] = 'At utterance begin' boundary_to_text['end'] = 'At utterance end' boundary_to_text['all'] = 'Overall' # the next block prints lines like (to give some examples): # At utterance begin, SIL accounts for 98.4% of phone occurrences, with duration (median, mean, 95-percentile) is (57,59.9,113) frames. # ... # At utterance end, nonsilence accounts for 4.2% of phone occurrences, with duration (median, mean, 95-percentile) is (13,13.3,22) frames. # ... # Overall, R_I accounts for 3.2% of phone occurrences, with duration (median, mean, 95-percentile) is (6,6.9,12) frames. for boundary_type in 'begin', 'end', 'all': phone_to_lengths = phone_lengths[boundary_type] tot_num_phones = total_phones[boundary_type] # sort the phones in decreasing order of count. for phone,lengths in sorted(phone_to_lengths.items(), key = lambda x : -sum(x[1].values())): frequency_percentage = sum(lengths.values()) * 100.0 / tot_num_phones if frequency_percentage < args.frequency_cutoff_percentage: continue duration_median = GetPercentile(lengths, 0.5) duration_percentile_95 = GetPercentile(lengths, 0.95) duration_mean = GetMean(lengths) text = boundary_to_text[boundary_type] # e.g. 'At utterance begin'. try: phone_text = phone_int2text[phone] except: sys.exit("analyze_phone_length_stats.py: phone {0} is not covered on phones.txt " "(lang/alignment mismatch?)".format(phone)) print("{text}, {phone_text} accounts for {percent}% of phone occurrences, with " "duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format( text = text, phone_text = phone_text, percent = "%.1f" % frequency_percentage, median = duration_median, mean = "%.1f" % duration_mean, percentile95 = duration_percentile_95)) ## Print stats on frequency and average length of word-internal optional-silences. ## For optional-silence only, subtract the begin and end-utterance stats from the 'all' ## stats, to get the stats excluding initial and final phones. total_frames['internal'] = total_frames['all'] - total_frames['begin'] - total_frames['end'] total_phones['internal'] = total_phones['all'] - total_phones['begin'] - total_phones['end'] internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone]) # internal_opt_sil_phone_lenghts is a dict from length to count. for length in list(internal_opt_sil_phone_lengths.keys()): # subtract the counts for begin and end from the overall counts to get the # word-internal count. internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] + phone_lengths['end'][optional_silence_phone][length]) if internal_opt_sil_phone_lengths[length] == 0: del internal_opt_sil_phone_lengths[length] if total_phones['internal'] != 0.0: total_internal_optsil_frames = sum([ float(l * c) for l,c in internal_opt_sil_phone_lengths.items() ]) total_optsil_frames = sum([ float(l * c) for l,c in phone_lengths['all'][optional_silence_phone].items() ]) opt_sil_internal_frame_percent = total_internal_optsil_frames * 100.0 / total_frames['internal'] opt_sil_total_frame_percent = total_optsil_frames * 100.0 / total_frames['all'] internal_frame_percent = total_frames['internal'] * 100.0 / total_frames['all'] print("The optional-silence phone {0} occupies {1}% of frames overall ".format( optional_silence_phone_text, "%.1f" % opt_sil_total_frame_percent)) hours_total = total_frames['all'] / 360000.0; hours_nonsil = (total_frames['all'] - total_optsil_frames) / 360000.0 print("Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, " "optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent, optional_silence_phone_text, "%.1f" % opt_sil_internal_frame_percent)) print("Assuming 100 frames per second, the alignments represent {0} hours of data, " "or {1} hours if {2} frames are excluded.".format( "%.1f" % hours_total, "%.1f" % hours_nonsil, optional_silence_phone_text)) opt_sil_internal_phone_percent = (sum(internal_opt_sil_phone_lengths.values()) * 100.0 / total_phones['internal']) duration_median = GetPercentile(internal_opt_sil_phone_lengths, 0.5) duration_mean = GetMean(internal_opt_sil_phone_lengths) duration_percentile_95 = GetPercentile(internal_opt_sil_phone_lengths, 0.95) print("Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration " "(median, mean, 95-percentile) = ({2},{3},{4})".format( optional_silence_phone_text, "%.1f" % opt_sil_internal_phone_percent, duration_median, "%0.1f" % duration_mean, duration_percentile_95)) |