analyze_phone_length_stats.py 12.8 KB
#!/usr/bin/env python


# Copyright 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
import argparse
import sys, os
from collections import defaultdict


parser = argparse.ArgumentParser(description="This script reads stats created in analyze_alignments.sh "
                                 "to print information about phone lengths in alignments.  It's principally "
                                 "useful in order to see whether there is a reasonable amount of silence "
                                 "at the beginning and ends of segments.  The normal output of this script "
                                 "is written to the standard output and is human readable (on crashes, "
                                 "we'll print an error to stderr.")

parser.add_argument("--frequency-cutoff-percentage", type = float,
                    default = 0.5, help="Cutoff, expressed as a percentage "
                    "(between 0 and 100), of frequency at which we print stats "
                    "for a phone.")

parser.add_argument("lang",
                    help="Language directory, e.g. data/lang.")

args = parser.parse_args()


# set up phone_int2text to map from phone to printed form.
phone_int2text = {}
try:
    f = open(args.lang + "/phones.txt", "r");
    for line in f.readlines():
        [ word, number] = line.split()
        phone_int2text[int(number)] = word
    f.close()
except:
    sys.exit("analyze_phone_length_stats.py: error opening or reading {0}/phones.txt".format(
            args.lang))
# this is a special case... for begin- and end-of-sentence stats,
# we group all nonsilence phones together.
phone_int2text[0] = 'nonsilence'


# populate the set 'nonsilence', which will contain the integer phone-ids of
# nonsilence phones (and disambig phones, which won't matter).
nonsilence = set(phone_int2text.keys())
nonsilence.remove(0)
try:
    # open lang/phones/silence.csl-- while there are many ways of obtaining the
    # silence/nonsilence phones, we read this because it's present in graph
    # directories as well as lang directories.
    filename = "{0}/phones/silence.csl".format(args.lang)
    f = open(filename, "r")
    line = f.readline()
    f.close()
    for silence_phone in line.split(":"):
        nonsilence.remove(int(silence_phone))
except Exception as e:
    sys.exit("analyze_phone_length_stats.py: error processing {0}/phones/silence.csl: {1}".format(
            args.lang, str(e)))


# phone_length is a dict of dicts of dicts;
# phone_lengths[boundary_type] for boundary_type in [ 'begin', 'end', 'all' ] is
# a dict indexed by phone, containing dicts from length to a count of occurrences.
# Phones are ints and lengths are integers representing numbers of frames.
# So: count == phone_lengths[boundary_type][phone][length].
# note: for the 'begin' and 'end' boundary-types, we group all nonsilence phones
# into phone-id zero.
phone_lengths = dict()
for boundary_type in [ 'begin', 'end', 'all' ]:
    phone_lengths[boundary_type] = dict()
    for p in phone_int2text.keys():
        phone_lengths[boundary_type][p] = defaultdict(int)

# total_phones is a dict from boundary_type to total count [of phone occurrences]
total_phones = defaultdict(int)
# total_frames is a dict from boundary_type to total number of frames.
total_frames = defaultdict(int)
# total_frames is a dict from num-frames to count of num-utterances with that
# num-frames.

while True:
    line = sys.stdin.readline()
    if line == '':
        break
    a = line.split()
    if len(a) != 4:
        sys.exit("analyze_phone_length_stats.py: reading stdin, could not interpret line: " + line)
    try:
        count, boundary_type, phone, length = a
        total_phones[boundary_type] += int(count)
        total_frames[boundary_type] += int(count) * int(length)
        phone_lengths[boundary_type][int(phone)][int(length)] += int(count)
        if int(phone) in nonsilence:
            nonsilence_phone = 0
            phone_lengths[boundary_type][nonsilence_phone][int(length)] += int(count)
    except Exception as e:
        sys.exit("analyze_phone_length_stats.py: unexpected phone {0} "
                 "seen (lang directory mismatch?): {1}".format(phone, str(e)))

if len(phone_lengths) == 0:
    sys.exit("analyze_phone_length_stats.py: read no input")

# work out the optional-silence phone
try:
    f = open(args.lang + "/phones/optional_silence.int", "r")
    optional_silence_phone = int(f.readline())
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    f.close()
    if optional_silence_phone in nonsilence:
        print("analyze_phone_length_stats.py: was expecting the optional-silence phone to "
              "be a member of the silence phones, it is not.  This script won't work correctly.")
except:
    largest_count = 0
    optional_silence_phone = 1
    for p in phone_int2text.keys():
        if p > 0 and not p in nonsilence:
            this_count = sum([ l * c for l,c in phone_lengths['all'][p].items() ])
            if this_count > largest_count:
                largest_count = this_count
                optional_silence_phone = p
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    print("analyze_phone_length_stats.py: could not get optional-silence phone from "
          "{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format(
            args.lang, optional_silence_phone_text))



# If length_to_count is a map from length-in-frames to count,
# return the length-in-frames that equals the (fraction * 100)'th
# percentile of the distribution.
def GetPercentile(length_to_count, fraction):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0
    else:
        items = sorted(length_to_count.items())
        count_cutoff = int(fraction * total_phones)
        cur_count_total = 0
        for length,count in items:
            assert count >= 0
            cur_count_total += count
            if cur_count_total >= count_cutoff:
                return length
        assert false # we shouldn't reach here.

def GetMean(length_to_count):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0.0
    total_frames = sum([ float(l * c) for l,c in length_to_count.items() ])
    return total_frames / total_phones


# Analyze frequency, median and mean of optional-silence at beginning and end of utterances.
# The next block will print something like
#  "At utterance begin, SIL is seen 15.0% of the time; when seen, duration (median, mean) is (5, 7.6) frames."
#  "At utterance end, SIL is seen 14.6% of the time; when seen, duration (median, mean) is (4, 6.1) frames."


# This block will print warnings if silence is seen less than 80% of the time at utterance
# beginning and end.
for boundary_type in 'begin', 'end':
    phone_to_lengths = phone_lengths[boundary_type]
    num_utterances = total_phones[boundary_type]
    assert num_utterances > 0
    opt_sil_lengths = phone_to_lengths[optional_silence_phone]
    frequency_percentage = sum(opt_sil_lengths.values()) * 100.0 / num_utterances
    # The reason for this warning is that the tradition in speech recognition is
    # to supply a little silence at the beginning and end of utterances... up to
    # maybe half a second.  If your database is not like this, you should know;
    # you may want to mess with the segmentation to add more silence.
    if frequency_percentage < 80.0:
        print("analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% "
              "of the time at utterance {2}.  This may not be optimal.".format(
                optional_silence_phone_text, frequency_percentage, boundary_type))



# this will control a sentence that we print..
boundary_to_text = { }
boundary_to_text['begin'] = 'At utterance begin'
boundary_to_text['end'] = 'At utterance end'
boundary_to_text['all'] = 'Overall'

# the next block prints lines like (to give some examples):
# At utterance begin, SIL accounts for 98.4% of phone occurrences, with duration (median, mean, 95-percentile) is (57,59.9,113) frames.
# ...
# At utterance end, nonsilence accounts for 4.2% of phone occurrences, with duration (median, mean, 95-percentile) is (13,13.3,22) frames.
# ...
# Overall, R_I accounts for 3.2% of phone occurrences, with duration (median, mean, 95-percentile) is (6,6.9,12) frames.

for boundary_type in 'begin', 'end', 'all':
    phone_to_lengths = phone_lengths[boundary_type]
    tot_num_phones = total_phones[boundary_type]
    # sort the phones in decreasing order of count.
    for phone,lengths in sorted(phone_to_lengths.items(), key = lambda x : -sum(x[1].values())):
        frequency_percentage = sum(lengths.values()) * 100.0 / tot_num_phones
        if frequency_percentage < args.frequency_cutoff_percentage:
            continue

        duration_median = GetPercentile(lengths, 0.5)
        duration_percentile_95 = GetPercentile(lengths, 0.95)
        duration_mean = GetMean(lengths)

        text = boundary_to_text[boundary_type]  # e.g. 'At utterance begin'.
        try:
            phone_text = phone_int2text[phone]
        except:
            sys.exit("analyze_phone_length_stats.py: phone {0} is not covered on phones.txt "
                     "(lang/alignment mismatch?)".format(phone))
        print("{text}, {phone_text} accounts for {percent}% of phone occurrences, with "
              "duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format(
                text = text, phone_text = phone_text,
                percent = "%.1f" % frequency_percentage,
                median = duration_median, mean = "%.1f" % duration_mean,
                percentile95 = duration_percentile_95))


## Print stats on frequency and average length of word-internal optional-silences.
## For optional-silence only, subtract the begin and end-utterance stats from the 'all'
## stats, to get the stats excluding initial and final phones.
total_frames['internal'] = total_frames['all'] - total_frames['begin'] - total_frames['end']
total_phones['internal'] = total_phones['all'] - total_phones['begin'] - total_phones['end']

internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone])
# internal_opt_sil_phone_lenghts is a dict from length to count.
for length in list(internal_opt_sil_phone_lengths.keys()):
    # subtract the counts for begin and end from the overall counts to get the
    # word-internal count.
    internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] +
                                               phone_lengths['end'][optional_silence_phone][length])
    if internal_opt_sil_phone_lengths[length] == 0:
        del internal_opt_sil_phone_lengths[length]

if total_phones['internal'] != 0.0:
    total_internal_optsil_frames = sum([ float(l * c) for l,c in internal_opt_sil_phone_lengths.items() ])
    total_optsil_frames = sum([ float(l * c)
                                for l,c in phone_lengths['all'][optional_silence_phone].items() ])
    opt_sil_internal_frame_percent = total_internal_optsil_frames * 100.0 / total_frames['internal']
    opt_sil_total_frame_percent = total_optsil_frames * 100.0 / total_frames['all']
    internal_frame_percent = total_frames['internal'] * 100.0 / total_frames['all']

    print("The optional-silence phone {0} occupies {1}% of frames overall ".format(
            optional_silence_phone_text, "%.1f" % opt_sil_total_frame_percent))
    hours_total = total_frames['all'] / 360000.0;
    hours_nonsil = (total_frames['all'] - total_optsil_frames) / 360000.0
    print("Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, "
          "optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent,
                                                                 optional_silence_phone_text,
                                                                 "%.1f" % opt_sil_internal_frame_percent))
    print("Assuming 100 frames per second, the alignments represent {0} hours of data, "
          "or {1} hours if {2} frames are excluded.".format(
            "%.1f" % hours_total, "%.1f" % hours_nonsil, optional_silence_phone_text))

    opt_sil_internal_phone_percent = (sum(internal_opt_sil_phone_lengths.values()) *
                                      100.0 / total_phones['internal'])
    duration_median = GetPercentile(internal_opt_sil_phone_lengths, 0.5)
    duration_mean = GetMean(internal_opt_sil_phone_lengths)
    duration_percentile_95 = GetPercentile(internal_opt_sil_phone_lengths, 0.95)
    print("Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration "
          "(median, mean, 95-percentile) = ({2},{3},{4})".format(
                optional_silence_phone_text, "%.1f" % opt_sil_internal_phone_percent,
                duration_median, "%0.1f" % duration_mean, duration_percentile_95))