analyze_phone_length_stats.py 12.8 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268


#!/usr/bin/env python


# Copyright 2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

from __future__ import print_function
import argparse
import sys, os
from collections import defaultdict


parser = argparse.ArgumentParser(description="This script reads stats created in analyze_alignments.sh "
                                 "to print information about phone lengths in alignments.  It's principally "
                                 "useful in order to see whether there is a reasonable amount of silence "
                                 "at the beginning and ends of segments.  The normal output of this script "
                                 "is written to the standard output and is human readable (on crashes, "
                                 "we'll print an error to stderr.")

parser.add_argument("--frequency-cutoff-percentage", type = float,
                    default = 0.5, help="Cutoff, expressed as a percentage "
                    "(between 0 and 100), of frequency at which we print stats "
                    "for a phone.")

parser.add_argument("lang",
                    help="Language directory, e.g. data/lang.")

args = parser.parse_args()


# set up phone_int2text to map from phone to printed form.
phone_int2text = {}
try:
    f = open(args.lang + "/phones.txt", "r");
    for line in f.readlines():
        [ word, number] = line.split()
        phone_int2text[int(number)] = word
    f.close()
except:
    sys.exit("analyze_phone_length_stats.py: error opening or reading {0}/phones.txt".format(
            args.lang))
# this is a special case... for begin- and end-of-sentence stats,
# we group all nonsilence phones together.
phone_int2text[0] = 'nonsilence'


# populate the set 'nonsilence', which will contain the integer phone-ids of
# nonsilence phones (and disambig phones, which won't matter).
nonsilence = set(phone_int2text.keys())
nonsilence.remove(0)
try:
    # open lang/phones/silence.csl-- while there are many ways of obtaining the
    # silence/nonsilence phones, we read this because it's present in graph
    # directories as well as lang directories.
    filename = "{0}/phones/silence.csl".format(args.lang)
    f = open(filename, "r")
    line = f.readline()
    f.close()
    for silence_phone in line.split(":"):
        nonsilence.remove(int(silence_phone))
except Exception as e:
    sys.exit("analyze_phone_length_stats.py: error processing {0}/phones/silence.csl: {1}".format(
            args.lang, str(e)))


# phone_length is a dict of dicts of dicts;
# phone_lengths[boundary_type] for boundary_type in [ 'begin', 'end', 'all' ] is
# a dict indexed by phone, containing dicts from length to a count of occurrences.
# Phones are ints and lengths are integers representing numbers of frames.
# So: count == phone_lengths[boundary_type][phone][length].
# note: for the 'begin' and 'end' boundary-types, we group all nonsilence phones
# into phone-id zero.
phone_lengths = dict()
for boundary_type in [ 'begin', 'end', 'all' ]:
    phone_lengths[boundary_type] = dict()
    for p in phone_int2text.keys():
        phone_lengths[boundary_type][p] = defaultdict(int)

# total_phones is a dict from boundary_type to total count [of phone occurrences]
total_phones = defaultdict(int)
# total_frames is a dict from boundary_type to total number of frames.
total_frames = defaultdict(int)
# total_frames is a dict from num-frames to count of num-utterances with that
# num-frames.

while True:
    line = sys.stdin.readline()
    if line == '':
        break
    a = line.split()
    if len(a) != 4:
        sys.exit("analyze_phone_length_stats.py: reading stdin, could not interpret line: " + line)
    try:
        count, boundary_type, phone, length = a
        total_phones[boundary_type] += int(count)
        total_frames[boundary_type] += int(count) * int(length)
        phone_lengths[boundary_type][int(phone)][int(length)] += int(count)
        if int(phone) in nonsilence:
            nonsilence_phone = 0
            phone_lengths[boundary_type][nonsilence_phone][int(length)] += int(count)
    except Exception as e:
        sys.exit("analyze_phone_length_stats.py: unexpected phone {0} "
                 "seen (lang directory mismatch?): {1}".format(phone, str(e)))

if len(phone_lengths) == 0:
    sys.exit("analyze_phone_length_stats.py: read no input")

# work out the optional-silence phone
try:
    f = open(args.lang + "/phones/optional_silence.int", "r")
    optional_silence_phone = int(f.readline())
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    f.close()
    if optional_silence_phone in nonsilence:
        print("analyze_phone_length_stats.py: was expecting the optional-silence phone to "
              "be a member of the silence phones, it is not.  This script won't work correctly.")
except:
    largest_count = 0
    optional_silence_phone = 1
    for p in phone_int2text.keys():
        if p > 0 and not p in nonsilence:
            this_count = sum([ l * c for l,c in phone_lengths['all'][p].items() ])
            if this_count > largest_count:
                largest_count = this_count
                optional_silence_phone = p
    optional_silence_phone_text = phone_int2text[optional_silence_phone]
    print("analyze_phone_length_stats.py: could not get optional-silence phone from "
          "{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format(
            args.lang, optional_silence_phone_text))


# If length_to_count is a map from length-in-frames to count,
# return the length-in-frames that equals the (fraction * 100)'th
# percentile of the distribution.
def GetPercentile(length_to_count, fraction):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0
    else:
        items = sorted(length_to_count.items())
        count_cutoff = int(fraction * total_phones)
        cur_count_total = 0
        for length,count in items:
            assert count >= 0
            cur_count_total += count
            if cur_count_total >= count_cutoff:
                return length
        assert false # we shouldn't reach here.

def GetMean(length_to_count):
    total_phones = sum(length_to_count.values())
    if total_phones == 0:
        return 0.0
    total_frames = sum([ float(l * c) for l,c in length_to_count.items() ])
    return total_frames / total_phones


# Analyze frequency, median and mean of optional-silence at beginning and end of utterances.
# The next block will print something like
#  "At utterance begin, SIL is seen 15.0% of the time; when seen, duration (median, mean) is (5, 7.6) frames."
#  "At utterance end, SIL is seen 14.6% of the time; when seen, duration (median, mean) is (4, 6.1) frames."


# This block will print warnings if silence is seen less than 80% of the time at utterance
# beginning and end.
for boundary_type in 'begin', 'end':
    phone_to_lengths = phone_lengths[boundary_type]
    num_utterances = total_phones[boundary_type]
    assert num_utterances > 0
    opt_sil_lengths = phone_to_lengths[optional_silence_phone]
    frequency_percentage = sum(opt_sil_lengths.values()) * 100.0 / num_utterances
    # The reason for this warning is that the tradition in speech recognition is
    # to supply a little silence at the beginning and end of utterances... up to
    # maybe half a second.  If your database is not like this, you should know;
    # you may want to mess with the segmentation to add more silence.
    if frequency_percentage < 80.0:
        print("analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% "
              "of the time at utterance {2}.  This may not be optimal.".format(
                optional_silence_phone_text, frequency_percentage, boundary_type))


# this will control a sentence that we print..
boundary_to_text = { }
boundary_to_text['begin'] = 'At utterance begin'
boundary_to_text['end'] = 'At utterance end'
boundary_to_text['all'] = 'Overall'

# the next block prints lines like (to give some examples):
# At utterance begin, SIL accounts for 98.4% of phone occurrences, with duration (median, mean, 95-percentile) is (57,59.9,113) frames.
# ...
# At utterance end, nonsilence accounts for 4.2% of phone occurrences, with duration (median, mean, 95-percentile) is (13,13.3,22) frames.
# ...
# Overall, R_I accounts for 3.2% of phone occurrences, with duration (median, mean, 95-percentile) is (6,6.9,12) frames.

for boundary_type in 'begin', 'end', 'all':
    phone_to_lengths = phone_lengths[boundary_type]
    tot_num_phones = total_phones[boundary_type]
    # sort the phones in decreasing order of count.
    for phone,lengths in sorted(phone_to_lengths.items(), key = lambda x : -sum(x[1].values())):
        frequency_percentage = sum(lengths.values()) * 100.0 / tot_num_phones
        if frequency_percentage < args.frequency_cutoff_percentage:
            continue

        duration_median = GetPercentile(lengths, 0.5)
        duration_percentile_95 = GetPercentile(lengths, 0.95)
        duration_mean = GetMean(lengths)

        text = boundary_to_text[boundary_type]  # e.g. 'At utterance begin'.
        try:
            phone_text = phone_int2text[phone]
        except:
            sys.exit("analyze_phone_length_stats.py: phone {0} is not covered on phones.txt "
                     "(lang/alignment mismatch?)".format(phone))
        print("{text}, {phone_text} accounts for {percent}% of phone occurrences, with "
              "duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format(
                text = text, phone_text = phone_text,
                percent = "%.1f" % frequency_percentage,
                median = duration_median, mean = "%.1f" % duration_mean,
                percentile95 = duration_percentile_95))


## Print stats on frequency and average length of word-internal optional-silences.
## For optional-silence only, subtract the begin and end-utterance stats from the 'all'
## stats, to get the stats excluding initial and final phones.
total_frames['internal'] = total_frames['all'] - total_frames['begin'] - total_frames['end']
total_phones['internal'] = total_phones['all'] - total_phones['begin'] - total_phones['end']

internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone])
# internal_opt_sil_phone_lenghts is a dict from length to count.
for length in list(internal_opt_sil_phone_lengths.keys()):
    # subtract the counts for begin and end from the overall counts to get the
    # word-internal count.
    internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] +
                                               phone_lengths['end'][optional_silence_phone][length])
    if internal_opt_sil_phone_lengths[length] == 0:
        del internal_opt_sil_phone_lengths[length]

if total_phones['internal'] != 0.0:
    total_internal_optsil_frames = sum([ float(l * c) for l,c in internal_opt_sil_phone_lengths.items() ])
    total_optsil_frames = sum([ float(l * c)
                                for l,c in phone_lengths['all'][optional_silence_phone].items() ])
    opt_sil_internal_frame_percent = total_internal_optsil_frames * 100.0 / total_frames['internal']
    opt_sil_total_frame_percent = total_optsil_frames * 100.0 / total_frames['all']
    internal_frame_percent = total_frames['internal'] * 100.0 / total_frames['all']

    print("The optional-silence phone {0} occupies {1}% of frames overall ".format(
            optional_silence_phone_text, "%.1f" % opt_sil_total_frame_percent))
    hours_total = total_frames['all'] / 360000.0;
    hours_nonsil = (total_frames['all'] - total_optsil_frames) / 360000.0
    print("Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, "
          "optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent,
                                                                 optional_silence_phone_text,
                                                                 "%.1f" % opt_sil_internal_frame_percent))
    print("Assuming 100 frames per second, the alignments represent {0} hours of data, "
          "or {1} hours if {2} frames are excluded.".format(
            "%.1f" % hours_total, "%.1f" % hours_nonsil, optional_silence_phone_text))

    opt_sil_internal_phone_percent = (sum(internal_opt_sil_phone_lengths.values()) *
                                      100.0 / total_phones['internal'])
    duration_median = GetPercentile(internal_opt_sil_phone_lengths, 0.5)
    duration_mean = GetMean(internal_opt_sil_phone_lengths)
    duration_percentile_95 = GetPercentile(internal_opt_sil_phone_lengths, 0.95)
    print("Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration "
          "(median, mean, 95-percentile) = ({2},{3},{4})".format(
                optional_silence_phone_text, "%.1f" % opt_sil_internal_phone_percent,
                duration_median, "%0.1f" % duration_mean, duration_percentile_95))