prepare_sad_graph.py 6.35 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164


#!/usr/bin/env python

# Copyright 2016  Vimal Manohar
# Apache 2.0

"""Prepares a graph with a simple HMM topology for segmentation
with minimum and maximum speech duration constraints and minimum silence
duration constraint. The graph is written to the 'output_graph', which
can be file or "-" for stdout.
"""

from __future__ import print_function
import argparse
import logging
import math
import os
import sys
import traceback

sys.path.insert(0, 'steps')
import libs.common as common_lib


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                              "%(funcName)s - %(levelname)s ] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def get_args():
    parser = argparse.ArgumentParser(
        description="""This script prepares a graph with a simple HMM topology
        for segmentation with minimum and maximum speech duration constraints
        and minimum silence duration constraint. The graph is written to the
        'output_graph', which can be file or "-" for stdout.  for segmentation
        with minimum and maximum speech duration constraints and minimum silence
        duration constraint.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--transition-scale", type=float, default=1.0,
                        help="""Scale on transition probabilities relative to
                        LM weights""")
    parser.add_argument("--loopscale", type=float, default=0.1,
                        help="""Scale on self-loop log-probabilities relative
                        to LM weights""")

    parser.add_argument("--min-silence-duration", type=float, default=0.03,
                        help="""Minimum duration for silence""")
    parser.add_argument("--min-speech-duration", type=float, default=0.3,
                        help="""Minimum duration for speech""")
    parser.add_argument("--max-speech-duration", type=float, default=10.0,
                        help="""Maximum duration for speech""")
    parser.add_argument("--frame-shift", type=float, default=0.03,
                        help="""Frame shift in seconds""")

    parser.add_argument("--edge-silence-probability", type=float,
                        default=0.5,
                        help="Probability of silence at the edges.")
    parser.add_argument("--transition-probability", type=float, default=0.1,
                        help="Transition probability for silence to speech "
                        "or vice-versa")

    parser.add_argument("output_graph", type=str,
                        help="Output graph")
    args = parser.parse_args()

    args.min_states_silence = int(args.min_silence_duration / args.frame_shift
                                  + 0.5)
    args.min_states_speech = int(args.min_speech_duration / args.frame_shift
                                 + 0.5)
    args.max_states_speech = int(args.max_speech_duration / args.frame_shift
                                 + 0.5)

    return args


def print_states(args, file_handle):
    # Initial transition to silence
    print ("0 1 silence silence {0}".format(-math.log(args.edge_silence_probability)),
           file=file_handle)
    silence_start_state = 1

    # Silence min duration transitions
    # 1->2, 2->3 and so on until
    # (1 + min_states_silence - 2) -> (1 + min_states_silence - 1)  ...
    for state in range(silence_start_state,
                       silence_start_state + args.min_states_silence - 1):
        print ("{state} {next_state} silence silence {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)
    silence_last_state = silence_start_state + args.min_states_silence - 1

    # Silence self-loop
    print ("{state} {state} silence silence {cost}".format(
                state=silence_last_state, cost=0.0),
           file=file_handle)

    speech_start_state = silence_last_state + 1
    # Initial transition to speech
    print ("0 {state} speech speech {cost}".format(
                state=speech_start_state,
                cost=-math.log(1.0 - args.edge_silence_probability)),
           file=file_handle)

    # Silence to speech transition
    print ("{sil_state} {speech_state} speech speech {cost}".format(
                sil_state=silence_last_state,
                speech_state=speech_start_state,
                cost=-math.log(args.transition_probability)),
           file=file_handle)

    # Speech min duration
    for state in range(speech_start_state,
                       speech_start_state + args.min_states_speech - 1):
        print ("{state} {next_state} speech speech {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

    # Speech max duration
    for state in range(speech_start_state + args.min_states_speech - 1,
                       speech_start_state + args.max_states_speech - 1):
        print ("{state} {next_state} speech speech {cost}".format(
                    state=state, next_state=state + 1, cost=0.0),
               file=file_handle)

        print ("{state} {sil_state} silence silence {cost}".format(
                    state=state, sil_state=silence_start_state,
                    cost=-math.log(args.transition_probability)),
               file=file_handle)
    speech_last_state = speech_start_state + args.max_states_speech - 1

    # Transition to silence after max duration of speech
    print ("{state} {sil_state} silence silence {cost}".format(
                state=speech_last_state, sil_state=silence_start_state,
                cost=0.0),
           file=file_handle)

    for state in range(1, speech_start_state):
        print ("{state} {cost}".format(
                    state=state, cost=-math.log(args.edge_silence_probability)),
               file=file_handle)

    for state in range(speech_start_state, speech_last_state + 1):
        print ("{state} {cost}".format(
                    state=state,
                    cost=-math.log(1.0 - args.edge_silence_probability)),
               file=file_handle)


def main():
    try:
        args = get_args()
        with common_lib.smart_open(args.output_graph, 'w') as f:
            print_states(args, f)
    except Exception:
        raise


if __name__ == '__main__':
    main()