Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py 6.35 KB
  #!/usr/bin/env python
  
  # Copyright 2016  Vimal Manohar
  # Apache 2.0
  
  """Prepares a graph with a simple HMM topology for segmentation
  with minimum and maximum speech duration constraints and minimum silence
  duration constraint. The graph is written to the 'output_graph', which
  can be file or "-" for stdout.
  """
  
  from __future__ import print_function
  import argparse
  import logging
  import math
  import os
  import sys
  import traceback
  
  sys.path.insert(0, 'steps')
  import libs.common as common_lib
  
  
  logger = logging.getLogger(__name__)
  logger.setLevel(logging.INFO)
  handler = logging.StreamHandler()
  handler.setLevel(logging.INFO)
  formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                                "%(funcName)s - %(levelname)s ] %(message)s")
  handler.setFormatter(formatter)
  logger.addHandler(handler)
  
  
  def get_args():
      parser = argparse.ArgumentParser(
          description="""This script prepares a graph with a simple HMM topology
          for segmentation with minimum and maximum speech duration constraints
          and minimum silence duration constraint. The graph is written to the
          'output_graph', which can be file or "-" for stdout.  for segmentation
          with minimum and maximum speech duration constraints and minimum silence
          duration constraint.""",
          formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  
      parser.add_argument("--transition-scale", type=float, default=1.0,
                          help="""Scale on transition probabilities relative to
                          LM weights""")
      parser.add_argument("--loopscale", type=float, default=0.1,
                          help="""Scale on self-loop log-probabilities relative
                          to LM weights""")
  
      parser.add_argument("--min-silence-duration", type=float, default=0.03,
                          help="""Minimum duration for silence""")
      parser.add_argument("--min-speech-duration", type=float, default=0.3,
                          help="""Minimum duration for speech""")
      parser.add_argument("--max-speech-duration", type=float, default=10.0,
                          help="""Maximum duration for speech""")
      parser.add_argument("--frame-shift", type=float, default=0.03,
                          help="""Frame shift in seconds""")
  
      parser.add_argument("--edge-silence-probability", type=float,
                          default=0.5,
                          help="Probability of silence at the edges.")
      parser.add_argument("--transition-probability", type=float, default=0.1,
                          help="Transition probability for silence to speech "
                          "or vice-versa")
  
      parser.add_argument("output_graph", type=str,
                          help="Output graph")
      args = parser.parse_args()
  
      args.min_states_silence = int(args.min_silence_duration / args.frame_shift
                                    + 0.5)
      args.min_states_speech = int(args.min_speech_duration / args.frame_shift
                                   + 0.5)
      args.max_states_speech = int(args.max_speech_duration / args.frame_shift
                                   + 0.5)
  
      return args
  
  
  def print_states(args, file_handle):
      # Initial transition to silence
      print ("0 1 silence silence {0}".format(-math.log(args.edge_silence_probability)),
             file=file_handle)
      silence_start_state = 1
  
      # Silence min duration transitions
      # 1->2, 2->3 and so on until
      # (1 + min_states_silence - 2) -> (1 + min_states_silence - 1)  ...
      for state in range(silence_start_state,
                         silence_start_state + args.min_states_silence - 1):
          print ("{state} {next_state} silence silence {cost}".format(
                      state=state, next_state=state + 1, cost=0.0),
                 file=file_handle)
      silence_last_state = silence_start_state + args.min_states_silence - 1
  
      # Silence self-loop
      print ("{state} {state} silence silence {cost}".format(
                  state=silence_last_state, cost=0.0),
             file=file_handle)
  
      speech_start_state = silence_last_state + 1
      # Initial transition to speech
      print ("0 {state} speech speech {cost}".format(
                  state=speech_start_state,
                  cost=-math.log(1.0 - args.edge_silence_probability)),
             file=file_handle)
  
      # Silence to speech transition
      print ("{sil_state} {speech_state} speech speech {cost}".format(
                  sil_state=silence_last_state,
                  speech_state=speech_start_state,
                  cost=-math.log(args.transition_probability)),
             file=file_handle)
  
      # Speech min duration
      for state in range(speech_start_state,
                         speech_start_state + args.min_states_speech - 1):
          print ("{state} {next_state} speech speech {cost}".format(
                      state=state, next_state=state + 1, cost=0.0),
                 file=file_handle)
  
      # Speech max duration
      for state in range(speech_start_state + args.min_states_speech - 1,
                         speech_start_state + args.max_states_speech - 1):
          print ("{state} {next_state} speech speech {cost}".format(
                      state=state, next_state=state + 1, cost=0.0),
                 file=file_handle)
  
          print ("{state} {sil_state} silence silence {cost}".format(
                      state=state, sil_state=silence_start_state,
                      cost=-math.log(args.transition_probability)),
                 file=file_handle)
      speech_last_state = speech_start_state + args.max_states_speech - 1
  
      # Transition to silence after max duration of speech
      print ("{state} {sil_state} silence silence {cost}".format(
                  state=speech_last_state, sil_state=silence_start_state,
                  cost=0.0),
             file=file_handle)
  
      for state in range(1, speech_start_state):
          print ("{state} {cost}".format(
                      state=state, cost=-math.log(args.edge_silence_probability)),
                 file=file_handle)
  
      for state in range(speech_start_state, speech_last_state + 1):
          print ("{state} {cost}".format(
                      state=state,
                      cost=-math.log(1.0 - args.edge_silence_probability)),
                 file=file_handle)
  
  
  def main():
      try:
          args = get_args()
          with common_lib.smart_open(args.output_graph, 'w') as f:
              print_states(args, f)
      except Exception:
          raise
  
  
  if __name__ == '__main__':
      main()