Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/segmentation/evaluate_segmentation.pl 7.9 KB
  #!/usr/bin/env perl
  
  # Copyright 2014  Johns Hopkins University (Author: Sanjeev Khudanpur), Vimal Manohar 
  # Apache 2.0
  
  ################################################################################
  #
  # This script was written to check the goodness of automatic segmentation tools
  # It assumes input in the form of two Kaldi segments files, i.e. a file each of
  # whose lines contain four space-separated values:
  #
  #    UtteranceID  FileID  StartTime EndTime
  #
  # It computes # missed frames, # false positives and # overlapping frames.
  #
  ################################################################################
  
  if ($#ARGV == 1) {
      $ReferenceSegmentation = $ARGV[0];
      $HypothesizedSegmentation = $ARGV[1];
      printf STDERR ("Comparing reference segmentation
  \t%s
  with proposed segmentation
  \t%s
  ",
  		   $ReferenceSegmentation,
  		   $HypothesizedSegmentation);
  } else {
      printf STDERR "This program compares the reference segmenation with the proposted segmentation
  ";
      printf STDERR "Usage: $0 reference_segments_filename proposed_segments_filename
  ";
      printf STDERR "e.g. $0 data/dev10h/segments data/dev10h.seg/segments
  ";
      exit (0);
  }
  
  ################################################################################
  # First read the reference segmentation, and
  # store the start- and end-times of all segments in each file.
  ################################################################################
  
  open (SEGMENTS, "cat $ReferenceSegmentation | sort -k2,2 -k3n,3 -k4n,4 |")
      || die "Unable to open $ReferenceSegmentation";
  $numLines = 0;
  while ($line=<SEGMENTS>) {
      chomp $line;
      @field = split("[ \t]+", $line);
      unless ($#field == 3) {
    exit (1);
  	printf STDERR "Skipping unparseable line in file $ReferenceSegmentation
  \t$line
  ";
  	next;
      }
      $fileID = $field[1];
      unless (exists $firstSeg{$fileID}) {
  	$firstSeg{$fileID} = $numLines;
  	$actualSpeech{$fileID} = 0.0;
  	$hypothesizedSpeech{$fileID} = 0.0;
  	$foundSpeech{$fileID} = 0.0;
  	$falseAlarm{$fileID} = 0.0;
  	$minStartTime{$fileID} = 0.0;
  	$maxEndTime{$fileID} = 0.0;
      }
      $refSegName[$numLines] = $field[0];
      $refSegStart[$numLines] = $field[2];
      $refSegEnd[$numLines] = $field[3];
      $actualSpeech{$fileID} += ($field[3]-$field[2]);
      $minStartTime{$fileID} = $field[2] if ($minStartTime{$fileID}>$field[2]);
      $maxEndTime{$fileID} = $field[3] if ($maxEndTime{$fileID}<$field[3]);
      $lastSeg{$fileID} = $numLines;
      ++$numLines;
  }
  close(SEGMENTS);
  print STDERR "Read $numLines segments from $ReferenceSegmentation
  ";
  
  ################################################################################
  # Process hypothesized segments sequentially, and gather speech/nonspeech stats
  ################################################################################
  
  open (SEGMENTS, "cat $HypothesizedSegmentation | sort -k2,2 -k1,1 |")
      # Kaldi segments files are sorted by UtteranceID, but we re-sort them here
      # so that all segments of a file are read together, sorted by start-time.
      || die "Unable to open $HypothesizedSegmentation";
  $numLines = 0;
  $totalHypSpeech = 0.0;
  $totalFoundSpeech = 0.0;
  $totalFalseAlarm = 0.0;
  $numShortSegs = 0;
  $numLongSegs = 0;
  while ($line=<SEGMENTS>) {
      chomp $line;
      @field = split("[ \t]+", $line);
      unless ($#field == 3) {
    exit (1);
  	printf STDERR "Skipping unparseable line in file $HypothesizedSegmentation
  \t$line
  ";
  	next;
      }
      $fileID = $field[1];
      $segStart = $field[2];
      $segEnd = $field[3];
      if (exists $firstSeg{$fileID}) {
  	# This FileID exists in the reference segmentation
  	# So gather statistics for this UtteranceID
  	$hypothesizedSpeech{$fileID} += ($segEnd-$segStart);
  	$totalHypSpeech += ($segEnd-$segStart);
  	if (($segStart>=$maxEndTime{$fileID}) || ($segEnd<=$minStartTime{$fileID})) {
  	    # This entire segment is a false alarm
  	    $falseAlarm{$fileID} += ($segEnd-$segStart);
  	    $totalFalseAlarm += ($segEnd-$segStart);
  	} else {
  	    # This segment may overlap one or more reference segments
  	    $p = $firstSeg{$fileID};
  	    while ($refSegEnd[$p]<=$segStart) {
  		++$p;
  	    }
  	    # The overlap, if any, begins at the reference segment p
  	    $q = $lastSeg{$fileID};
  	    while ($refSegStart[$q]>=$segEnd) {
  		--$q;
  	    }
  	    # The overlap, if any, ends at the reference segment q
  	    if ($q<$p) {
  		# This segment sits entirely in the nonspeech region
  		# between the two reference speech segments q and p
   		$falseAlarm{$fileID} += ($segEnd-$segStart);
  		$totalFalseAlarm += ($segEnd-$segStart);
  	    } else {
  		if (($segEnd-$segStart)<0.20) {
  		    # For diagnosing Pascal's VAD segmentation
  		    print STDOUT "Found short speech region $line
  ";
  		    ++$numShortSegs;
  		} elsif (($segEnd-$segStart)>60.0) {
  		    ++$numLongSegs;
  		    # For diagnosing Pascal's VAD segmentation
  		    print STDOUT "Found long speech region $line
  ";
  		}
  		# There is some overlap with segments p through q
  		for ($s=$p; $s<=$q; ++$s) {
  		    if ($segStart<$refSegStart[$s]) {
  			# There is a leading false alarm portion before s
  			$falseAlarm{$fileID} += ($refSegStart[$s]-$segStart);
  			$totalFalseAlarm += ($refSegStart[$s]-$segStart);
  			$segStart=$refSegStart[$s];
  		    }
  		    $speechPortion = ($refSegEnd[$s]<$segEnd) ?
  			($refSegEnd[$s]-$segStart) : ($segEnd-$segStart);
  		    $foundSpeech{$fileID} += $speechPortion;
  		    $totalFoundSpeech += $speechPortion;
  		    $segStart=$refSegEnd[$s];
  		}
  		if ($segEnd>$segStart) {
  		    # There is a trailing false alarm portion after q
  		    $falseAlarm{$fileID} += ($segEnd-$segStart);
  		    $totalFalseAlarm += ($segEnd-$segStart);
  		}
  	    }
  	}
      } else {
  	# This FileID does not exist in the reference segmentation
  	# So all this speech counts as a false alarm
    exit (1);
  	printf STDERR ("Unexpected fileID in hypothesized segments: %s", $fileID);
  	$totalFalseAlarm += ($segEnd-$segStart);
      }
      ++$numLines;
  }
  close(SEGMENTS);
  print STDERR "Read $numLines segments from $HypothesizedSegmentation
  ";
  
  ################################################################################
  # Now that all hypothesized segments have been processed, compute needed stats
  ################################################################################
  
  $totalActualSpeech = 0.0;
  $totalNonSpeechEst = 0.0; # This is just a crude estimate of total nonspeech.
  foreach $fileID (sort keys %actualSpeech) {
      $totalActualSpeech += $actualSpeech{$fileID};
      $totalNonSpeechEst += $maxEndTime{$fileID} - $actualSpeech{$fileID};
      #######################################################################
      # Print file-wise statistics to STDOUT; can pipe to /dev/null is needed
      #######################################################################
      printf STDOUT ("%s: %.2f min actual speech, %.2f min hypothesized: %.2f min overlap (%d\%), %.2f min false alarm (~%d\%)
  ",
  		   $fileID,
  		   ($actualSpeech{$fileID}/60.0),
  		   ($hypothesizedSpeech{$fileID}/60.0),
  		   ($foundSpeech{$fileID}/60.0),
  		   ($foundSpeech{$fileID}*100/($actualSpeech{$fileID}+0.01)),
  		   ($falseAlarm{$fileID}/60.0),
  		   ($falseAlarm{$fileID}*100/($maxEndTime{$fileID}-$actualSpeech{$fileID}+0.01)));
  }
  
  ################################################################################
  # Finally, we have everything needed to report the segmentation statistics.
  ################################################################################
  
  printf STDERR ("------------------------------------------------------------------------
  ");
  printf STDERR ("TOTAL: %.2f hrs actual speech, %.2f hrs hypothesized: %.2f hrs overlap (%d\%), %.2f hrs false alarm (~%d\%)
  ",
  		   ($totalActualSpeech/3600.0),
  		   ($totalHypSpeech/3600.0),
  		   ($totalFoundSpeech/3600.0),
  		   ($totalFoundSpeech*100/($totalActualSpeech+0.000001)),
  		   ($totalFalseAlarm/3600.0),
  		   ($totalFalseAlarm*100/($totalNonSpeechEst+0.000001)));
  printf STDERR ("\t$numShortSegs segments < 0.2 sec and $numLongSegs segments > 60.0 sec
  ");
  printf STDERR ("------------------------------------------------------------------------
  ");