Yannick Estève / ONTRAC-Kaldi

Blame view

tools/sctk-2.4.10/bin/chfilt.pl 38 KB
  #!/usr/bin/env perl
  
  ### This changes the function of a runtime warning to dump a stack trace
  #use Carp ();  local $SIG{__WARN__} = \&Carp::cluck;
  ### This dies on a warning after the stack dump
  #use Carp ();  local $SIG{__WARN__} = sub { Carp::cluck(); die; };
  #use Carp ();  local $SIG{__DIE__} = sub { Carp::cluck(); die; };
  
  use strict;
  
  # File:  chfilt
  # History:  
  #    version 1.0  Released 950921
  #    version 1.1  Released 951006
  #        - For Spanish, it now converts a set of predefined words to 
  #          interjections.
  #        - Corrected the order of deleting "--" from the transcripts
  #        - Normalized improperly formatted words in (( )) markers
  #    version 1.2  Released 951025
  #        - The "add_spaces_for_japanese_punctuation" function was splitting
  #          words that contained vowel elongation characters
  #        - Japapese quotation characters are now removed
  #    version 1.3 Released 960418
  #        - changed the exit codes to be 0 for successful completion, or
  #          1 otherwise,  (thanks Barb).
  #        - added English support	 
  #        - modified the main loop to first read in the entire file, then normalize
  #          it, by deleting empty lines and concatenating continued lines together.
  #        - added the -i option
  #        - added the translation flag option "-t"
  #        - added the option -I to not make alternations for interjection words.
  #        - added the equivalence class option "-q"
  #        - added the hyphenated word alternation flag "-a"
  #    version 1.4 Released 970711
  #        - added a the pem output format
  #        - added the -u option
  #        ????? is there an unresolved issue for Japanese ????
  #    version 1.5 Release 971010
  #        - Spanish: installed hesitation filter, removing interjections.
  #        - German:  installed hesitation filter
  #        - Arabic:  updated hesitation filter for *romanized script*
  #        - removed the -i, -u and -I flags
  #        - removed all code that made altenations with NULL
  #        - added the '-d' optionally deletable flag
  #    version 1.6 Released 
  #        - For Arabic scoring, translated the arabic script for an apostrophy to a hyphen
  #    version 1.7 Released April 7, 1998
  #        - fixed a problem with Out-of-Language markers within an unintelligible markers
  #        - fixed a problem with unintelligible markers within an OOL markers
  #        - added a filter to remove empty optionally deletable flags.
  #  Version 1.10
  #       JGF updated to new version or perl 
  #  Version 1.11 
  #       JGF added support for Miss. State transcript
  #       JGF modified the PEM output to pass through the duration times as strings
  #  Version 1.12
  #       JGF commented out part of the MState code that converted 'OH' and 'AW' to hesitations
  #       JGF Correctly handle possessive ancronyms in MState transcripts
  #  Version 1.13
  #       JGF Added processing for spoken letters.  Previously they did not exist.
  #       JGF Added code to remove '^' for proper names
  #       JGF Corrected the line header extraction code to handle empty transcripts
  #       JGF Added -i and -k options
  #       JGF Added code to remove 'aside' sgml tags
  #       JGF Added -p to pad segment times with silence and -s to smooth segmentations
  #       JGF Added -E to exclude overlapping speech          
  #  Version 1.14
  #       JGF Added -c to read contractions
  #       JGF disabled all old arabic character-specific processing
  #       JGF Uses Getopt::Long now tp process the command line arguments
  #       JGF Modified the output for stdout to be "-- -".  The new Getopt
  #           is interpreting '-' as an ambiguous arguement.
  #  Version 1.15
  #       JGF added Arabic processing steps for 2004 arabic transcripts.  See the Eval
  #           plan appendix for Arabic.
  #       JGF add use strict pragma
  #  Version 1.16
  #       JGF added code to check for unannotated, non-posessive contractions IFF contractions
  #           are to be processed.
  #  Version 1.17
  #       JGF Added -C
  #  Version 1.18
  #       JGF Added two new hesitations
  
  my $Version="1.18";
  
  my $usage = "Usage: chfilt <OPTIONS> infile outfile|-- -
  ".
  "Version: $Version
  ".
  "
  ".
  "Desc:  chfilt converts a Callhome transcription file, 'infile', to STM
  ".
  "       format, written to 'outfile'.  If '-- -' is used as output, the output goes to stdout.
  ".
  "
  ".
  "       OPTIONS :==
  ".
  "       -l <language>  Identifies the language on the transcript
  ".
  "                      Current chfilt can handle:  mandarin
  ".
  "                                                  spanish
  ".
  "                                                  japanese
  ".
  "                                                  english
  ".
  "                                                  arabic
  ".
  "                                                  german
  ".
  "       -m             Read Miss. State Transcripts
  ".
  "       -b b_time      Output segments After 'b_time'.
  ".
  "       -e e_time      Output segments ending before 'e_time'.
  ".
  "       -d             Output text with optionally deletable tags.
  ".
  "       -q Word1=Word2
  ".
  "                      Add alternations to the output such that Word1 and Word2
  ".
  "                      are equivalent
  ".
  "       -t Word1=Word2
  ".
  "                      Translate Word1 to Word2
  ".
  "       -a             Convert all hyphenated word to alternations whereby the 
  ".
  "                      two alternations are the word with hyphens deleted, or
  ".
  "                      the word(s) with hyphens converted to spaces
  ".
  "       -i             Add inter-segment gaps to the stm records.  This option
  ".
  "                      requires -b and -e to be used
  ".
  "       -k             Keep lines with no lexemes in the transcript rather than
  ".
  "                      ignoring them
  ".
  "       -o fmt         Set the output format to: 
  ".
  "                        stm  -> Output STM format (default)
  ".
  "                        pem  -> Output PEM format
  ".
  "       -s time        Smooth the segmentation (i.e., merge segments) that hava gap
  ".
  "                      intersegment gaps less than 'time'
  ".
  "       -p time        Pad segment boundaries with 'time' seconds of silence if
  ".
  "                      the time is available
  ".
  "       -c             Don't expand the contractions if they are annotated
  ".
  "       -C (numbers|letters)
  ".
  "                      If 'numbers', change channels A and B to 1 and 2 respectively 
  ".
  "                      If 'letters', change channels 1 and 2 to A and B respectively 
  ".
  "       -E             Exclude overlapping segments in the output STMs
  ";;
  
  ##########3  Globals 
  my $Language;
  my $OptDel;
  my ($NewBeginTime, $NewEndTime);
  my $Format; 
  my $KeepEmptyLines;
  my @Translate;
  my @Equivs;
  my $AltHyphens;
  my $ReadMSstate;
  my $ExcludeOverlap;
  my $PadSilence;
  my $SmoothSegments;
  my $ExpandContractions;
  my $AddInterSegGap;
  my $HES;
  my $InFile;
  my $OutFile;
  my $id;
  my %TranslateAArray;
  my %EquivsAArray;
  my @Trans;
  my $ChannelsTo = "asis";
  
  use Getopt::Long;
  $Getopt::Long::ignorecase = 0;
  my $ret = GetOptions ("s:f", "p:f", "l=s", "b:f", "e:f", "o=s", "t:s", "q:s", "a", "d", "m", "i", "k", "E", "c", "C=s");
  die "
  $usage
  Error: Failed to parse argements" if (! $ret);
  if (defined($main::opt_o)) {
      $Format = $main::opt_o;
      if($Format !~ /^(stm|pem)$/){
          print "
  $usage
  Error: Unrecognized format type --> $Format
  
  ";
          exit 1;
      }
  } else {
      $Format = "stm";
  }
  if (!defined($main::opt_l)){
      print "
  $usage
  Error: Language option is required
  
  ";
      exit 1;
  } else {
      $Language = $main::opt_l;
      if ($Language !~ /japanese/ &&
  	$Language !~ /mandarin/ &&
  	$Language !~ /english/ &&
  	$Language !~ /spanish/ &&
          $Language !~ /arabic/ &&
          $Language !~ /german/) {
  	print "
  $usage
  Error: Language not recognized
  
  ";
  	exit 1;
      }
  }
  
  if (defined($main::opt_b)) { $NewBeginTime = $main::opt_b;            } else { $NewBeginTime = 0.0}
  if (defined($main::opt_e)) { $NewEndTime   = $main::opt_e;            } else { $NewEndTime   = 999999;  }
  if (defined($main::opt_k)) { $KeepEmptyLines= $main::opt_k; } else { $KeepEmptyLines = 0; }
  if (defined($main::opt_t)) { @Translate    = split(/:/,$main::opt_t); } else { @Translate = (); }
  if (defined($main::opt_q)) { @Equivs       = split(/:/,$main::opt_q); } else { @Equivs = ();    }
  if (defined($main::opt_a)) { $AltHyphens   = 1;  $main::opt_a = 1; } else{ $AltHyphens = 0;$main::opt_a=0;}
  if (defined($main::opt_m)) { $ReadMSstate  = 1;  $main::opt_m = 1; } else{ $ReadMSstate = 0;$main::opt_m=0;}
  if (defined($main::opt_E)) { $ExcludeOverlap = $main::opt_E } else{ $ExcludeOverlap = 0}
  if (defined($main::opt_p)) { $PadSilence   = $main::opt_p } else{ $PadSilence = -1.0}
  if (defined($main::opt_s)) { $SmoothSegments = $main::opt_s } else{ $SmoothSegments = -1.0}
  if (defined($main::opt_c)) { $ExpandContractions = !(defined($main::opt_c))} else{ $ExpandContractions = 1}
  if (defined($main::opt_C)) { 
      die "Error: Unknown channel conversion '$main::opt_C'" if ($main::opt_C !~ /^(numbers|letters)/);
      $ChannelsTo = $main::opt_C;
  }
  $AddInterSegGap = defined($main::opt_i);
  die "$usage
  Error: -i requires -b and -e to be used"
      if (defined($main::opt_i) && (! defined($main::opt_b) || !defined($main::opt_e)));	
  
  if (defined($main::opt_d)) {
      $OptDel       = 1;  $main::opt_d = 1;
      $HES = " (%HESITATION) ";
  } else {
      $OptDel = 0; $main::opt_d = 0;
      $HES = "%HESITATION";
  }
  
  #### The main functions arguements:
  if ($#ARGV > 1) { print "
  $usage
  Too many arguements 
  
  "; exit 1; } 
  if ($#ARGV == 0) { print "
  $usage
  Output Not Specified
  
  "; exit 1; } 
  if ($#ARGV == -1) { print "
  $usage
  Input and Output Not Specified
  
  ";
  		    exit 1; } 
  
  $InFile=$ARGV[0];
  $OutFile=$ARGV[1];
  
  if (! -r $InFile){
      print "
  $usage
  Input file $InFile does not exist
  
  ";
      exit 1;
  }
  
  if ($OutFile eq "-"){
      $OutFile = "";
  } else {
      $OutFile = " > ".$OutFile;
  }
  
  ($id=$InFile) =~ s:^.*/::;
  $id =~ s/\.[^\.]*$//;
  
  # Check the translation classes.  If they exist, make an associative
  # array to house them
  if ($#Translate >= 0){
      foreach my $tlate (@Translate){
  	$tlate =~ tr/a-z/A-Z/;
  	if ($Language eq "german" || $Language eq "spanish") {
  	    $tlate =~ tr/\340-\377/\300-\337/;     #Upper-Case accented letters
  	}
  	my @tl = split(/=/,$tlate);
  	$TranslateAArray{$tl[0]} = $tl[1];
      }
  }
  
  # Check the equivalence classes.  If they exist, make an associative
  # array to house them
  if ($#Equivs >= 0){
      foreach my $equiv (@Equivs){
  	$equiv =~ tr/a-z/A-Z/;
  	if ($Language eq "german" || $Language eq "spanish") {
  	    $equiv =~ tr/\340-\377/\300-\337/;     #Upper-Case accented letters
  	}
  	my $alt;
   	($alt = $equiv) =~ s/^/{ /;
  	 $alt           =~ s/$/ }/;
  	 $alt           =~ s:=: / :g;
  	foreach my $eq (split(/=/,$equiv)){
  	    $EquivsAArray{$eq} = $alt;
  	}
      }
  }
  
  open(FILE,"< $InFile") || die("cannot open input Callhome file $InFile"); 
  #binmode FILE;
  
  my $hubscr_path = "../hubscr/hubscr.pl";
  die "ERROR: 'hubscr.pl' cannot be found, please edit 'chfilt.pl line 267 and set hubscr.pl path."if(! -e $hubscr_path);
  
  #open(OUTPUT,"| sort -t \\   +0  -1  +1  -2 +3nb -4 $OutFile") ||
  open(OUTPUT,"| perl $hubscr_path sortSTM $OutFile") || 
      die("cannot open output file $OutFile"); 
  
  ## make a place holder for the LUR reports
  print OUTPUT ';; CATEGORY "0" "" ""'."
  ";
  print OUTPUT ';; LABEL "O" "Overall" "All Segments in the test set"'."
  ";
  
  # read in the file, deleting blank lines:
  while (<FILE>){
      s/\[\[[^]]*\]\]//g;            #remove comments              '[[words]]'
  
      if ($_ !~ /^\s*$/ && $_ !~ /^#/){
  	if ($ReadMSstate){
  	    s/^\S+\s+//;
  	}
  	push(@Trans,$_);
      }
  }
  close (FILE);
  
  # normalize the transcript so that each turn is on a line by itself
  for (my $n=0; $n<=$#Trans; $n++){
      if ($Trans[$n] =~ /^\s*\d+\.\d*\s+\d+\.\d*\s+[AB]\d*:/){
  	my $orig = $Trans[$n];
  	if ($ExpandContractions){
  	    ### Scan the transcript looking for un-annotated non-possessive contractions
  	    my $ct = $Trans[$n];	    
  	    ### delete all tokens annotated as contractions
  	    $ct =~ s/(<contraction\s+e_form=\"([^\"]+)\">([^ \t
  \r\f\)]+))//gi;
  	    foreach my $ctt(split(/[\s\,\.\?]+/,$ct)){
  		print STDERR "Warning: Possible non-annotated contraction \"$ctt\"
  " if ($ctt =~ /\w'\w+/ && $ctt !~ /\w's$/);
  	    }
  	}
  	while ($Trans[$n] =~ /(<contraction\s+e_form=\"([^\"]+)\">([^ \t
  \r\f\)]+))/i){
  	    my ($tag, $rep, $token) = (quotemeta($1), $2, $3);
  	    my $exp;
  	    ($exp = $rep) =~ s/\[[^\[=]+=>/ /g;
  	    $exp =~ s/\]//g;
  	    $exp =~ s/^\s+//g;
  
  #	    print "Found $tag $rep $token '$exp' ------------ in $Trans[$n]";
  	    if ($ExpandContractions){
  		$Trans[$n] =~ s/$tag/$exp/;
  	    } else {
  		$Trans[$n] =~ s/$tag/$token/;
  	    }	    
  	    die "Error: unable to expand the contraction
  " if ($orig eq $Trans[$n]);
  	    $orig = $Trans[$n];
  	}
  	die "Error: Contraction not handled $Trans[$n]" if ($Trans[$n] =~ /<contrac/);
      } else {
  #	print "no match $Trans[$n]";
  	if ($n == 0){
  	    die("Illegal format, first line does not have a begin time, end time and channel");
  	}
  	chop($Trans[$n - 1]);
  	$Trans[$n - 1] = $Trans[$n - 1]." ".$Trans[$n];
  	splice(@Trans,$n,1);
  	$n--;
      }
  }   
  
  #print @Trans;
  
  my @STMS = ();
  
  ### Normalize the text 
  foreach $_ (@Trans){
      #match the lines header
      chomp;
      die "Unable to strip the line header of '$_'" unless s/^(.*[AB][0-9]*):\s*//;
      my $line_header = $1;             #Store the header in a variable
  
      tr/\011/ /;                    #change all tabs to spaces
      s/ /  /g;                      #multiple spaces go to one
      s/^/ /;                        #add spaces at begining of line
      s/$/ /;                        #put a space at the end of each line
      s/\s\(\(/ (( /g;                 #add spaces around unintelligble markers
      s/\)\)\s/ )) /g;                 #add spaces around unintelligble markers
      if ($OptDel == 0){
          s:<foreign\s+language=[^>]+>([^><]+)</foreign>:$1:gi;
      } else {
          s/\(\(\s*<foreign\s+language=\"[^"]+\">([^><]+)<\/foreign>\s*\)\)/\(\($1\)\)/gi; 
          s/<foreign\s+language=\"[^"]+\">([^><]+)<\/foreign>/\(\($1\)\)/gi;
     } 
  #print "$_ here
  ";
      if ($Language ne "arabic") {
         tr/a-z/A-Z/;                #Upper-Case all letters
         if ($Language eq "german" || $Language eq "spanish") {
  	   tr/\340-\377/\300-\337/     #Upper-Case accented letters
         }
      }
      s/\{[^{}]*\}//g;               #remove non-speech vocal noises         '{words}'
  
      ### LANGUAGE Specific Normalizations
      if ($Language eq "arabic") { 
  	my $hes;
  	if ($OptDel == 0){
  	    $hes="%\330\252\330\261\330\257\331\221\330\257";
  	} else {
  	    $hes=" (%\330\252\330\261\330\257\331\221\330\257) ";
  	}
          s/[% ]\330\243\331\207 /$hes /g;           ## %>h
          s/[% ]\330\245\331\212\331\207 /$hes /g;   ## %<yh
          s/[% ]\330\243\331\205 /$hes /g;           ## >m
          s/[% ]\330\243\331\210\331\210 /$hes /g;   ## >ww
          s/[% ]\331\207\330\247\331\212 /$hes /g;   ## hAy
          s/[% ]\331\205\331\207\331\205 /$hes /g;   ## mhm
       
          s/%*(\330\243\331\207\331\207) /$1 /g;         ## Back Channel
          s/%\330\243\330\254\331\206\330\250\331\212 / /g;     ##  Foreign lexeme
          s/%\330\252\330\257\330\247\330\256\331\204.*%\330\252\330\257\330\247\330\256\331\204\\ / /g;   ## End cross channel
  
          ### Remove non speech markers
          s/%\330\265\331\205\330\252\\* / /g;    # silence
          s/%\330\245\331\206\331\202\330\267\330\247\330\271\\* / /g; # pause symbol
          s/%\330\266\330\254\331\221\330\251\\* / /g;                 # noise symbol
          s/%\330\252\331\206\331\201\331\221\330\263\\* / /g;         # breath symbol
          s/%\330\263\330\271\330\247\331\204\\* / /g;                 # cough
          s/%\330\266\330\255\331\203\\* / /g;                         # laugh
          s/%\330\243\330\265\331\210\330\247\330\252\\* / /g;         # people talk
  
  #	## The LDC Transcripts have a special case encoded in them for
  #	## foreign words with an attached definite article.  The next few commands
  #	## move the definite article into the foreign tag.
  #	s/\s(\S+)(<\S+)\s+(\S+)/ $2 $1$3/g;
  #	## translate the arabic signle apostrophies to hyphens
  #	s/\254\s/- /g;
  #	s/\254$/- /g;
  #        s/\s\254/ -/g;
  #	## Arabic Semi colon
  #	s/\s\273/ /g;
      } elsif ($Language eq "english") { 
  	;
      } elsif ($Language eq "japanese") {
  	$_ = &add_spaces_for_japanese_punctuation($_);
  	s/\241\242 /, /g;          # Change the Japanese coma
  	s/\241\243 /. /g;          # Change the Japanese periods
  	s/\241\244 /, /g;          # Change the Japanese periods
  	s/\241\250 /; /g;          # Change the Japanese semi-colon
  	s/\241\251 /? /g;          # Change the Japanese question marks
  	s/\241\252 /! /g;          # Change the Japanese exclaimation marks
  	s/\241\327 / /g;           # Change the Japanese quotation mark
  	s/ \241\326/ /g;           # Change the Japanese quotation mark
  	s/\241\343([\000-\177])/<$1/g;      # Change the Japanese '<'
  	s/([\000-\177])\241\344/$1>/g;      # Change the Japanese '>'	 
  	# Convert Japanese col and dia flags to alternations
  	s/@([^@\[\]]+)\[\[([^,\[\]]+)\s*,[^\[\]]+\]\]/ { $1 \/ $2 } /g;
      } elsif ($Language eq "mandarin") {
  	while (s/(<[^_>]*)_/$1 /) { # Removes underlines from OOL items
  	    ;
  	}
  	s/@([^@]+)@/$1/g;           # Removes obviously the @'s !!!
  	s/\*([^\*]+)\*/$1/g;           # Removes obviously the *'s !!!
      }
  
      ### Call Home filtering (General)
  
      s/\-\-//g;                     #whatever they are??
      s/[.,?!;]//g;                   #Remove normal punctuation
      s/"//g;                        #Remove quotation marks
      s/\#/ /g;                      #remove simultaneous speech markers     '#'
      s/\[\[[^]]*\]\]//g;            #remove comments              '[[words]]'
      s/\[[^]]*\]//g;                #remove other sounds          '[words]'
      s/\/\///g;                     #remove speech to someone else markers  '//'
      s/<\/*aside\/*>//g;            #remove aside sgml tags
  
      if ($ReadMSstate){
          s/\^(\S)/$1/g;              #remove proper name markers? '^'
          s/\*(\S)/$1/g;              #remove ??? name markers? '*'
          s/\@(\S)/$1/g;              #remove ??? name markers? '@'
          s:<as/>[^<>]+</as>::ig;   #remove asides
          while ($_ =~ /~/){          # Deal with acronyms
              s/~(\S) /$1 /;
              s/~(\S\'[sS]) /$1 /;       # Deal with a possessive acronym
              s/~(\S)(\S)/$1 ~$2/;
          } 
      }
  
      s/\(\(\s*\)\)//g;              #remove unintelligable speech with no words
  
      ## Changes from new LDC transcription practices (Jan 2003)_
      s/~([A-Z-a-z])/$1./g;          #Change the spoken letters to the Hub4 form
      s/\^//g;                        #Remove proper names
  
  
      if ($OptDel == 1){
          s/\(\s+/(/g;                 #Normalize the variablity in the LDC trans
          s/\s+\)/)/g;                 #Normalize the variablity in the LDC trans
          my $vb = 0;
          my $iteration = 1;
          print "Start $_
  " if ($vb);
          my $loop_end = "xxxxxxxxxxxxxxxxx";
  
          my $plainWord = "[^ \t
  \r\f\(\)\{\}]+";
          my $optDelWord = "\\([^ \t
  \r\f\(\)\{\}]+\\)";
  #        my $wordSet = "($plainWord|$optDelWord)(\s+($plainWord|$optDelWord))*";
  
          while ($_ =~ /\(\(.*\)\)/) {
              # if the unintell is out side of a OOL marker, just delete them
              s/\(\((\s*<[^>]+>\s*)\)\)/$1/;
              print "      Step 1 $_
  " if ($vb);
  
              #  handle unintelligle markers within OOL markers
              s/\(\((\s*<[^>]+>\s*)([^\)]*)\)\)/$1 \(\($2\)\)/;
              print "      Step 2 $_
  " if ($vb);
  
  	    #Change 1st unintelligable word, if its already optionally deletable
              s/\(\(\s*($optDelWord)\s+(($plainWord|$optDelWord)(\s+($plainWord|$optDelWord))*\s*\)\))/ $1 \(\($2/g;
              print "      Step 3 $_
  " if ($vb);
  
  	    #Change 1st unintelligable word, if its an alternate
              s:\(\(\s*({[^{}]+})\s+(($plainWord|$optDelWord)(\s+($plainWord|$optDelWord))*\s*\)\)): \($1\) \(\($2:g;
              print "      Step 4 $_
  " if ($vb);
  
  	    #Change 1st unintelligable word, if its a plain word
              s/\(\(\s*($plainWord)\s+(($plainWord|$optDelWord)(\s+($plainWord|$optDelWord))*\s*\)\))/ \($1\) \(\($2/g;
              print "      Step 5 $_
  " if ($vb);
  
              ## Change a single optionally deletable word
              s/\(\(\s*($optDelWord)\s*\)\)/ $1 /g;
              print "      Step 8 $_
  " if ($vb);
  
              #alternate single unintelligable with alternate
              s/\(\(\s*({[^{}]+})\s*\)\)/ \($1\) /g;
              print "      Step 9 $_
  " if ($vb);
  
              #alternate single unintelligable words
              s/\(\(\s*($plainWord)\s*\)\)/ \($1\) /g;
              print "      Step A $_
  " if ($vb);
  
  	    # remove empty unintellegible markers
  	    s/\(\(\s*\)\)//g;
              $iteration ++;
  	
  	    if ($_ eq $loop_end){
  	         print STDERR "Warning: unable to fully alternate an unitelligible marker
  ".
  	                      "         $_
  ";
  	         last;
  	    }
              if ($iteration > 100){
  	         die "Unable to alternate an unitelligible marker '$_'
  ";
              }
              # Remember what the last line was so that We don't get it an infinite loop
              $loop_end = $_;
              print "   End $_
  " if ($vb);
          }
      } else {
  #        print "Delete (( $_
  ";
          s/\s\(\(/ /g;                 #Strip the unintelligible markers
          s/\)\)\s/ /g;                 #Strip the unintelligible markers
  #        print "          $_
  ";
      }
  
      if (! $OptDel){
          s/<\s*\S+\s+([^<>]*)>/$1/g;  #remove out-of-language speech markers
          s/<[^<>]*>//g;                #remove out-of-language speech markers
      } else {
          while ($_ =~ /(<\s*\S+\s+)([^<>]*)>/) {
              my ($otxt, $head) = ($2, $1);
              my $txt = $otxt;
              $txt =~ s/^\s+//;;
              $txt =~ s/\s+$//;;
              my $new= "";
              foreach my $w(split(/\s+/,$txt)){
                  if ($w !~ /^\(.+\)$/){
                      if ($new eq ""){
                          $new = "(".$w.")";
                      } else {
                          $new .= " (".$w.")";
                      }
                  } else {
                      $new .= " $w";
                  }
              }
              if ( ($_ =~ s/${head}[^>]*>/$new/) != 1){
                  die("OptDel substitution of non-language marker failed
  ".
                      "    text: $_    head: '$head'
      otxt: '$otxt'
      new: '$new'".
                      "
      merge: '${head}${otxt}>'");
              }
          }
          ### delete noise tags.  These are new as of 1/20/2004
          s/<[^ \t
  \r\f<>]+>//g;
     }
  
      ### Language Specific Filters
  
      # SPANISH
      if ($Language eq "spanish") {
          y/\240-\277//d;	             #remove spanish punctuation
          s/\*\*//g;		     #words marked with "**" (as neologisms or unknown
                                       # 'words')
          s/\+//g;		     #words marked with "++" (mispronunciations given
                                       # in correct orthography)
  
          # map the list of hesitiation's
          s:%::g; # remove all hesitation markers (maybe illegal ones!)
          # insert all hesitation markers (maybe missed ones!)
          s/ AAA / $HES /g;
          s/ EEE / $HES /g;
          s/ III / $HES /g;
          s/ MMM / $HES /g;
          s/ EMM / $HES /g;
          s/ AMM / $HES /g;
          s/ IMM / $HES /g;
          s/ EH / $HES /g;
          s/ MMH / $HES /g;
          s/ OH / $HES /g;
          s/ UH / $HES /g;
          s/ EY / $HES /g;
          s/ UY / $HES /g;
          s/ PSS / $HES /g;
          s/ HM / $HES /g;
          s/ UF / $HES /g;
          s/ SHH / $HES /g;
          s/ PFF / $HES /g;
          s/ OY / $HES /g;
          s/ HA / $HES /g;
          s/ AH / $HES /g;
       }
  
       # GERMAN
       if ($Language eq "german") {
  
          s/\&//g;	 		 #remove proper name marker
          s/\+//g;		     #words marked with "++" (mispronunciations given
                                       # in correct orthography)
           s/\*\*//g;		     #words marked with "**" (as neologisms
                                       # or unknown 'words')
          
          s:%::g; # remove all hesitation markers (maybe illegal ones!)
          # insert all hesitation markers (maybe missed ones!)
  
          s/ ÄH / $HES /g;
          s/ MM / $HES /g;
          s/ ÄHM / $HES /g;
          s/ HM / $HES /g;
          s/ HA / $HES /g;
          s/ EI / $HES /g;
          s/ UH / $HES /g;
          s/ HÄ / $HES /g;
          s/ HO / $HES /g;
          s/ UFF / $HES /g;
          s/ OI / $HES /g;
          s/ HUH / $HES /g;
          s/ BAH / $HES /g;
          s/ UI / $HES /g;
       }
  
  
       # ARABIC
       if ($Language eq "arabic") {
  
          s/\&//g;	 		 #remove proper name the marker
          s/\s\+(\S+)\+\s/ $1 /;           #Remove the plus markers
  #        s:%\S+\s::g; # remove all hesitation markers (maybe illegal ones!)
          s/\*\*//g;		     #words marked with "**" (as neologisms
                                       # or unknown 'words')
          s/$/ /;                      ## Add a space at the end of the line
          s/ /  /;                     ## Add double spaces
  
          # insert all hesitation markers (maybe missed ones!)
  #        s/ ah / $HES /g;
  #        s/ E / $HES /g;
  #        s/ M / $HES /g;
  #        s/ mhm / $HES /g;
  #        s/ ha / $HES /g;
  #        s/ uh / $HES /g;
  #        s/ yA / $HES /g;
  #        s/ aha / $HES /g;
  #        s/ yaa / $HES /g;
  #        s/ hm / $HES /g;
  #        s/ hum / $HES /g;
  #        s/ Ah / $HES /g;
  #        s/ ayyO  / $HES /g;
  #        s/ Eyy / $HES /g;
  #        s/ yuu / $HES /g;
  #        s/ ih / $HES /g;
  #        s/ yO / $HES /g;
  #        s/ O / $HES /g;
  #        s/ wAw / $HES /g;
  #        s/ hi / $HES /g;
  #        s/ Hay / $HES /g;
  #        s/ yOO / $HES /g;
  #        s/ OhO / $HES /g;
  #        s/ hE / $HES /g;
  #        ### Script equivalences
  #        s/ \302\347 / $HES /g;
  #        s/ \305\352\347 / $HES /g;
  #        s/ \305\352\352\347 / $HES /g;
  #        s/ \315\351 / $HES /g;
  #        s/ \345 / $HES /g;
  #        s/ \307\350 / $HES /g;
  #        s/ \307\350\347\350 / $HES /g;
  #        s/ \307\347 / $HES /g;
  #        s/ \307\347\307 / $HES /g;
  #        s/ \307\352\350     / $HES /g;
  #        s/ \347\351 / $HES /g;
  #        s/ \347\307 / $HES /g;
  #        s/ \347\351 / $HES /g;
  #        s/ \347\345 / $HES /g;
  #        s/ \347\345 / $HES /g;
  #        s/ \305\347 / $HES /g;
  #        s/ \345\345 / $HES /g;
  #        s/ \307\347 / $HES /g;
  #        s/ \350\307\350 / $HES /g;
  #        s/ \352\307\347 / $HES /g;
  #        s/ \352\350 / $HES /g;
  #        s/ \352\350\350 / $HES /g;
  #        s/ \352\307 / $HES /g;
  #        s/ \352\347 / $HES /g;
       }
  
       # MANDARIN
       if ($Language eq "mandarin") {
           s/\+//g;	 		 #remove non-standard representation marker
           s/\&//g;	 		 #remove some the marker
           s/\243[\254\255] / /g;      # Remove Japanese punctuation
  
           s/\s%(\S)/ $1/g;          # Delete the all the percent signs at word head
           s/\s\(\%/ (/g;          # Delete the all the percent signs at word head of unclear
           s/%\s/ /g;                  # Delete the all the percent signs at word tail
  
           # NOTE: Cannot remove all hesitation markers because some "%" are used
           #       in the text (I think)
           #       So we will just filter the known hesitation markers.
           s/\sÚÀ\s/ $HES /g;
           s/\sßÀ\s/ $HES /g;
           s/\sàÅ\s/ $HES /g;
           s/\sßí\s/ $HES /g;
           s/\sºÇ\s/ $HES /g;
       }	
  	
       # JAPANESE
       if ($Language eq "japanese") {
           s/\+//g;		    #remove non-standard representation marker
           s/\=/\-/g;                 #change frag= to frag-
           s/\241[\241-\256] / /g;      # Remove Japanese punctuation
       }
  
       if ($Language eq "english") {
           s/\&//g;	 		 #remove proper name the marker
           s/\*\*//g;		     #words marked with "**" (as neologisms
                                       # or unknown 'words')
           s/\*//g;		     #words marked with "*" idiosyncratic words
                                       # or unknown 'words')
           s/\+//g;		     #words marked with "++" (mispronun-
                                       # ciations given in correct orthography)
           s/@//g;                     # Spoken acronyms
           ### Mapp the hesitations
           s/[\s\(][%]*(UH|UM|EH|MM|AH|HM|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)[\s\)]/ $HES /g;
           if ($ReadMSstate){
  #### Commented out because this should be changed in SPINE
  ####         s/[\s\(][%]*(OH|AW)[\s\)]/ $HES /g;
               s/[\s\(][%]*(MHM)[\s\)]/ $1 /g;
           }
       }
  
       # This has to be done AFTER ALL THE PROCEEDING STEPS to alternate any fragment words
       if ($OptDel){
           s/(\S+)-(\S+)[-=]\s/ $1 \($2-\) /g;  #put in alternates for fragments    
           s/(\S*)[-=]\s/ \($1-\) /g;  #put in alternates for fragments    
           s/\s[-=](\S*)/ \(-$1\) /g;  #put in alternates for fragments    
       } else {
           s/(\S*)[-=]\s/ $1- /g;  #put in alternates for fragments    
           s/\s[-=](\S*)/ -$1 /g;  #put in alternates for fragments    
       }
       s/ +/ /g;
       if ($#Translate >= 0){	
  	$_ = &Map_AArray($_,\%TranslateAArray);
       }	
       if ($#Equivs >= 0){	
  	$_ = &Map_AArray($_,\%EquivsAArray);
       }
       if ($AltHyphens) {
          $_ = &AlternateHyphenWords($_);
       }
  
       ### delete any emtpy optionally deletable words
       if ($OptDel){         s/\(\)//g;  }
  
       my($bt, $et, $spkr) = split(/\s+/,$line_header);
       my $chan;
       ($chan = $spkr) =~ s/[0-9]$//;
       if (&in_range($bt, $et) && ($KeepEmptyLines || $_ !~ /^\s*$/)){
           if ($Format eq "stm"){
  	     push @STMS, sprintf "%s %s %s %s %6.2f <O> %s
  ",$id,makeChannel($chan),$id."_".$spkr,$bt,$et,$_; 
           } elsif ($Format eq "pem"){
  	     printf OUTPUT "%s %s %s %s %s
  ",$id,makeChannel($chan),"unknown_speaker",$bt,$et; 
           }
       }
  }
  
  sub makeChannel{
      my ($chan) = @_;
      if ($ChannelsTo eq "numbers") {
  	$chan =~ s/a/1/i;
  	$chan =~ s/b/2/i;
      } elsif ($ChannelsTo eq "letters") {
  	$chan =~ s/1/A/i;
  	$chan =~ s/2/B/i;
      } elsif ($ChannelsTo eq "asis") {
  	;
      } else {
  	die "Error: unknown Channel conversion $ChannelsTo";
      }
      $chan;
  }
  
  sub MIN{
      my(@a) = @_;
      my($min) = 99e9;
  
      while ($#a >= 0){
          # print "  $a[0]
  ";
          $min = $a[0] if ($min > $a[0]);
          splice(@a,0,1);
      }
      $min;
  }
  
  if ($Format eq "stm"){
      my (%lastStmEnd, %lastStmEndId, %lastStmEndIndex, $nt);
      ### Pad the STMs segments with silence if you got it
      if ($PadSilence >= 0){
          for (my $i=0; $i<@STMS; $i++){
  #            print "STM[$i] = $STMS[$i]";
              my ($id, $chan, $spkr, $bt, $et, $lur, $text) = split(/\s+/,$STMS[$i],7);
              if (! defined($lastStmEnd{$chan})){
  #                print "Set \$lastStmEnd{$chan} = $NewBeginTime;
  ";
                  $lastStmEnd{$chan} = $NewBeginTime;
                  $lastStmEndId{$chan} = $id;
  		$lastStmEndIndex{$chan} = $i;
              }
              if ($lastStmEnd{$chan} < $bt){
                  my $shift = MIN(($bt - $lastStmEnd{$chan}) / 2.0, $PadSilence);
                  if ($lastStmEndIndex{$chan} < $i){
                      $nt = sprintf("%.2f", $lastStmEnd{$chan} + $shift);
  #		    print "fix $nt $STMS[$lastStmEndIndex{$chan}] ";
                      die "Can't fix silence pad time of previous line" unless ($STMS[$lastStmEndIndex{$chan}] =~ s/^(\S+\s+\S+\s+\S+\s+\S+\s+)\S+/$1$nt/);
  #		    print "->  $STMS[$lastStmEndIndex{$chan}]
  ";
                  } 
                  $nt = sprintf("%.2f",$bt - $shift);
                  die "Can't fix silence pad time" unless ($STMS[$i] =~ s/^(\S+\s+\S+\s+\S+\s+)\S+/$1$nt/);
              } 
              $lastStmEnd{$chan} = $et;
         	    $lastStmEndIndex{$chan} = $i;
  #            print "     \$lastStmEnd{$chan} = $et
  ";
          }
          foreach my $chan(sort(keys %lastStmEnd)){
              my ($id, $ccchan, $spkr, $bt, $et, $lur, $text) = split(/\s+/,$STMS[$lastStmEndIndex{$chan}],7);
  #	    print "End $STMS[$lastStmEndIndex{$chan}]";
              if ($lastStmEnd{$chan} < $NewEndTime){
                  my $shift = MIN(($NewEndTime - $lastStmEnd{$chan}) / 2.0, $PadSilence);
                  $nt = $lastStmEnd{$chan} + $shift;
                  die "Can't fix silence pad time" unless ($STMS[$lastStmEndIndex{$chan}] =~ s/^(\S+\s+\S+\s+\S+\s+\S+\s+)\S+/$1$nt/);
  #		print "    end now $STMS[$lastStmEndIndex{$chan}]
  ";
              }
          }
      }
      undef %lastStmEnd;
      undef %lastStmEndId;
  
      ## Smooth the segments 
      if ($SmoothSegments >= 0){
          for (my $i=0; $i<@STMS; $i++){
              my ($id, $chan, $spkr, $bt, $et, $lur, $text) = split(/\s+/,$STMS[$i],7);
              if (! defined($lastStmEnd{$chan})){
                  $lastStmEnd{$chan} = $NewBeginTime;
                  $lastStmEndId{$chan} = $id;
  		$lastStmEndIndex{$chan} = $i;
              }
  	    my $change = 0;
              if ($lastStmEnd{$chan} > $bt - $SmoothSegments && $lastStmEndIndex{$chan} < $i){
  		my ($lid, $lchan, $lspkr, $lbt, $let, $llur, $ltext) = split(/\s+/,$STMS[$lastStmEndIndex{$chan}],7);
  		if ($id eq $lid && $chan eq $lchan && $spkr eq $lspkr && $lur eq $llur){
  		    chomp $ltext;
  		    chomp $text;
  		    $STMS[$lastStmEndIndex{$chan}] = 
  			"$id $chan $spkr $lbt $et $lur $ltext $text
  ";
  		    splice (@STMS, $i, 1);
  		    $i --;
  		    $change = 1;
  		}
              } 
  	    $lastStmEnd{$chan} = $et;
  	    if (! $change){
  		$lastStmEndIndex{$chan} = $i;
  	    }
          }
      }
      undef %lastStmEnd;
      undef %lastStmEndId;
  
      ## Smooth the segments 
      if ($ExcludeOverlap){
          for (my $i=0; $i<@STMS; $i++){
  #            print "STM[$i] = $STMS[$i]";
              my ($id, $chan, $spkr, $bt, $et, $lur, $text) = split(/\s+/,$STMS[$i],7);
              if (! defined($lastStmEnd{$chan})){
                  $lastStmEnd{$chan} = $NewBeginTime;
                  $lastStmEndId{$chan} = $id;
  		$lastStmEndIndex{$chan} = $i;
              }
  	    my $change = 0;
              if ($lastStmEnd{$chan} > $bt && $lastStmEndIndex{$chan} < $i){
  		my ($lid, $lchan, $lspkr, $lbt, $let, $llur, $ltext) = split(/\s+/,$STMS[$lastStmEndIndex{$chan}],7);
  		if ($id eq $lid && $chan eq $lchan){
  		    chomp $ltext;
  		    $STMS[$lastStmEndIndex{$chan}] = 
  			"$id $chan excluded_region $lbt $et $lur ignore_time_segment_in_scoring
  ";
  		    splice (@STMS, $i, 1);
  		    $i --;
  		    $change = 1;
  		}
              } 
  	    $lastStmEnd{$chan} = $et;
  	    if (! $change){
  		$lastStmEndIndex{$chan} = $i;
  	    }
          }
      }
      undef %lastStmEnd;
      undef %lastStmEndId;
  
       ## Add the intersegment gaps
       if ($AddInterSegGap){
           for (my $i=0; $i<@STMS; $i++){
               my ($id, $chan, $spkr, $bt, $et, $lur, $text) = split(/\s+/,$STMS[$i],7);
               if (! defined($lastStmEnd{$chan})){
                   $lastStmEnd{$chan} = $NewBeginTime;
                   $lastStmEndId{$chan} = $id
               }
               if ($lastStmEnd{$chan} < $bt - 0.01){
  #            if (0){
                   splice (@STMS, $i, 0, sprintf("%s %s %s %s %6.2f <O> %s
  ",$id,$chan,
                                         "${id}_${chan}_".
                                         "inter_segment_gap",
                                         $lastStmEnd{$chan},$bt,""));
                   $i++;
               }
               $lastStmEnd{$chan} = $et;
           }
           foreach my $chan(sort(keys %lastStmEnd)){
               if ($lastStmEnd{$chan} < $NewEndTime - 0.01){
                   push(@STMS, sprintf("%s %s %s %s %6.2f <O> %s
  ",$lastStmEndId{$chan},$chan,
                     "${id}_${chan}_inter_segment_gap",
                      $lastStmEnd{$chan},$NewEndTime,"")); 
               }
           }
      }
  }
  print OUTPUT @STMS;
  close OUTPUT;
  exit 0;
  
  ###############################################################################
  ####                                                                       ####
  ####                        END OF MAIN PROGRAM                            ####
  ####                                                                       ####
  ###############################################################################
  
   
  ####
  ####  Return true if the passed in begin and end times are within the 
  ####  specified range of output times.  If not return false.
  ####  Inputs:   - Argument 0: The begin time
  ####            - Argument 1: The end time
  ####  Outputs:  - Return true if:
  ####              * the global variables $NewBeginTime or $NewEndTime
  ####              * are set, and the begin and end times are with their
  ####              * respective ranges
  ####              OR
  ####              * neither global variables are defined.
  sub in_range{
      my ($bt,$et) = @_;
      my $do_prnt = 0;
      if (defined($NewBeginTime) && defined($NewEndTime)) {
  	if (($bt >= $NewBeginTime || &eqdelta($bt,$NewBeginTime,0.5)) &&
  	    ($et <= $NewEndTime || &eqdelta($et,$NewEndTime,0.5))) {  
  	    $do_prnt = 1;
  	}
      } elsif (defined($NewBeginTime)) {
  	if ($bt >= $NewBeginTime ||
  	    &eqdelta($bt,$NewBeginTime,0.5)) {
  	    $do_prnt = 1;
  	} 
      } elsif (defined($NewEndTime)) {
  	if ($et <= $NewEndTime || &eqdelta($et,$NewEndTime,0.5)) {
  	    $do_prnt = 1;
  	}
      } else {
  	$do_prnt = 1;
      }
      $do_prnt;
  }
  
  
  ####
  ####  Compare two real numbers, they're equal if the absolute value of the
  ####  difference is less than delta
  ####  Inputs:  - Argument 0: real number 1.
  ####           - Argument 1: real number 2.
  ####           - Argument 2: The delta.
  ####  Outputs: - True if they are equal, False if they are not
  sub eqdelta {
      my($v1,$v2,$del) = @_;
      if ($v1 < $v2) {
          $v2 - $v1 < $del;
      } else {
          $v1 - $v2 < $del;
      }
  }
  
  sub add_spaces_for_japanese_punctuation {
      $_ = $_[0];
      chomp;
      my $first = 0;
      my $c1 = "";
      my $lc = "";
      my $out = "";
      
      while (length($_) > 0){
  	s/^(.)//;
  	$c1 = $1;
  	if ($c1 =~ /^[\177-\377]/){
  	    if (!$first) { $first = 1; } else { $first = 0; }
  	}
  	$out = $out.$c1;
  	if (!$first && ($lc.$c1) =~ /^\241[\241-\270\272\273\275-\377]$/) {
  	    $out = $out." ";
  	}
  	$lc = $c1;
      }
      $out."
  ";
  }
  
  sub Map_AArray{
      $_ = $_[0];
      my $AArray = $_[1];
      my $out = "";
      my $word = "";
      foreach $word (split){
  	if ($AArray->{$word} !~ /^$/){
  	    $out .= " ".$AArray->{$word};
  	} else {
  	    $out .= " ".$word;
  	}
      }
      $out .= "
  ";
      $out;
  }
  
  sub AlternateHyphenWords{
      $_ = $_[0];
      my $out = "";
      my $word = "";
      my $x;
      my $y;
      my $z;
      foreach $word (split){
  	if ($word =~ /[A-Za-z][_-][A-Za-z]/){
  	    ($x=$word) =~ s/[_-]//g;
  	    ($y=$word) =~ s/[_-]/ /g;
  	    ($z=$word) =~ s/[_-]/-/g;
  	    $word = "{ ".$y." / ".$x." / ".$z." }"; 
  	}
  	$out .= " ".$word;
      }
      $out .= "
  ";
      $out;
  }