Yannick Estève / ONTRAC-Kaldi

Blame view

tools/sctk-2.4.10/bin/tanweenFilt.pl 3.09 KB
  #!/usr/bin/env perl
  
  use strict;
  
  my $Version="1.1";
  
  #####
  #  Version 1.0  Released October, 2004
  #        - Initial Release
  #  Version 1.1  Released October 12, 2004
  #        - Added -a option
  #        - Fixed problem with word-final tanween characters that were optionally deletable
   
  my $Usage="Usage: tanweenFilt.pl [ -i fmt ] Infile|- OutFile|-
  ".
  "Version: $Version
  ".
  "Desc: tanweenFilt removes the tanween Arabic characters from the end of a word.
  ".
  "      In the Buckwalter normalization scheme, the word-final letters
  ".
  "      'F', 'N', and 'K' are removed".
  "Options:
  ".
  "      -a       Remove all tanween characters, not just the final characters.
  ".
  "      -i fmt   Set the input file formant to 'fmt'.  The possible choices are:
  ".
  "                  txt -> plain text, the default
  ".
  "                  ctm -> CTM format, ignores all but the 5th column, and if
  ".
  "                         a division occurs and a confidence score is present,
  ".
  "                         the confidence score is copied to all parts.
  ".
  "                  stm -> STM format, change only the text field of the stm record
  ".
  "
  ";
  
  use Getopt::Long;
  my ($InFmt, $AllTanween) = (undef, 0);
  my $result = GetOptions ("i:s" => \$InFmt, "a" => \$AllTanween);
  die "Aborting:
  $Usage
  :" if (!$result);
  
  if (defined($InFmt)) {
      die("$Usage
  
  Error: Undefined input format '$InFmt'") 
  	if ($InFmt !~ /^(txt|ctm|stm)$/);
  } else {
      $InFmt = "txt";
  } 
  
  #### The main functions arguements:
  die "$Usage
  Too many arguements" if ($#ARGV > 1);
  die "$Usage
  Output Not Specified" if ($#ARGV == 0);
  die "$Usage
  Input and Output Not Specified" if ($#ARGV == -1);
  
  my $InFile=$ARGV[0];
  my $OutFile=$ARGV[1];
  die("$Usage
  Error: Input file $InFile does not exist
  ")
      if ($InFile ne "-" && ! -r $InFile);
  
  open(IN, "$InFile") || die "Unable to open trans-file $InFile";
  open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile";
  
  while (<IN>){
      chomp;
      if ($InFmt eq "txt"){
  	print OUT normalize($_)."
  ";
      } elsif ($InFmt eq "ctm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @ctm = split(/(\s+)/,$_);
  	$ctm[8] = normalize($ctm[8]);
  	print OUT $prefix.join("", @ctm)."
  ";
      } elsif ($InFmt eq "stm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @stm = split(/(\s+)/,$_, 7);
  	if ($stm[10] =~ /^<[^<>]*>$/){
  	    $stm[12] = normalize($stm[12]);
  	} else {
  	    $stm[10] .= join("",splice(@stm,11,2));
  	    $stm[10] = normalize($stm[10]);
  	}
  	print OUT $prefix.join("", @stm)."
  ";
      } else {
  	die "Error: unknown input format '$InFmt'
  $Usage
  ";
      }    
  }
  
  close IN; close OUT;
  exit 0;
  
  sub normalize{
      my ($text) = @_;
      $text = " ".$text;    
      if ($AllTanween){
  	$text =~ s/(\331\215|\331\214|\331\213)//g;
      } else {
  	$text =~ s/(\331\215|\331\214|\331\213)$//g;
  	$text =~ s/(\331\215|\331\214|\331\213)\)$/\)/g;
  	$text =~ s/(\331\215|\331\214|\331\213)\) /\) /g;
  	$text =~ s/(\331\215|\331\214|\331\213) / /g;
      }
      $text =~ s/^ //;
      $text;
  }