Kaldi / Kaldi first steps

Blame view

LIA_kaldiUtils/lia_map_words.pl 6.29 KB
  #!/usr/bin/perl
  
  use strict;
  use warnings;
  
  
  # from switchboard s5 example scripts
  
  
  # piece of code to chose the field impacted to this treatement.
  # By default we consider from the secpnd filed to end-of-line.
  
  #if ($ARGV[0] eq "-f") {
  #  shift @ARGV; 
  #  $field_spec = shift @ARGV; 
  #  if ($field_spec =~ m/^\d+$/) {
  #    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
  #  }
  #  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
  #    if ($1 ne "") {
  #      $field_begin = $1 - 1;    # Change to zero-based indexing.
  #    }
  #    if ($2 ne "") {
  #      $field_end = $2 - 1;      # Change to zero-based indexing.
  #    }
  #  }
  #  if (!defined $field_begin && !defined $field_end) {
  #    die "Bad argument to -f option: $field_spec"; 
  #  }
  #}
  
  
  
  
  sub numtolitteral {
  	my @num = (); # split input string into an array of digits
  	my $sol = undef;
  	my $len = undef;
   
  	my $cent = undef;
  	my $mille = undef;
  	my $million = undef;
   
  	my $str = shift;
  	$str =~ s/^0*//;
  	@num = reverse split(//, $str);
  	$len = scalar @num;
   
  	return "zéro" if not $len;
   
  	$cent = join("", reverse splice @num, 0, 3);
  	$mille = join("", reverse splice @num, 0, 3);
  	$million = join("", reverse splice @num, 0, 3);
   
  	if ($len <= 3) {
  	$sol = dtoa("zéro", $cent, 1);
  	}
  	elsif ($len >= 4 and $len <= 6)  {
  	$sol = "";
  	$sol = dtoa("", $mille, 0) unless $len == 4 and $mille == 1;
       $sol = join(" ", $sol, "mille", dtoa("", $cent, 1));
    }
    else {
      $sol = join(" ", dtoa("", $million, 1), "million");
      $sol = $sol . "s" if $million > 1;
       my $tmp = "";
       $tmp = dtoa("", $mille, 0) unless $mille == 1;
       $sol = join(" ", $sol, $tmp, "mille") unless $mille == 0;
       $sol = join(" ", $sol, dtoa("", $cent, 1));
     }
     $sol =~ s/\s+/ /g;
     return $sol;
   }
  
  
  
  
  # --------------------- #
  # ---- sub dtoa() ----- #
  # --------------------- #
  sub dtoa() {
    my $zero = shift;
    my @tab = split(//, shift);
    my $final = shift;
    my $sol = ""; 
  
    my @digit = qw(zéro un deux trois quatre cinq six sept huit neuf);
    my @digit1x = qw/dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf/;
    my @digitx0 = ("", "dix", "vingt", "trente", "quarante", "cinquante", "soixante", "soixante-dix", "quatre-vingt", "quatre-vingt-dix");
  
    # remove heading zeros...
    shift @tab while $tab[0] == 0 and $#tab > 0;
    my $len = scalar @tab;
  
    if ($len == 1) { # [0-9]
      $sol = $digit[$tab[0]];
      $sol = $zero if $tab[0] == 0;
    }
    else {
      $sol = ""; 
      if ($len == 3) {
        my $c = shift @tab;
        $sol = $digit[$c] unless $c == 1;
        $sol = join(" ", $sol, "cent");
        $sol = $sol . "s" if $c > 1 and $tab[0] == 0 and $tab[1] == 0 and $final;
      }   
  
      # convert 2 digits @tab to string
      my $d2 = ""; 
      if ($tab[0] == 0) {
        $d2 = $digit[$tab[1]] if $tab[1];
      }   
      elsif ($tab[0] == 1) {
        $d2 = $digit1x[$tab[1]];
      }   
      else {
        if ($tab[1] == 0) {
          $d2 = $digitx0[$tab[0]];
        }   
        else {
          if ($tab[0] == 7 or $tab[0] == 9) {
            $d2 = join("-", $digitx0[$tab[0] - 1], $digit1x[$tab[1]]);
            $d2 = join("-", $digitx0[$tab[0]-1], "et", $digit1x[$tab[1]]) if $tab[0] == 7 and $tab[1] == 1;
          }   
          else {
            $d2 = $digitx0[$tab[0]];
            $d2 = $d2 . "-et" if $tab[1] == 1 and $tab[0] != 8;
            $d2 = join("-", $d2, $digit[$tab[1]]);
          }   
        }   
      }   
  
      $sol = join(" ", $sol, $d2);
    }
  
    return $sol;
  }
  
  
  
  
  
  
  
  
  
  
  
  while (<STDIN>) {
  	chomp;
  	my @line = split(/ +/, $_);
  	for (my $n = 1; $n <= $#line; $n++) {
  		### $a = $A[$n];
  		#### if ( (!defined $field_begin || $n >= $field_begin) && (!defined $field_end || $n <= $field_end)) {
  		
  		#________________________________#
  		# e.g. [LAUGHTER-STORY] -> STORY;	
  		$line[$n] =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:; 
  
  		#________________________________#
  		# hesitation %HESITATION -> euh
  		$line[$n] =~ s/\(%HESITATION\)/euh/;
  
  		#____________________________________________#
  		# $1 and $3 relate to preserving trailing "-"
  		# e.g. [IT'N/ISN'T] -> IT'N ... note,
  		$line[$n] =~ s:^\[(.+)/.+\](|\-)$:$1$2:; 
  
  		#________________________________________________________________________#
  	      	# 1st part may include partial-word stuff, which we process further below,
  		# e.g. [LEM[GUINI]-/LINGUINI]
  		# the (|\_) at the end is to accept and preserve trailing -'s.
  		# e.g. -[AN]Y , note \047 is quote;
  		$line[$n] =~ s:^(|\-)\[[^][]+\](.+)$:-$2:;
  
  		#______________________________________________________________#
  	   	# let the leading - be optional on input, as sometimes omitted.
  		# e.g. AB[SOLUTE]- -> AB-;
  		$line[$n] =~ s:^(.+)\[[^][]+\](|\-)$:$1-:;
  
  		#______________________________________________________________#
  		# let the trailing - be optional on input, as sometimes omitted.
  		# e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
  		$line[$n] =~ s:([^][]+)\[.+\]$:$1:; 
  		
  		#__________________________________#
  		# which is a  mistake in the input.
  		# e.g. {YUPPIEDOM} -> YUPPIEDOM
  		$line[$n] =~ s:^\{(.+)\}$:$1:;                
  		# e.g. AMMU[N]IT- -> AMMU-IT-
  		$line[$n] =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:;
  		# e.g. THEM_1 -> THEM
  		$line[$n] =~ s:_\d$::;                   
  
  		#________________________________________________#
  		# remove tildes, points, double quotes, commas, underscores, double-points, non-diacritic circumflex,
  		# stars, slashes, backslashes, interrogation points, exclamation points, semicolons
  		$line[$n] =~s/[\~\.\"\,\_\:\^\*\\\/\?\!\;\#\¿\¡]+//g;
   
  		# &blah -> (%hesitation)
  		$line[$n] =~s/\B\&\S+/(%hesitation)/g;
   
  		# Drop lines with '&' remaining
  		$line[$n] =~s/^.*\B\&\B.*$/{}/;
   
  		# %% -> (%hesitation)
  		$line[$n] =~s/\%\%/(%hesitation)/g;
   
  		$line[$n] =~s/([^(])%/$1 pourcent/g;
   
  		$line[$n] =~s/([0-9])-([0-9])/$1 $2/g;
   
  		$line[$n] =~s/'/' /g;
  		$line[$n] =~s/aujourd' hui/aujourd'hui/g;
  		$line[$n] =~s/Aujourd' hui/Aujourd'hui/g;
  		$line[$n] =~s/\b([0-9]+)\b/numtolitteral($1)/ge;
  		$line[$n] =~s/\(c'est\) à/à/g;
  		$line[$n] =~s/\(c'est-\)à/à/g;
  		$line[$n] =~s/\(c'est-à-\)dire/dire/g;
  		$line[$n] =~s/deux\(011\)/deux/g;
  		$line[$n] =~s/c\('est\)/c'/g;
  		$line[$n] =~s/(\s)\([^\s]+\)(\s)/$1$2/g;
  		$line[$n] =~s/ +/ /g;
  		#________________________________________________#
    
      	}
  	### $A[$n] = $a;
    	#### }
  	
  	my $endLine =  join(" ", @line) . "
  ";
  	$endLine =~ s/ +/ /g;
  	$endLine =~ s/\\//g;
  	print $endLine;
  
  }
  1;