lia_map_words.pl 6.29 KB
#!/usr/bin/perl

use strict;
use warnings;


# from switchboard s5 example scripts


# piece of code to chose the field impacted to this treatement.
# By default we consider from the secpnd filed to end-of-line.

#if ($ARGV[0] eq "-f") {
#  shift @ARGV; 
#  $field_spec = shift @ARGV; 
#  if ($field_spec =~ m/^\d+$/) {
#    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
#  }
#  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
#    if ($1 ne "") {
#      $field_begin = $1 - 1;    # Change to zero-based indexing.
#    }
#    if ($2 ne "") {
#      $field_end = $2 - 1;      # Change to zero-based indexing.
#    }
#  }
#  if (!defined $field_begin && !defined $field_end) {
#    die "Bad argument to -f option: $field_spec"; 
#  }
#}




sub numtolitteral {
	my @num = (); # split input string into an array of digits
	my $sol = undef;
	my $len = undef;
 
	my $cent = undef;
	my $mille = undef;
	my $million = undef;
 
	my $str = shift;
	$str =~ s/^0*//;
	@num = reverse split(//, $str);
	$len = scalar @num;
 
	return "zéro" if not $len;
 
	$cent = join("", reverse splice @num, 0, 3);
	$mille = join("", reverse splice @num, 0, 3);
	$million = join("", reverse splice @num, 0, 3);
 
	if ($len <= 3) {
	$sol = dtoa("zéro", $cent, 1);
	}
	elsif ($len >= 4 and $len <= 6)  {
	$sol = "";
	$sol = dtoa("", $mille, 0) unless $len == 4 and $mille == 1;
     $sol = join(" ", $sol, "mille", dtoa("", $cent, 1));
  }
  else {
    $sol = join(" ", dtoa("", $million, 1), "million");
    $sol = $sol . "s" if $million > 1;
     my $tmp = "";
     $tmp = dtoa("", $mille, 0) unless $mille == 1;
     $sol = join(" ", $sol, $tmp, "mille") unless $mille == 0;
     $sol = join(" ", $sol, dtoa("", $cent, 1));
   }
   $sol =~ s/\s+/ /g;
   return $sol;
 }




# --------------------- #
# ---- sub dtoa() ----- #
# --------------------- #
sub dtoa() {
  my $zero = shift;
  my @tab = split(//, shift);
  my $final = shift;
  my $sol = ""; 

  my @digit = qw(zéro un deux trois quatre cinq six sept huit neuf);
  my @digit1x = qw/dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf/;
  my @digitx0 = ("", "dix", "vingt", "trente", "quarante", "cinquante", "soixante", "soixante-dix", "quatre-vingt", "quatre-vingt-dix");

  # remove heading zeros...
  shift @tab while $tab[0] == 0 and $#tab > 0;
  my $len = scalar @tab;

  if ($len == 1) { # [0-9]
    $sol = $digit[$tab[0]];
    $sol = $zero if $tab[0] == 0;
  }
  else {
    $sol = ""; 
    if ($len == 3) {
      my $c = shift @tab;
      $sol = $digit[$c] unless $c == 1;
      $sol = join(" ", $sol, "cent");
      $sol = $sol . "s" if $c > 1 and $tab[0] == 0 and $tab[1] == 0 and $final;
    }   

    # convert 2 digits @tab to string
    my $d2 = ""; 
    if ($tab[0] == 0) {
      $d2 = $digit[$tab[1]] if $tab[1];
    }   
    elsif ($tab[0] == 1) {
      $d2 = $digit1x[$tab[1]];
    }   
    else {
      if ($tab[1] == 0) {
        $d2 = $digitx0[$tab[0]];
      }   
      else {
        if ($tab[0] == 7 or $tab[0] == 9) {
          $d2 = join("-", $digitx0[$tab[0] - 1], $digit1x[$tab[1]]);
          $d2 = join("-", $digitx0[$tab[0]-1], "et", $digit1x[$tab[1]]) if $tab[0] == 7 and $tab[1] == 1;
        }   
        else {
          $d2 = $digitx0[$tab[0]];
          $d2 = $d2 . "-et" if $tab[1] == 1 and $tab[0] != 8;
          $d2 = join("-", $d2, $digit[$tab[1]]);
        }   
      }   
    }   

    $sol = join(" ", $sol, $d2);
  }

  return $sol;
}











while (<STDIN>) {
	chomp;
	my @line = split(/ +/, $_);
	for (my $n = 1; $n <= $#line; $n++) {
		### $a = $A[$n];
		#### if ( (!defined $field_begin || $n >= $field_begin) && (!defined $field_end || $n <= $field_end)) {
		
		#________________________________#
		# e.g. [LAUGHTER-STORY] -> STORY;	
		$line[$n] =~ s:(|\-)^\[LAUGHTER-(.+)\](|\-)$:$1$2$3:; 

		#________________________________#
		# hesitation %HESITATION -> euh
		$line[$n] =~ s/\(%HESITATION\)/euh/;

		#____________________________________________#
		# $1 and $3 relate to preserving trailing "-"
		# e.g. [IT'N/ISN'T] -> IT'N ... note,
		$line[$n] =~ s:^\[(.+)/.+\](|\-)$:$1$2:; 

		#________________________________________________________________________#
	      	# 1st part may include partial-word stuff, which we process further below,
		# e.g. [LEM[GUINI]-/LINGUINI]
		# the (|\_) at the end is to accept and preserve trailing -'s.
		# e.g. -[AN]Y , note \047 is quote;
		$line[$n] =~ s:^(|\-)\[[^][]+\](.+)$:-$2:;

		#______________________________________________________________#
	   	# let the leading - be optional on input, as sometimes omitted.
		# e.g. AB[SOLUTE]- -> AB-;
		$line[$n] =~ s:^(.+)\[[^][]+\](|\-)$:$1-:;

		#______________________________________________________________#
		# let the trailing - be optional on input, as sometimes omitted.
		# e.g. EX[SPECIALLY]-/ESPECIALLY] -> EX-
		$line[$n] =~ s:([^][]+)\[.+\]$:$1:; 
		
		#__________________________________#
		# which is a  mistake in the input.
		# e.g. {YUPPIEDOM} -> YUPPIEDOM
		$line[$n] =~ s:^\{(.+)\}$:$1:;                
		# e.g. AMMU[N]IT- -> AMMU-IT-
		$line[$n] =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:;
		# e.g. THEM_1 -> THEM
		$line[$n] =~ s:_\d$::;                   

		#________________________________________________#
		# remove tildes, points, double quotes, commas, underscores, double-points, non-diacritic circumflex,
		# stars, slashes, backslashes, interrogation points, exclamation points, semicolons
		$line[$n] =~s/[\~\.\"\,\_\:\^\*\\\/\?\!\;\#\¿\¡]+//g;
 
		# &blah -> (%hesitation)
		$line[$n] =~s/\B\&\S+/(%hesitation)/g;
 
		# Drop lines with '&' remaining
		$line[$n] =~s/^.*\B\&\B.*$/{}/;
 
		# %% -> (%hesitation)
		$line[$n] =~s/\%\%/(%hesitation)/g;
 
		$line[$n] =~s/([^(])%/$1 pourcent/g;
 
		$line[$n] =~s/([0-9])-([0-9])/$1 $2/g;
 
		$line[$n] =~s/'/' /g;
		$line[$n] =~s/aujourd' hui/aujourd'hui/g;
		$line[$n] =~s/Aujourd' hui/Aujourd'hui/g;
		$line[$n] =~s/\b([0-9]+)\b/numtolitteral($1)/ge;
		$line[$n] =~s/\(c'est\) à/à/g;
		$line[$n] =~s/\(c'est-\)à/à/g;
		$line[$n] =~s/\(c'est-à-\)dire/dire/g;
		$line[$n] =~s/deux\(011\)/deux/g;
		$line[$n] =~s/c\('est\)/c'/g;
		$line[$n] =~s/(\s)\([^\s]+\)(\s)/$1$2/g;
		$line[$n] =~s/ +/ /g;
		#________________________________________________#
  
    	}
	### $A[$n] = $a;
  	#### }
	
	my $endLine =  join(" ", @line) . "\n";
	$endLine =~ s/ +/ /g;
	$endLine =~ s/\\//g;
	print $endLine;

}
1;