normalize_fr.pl 3.68 KB
#!/usr/bin/perl -w

use FileHandle;

use strict;
use utf8;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

sub numtolitteral {
  my @num = (); # split input string into an array of digits
  my $sol = undef;
  my $len = undef;

  my $cent = undef;
  my $mille = undef;
  my $million = undef;
  
  my $str = shift;
  $str =~ s/^0*//;
  @num = reverse split(//, $str);
  $len = scalar @num;

  return "zéro" if not $len;

  $cent = join("", reverse splice @num, 0, 3);
  $mille = join("", reverse splice @num, 0, 3);
  $million = join("", reverse splice @num, 0, 3);

  if ($len <= 3) {
    $sol = dtoa("zéro", $cent, 1);
  }
  elsif ($len >= 4 and $len <= 6)  {
    $sol = "";
    $sol = dtoa("", $mille, 0) unless $len == 4 and $mille == 1;
    $sol = join(" ", $sol, "mille", dtoa("", $cent, 1));
  }
  else {
    $sol = join(" ", dtoa("", $million, 1), "million");
    $sol = $sol . "s" if $million > 1;
    my $tmp = "";
    $tmp = dtoa("", $mille, 0) unless $mille == 1;
    $sol = join(" ", $sol, $tmp, "mille") unless $mille == 0;
    $sol = join(" ", $sol, dtoa("", $cent, 1));
  }
  $sol =~ s/\s+/ /g;
  return $sol;
}

# --------------------- #
# ---- sub dtoa() ----- #
# --------------------- #
sub dtoa() {
  my $zero = shift;
  my @tab = split(//, shift);
  my $final = shift;
  my $sol = "";

  my @digit = qw(zéro un deux trois quatre cinq six sept huit neuf);
  my @digit1x = qw/dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf/;
  my @digitx0 = ("", "dix", "vingt", "trente", "quarante", "cinquante", "soixante", "soixante-dix", "quatre-vingt", "quatre-vingt-dix");

  # remove heading zeros...
  shift @tab while $tab[0] == 0 and $#tab > 0;
  my $len = scalar @tab;

  if ($len == 1) { # [0-9]
    $sol = $digit[$tab[0]];
    $sol = $zero if $tab[0] == 0;
  }
  else {
    $sol = "";
    if ($len == 3) {
      my $c = shift @tab;
      $sol = $digit[$c] unless $c == 1;
      $sol = join(" ", $sol, "cent");
      $sol = $sol . "s" if $c > 1 and $tab[0] == 0 and $tab[1] == 0 and $final;
    }

    # convert 2 digits @tab to string
    my $d2 = "";
    if ($tab[0] == 0) {
      $d2 = $digit[$tab[1]] if $tab[1];
    }
    elsif ($tab[0] == 1) {
      $d2 = $digit1x[$tab[1]];
    }
    else {
      if ($tab[1] == 0) {
	$d2 = $digitx0[$tab[0]];
      }
      else {
	if ($tab[0] == 7 or $tab[0] == 9) {
	  $d2 = join("-", $digitx0[$tab[0] - 1], $digit1x[$tab[1]]);
	  $d2 = join("-", $digitx0[$tab[0]-1], "et", $digit1x[$tab[1]]) if $tab[0] == 7 and $tab[1] == 1;
	}
	else {
	  $d2 = $digitx0[$tab[0]];
	  $d2 = $d2 . "-et" if $tab[1] == 1 and $tab[0] != 8;
	  $d2 = join("-", $d2, $digit[$tab[1]]);
	}
      }
    }

    $sol = join(" ", $sol, $d2);
  }

  return $sol;
}


while(<>){
  s/\(([^)]*)\)/"<<<X".unpack("H*", $1)."X>>>"/ge;

  s/<pers=[^>]*>/ /g;
  s/<\/pers>/ /g;

  # remove tildes, points, double quotes, commas, underscores, double-points, non-diacritic circumflex,
  # stars, slashes, backslashes, interrogation points, exclamation points, semicolons
  s/[\~\.\"\,\_\:\^\*\\\/\?\!\;\#\¿\¡]+//g;

  # &blah -> (%hesitation)
  s/\B\&\S+/(%hesitation)/g;

  # Drop lines with '&' remaining
  s/^.*\B\&\B.*$/{}/;

  # %% -> (%hesitation)
  s/\%\%/(%hesitation)/g;

  s/([^(])%/$1 pourcent/g;

  s/([0-9])-([0-9])/$1 $2/g;

  s/'/' /g;
  s/aujourd' hui/aujourd'hui/g;
  s/Aujourd' hui/Aujourd'hui/g;
  s/\b([0-9]+)\b/numtolitteral($1)/ge;

  s/-/ /g;
  s/(<<<X([0-9a-f]+)X>>>)/"(".pack("H*", $2).")"/ge;
  s/<<<XX>>>/()/g;

  s/\(c'est\) à/à/g;
  s/\(c'est-\)à/à/g;
  s/\(c'est-à-\)dire/dire/g;
#  s/ \(endroit\)/ /g;
#  s/ \(est\)/ /g;
#  s/ \(où\)/ /g;
#  s/ \(que\)/  /g;
  s/deux\(011\)/deux/g;
  s/c\('est\)/c'/g;
  s/(\s)\([^\s]+\)(\s)/$1$2/g;

  print;
  STDOUT->flush;
}