Jean-François Rey / otmedia

Blame view

tools/scripts/Number2txt.pl 8.84 KB
  #!/usr/bin/perl -w
  
  #------------------------------------------
  # Author : Emmanuel FERREIRA
  # Contact: emmanuel.ferreira0194@gmail.com
  # Date : 14/02/11
  # Brief : in order to convert a raw text containing numbers
  #	  in fulltext (without numbers)
  #------------------------------------------
  
  use strict;
  use warnings;
  use Getopt::Long;
  use Pod::Usage;
  
  #-------------------------------------------
  # MAIN
  # N.B : in this current file mu stands for 
  #       measuring unit
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $muRulesFile;
  my $number2txtRulesFile;
  my $processNbClustering; #65.000 => 65000
  my $oovMu;
  
  # usefull variables
  my %singularMuSpellingMap;
  my %pluralMuSpellingMap;
  my %femMu;
  my %oovMuMap;
  #number to text map
  my %num2txtMap;
  #rules map
  my %rulesMap;
  #valid digit separator map
  my %sepMap;
  #regex in order to catch valid digit separator
  my $sepRegex ="";
  #config
  my $rulesSep = "\t";
  my $unitSep = " ";
  my $numberSep = "";
  my $spotfem ="<e>";
  
  # Parse options and print usage if there is a syntax
  # error, or if usage was explicitly requested
  GetOptions('help|?' => \$help, 
  	   'clustNb' => \$processNbClustering,
  	   'muRules=s' => \$muRulesFile,
  	   'oovMu=s' => \$oovMu,
  	   'rule=s' => \$number2txtRulesFile,
  	   'sep=s' =>\$rulesSep,
  	   'usep=s' =>\$unitSep,
  	   'numbSep=s' =>\$numberSep);
  
  pod2usage(1) if($help);
  pod2usage({-msg => "BAD USAGE - mandatory option : rule (--rule <path_to_rule_file>),  in order to process the number convertion
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$number2txtRulesFile);
  
  #Load rules in programming structures
  
  #MU RULES FILE => SINGULAR_MU_MAP & PLURAL_MU_MAP
  if($muRulesFile){
  	my $file;
  	open($file, $muRulesFile) or die("Cannot open: $muRulesFile");
  	while(<$file>){
  		chomp($_);
  		#non-comment lines
  		if($_ && $_ !~ /^\s*$/ && $_ !~ /^#/ ){
  			my @splitLine = split(/$rulesSep/, $_);
  			#check column number (must be 4 (-1))
  			if($#splitLine != 3){
  				#waring bad format
  				die("Bad format in $muRulesFile : $_");			
  			}
  			else{
  				#take into account variant spelling of the unit
  				my @sameUnitTab = split($unitSep, $splitLine[0]);
  				#for each spelling store the singular and plural form
  				for(my $i=0; $i <= $#sameUnitTab; $i++){
  					my $unit = $sameUnitTab[$i];
  					$singularMuSpellingMap{$unit}=$splitLine[1];
  					$pluralMuSpellingMap{$unit}=$splitLine[2];
  					#store if it is a female form
  					if($splitLine[3] eq "f"){
  						$femMu{$unit}=1;
  					}
  				}
  			}
  		}
  	}
  	close($file);
  }
  
  initializeMaps($number2txtRulesFile);
  
  #perform the clean task preprocess
  while(<STDIN>){
  	chomp($_);
  	if($_){
  		if($processNbClustering){
  			$_ =~ s/(\b\d{1,3})\s*(([\.\s](\d{3}))+)(\s*(\D|$))/clust($1,$2).$5/eg;
  		}
  		if($muRulesFile){
  			$_ =~ s/(\b\d+([\.,]\d+)?)\s?([^\s\d\.,]+)/goodFormat($1, $3, '')/eg;
  		}
  		#text convertion of all the valid digit separator
  		$_ =~ s/(\d+)($sepRegex)(\d+)/sep2txt($1,$2,$3)/ge;
  		#find and format 0 sequels
  		$_ =~ s/(\b)(0+)([1-9]\d*)?(\b)/preformatZeroSequel($1,$2,$3)/ge;
  		#number convertion
  		$_ =~ s/(\b)(\d+)(\b)/numberConvertion($1,$2,$3)/ge;
  		#replace the spot fem if needed
  		$_ =~ s/([^e]) $spotfem/$1e/g;
  		#remove them otherwise
  		$_ =~ s/$spotfem//g;
  		#remove the mutiple space
  		$_ =~ s/ +/ /g;
  		#display
  		print "$_
  ";
  	}
  }
  
  #build the report (if needed)
  #if($oovMu){
  #	my $file;
  #	open($file, ">$oovMu") or die("Cannot open : $oovMu");
  #	while( my($k, $v) = each(%oovMuMap)){
  #		print $file "$k	$v
  ";
  #	}
  #	close($file);
  #}
  
  #-------------------------------------------
  # SUBROUTINES
  #-------------------------------------------
  
  #
  # \brief clust process 16.000 => 16000 and 15 000 => 15000
  #
  sub clust{
  	my ($a, $b) = @_;
  	$b =~ s/\s+//g;
  	$b =~ s/\.//g;
  	return $a.$b;
  }
  
  #
  # \breif format a number sequence applying the muRules
  #
  sub goodFormat{
  	my ($nb_prec, $current, $nb_succ) = @_;
  	$nb_succ = '' if(!$nb_succ);
  	return  $nb_prec.writeMu($current, $nb_prec, $nb_succ).$nb_succ;
  }
  
  #
  # \brief apply the muRules
  #
  sub writeMu{
  	my ($current, $nb_prec, $nb_succ) = @_;
  	my $modif=" ";
  	#if the key exist
  	if($singularMuSpellingMap{$current}){
  		$nb_prec =~ s/,/./g;
  		$nb_prec =~ s/([^\d\.]*)([\d\.]+$)/$2/g;
  		if($nb_prec =~ /1$/ && $nb_prec !~ /11$/ && exists $femMu{$current}){
  			$modif=" $spotfem ";
  		}
  		if($nb_prec < -1 || $nb_prec > 1){
  			return $modif.$pluralMuSpellingMap{$current}." ";	
  		}
  		else{
  			return $modif.$singularMuSpellingMap{$current}.$modif;
  		}
  	}
  	else{
  		#count occurence oov
  		$oovMuMap{$current}++;
  		if(!$nb_prec && $nb_prec != 0){
  			return $current;
  		} else{
  			return $modif.$current.$modif;
  		}
  	}
  }
  
  #
  # \brief initialize the different maps use in this script
  #	 rulesMap : map containing the rules
  #	 sepMap : map containing the number valid separator
  #	 number2charMap : map containing the usefull number to text convertion 
  #
  sub initializeMaps
  {
  	my ($ruleFile) = @_;
  	#open the rule file
  	open(RULE_FILE, $ruleFile) or die("Cannot open the rule file : $ruleFile");
  	#reading
  	while(<RULE_FILE>)
  	{
  		chomp($_);	
  		my @columns = split($rulesSep);
  		# for all the rule containing in the rules file (line starting by rule)
  		if($columns[0] =~ /rule/){
  			$rulesMap{$columns[1]} = $columns[2];
  		}
  		# for all the separator (line starting by chars_sep)
  		elsif($columns[0] =~ /chars_sep/){
  			$sepMap{$columns[1]} = $columns[2];
  			#construction simultanee de la regex permettant de
  			#les retrouver
  			if($sepRegex eq "")
  			{
  				$sepRegex .= $columns[1];
  			}
  			else
  			{
  				$sepRegex .= "|".$columns[1];
  			}
  			
  		}
  		#for all the number value
  		elsif($columns[0] =~ /exep/){
  			$num2txtMap{$columns[1]} = $columns[2];
  		}
  		else{
  			#nothing
  		}
  	}
  	close(RULE_FILE);
  }
  
  #
  # \brief from valide digit separator to text
  #
  sub sep2txt
  {
  	my ($leftNum, $sep, $rigthNum) = @_;
  	if($sepMap{$sep}){
  		$sep = " $sepMap{$sep} ";
  	}
  	return "${leftNum}${sep}${rigthNum}";
  }
  
  #
  # \brief preformat a 0 sequel
  #
  sub preformatZeroSequel
  {
  	my ($left, $zeroSequel, $rigth) = @_;
  	$left = "" if(!$left);
  	$rigth = "" if(!$rigth);
  	my $sequelTransf = "";
  	for(my $i = 0; $i < length($zeroSequel); $i++){
  		$sequelTransf.= "0 ";
  	}
  	return "${left}${sequelTransf}${rigth}";
  }
  
  #
  # \brief number convertion
  #
  sub numberConvertion
  {
  	my ($boundLeft, $number, $boundRigth) = @_;
  	$number = processNumberTransf($number);
  	return "${boundLeft}${number}${boundRigth}";
  }
  
  #
  # \brief process a number convertion
  #
  sub processNumberTransf{
  	my ($number) = @_;
  	#remove all the useless 0
  	$number =~ s/^(0+)([1-9])/$2/g;
  	my $numberTxt = $number;
  	#if the number correspond to a map key exec the change
  	if($num2txtMap{$number}){
  		$numberTxt = $num2txtMap{$number};
  	}
  	#if the number isn't a 0 sequel
  	elsif($number !~ /^0+$/){
  		my $foundMatch = 0;
  		# try to find a matching rules
  		while((my $ruleMatch, my $transf)=each(%rulesMap)){
  			# test if the current rule match
  			if($number =~ /^\b$ruleMatch\b$/){
  				$numberTxt =~ s/$ruleMatch/processNumberTransf($1)/ge;
  				if($transf){
  					$numberTxt .= " $transf";
  				}
  				return $numberTxt;
  			}
  		}
  		$numberTxt = splitNumber($number);
  	}
  	return $numberTxt;
  }
  
  #
  # \brief process to a number splitting
  #	ex: 1253 ==> 1000 + 253
  #
  sub splitNumber
  {
  	my ($number) = @_;
      print $number;
  	my ($subStr1, $subStr2, $subStrConverted1, $subStrConverted2);
  	my $splittedNbTxt = "";
  	# find where split
  	my $splitLength = foundSplitNumberLength($number);
  
  	$subStr1 = substr($number, 0, $splitLength-1)."0" x length(substr($number, $splitLength-1));
  	$subStr2 = substr($number, $splitLength-1);
  	
      print $subStr1."
  ";
  	$subStrConverted1 = processNumberTransf($subStr1);
  	if($subStr2 !~ /^0+$/){
  		$subStrConverted2 = processNumberTransf($subStr2);
  	}
  	else{
  		$subStrConverted2 = "";
  	}
  		
  	$splittedNbTxt = " $subStrConverted1 $numberSep $subStrConverted2 ";
  	return	$splittedNbTxt;
  }
  
  #
  # \brief find the length of a split
  #
  sub foundSplitNumberLength
  {
  	my ($number) = @_;
  	my $splitLength=length($number);	
  	my $nbLength = length($number);
  	
  	if($nbLength%3){	
  		if($nbLength < 3){
  			return $nbLength%3; #2(3)
  		}
  		else{
  			$splitLength = $nbLength%3 + 1; #11(054)
  		}
  	}
  	else{
  		if($nbLength > 3){
  			return 4; #100(004)
  		}
  		else{
  			return 2; #1(02)
  		}
  	}
  	return $splitLength;
  }
  __END__
  
  =head1 NAME
  
  PreprocessCorpus.pl - preprocess a raw corpus
  
  =head1 SYNOPSIS
  
  PreprocessCorpus.pl [options] 
  
  Options:
  	-help|?		brief help message
  
  	-rule		(i) file containing rules to convert number to text (mandatory)
  			Format :
  			#tag	#regex	#correct
  			Ex :
  			exep	1	un
  			rule	\d)00   cents
  			chars_sep	,	virgule
  
  	-muRules	(i) file containing measuring unit rules
  			Format : 
  			#symbol	#singular	#plural
  			Ex:
  			h	heure		heures	
  
  	-clustNb	perform a clustering task on number
  			Ex: 65.000 => 65000
  
  	-oovMu		(o) ouput of the measuring unit oov report (default : no report produced)
  
  	-sep		(i) change se sperator a the rule file (default \t)
  	
  	-numberSep	(i) introduce a char sequence between two text number units (ex: with "et" : 12 => dix et deux)