Number2txt.pl 8.84 KB
#!/usr/bin/perl -w

#------------------------------------------
# Author : Emmanuel FERREIRA
# Contact: emmanuel.ferreira0194@gmail.com
# Date : 14/02/11
# Brief : in order to convert a raw text containing numbers
#	  in fulltext (without numbers)
#------------------------------------------

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;

#-------------------------------------------
# MAIN
# N.B : in this current file mu stands for 
#       measuring unit
#-------------------------------------------

# options variables
my $help = 0;
my $muRulesFile;
my $number2txtRulesFile;
my $processNbClustering; #65.000 => 65000
my $oovMu;

# usefull variables
my %singularMuSpellingMap;
my %pluralMuSpellingMap;
my %femMu;
my %oovMuMap;
#number to text map
my %num2txtMap;
#rules map
my %rulesMap;
#valid digit separator map
my %sepMap;
#regex in order to catch valid digit separator
my $sepRegex ="";
#config
my $rulesSep = "\t";
my $unitSep = " ";
my $numberSep = "";
my $spotfem ="<e>";

# Parse options and print usage if there is a syntax
# error, or if usage was explicitly requested
GetOptions('help|?' => \$help, 
	   'clustNb' => \$processNbClustering,
	   'muRules=s' => \$muRulesFile,
	   'oovMu=s' => \$oovMu,
	   'rule=s' => \$number2txtRulesFile,
	   'sep=s' =>\$rulesSep,
	   'usep=s' =>\$unitSep,
	   'numbSep=s' =>\$numberSep);

pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - mandatory option : rule (--rule <path_to_rule_file>),  in order to process the number convertion\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$number2txtRulesFile);

#Load rules in programming structures

#MU RULES FILE => SINGULAR_MU_MAP & PLURAL_MU_MAP
if($muRulesFile){
	my $file;
	open($file, $muRulesFile) or die("Cannot open: $muRulesFile");
	while(<$file>){
		chomp($_);
		#non-comment lines
		if($_ && $_ !~ /^\s*$/ && $_ !~ /^#/ ){
			my @splitLine = split(/$rulesSep/, $_);
			#check column number (must be 4 (-1))
			if($#splitLine != 3){
				#waring bad format
				die("Bad format in $muRulesFile : $_");			
			}
			else{
				#take into account variant spelling of the unit
				my @sameUnitTab = split($unitSep, $splitLine[0]);
				#for each spelling store the singular and plural form
				for(my $i=0; $i <= $#sameUnitTab; $i++){
					my $unit = $sameUnitTab[$i];
					$singularMuSpellingMap{$unit}=$splitLine[1];
					$pluralMuSpellingMap{$unit}=$splitLine[2];
					#store if it is a female form
					if($splitLine[3] eq "f"){
						$femMu{$unit}=1;
					}
				}
			}
		}
	}
	close($file);
}

initializeMaps($number2txtRulesFile);

#perform the clean task preprocess
while(<STDIN>){
	chomp($_);
	if($_){
		if($processNbClustering){
			$_ =~ s/(\b\d{1,3})\s*(([\.\s](\d{3}))+)(\s*(\D|$))/clust($1,$2).$5/eg;
		}
		if($muRulesFile){
			$_ =~ s/(\b\d+([\.,]\d+)?)\s?([^\s\d\.,]+)/goodFormat($1, $3, '')/eg;
		}
		#text convertion of all the valid digit separator
		$_ =~ s/(\d+)($sepRegex)(\d+)/sep2txt($1,$2,$3)/ge;
		#find and format 0 sequels
		$_ =~ s/(\b)(0+)([1-9]\d*)?(\b)/preformatZeroSequel($1,$2,$3)/ge;
		#number convertion
		$_ =~ s/(\b)(\d+)(\b)/numberConvertion($1,$2,$3)/ge;
		#replace the spot fem if needed
		$_ =~ s/([^e]) $spotfem/$1e/g;
		#remove them otherwise
		$_ =~ s/$spotfem//g;
		#remove the mutiple space
		$_ =~ s/ +/ /g;
		#display
		print "$_\n";
	}
}

#build the report (if needed)
#if($oovMu){
#	my $file;
#	open($file, ">$oovMu") or die("Cannot open : $oovMu");
#	while( my($k, $v) = each(%oovMuMap)){
#		print $file "$k	$v\n";
#	}
#	close($file);
#}

#-------------------------------------------
# SUBROUTINES
#-------------------------------------------

#
# \brief clust process 16.000 => 16000 and 15 000 => 15000
#
sub clust{
	my ($a, $b) = @_;
	$b =~ s/\s+//g;
	$b =~ s/\.//g;
	return $a.$b;
}

#
# \breif format a number sequence applying the muRules
#
sub goodFormat{
	my ($nb_prec, $current, $nb_succ) = @_;
	$nb_succ = '' if(!$nb_succ);
	return  $nb_prec.writeMu($current, $nb_prec, $nb_succ).$nb_succ;
}

#
# \brief apply the muRules
#
sub writeMu{
	my ($current, $nb_prec, $nb_succ) = @_;
	my $modif=" ";
	#if the key exist
	if($singularMuSpellingMap{$current}){
		$nb_prec =~ s/,/./g;
		$nb_prec =~ s/([^\d\.]*)([\d\.]+$)/$2/g;
		if($nb_prec =~ /1$/ && $nb_prec !~ /11$/ && exists $femMu{$current}){
			$modif=" $spotfem ";
		}
		if($nb_prec < -1 || $nb_prec > 1){
			return $modif.$pluralMuSpellingMap{$current}." ";	
		}
		else{
			return $modif.$singularMuSpellingMap{$current}.$modif;
		}
	}
	else{
		#count occurence oov
		$oovMuMap{$current}++;
		if(!$nb_prec && $nb_prec != 0){
			return $current;
		} else{
			return $modif.$current.$modif;
		}
	}
}

#
# \brief initialize the different maps use in this script
#	 rulesMap : map containing the rules
#	 sepMap : map containing the number valid separator
#	 number2charMap : map containing the usefull number to text convertion 
#
sub initializeMaps
{
	my ($ruleFile) = @_;
	#open the rule file
	open(RULE_FILE, $ruleFile) or die("Cannot open the rule file : $ruleFile");
	#reading
	while(<RULE_FILE>)
	{
		chomp($_);	
		my @columns = split($rulesSep);
		# for all the rule containing in the rules file (line starting by rule)
		if($columns[0] =~ /rule/){
			$rulesMap{$columns[1]} = $columns[2];
		}
		# for all the separator (line starting by chars_sep)
		elsif($columns[0] =~ /chars_sep/){
			$sepMap{$columns[1]} = $columns[2];
			#construction simultanee de la regex permettant de
			#les retrouver
			if($sepRegex eq "")
			{
				$sepRegex .= $columns[1];
			}
			else
			{
				$sepRegex .= "|".$columns[1];
			}
			
		}
		#for all the number value
		elsif($columns[0] =~ /exep/){
			$num2txtMap{$columns[1]} = $columns[2];
		}
		else{
			#nothing
		}
	}
	close(RULE_FILE);
}

#
# \brief from valide digit separator to text
#
sub sep2txt
{
	my ($leftNum, $sep, $rigthNum) = @_;
	if($sepMap{$sep}){
		$sep = " $sepMap{$sep} ";
	}
	return "${leftNum}${sep}${rigthNum}";
}

#
# \brief preformat a 0 sequel
#
sub preformatZeroSequel
{
	my ($left, $zeroSequel, $rigth) = @_;
	$left = "" if(!$left);
	$rigth = "" if(!$rigth);
	my $sequelTransf = "";
	for(my $i = 0; $i < length($zeroSequel); $i++){
		$sequelTransf.= "0 ";
	}
	return "${left}${sequelTransf}${rigth}";
}

#
# \brief number convertion
#
sub numberConvertion
{
	my ($boundLeft, $number, $boundRigth) = @_;
	$number = processNumberTransf($number);
	return "${boundLeft}${number}${boundRigth}";
}

#
# \brief process a number convertion
#
sub processNumberTransf{
	my ($number) = @_;
	#remove all the useless 0
	$number =~ s/^(0+)([1-9])/$2/g;
	my $numberTxt = $number;
	#if the number correspond to a map key exec the change
	if($num2txtMap{$number}){
		$numberTxt = $num2txtMap{$number};
	}
	#if the number isn't a 0 sequel
	elsif($number !~ /^0+$/){
		my $foundMatch = 0;
		# try to find a matching rules
		while((my $ruleMatch, my $transf)=each(%rulesMap)){
			# test if the current rule match
			if($number =~ /^\b$ruleMatch\b$/){
				$numberTxt =~ s/$ruleMatch/processNumberTransf($1)/ge;
				if($transf){
					$numberTxt .= " $transf";
				}
				return $numberTxt;
			}
		}
		$numberTxt = splitNumber($number);
	}
	return $numberTxt;
}

#
# \brief process to a number splitting
#	ex: 1253 ==> 1000 + 253
#
sub splitNumber
{
	my ($number) = @_;
    print $number;
	my ($subStr1, $subStr2, $subStrConverted1, $subStrConverted2);
	my $splittedNbTxt = "";
	# find where split
	my $splitLength = foundSplitNumberLength($number);

	$subStr1 = substr($number, 0, $splitLength-1)."0" x length(substr($number, $splitLength-1));
	$subStr2 = substr($number, $splitLength-1);
	
    print $subStr1."\n";
	$subStrConverted1 = processNumberTransf($subStr1);
	if($subStr2 !~ /^0+$/){
		$subStrConverted2 = processNumberTransf($subStr2);
	}
	else{
		$subStrConverted2 = "";
	}
		
	$splittedNbTxt = " $subStrConverted1 $numberSep $subStrConverted2 ";
	return	$splittedNbTxt;
}

#
# \brief find the length of a split
#
sub foundSplitNumberLength
{
	my ($number) = @_;
	my $splitLength=length($number);	
	my $nbLength = length($number);
	
	if($nbLength%3){	
		if($nbLength < 3){
			return $nbLength%3; #2(3)
		}
		else{
			$splitLength = $nbLength%3 + 1; #11(054)
		}
	}
	else{
		if($nbLength > 3){
			return 4; #100(004)
		}
		else{
			return 2; #1(02)
		}
	}
	return $splitLength;
}
__END__

=head1 NAME

PreprocessCorpus.pl - preprocess a raw corpus

=head1 SYNOPSIS

PreprocessCorpus.pl [options] 

Options:
	-help|?		brief help message

	-rule		(i) file containing rules to convert number to text (mandatory)
			Format :
			#tag	#regex	#correct
			Ex :
			exep	1	un
			rule	\d)00   cents
			chars_sep	,	virgule

	-muRules	(i) file containing measuring unit rules
			Format : 
			#symbol	#singular	#plural
			Ex:
			h	heure		heures	

	-clustNb	perform a clustering task on number
			Ex: 65.000 => 65000

	-oovMu		(o) ouput of the measuring unit oov report (default : no report produced)

	-sep		(i) change se sperator a the rule file (default \t)
	
	-numberSep	(i) introduce a char sequence between two text number units (ex: with "et" : 12 => dix et deux)