ApplyCorrectionRules.pl 1.79 KB
#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Pod::Usage;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $tagg = 0;
my $correctionFile;

# usefull variables
my $sep="#";
my $preRegex="";
my %tokenRegexHash; 

GetOptions('help|?' => \$help,
	   'tagg' => \$tagg,
	   'correction' => \$correctionFile);
 
pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - you must specify a cor\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);

my $file;
open($file, "$ARGV[0]") or die ("Cannot open : $ARGV[0]\n");
while(<$file>){
	chomp($_);
	my @splittedLine = split($sep, $_);
	my $seeked = $splittedLine[0];
	my $word = $seeked;
	my $correction = $splittedLine[1];
	if($seeked =~ / /){
		my @splittedSeeked = split(/ /, $seeked);
		$word = $splittedSeeked[0];
	}
	if($preRegex){
		$preRegex .= "| $word ";
	} else{
		$preRegex = " $word ";
	}
	my @value = ($seeked, $correction);
	if(exists $tokenRegexHash{$word}){
		push(@{$tokenRegexHash{$word}}, \@value);
	} else{
		my @tab = (\@value);
		$tokenRegexHash{$word}=\@tab;
	}
}
close($file);

while(<STDIN>){
	$_ =~ tr/_/ /;
	my $sentence = $_;
	my @match = ($_ =~ m/ $preRegex /g);
	if($#match >= 0){
		my %hash;
		for(my $i = 0; $i <= $#match; $i++){
			$match[$i] =~ s/ //g;
			$hash{$match[$i]}++;
		}
		my @entities = keys %hash;
		for(my $i = 0; $i <= $#entities; $i++){
			my @regex = @{$tokenRegexHash{$entities[$i]}};
			for(my $y = 0; $y <= $#regex; $y++){
				$sentence =~ s/ ${$regex[$y]}[0] / ${$regex[$y]}[1] /g;
			}
		}
	}
	$sentence =~ s/ +/ /g;
	$sentence =~ s/^ //g;
	$sentence =~ s/ $//g;
	print $sentence;
}

__END__

=head1 NAME

=head1 SYNOPSIS

cat corpus | ApplyCorrectionRules.pl [options] <correction.tab> 

Options :

	-help|? 	display this help