ApplyCorrectionRules.pl
1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
#-------------------------------------------
# MAIN
#-------------------------------------------
# options variables
my $help = 0;
my $tagg = 0;
my $correctionFile;
# usefull variables
my $sep="#";
my $preRegex="";
my %tokenRegexHash;
GetOptions('help|?' => \$help,
'tagg' => \$tagg,
'correction' => \$correctionFile);
pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - you must specify a cor\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);
my $file;
open($file, "$ARGV[0]") or die ("Cannot open : $ARGV[0]\n");
while(<$file>){
chomp($_);
my @splittedLine = split($sep, $_);
my $seeked = $splittedLine[0];
my $word = $seeked;
my $correction = $splittedLine[1];
if($seeked =~ / /){
my @splittedSeeked = split(/ /, $seeked);
$word = $splittedSeeked[0];
}
if($preRegex){
$preRegex .= "| $word ";
} else{
$preRegex = " $word ";
}
my @value = ($seeked, $correction);
if(exists $tokenRegexHash{$word}){
push(@{$tokenRegexHash{$word}}, \@value);
} else{
my @tab = (\@value);
$tokenRegexHash{$word}=\@tab;
}
}
close($file);
while(<STDIN>){
$_ =~ tr/_/ /;
my $sentence = $_;
my @match = ($_ =~ m/ $preRegex /g);
if($#match >= 0){
my %hash;
for(my $i = 0; $i <= $#match; $i++){
$match[$i] =~ s/ //g;
$hash{$match[$i]}++;
}
my @entities = keys %hash;
for(my $i = 0; $i <= $#entities; $i++){
my @regex = @{$tokenRegexHash{$entities[$i]}};
for(my $y = 0; $y <= $#regex; $y++){
$sentence =~ s/ ${$regex[$y]}[0] / ${$regex[$y]}[1] /g;
}
}
}
$sentence =~ s/ +/ /g;
$sentence =~ s/^ //g;
$sentence =~ s/ $//g;
print $sentence;
}
__END__
=head1 NAME
=head1 SYNOPSIS
cat corpus | ApplyCorrectionRules.pl [options] <correction.tab>
Options :
-help|? display this help