ApplyCorrectionRules.pl 1.79 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89


#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Pod::Usage;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $tagg = 0;
my $correctionFile;

# usefull variables
my $sep="#";
my $preRegex="";
my %tokenRegexHash; 

GetOptions('help|?' => \$help,
	   'tagg' => \$tagg,
	   'correction' => \$correctionFile);
 
pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - you must specify a cor\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);

my $file;
open($file, "$ARGV[0]") or die ("Cannot open : $ARGV[0]\n");
while(<$file>){
	chomp($_);
	my @splittedLine = split($sep, $_);
	my $seeked = $splittedLine[0];
	my $word = $seeked;
	my $correction = $splittedLine[1];
	if($seeked =~ / /){
		my @splittedSeeked = split(/ /, $seeked);
		$word = $splittedSeeked[0];
	}
	if($preRegex){
		$preRegex .= "| $word ";
	} else{
		$preRegex = " $word ";
	}
	my @value = ($seeked, $correction);
	if(exists $tokenRegexHash{$word}){
		push(@{$tokenRegexHash{$word}}, \@value);
	} else{
		my @tab = (\@value);
		$tokenRegexHash{$word}=\@tab;
	}
}
close($file);

while(<STDIN>){
	$_ =~ tr/_/ /;
	my $sentence = $_;
	my @match = ($_ =~ m/ $preRegex /g);
	if($#match >= 0){
		my %hash;
		for(my $i = 0; $i <= $#match; $i++){
			$match[$i] =~ s/ //g;
			$hash{$match[$i]}++;
		}
		my @entities = keys %hash;
		for(my $i = 0; $i <= $#entities; $i++){
			my @regex = @{$tokenRegexHash{$entities[$i]}};
			for(my $y = 0; $y <= $#regex; $y++){
				$sentence =~ s/ ${$regex[$y]}[0] / ${$regex[$y]}[1] /g;
			}
		}
	}
	$sentence =~ s/ +/ /g;
	$sentence =~ s/^ //g;
	$sentence =~ s/ $//g;
	print $sentence;
}

__END__

=head1 NAME

=head1 SYNOPSIS

cat corpus | ApplyCorrectionRules.pl [options] <correction.tab> 

Options :

	-help|? 	display this help