Blame view

tools/scripts/ApplyCorrectionRules.pl 1.79 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  #!/usr/bin/perl -w
  
  use strict;
  use Getopt::Long;
  use Pod::Usage;
  
  #-------------------------------------------
  # MAIN
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $tagg = 0;
  my $correctionFile;
  
  # usefull variables
  my $sep="#";
  my $preRegex="";
  my %tokenRegexHash; 
  
  GetOptions('help|?' => \$help,
  	   'tagg' => \$tagg,
  	   'correction' => \$correctionFile);
   
  pod2usage(1) if($help);
  pod2usage({-msg => "BAD USAGE - you must specify a cor
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);
  
  my $file;
  open($file, "$ARGV[0]") or die ("Cannot open : $ARGV[0]
  ");
  while(<$file>){
  	chomp($_);
  	my @splittedLine = split($sep, $_);
  	my $seeked = $splittedLine[0];
  	my $word = $seeked;
  	my $correction = $splittedLine[1];
  	if($seeked =~ / /){
  		my @splittedSeeked = split(/ /, $seeked);
  		$word = $splittedSeeked[0];
  	}
  	if($preRegex){
  		$preRegex .= "| $word ";
  	} else{
  		$preRegex = " $word ";
  	}
  	my @value = ($seeked, $correction);
  	if(exists $tokenRegexHash{$word}){
  		push(@{$tokenRegexHash{$word}}, \@value);
  	} else{
  		my @tab = (\@value);
  		$tokenRegexHash{$word}=\@tab;
  	}
  }
  close($file);
  
  while(<STDIN>){
  	$_ =~ tr/_/ /;
  	my $sentence = $_;
  	my @match = ($_ =~ m/ $preRegex /g);
  	if($#match >= 0){
  		my %hash;
  		for(my $i = 0; $i <= $#match; $i++){
  			$match[$i] =~ s/ //g;
  			$hash{$match[$i]}++;
  		}
  		my @entities = keys %hash;
  		for(my $i = 0; $i <= $#entities; $i++){
  			my @regex = @{$tokenRegexHash{$entities[$i]}};
  			for(my $y = 0; $y <= $#regex; $y++){
  				$sentence =~ s/ ${$regex[$y]}[0] / ${$regex[$y]}[1] /g;
  			}
  		}
  	}
  	$sentence =~ s/ +/ /g;
  	$sentence =~ s/^ //g;
  	$sentence =~ s/ $//g;
  	print $sentence;
  }
  
  __END__
  
  =head1 NAME
  
  =head1 SYNOPSIS
  
  cat corpus | ApplyCorrectionRules.pl [options] <correction.tab> 
  
  Options :
  
  	-help|? 	display this help