FindNormRules.pl 4.18 KB
#!/usr/bin/perl -w

#------------------------------------------
# Author : Emmanuel FERREIRA
# Contact: emmanuel.ferreira0194@gmail.com
# Date : 08/02/12
# Brief : 
#	Analyze a list of coverage reports and process some text transformation
# 	in order to check usual words spelling mistakes
# 	create both a NOEMALIZATION_RULES_FILE containing transformation table in order
# 	to correct guessed mistakes found by this process and a OOV_LST_FILE 
# 	containing the guessed real OOV words
#------------------------------------------

use strict;
use Getopt::Long;
use Pod::Usage;
use Switch;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $vocab;
my $penalty = 5;
my $oovReport;

# usefull variables
my %vocabMap;
my $sep="\t";

# Parse options and print usage if there is a syntax
# error, or if usage as explicitly requested
GetOptions('help|?' => \$help,
	   'penalty=s' => \$penalty);

pod2usage(1) if($help);
pod2usage({-msg =>"\nERROR : you must specify an oov report  and a lexicon\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if($#ARGV != 1);

$oovReport = $ARGV[0];
$vocab = $ARGV[1];

#*******************************
# MAIN
#*******************************

#--------------------------
# Load the lexicon in a map
#------------------------- 
if($vocab){
	my $file;

	open($file, $vocab) or die("Cannot open : $vocab");
	while(<$file>){
		chomp($_);
		my $tmp = doTransformation($_);
		if(!$vocabMap{$tmp}){
			$vocabMap{$tmp}=();
		}
		push(@{$vocabMap{$tmp}}, $_);
	}
	close($file);
}

#-----------------------
# Load the OOV report and display the result formatted as follows : 
# *  normalization rules : r <OCC> <WORD>#<CORREC>#<SCORE> <ID> <EXP>
# *  oov : r <OCC> <WORD>#<CORREC>#<SCORE> <ID> <EXP>
#-----------------------
my $file;	
open($file, "$oovReport");
while(<$file>)
{
		chomp($_);
		my @splittedLine = split(/$sep/, $_);
		my $word = $splittedLine[1];
		my $id = $splittedLine[0];
		my $occ = $splittedLine[2];
		my $exp = $splittedLine[3];
		my $transf = doTransformation($splittedLine[1]);
		if($vocabMap{$transf}){
			my @bestWord = findBestTransf($word, @{$vocabMap{$transf}});
			# on n'afficage pas de score inf a 0 
			my $score = $bestWord[1];
			if($score < 0){
				$score = 0;
			}
			print "r$sep${occ}$sep${word}#$bestWord[0]#$bestWord[1]$sep${id}$sep${exp}\n";
		} 
		else{	
			print "o$sep${occ}$sep${word}$sep${id}$sep${exp}\n";
		}
}
close($file);

#-------------------------------------------
# SUBROUTINES
#-------------------------------------------

sub doTransformation
{
	my ($word) = @_;
	$word = lc($word);
	$word =~ s/\d//g;
	$word =~ s/'//g;
	return $word;
}

sub findBestTransf
{
	my ($word, @vocabTab) = @_;
	
	my $Best;
	my $BestScore;
	my $i;
	my $score;

	for( $i = 0; $i <= $#vocabTab; $i++){
		if(!$Best){
			$Best = $vocabTab[$i];
			$BestScore = calculateScore($word, $Best);
		}
		else{
			$score = calculateScore($word, $vocabTab[$i]);
			if($score < $BestScore){
				$Best = $vocabTab[$i];
				$BestScore = $score;
			}
		}
	}

	return ($Best, $BestScore);
}

sub calculateScore
{
	my ($word, $cmpword) = @_;
	my $score = 0;
	if(length($word) > length($cmpword)){
	     $score += $penalty + levenshtein($word, $cmpword);
	}
	if ($cmpword =~ /^[A-Z]/ && $cmpword !~ /^[A-Z]+$/ && $word =~ /^[A-Z]/){
	    $score -= 1;
	}
#	if($cmpword =~ /^[A-Z]+$/ && $word !~ /^[A-Z]+$/){
#	    $score += $penalty;
#	}		
	$score +=  levenshtein($word, $cmpword);
	return $score;
}

#---------------------------------------------------
# LEVENSHTEIN DISTANCE ALGORITHM
#	Initial version : Jorge Mas Trullenque
#---------------------------------------------------
sub levenshtein($$){
  my @A=split //, lc shift;
  my @B=split //, lc shift;
  my @W=(0..@B);
  my ($i, $j, $cur, $next);
  for $i (0..$#A){
	$cur=$i+1;
	for $j (0..$#B){
		$next=min(
			$W[$j+1]+1,
			$cur+1,
			($A[$i] ne $B[$j])+$W[$j]
		);
		$W[$j]=$cur;
		$cur=$next;
	}
	$W[@B]=$next;
  }
  return $next;
}

sub min($$$){
  if ($_[0] < $_[2]){ pop @_; } else { shift @_; }
  return $_[0] < $_[1]? $_[0]:$_[1];
}
__END__

=head1 NAME

FindNormRules.pl - from oov produce invoc correction proposition

=head1 SYNOPSIS

FindNormRules.pl [options] <report_oov> <lexicon>