Jean-François Rey / otmedia

Blame view

tools/scripts/FindNormRules.pl 4.18 KB
  #!/usr/bin/perl -w
  
  #------------------------------------------
  # Author : Emmanuel FERREIRA
  # Contact: emmanuel.ferreira0194@gmail.com
  # Date : 08/02/12
  # Brief : 
  #	Analyze a list of coverage reports and process some text transformation
  # 	in order to check usual words spelling mistakes
  # 	create both a NOEMALIZATION_RULES_FILE containing transformation table in order
  # 	to correct guessed mistakes found by this process and a OOV_LST_FILE 
  # 	containing the guessed real OOV words
  #------------------------------------------
  
  use strict;
  use Getopt::Long;
  use Pod::Usage;
  use Switch;
  
  #-------------------------------------------
  # MAIN
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $vocab;
  my $penalty = 5;
  my $oovReport;
  
  # usefull variables
  my %vocabMap;
  my $sep="\t";
  
  # Parse options and print usage if there is a syntax
  # error, or if usage as explicitly requested
  GetOptions('help|?' => \$help,
  	   'penalty=s' => \$penalty);
  
  pod2usage(1) if($help);
  pod2usage({-msg =>"
  ERROR : you must specify an oov report  and a lexicon
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if($#ARGV != 1);
  
  $oovReport = $ARGV[0];
  $vocab = $ARGV[1];
  
  #*******************************
  # MAIN
  #*******************************
  
  #--------------------------
  # Load the lexicon in a map
  #------------------------- 
  if($vocab){
  	my $file;
  
  	open($file, $vocab) or die("Cannot open : $vocab");
  	while(<$file>){
  		chomp($_);
  		my $tmp = doTransformation($_);
  		if(!$vocabMap{$tmp}){
  			$vocabMap{$tmp}=();
  		}
  		push(@{$vocabMap{$tmp}}, $_);
  	}
  	close($file);
  }
  
  #-----------------------
  # Load the OOV report and display the result formatted as follows : 
  # *  normalization rules : r <OCC> <WORD>#<CORREC>#<SCORE> <ID> <EXP>
  # *  oov : r <OCC> <WORD>#<CORREC>#<SCORE> <ID> <EXP>
  #-----------------------
  my $file;	
  open($file, "$oovReport");
  while(<$file>)
  {
  		chomp($_);
  		my @splittedLine = split(/$sep/, $_);
  		my $word = $splittedLine[1];
  		my $id = $splittedLine[0];
  		my $occ = $splittedLine[2];
  		my $exp = $splittedLine[3];
  		my $transf = doTransformation($splittedLine[1]);
  		if($vocabMap{$transf}){
  			my @bestWord = findBestTransf($word, @{$vocabMap{$transf}});
  			# on n'afficage pas de score inf a 0 
  			my $score = $bestWord[1];
  			if($score < 0){
  				$score = 0;
  			}
  			print "r$sep${occ}$sep${word}#$bestWord[0]#$bestWord[1]$sep${id}$sep${exp}
  ";
  		} 
  		else{	
  			print "o$sep${occ}$sep${word}$sep${id}$sep${exp}
  ";
  		}
  }
  close($file);
  
  #-------------------------------------------
  # SUBROUTINES
  #-------------------------------------------
  
  sub doTransformation
  {
  	my ($word) = @_;
  	$word = lc($word);
  	$word =~ s/\d//g;
  	$word =~ s/'//g;
  	return $word;
  }
  
  sub findBestTransf
  {
  	my ($word, @vocabTab) = @_;
  	
  	my $Best;
  	my $BestScore;
  	my $i;
  	my $score;
  
  	for( $i = 0; $i <= $#vocabTab; $i++){
  		if(!$Best){
  			$Best = $vocabTab[$i];
  			$BestScore = calculateScore($word, $Best);
  		}
  		else{
  			$score = calculateScore($word, $vocabTab[$i]);
  			if($score < $BestScore){
  				$Best = $vocabTab[$i];
  				$BestScore = $score;
  			}
  		}
  	}
  
  	return ($Best, $BestScore);
  }
  
  sub calculateScore
  {
  	my ($word, $cmpword) = @_;
  	my $score = 0;
  	if(length($word) > length($cmpword)){
  	     $score += $penalty + levenshtein($word, $cmpword);
  	}
  	if ($cmpword =~ /^[A-Z]/ && $cmpword !~ /^[A-Z]+$/ && $word =~ /^[A-Z]/){
  	    $score -= 1;
  	}
  #	if($cmpword =~ /^[A-Z]+$/ && $word !~ /^[A-Z]+$/){
  #	    $score += $penalty;
  #	}		
  	$score +=  levenshtein($word, $cmpword);
  	return $score;
  }
  
  #---------------------------------------------------
  # LEVENSHTEIN DISTANCE ALGORITHM
  #	Initial version : Jorge Mas Trullenque
  #---------------------------------------------------
  sub levenshtein($$){
    my @A=split //, lc shift;
    my @B=split //, lc shift;
    my @W=(0..@B);
    my ($i, $j, $cur, $next);
    for $i (0..$#A){
  	$cur=$i+1;
  	for $j (0..$#B){
  		$next=min(
  			$W[$j+1]+1,
  			$cur+1,
  			($A[$i] ne $B[$j])+$W[$j]
  		);
  		$W[$j]=$cur;
  		$cur=$next;
  	}
  	$W[@B]=$next;
    }
    return $next;
  }
  
  sub min($$$){
    if ($_[0] < $_[2]){ pop @_; } else { shift @_; }
    return $_[0] < $_[1]? $_[0]:$_[1];
  }
  __END__
  
  =head1 NAME
  
  FindNormRules.pl - from oov produce invoc correction proposition
  
  =head1 SYNOPSIS
  
  FindNormRules.pl [options] <report_oov> <lexicon>