Jean-François Rey / otmedia

Blame view

tools/scripts/CoverageReportMaker.pl 5.09 KB
  #!/usr/bin/perl -w
  
  #------------------------------------------
  # Author : Emmanuel FERREIRA
  # Contact: emmanuel.ferreira0194@gmail.com
  # Date : 14/02/11
  # Brief : in order to postprocess a clean corpus 
  #------------------------------------------
  
  use strict;
  use warnings;
  use Getopt::Long;
  use Pod::Usage;
  use File::Path;
  
  #-------------------------------------------
  # MAIN
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $lexicon;
  my $verbose;
  my $filename = "report";
  
  # usefull variables
  my %lexiconMap;
  my %oovMap;
  my $lexMatch = 0;
  my $oovMatch = 0;
  my $zeroton = 0;
  my $dirname = "./tmp_report";
  my $write;
  my $sep="\t";
  # Parse options and print usage if there is a syntax
  # error, or if usage was explicitly requested
  GetOptions('help|?' => \$help, 
             'verbose' => \$verbose,
  	   'lex=s' => \$lexicon,
  	   'out=s' => \$dirname,
  	   'name=s' => \$filename,
  	   'writeAll' => \$write);
  
  pod2usage(1) if($help);
  pod2usage({-msg => "BAD USAGE - you must specify a lexicon in argument
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);
  
  $lexicon = $ARGV[0];
  
  #********************************************
  # MAIN
  #********************************************
  print "CREATE DIR...
  " if($verbose);
  
  mkdir $dirname, 0755;
  
  print "LOAD LEXICON...
  " if($verbose);
  
  # --------------------------------
  # Load the lexicon into a programming structure
  # --------------------------------
  my $file;
  open($file, $lexicon) or die("Cannot open : $lexicon");
  while(<$file>){
  	chomp($_);
  	#save the key
  	$lexiconMap{$_}=0;
  	$zeroton++;
  }
  close($file);
  
  # -------------------------------------
  # postprocess : remove <s> and </s> tags
  # and register all the words
  # -------------------------------------
  print "WORDS ANALYZE...
  " if($verbose);
  while(<STDIN>){
  	my $line = $_;
  	my $sentence = $line;
  	$line =~ s/<s>|<\/s>//g;
  	wordRegister("<s>", $sentence);
  	wordRegister("</s>", $sentence);
  	$line =~ s/ ([\w_']+) /wordRegister($1,$sentence)/ge;
  }
  
  # ------------------------------------
  # produce two reports :
  # * in-vocabulary words with counts 
  # * out-of-vocabulary words counts
  # ------------------------------------
  print "PRODUCE REPORTS...
  " if($verbose);
  print "\t* LEXICON MATCH : $lexMatch
  " if($verbose);
  print "\t* ZEROTON MATCH : $zeroton
  " if($verbose);
  print "\t* OOV : $oovMatch
  " if($verbose);
  # save in-vocabulary words sorted by occurences and alphabetic order
  # NB: this file is formatted as follow :  <WORD> <NB_OCC>
  my @tmpTab = ();
  while(my ($word, $occ) = each %lexiconMap){
  	my @tmp = ($word, $occ);
  	push(@tmpTab, \@tmp);
  }
  @tmpTab = sort { ${$b}[1] <=>  ${$a}[1] || ${$a}[0] cmp ${$b}[0] } @tmpTab;
  open($file, ">$dirname/$filename.invoc");
  for(my $i = 0; $i <= $#tmpTab; $i++){
  	my $word = ${$tmpTab[$i]}[0];
  	my $occ = ${$tmpTab[$i]}[1];
  	print $file "${word}$sep${occ}
  ";
  }
  close($file);
  # save out-of-vocabulary words by occurences and alphabetic order
  # NB: this file is formatted as follow : <ID> <WORD> <NB_OCC> <EXAMPLE_OF_CONTEXT>
  open($file, ">$dirname/$filename.oov");
  my @oovs = values %oovMap;
  @oovs = sort { ${$b}[0] <=> ${$a}[0] || ${$a}[3] cmp ${$b}[3] } @oovs;
  for(my $i = 0; $i <= $#oovs; $i++){
  	my @tabOOV = @{$oovs[$i]};
  	my $word = $tabOOV[3];
  	if($word !~ /<\/?s>/){
  		my $id = $tabOOV[1];
  		my $occ = $tabOOV[0];
  		my $exp = $tabOOV[2];
  		chomp $exp;
  		$exp =~ s/ ($word) / <$1> /g;
  		print $file "${id}$sep${word}$sep${occ}$sep\[${exp}\]
  ";
  	}
  }
  close($file);
  
  #********************************************
  # SUB-ROUTINES
  #********************************************
  
  sub wordRegister
  {
  	my ($word, $sentence) = @_;
  	if(defined $lexiconMap{$word}){
  		if($lexiconMap{$word} == 0){
  			$zeroton--;
  			$lexMatch++;
  		}
  		$lexiconMap{$word}++;
  	}
  	else{
  		if($oovMap{$word}){
  			${$oovMap{$word}}[0]++;
  		}
  		else{
  			$oovMatch++;
  			my @tab = (1, $oovMatch, $sentence, $word);
  			$oovMap{$word}=\@tab;
  			if($write){
  				createOOVFile($oovMatch, $word);
  			}
  		}
  		if($write){
  			writeContext(${$oovMap{$word}}[1], $sentence);
  		}
  	}
  	return " $word ";
  }
  
  sub createOOVFile
  {
  	my ($idOOV, $oov) = @_;
  	my $file;
  	open($file, ">$dirname/report_$idOOV");
  	print $file "OOV=$oov
  ";
  	close($file);	
  }
  
  sub writeContext
  {
  	my ($idOOV, $sentence) = @_;
  	my $file;
  	open($file, ">>$dirname/report_$idOOV");
  	print $file "$sentence
  ";
  	close($file);
  }
  
  #-------------------------------------------
  # SUBROUTINES
  #-------------------------------------------
  
  __END__
  
  =head1 NAME
  
  CoverageReportMaker.pl - produce a corpus coverage reports with a lexicon
  
  =head1 SYNOPSIS
  
  cat <corpus_cleaned> | CoverageReportMaker.pl [options] <lexicon>
  
  NOTICE: produce two reports : 
  - <basename_report>.oov  : containing the oov list sorted by occurrences number and alphabetic order
  - <basename_report>.invoc : containing the in-vocabulary list sort by occurrences and alphabetic order (also display Zeroton)
  
  Options:
  	-help|?		brief help message
  
  	-verbose	verbose
  
   	-out		output directory (default ./tmp_report)
  	
  	-name		reports basename used (default report)
  
   	-writeAll       build a file in the output directory for each oov word
                          listing its sentences (file name : report_<oov_id>)