CoverageReportMaker.pl 5.09 KB
#!/usr/bin/perl -w

#------------------------------------------
# Author : Emmanuel FERREIRA
# Contact: emmanuel.ferreira0194@gmail.com
# Date : 14/02/11
# Brief : in order to postprocess a clean corpus 
#------------------------------------------

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use File::Path;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $lexicon;
my $verbose;
my $filename = "report";

# usefull variables
my %lexiconMap;
my %oovMap;
my $lexMatch = 0;
my $oovMatch = 0;
my $zeroton = 0;
my $dirname = "./tmp_report";
my $write;
my $sep="\t";
# Parse options and print usage if there is a syntax
# error, or if usage was explicitly requested
GetOptions('help|?' => \$help, 
           'verbose' => \$verbose,
	   'lex=s' => \$lexicon,
	   'out=s' => \$dirname,
	   'name=s' => \$filename,
	   'writeAll' => \$write);

pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - you must specify a lexicon in argument\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if(!$ARGV[0]);

$lexicon = $ARGV[0];

#********************************************
# MAIN
#********************************************
print "CREATE DIR...\n" if($verbose);

mkdir $dirname, 0755;

print "LOAD LEXICON...\n" if($verbose);

# --------------------------------
# Load the lexicon into a programming structure
# --------------------------------
my $file;
open($file, $lexicon) or die("Cannot open : $lexicon");
while(<$file>){
	chomp($_);
	#save the key
	$lexiconMap{$_}=0;
	$zeroton++;
}
close($file);

# -------------------------------------
# postprocess : remove <s> and </s> tags
# and register all the words
# -------------------------------------
print "WORDS ANALYZE...\n" if($verbose);
while(<STDIN>){
	my $line = $_;
	my $sentence = $line;
	$line =~ s/<s>|<\/s>//g;
	wordRegister("<s>", $sentence);
	wordRegister("</s>", $sentence);
	$line =~ s/ ([\w_']+) /wordRegister($1,$sentence)/ge;
}

# ------------------------------------
# produce two reports :
# * in-vocabulary words with counts 
# * out-of-vocabulary words counts
# ------------------------------------
print "PRODUCE REPORTS...\n" if($verbose);
print "\t* LEXICON MATCH : $lexMatch\n" if($verbose);
print "\t* ZEROTON MATCH : $zeroton\n" if($verbose);
print "\t* OOV : $oovMatch\n" if($verbose);
# save in-vocabulary words sorted by occurences and alphabetic order
# NB: this file is formatted as follow :  <WORD> <NB_OCC>
my @tmpTab = ();
while(my ($word, $occ) = each %lexiconMap){
	my @tmp = ($word, $occ);
	push(@tmpTab, \@tmp);
}
@tmpTab = sort { ${$b}[1] <=>  ${$a}[1] || ${$a}[0] cmp ${$b}[0] } @tmpTab;
open($file, ">$dirname/$filename.invoc");
for(my $i = 0; $i <= $#tmpTab; $i++){
	my $word = ${$tmpTab[$i]}[0];
	my $occ = ${$tmpTab[$i]}[1];
	print $file "${word}$sep${occ}\n";
}
close($file);
# save out-of-vocabulary words by occurences and alphabetic order
# NB: this file is formatted as follow : <ID> <WORD> <NB_OCC> <EXAMPLE_OF_CONTEXT>
open($file, ">$dirname/$filename.oov");
my @oovs = values %oovMap;
@oovs = sort { ${$b}[0] <=> ${$a}[0] || ${$a}[3] cmp ${$b}[3] } @oovs;
for(my $i = 0; $i <= $#oovs; $i++){
	my @tabOOV = @{$oovs[$i]};
	my $word = $tabOOV[3];
	if($word !~ /<\/?s>/){
		my $id = $tabOOV[1];
		my $occ = $tabOOV[0];
		my $exp = $tabOOV[2];
		chomp $exp;
		$exp =~ s/ ($word) / <$1> /g;
		print $file "${id}$sep${word}$sep${occ}$sep\[${exp}\]\n";
	}
}
close($file);

#********************************************
# SUB-ROUTINES
#********************************************

sub wordRegister
{
	my ($word, $sentence) = @_;
	if(defined $lexiconMap{$word}){
		if($lexiconMap{$word} == 0){
			$zeroton--;
			$lexMatch++;
		}
		$lexiconMap{$word}++;
	}
	else{
		if($oovMap{$word}){
			${$oovMap{$word}}[0]++;
		}
		else{
			$oovMatch++;
			my @tab = (1, $oovMatch, $sentence, $word);
			$oovMap{$word}=\@tab;
			if($write){
				createOOVFile($oovMatch, $word);
			}
		}
		if($write){
			writeContext(${$oovMap{$word}}[1], $sentence);
		}
	}
	return " $word ";
}

sub createOOVFile
{
	my ($idOOV, $oov) = @_;
	my $file;
	open($file, ">$dirname/report_$idOOV");
	print $file "OOV=$oov\n";
	close($file);	
}

sub writeContext
{
	my ($idOOV, $sentence) = @_;
	my $file;
	open($file, ">>$dirname/report_$idOOV");
	print $file "$sentence\n";
	close($file);
}

#-------------------------------------------
# SUBROUTINES
#-------------------------------------------

__END__

=head1 NAME

CoverageReportMaker.pl - produce a corpus coverage reports with a lexicon

=head1 SYNOPSIS

cat <corpus_cleaned> | CoverageReportMaker.pl [options] <lexicon>

NOTICE: produce two reports : 
- <basename_report>.oov  : containing the oov list sorted by occurrences number and alphabetic order
- <basename_report>.invoc : containing the in-vocabulary list sort by occurrences and alphabetic order (also display Zeroton)

Options:
	-help|?		brief help message

	-verbose	verbose

 	-out		output directory (default ./tmp_report)
	
	-name		reports basename used (default report)

 	-writeAll       build a file in the output directory for each oov word
                        listing its sentences (file name : report_<oov_id>)