MergeLexicon.pl 3.64 KB
#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use Pod::Usage;

#-------------------------------------------
# MAIN
#-------------------------------------------

# options variables
my $help = 0;
my $level = 2;
my $lexSep = "\t";
my $report = "";
my $pause=1;
my $desactlc = 0;
# usefull variables
my %MergeLexicon;
my %HomoPhon;
my $homoPhonFile ="";
my $polyPhonFile = "";
my @polyPhonTab = ();

GetOptions('help|?' => \$help,
	   'dlc' => \$desactlc,
	   'report=s' => \$report,
	   'level=s' => \$level);
 
pod2usage(1) if($help);
pod2usage({-msg => "BAD USAGE - you must specify a level value greater than 0\n", -exitval => 1, -verbose => 0, -output => \*STDERR}) if($report && $level <= 0);

if($report){
	$homoPhonFile= $report.".homophon.lst";
	$polyPhonFile= $report.".polyphon.lst";
}

for(my $i = 0; $i<=$#ARGV; $i++){
	my $file;
	open($file, "$ARGV[$i]") or die ("Cannot open : $ARGV[$i]\n");
	while(<$file>){
		chomp($_);
		#split 0: word 1: phon spelling
		my @splittedLine = split($lexSep, $_);
		if($#splittedLine != 1){
			die("BAD FORMAT : $_\n");
		}
		my $word = $splittedLine[0];
		my $phonSpelling = $splittedLine[1];
		my $lcword;
		if(!$desactlc){
			$lcword = lc($word);
		} else{
			$lcword = $word;
		}
		if(exists $MergeLexicon{$lcword}){
			${$MergeLexicon{$lcword}}{$phonSpelling}++;
		} else{
			$MergeLexicon{$lcword} = { $phonSpelling => 1};
		}
		# no count duplicate value with pause
		if($phonSpelling !~ /pause/){
			if(exists $HomoPhon{$phonSpelling}){
				${$HomoPhon{$phonSpelling}}{$word}++;
			} else{
				$HomoPhon{$phonSpelling} = { $word => 1 };
			}
		}
	}
	close($file);	
}

my @lcwords = keys %MergeLexicon;
@lcwords = sort @lcwords;
for(my $idx=0; $idx<=$#lcwords; $idx++){
	my $lcword = $lcwords[$idx];
	my $refPhonHash = $MergeLexicon{$lcword};
	my @diffPhonTab = keys %{$refPhonHash};
	@diffPhonTab = sort @diffPhonTab;
	if($#diffPhonTab >= $level - 1){
		my $nbDiff;
		if($pause){
			$nbDiff = (($#diffPhonTab + 1)/2);
		} else{
			$nbDiff = ($#diffPhonTab +1);
		}
		push(@polyPhonTab, "$lcword\t$nbDiff");
	}
	for (my $i=0; $i <= $#diffPhonTab; $i++){
	 	print "$lcword\t$diffPhonTab[$i]\n";
	}
}


if($polyPhonFile){
	my $file;
	if($#polyPhonTab >= 0){
		open($file, ">$polyPhonFile") or die("Cannot open: $polyPhonFile\n");
		@polyPhonTab = sort sort_polyphon_homophon @polyPhonTab;
		for(my $i = 0; $i <= $#polyPhonTab; $i++){
			print $file "$polyPhonTab[$i]\n";
		}
		close($file);
	}
}

if($homoPhonFile){
	my @homoPhonTab=();
	while(my ($phon, $wordHash) = each %HomoPhon){
		my @diffWordTab = keys %{$wordHash};
		if($#diffWordTab >= $level -1){
			my $nbDiff = $#diffWordTab + 1;
			push(@homoPhonTab, "$phon\t$nbDiff");
		}
	}
	if($#homoPhonTab >= 0){
		my $file;
		open($file, ">$homoPhonFile") or die("Cannot open : $homoPhonFile\n");
		@homoPhonTab = sort sort_polyphon_homophon @homoPhonTab;
		for(my $i = 0; $i <= $#homoPhonTab; $i++){
			my @splittedLine = split(/\t/, $homoPhonTab[$i]);
			my $phon = $splittedLine[0];
			my $nbDiff = $splittedLine[1];
			print $file "### $nbDiff FOR $phon :\n";
			my @diffWordTab = keys %{$HomoPhon{$phon}};
			@diffWordTab = sort @diffWordTab;
			for(my $i=0; $i <= $#diffWordTab; $i++){
				print $file "$diffWordTab[$i]\n";
			}
		}
		close($file);
	}
}

sub sort_polyphon_homophon{
	my @splita = split(/\t/, $a);
	my @splitb = split(/\t/, $b);
	return $splitb[1] <=> $splita[1];
}

__END__

=head1 NAME

MergeLexicon.pl - merge phonetized lexicon (upper case to lower case)

=head1 SYNOPSIS

MergeLexicon.pl [options] <lexicon.phon> ...

Options :

	-help|? 	display this help

	-report		produce report

	-dlc		desactivate lower case transformation

	-level 	 	choose a level of report consideration (default 2)