Jean-François Rey / otmedia

Blame view

tools/scripts/MergeLexicon.pl 3.64 KB
  #!/usr/bin/perl -w
  
  use strict;
  use Getopt::Long;
  use Pod::Usage;
  
  #-------------------------------------------
  # MAIN
  #-------------------------------------------
  
  # options variables
  my $help = 0;
  my $level = 2;
  my $lexSep = "\t";
  my $report = "";
  my $pause=1;
  my $desactlc = 0;
  # usefull variables
  my %MergeLexicon;
  my %HomoPhon;
  my $homoPhonFile ="";
  my $polyPhonFile = "";
  my @polyPhonTab = ();
  
  GetOptions('help|?' => \$help,
  	   'dlc' => \$desactlc,
  	   'report=s' => \$report,
  	   'level=s' => \$level);
   
  pod2usage(1) if($help);
  pod2usage({-msg => "BAD USAGE - you must specify a level value greater than 0
  ", -exitval => 1, -verbose => 0, -output => \*STDERR}) if($report && $level <= 0);
  
  if($report){
  	$homoPhonFile= $report.".homophon.lst";
  	$polyPhonFile= $report.".polyphon.lst";
  }
  
  for(my $i = 0; $i<=$#ARGV; $i++){
  	my $file;
  	open($file, "$ARGV[$i]") or die ("Cannot open : $ARGV[$i]
  ");
  	while(<$file>){
  		chomp($_);
  		#split 0: word 1: phon spelling
  		my @splittedLine = split($lexSep, $_);
  		if($#splittedLine != 1){
  			die("BAD FORMAT : $_
  ");
  		}
  		my $word = $splittedLine[0];
  		my $phonSpelling = $splittedLine[1];
  		my $lcword;
  		if(!$desactlc){
  			$lcword = lc($word);
  		} else{
  			$lcword = $word;
  		}
  		if(exists $MergeLexicon{$lcword}){
  			${$MergeLexicon{$lcword}}{$phonSpelling}++;
  		} else{
  			$MergeLexicon{$lcword} = { $phonSpelling => 1};
  		}
  		# no count duplicate value with pause
  		if($phonSpelling !~ /pause/){
  			if(exists $HomoPhon{$phonSpelling}){
  				${$HomoPhon{$phonSpelling}}{$word}++;
  			} else{
  				$HomoPhon{$phonSpelling} = { $word => 1 };
  			}
  		}
  	}
  	close($file);	
  }
  
  my @lcwords = keys %MergeLexicon;
  @lcwords = sort @lcwords;
  for(my $idx=0; $idx<=$#lcwords; $idx++){
  	my $lcword = $lcwords[$idx];
  	my $refPhonHash = $MergeLexicon{$lcword};
  	my @diffPhonTab = keys %{$refPhonHash};
  	@diffPhonTab = sort @diffPhonTab;
  	if($#diffPhonTab >= $level - 1){
  		my $nbDiff;
  		if($pause){
  			$nbDiff = (($#diffPhonTab + 1)/2);
  		} else{
  			$nbDiff = ($#diffPhonTab +1);
  		}
  		push(@polyPhonTab, "$lcword\t$nbDiff");
  	}
  	for (my $i=0; $i <= $#diffPhonTab; $i++){
  	 	print "$lcword\t$diffPhonTab[$i]
  ";
  	}
  }
  
  
  if($polyPhonFile){
  	my $file;
  	if($#polyPhonTab >= 0){
  		open($file, ">$polyPhonFile") or die("Cannot open: $polyPhonFile
  ");
  		@polyPhonTab = sort sort_polyphon_homophon @polyPhonTab;
  		for(my $i = 0; $i <= $#polyPhonTab; $i++){
  			print $file "$polyPhonTab[$i]
  ";
  		}
  		close($file);
  	}
  }
  
  if($homoPhonFile){
  	my @homoPhonTab=();
  	while(my ($phon, $wordHash) = each %HomoPhon){
  		my @diffWordTab = keys %{$wordHash};
  		if($#diffWordTab >= $level -1){
  			my $nbDiff = $#diffWordTab + 1;
  			push(@homoPhonTab, "$phon\t$nbDiff");
  		}
  	}
  	if($#homoPhonTab >= 0){
  		my $file;
  		open($file, ">$homoPhonFile") or die("Cannot open : $homoPhonFile
  ");
  		@homoPhonTab = sort sort_polyphon_homophon @homoPhonTab;
  		for(my $i = 0; $i <= $#homoPhonTab; $i++){
  			my @splittedLine = split(/\t/, $homoPhonTab[$i]);
  			my $phon = $splittedLine[0];
  			my $nbDiff = $splittedLine[1];
  			print $file "### $nbDiff FOR $phon :
  ";
  			my @diffWordTab = keys %{$HomoPhon{$phon}};
  			@diffWordTab = sort @diffWordTab;
  			for(my $i=0; $i <= $#diffWordTab; $i++){
  				print $file "$diffWordTab[$i]
  ";
  			}
  		}
  		close($file);
  	}
  }
  
  sub sort_polyphon_homophon{
  	my @splita = split(/\t/, $a);
  	my @splitb = split(/\t/, $b);
  	return $splitb[1] <=> $splita[1];
  }
  
  __END__
  
  =head1 NAME
  
  MergeLexicon.pl - merge phonetized lexicon (upper case to lower case)
  
  =head1 SYNOPSIS
  
  MergeLexicon.pl [options] <lexicon.phon> ...
  
  Options :
  
  	-help|? 	display this help
  
  	-report		produce report
  
  	-dlc		desactivate lower case transformation
  
  	-level 	 	choose a level of report consideration (default 2)