Jean-François Rey / otmedia

Blame view

tools/scripts/BdlexUC.pl 2.84 KB
  #!/usr/bin/perl
  use strict;
  #use warnings;
  
  my %chars2bdlexMap;
  my %bdlex2charsMap;
  my $charsRegex = "";
  my $bdlexRegex = "";
  my $universalRegTo  = "";
  my $universalRegFrom  = "";
  
  if($#ARGV == 1)
  {
  	initializeMapsAndRegex($ARGV[0]);
  	if($ARGV[1] eq "-t")
  	{
  		readTxt(1);
  	}
  	elsif($ARGV[1] eq "-tw"){
  		readTxt(2);
  	}
  	else
  	{
  		readTxt(0);
  	}
  }
  else
  {
  	die "BAD USAGE : <rules_file> <direction> 
  "."direction:\t-t : to
  \t\t-f : from
  \t\t-tw : to, respecting words boundaries
  ";
  }
  
  sub readTxt
  {
  	(my $direction) = ($_[0]);
  	while(<STDIN>)
  	{
  		if($direction == 1)
  		{
  			print convertToBdlex($_);
  		}
  		elsif($direction == 2)
  		{
  			print convertWord($_);
  		}
  		else
  		{
  			print convertFromBdlex($_);
  		}
  	}
  }
  
  sub initializeMapsAndRegex
  {
  	#recup de l'argument
  	(my $ruleFile) = ($_[0]);
  	open(RULE_FILE, $ruleFile);
  	my $universalReg  = "";
  	while(<RULE_FILE>)
  	{
  		#lecture du fichier contenant les regles de convertion
  		chop($_);
  		my @columns = split("\t");
  		if($#columns == 1) {
  			$chars2bdlexMap{$columns[0]} = $columns[1];
  			$bdlex2charsMap{$columns[1]} = $columns[0];
  			#recuperation du premier caractere du code si necessaire (pour la regle universelle) 
  			my (@firstChar) = ($columns[1] =~ /^./g);
  			if($universalReg !~ /$firstChar[0]/){
  				$universalReg .= $firstChar[0];
  			}
  			#construction des regex
  			if($charsRegex eq "")
  			{
  				$charsRegex .= $columns[0];
  				$bdlexRegex .= $columns[1];
  			}
  			else
  			{
  				$charsRegex .= "|".$columns[0];
  				$bdlexRegex .= "|".$columns[1];
  			}
  		}
  	}
  	$universalRegTo = "([".$universalReg."])([0-9])";
  	$universalRegFrom = "([".$universalReg."])(0)";
  	close(RULE_FILE);
  }
  
  sub convertToBdlex
  {
  	#recup  de l'argument 
  	my $convertedString = $_[0];
  	#on transforme ce qui match avec l'expression universelle pour eviter les conflits
  	$convertedString =~ s/$universalRegTo/$1.\(0\).$2/g;
  	$convertedString =~ s/.\(0\)./0/g;
  	#on recuperer ce qui match avec l'expression reguliere
  	my (@match) = ($convertedString =~ /$charsRegex/g);
  	for(my $i = 0; $i <= $#match; $i++)
  	{
  		$convertedString =~ s/$match[$i]/$chars2bdlexMap{$match[$i]}/;
  	}
  	$convertedString;
  }
  
  sub convertWord
  {
  	#recup  de l'argument 
  	my $convertedString = $_[0];
  	#on recuperer ce qui match avec l'expression reguliere
  	my (@match) = ($convertedString =~ /\b($charsRegex)\b/g);
  	for(my $i = 0; $i <= $#match; $i++)
  	{
  		$convertedString =~ s/\b$match[$i]\b/$chars2bdlexMap{$match[$i]}/;
  	}
  	$convertedString;
  }
  sub convertFromBdlex
  {
  	#recup  de l'argument 
  	my $convertedString = $_[0];
  	#on recuperer ce qui match avec l'expression reguliere
  	my (@match) = ($convertedString =~ /$bdlexRegex/g);
  	for(my $i = 0; $i <= $#match; $i++)
  	{
  		$convertedString =~ s/$match[$i]/$bdlex2charsMap{$match[$i]}/;
  	}
  	#on transforme ce qui match avec l'expression universelle apres pour eviter les conflits
  	$convertedString =~ s/$universalRegFrom/$1/g;
  	$convertedString;
  }