BdlexUC.pl 2.84 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131


#!/usr/bin/perl
use strict;
#use warnings;

my %chars2bdlexMap;
my %bdlex2charsMap;
my $charsRegex = "";
my $bdlexRegex = "";
my $universalRegTo  = "";
my $universalRegFrom  = "";

if($#ARGV == 1)
{
	initializeMapsAndRegex($ARGV[0]);
	if($ARGV[1] eq "-t")
	{
		readTxt(1);
	}
	elsif($ARGV[1] eq "-tw"){
		readTxt(2);
	}
	else
	{
		readTxt(0);
	}
}
else
{
	die "BAD USAGE : <rules_file> <direction> \n"."direction:\t-t : to\n\t\t-f : from\n\t\t-tw : to, respecting words boundaries\n";
}

sub readTxt
{
	(my $direction) = ($_[0]);
	while(<STDIN>)
	{
		if($direction == 1)
		{
			print convertToBdlex($_);
		}
		elsif($direction == 2)
		{
			print convertWord($_);
		}
		else
		{
			print convertFromBdlex($_);
		}
	}
}

sub initializeMapsAndRegex
{
	#recup de l'argument
	(my $ruleFile) = ($_[0]);
	open(RULE_FILE, $ruleFile);
	my $universalReg  = "";
	while(<RULE_FILE>)
	{
		#lecture du fichier contenant les regles de convertion
		chop($_);
		my @columns = split("\t");
		if($#columns == 1) {
			$chars2bdlexMap{$columns[0]} = $columns[1];
			$bdlex2charsMap{$columns[1]} = $columns[0];
			#recuperation du premier caractere du code si necessaire (pour la regle universelle) 
			my (@firstChar) = ($columns[1] =~ /^./g);
			if($universalReg !~ /$firstChar[0]/){
				$universalReg .= $firstChar[0];
			}
			#construction des regex
			if($charsRegex eq "")
			{
				$charsRegex .= $columns[0];
				$bdlexRegex .= $columns[1];
			}
			else
			{
				$charsRegex .= "|".$columns[0];
				$bdlexRegex .= "|".$columns[1];
			}
		}
	}
	$universalRegTo = "([".$universalReg."])([0-9])";
	$universalRegFrom = "([".$universalReg."])(0)";
	close(RULE_FILE);
}

sub convertToBdlex
{
	#recup  de l'argument 
	my $convertedString = $_[0];
	#on transforme ce qui match avec l'expression universelle pour eviter les conflits
	$convertedString =~ s/$universalRegTo/$1.\(0\).$2/g;
	$convertedString =~ s/.\(0\)./0/g;
	#on recuperer ce qui match avec l'expression reguliere
	my (@match) = ($convertedString =~ /$charsRegex/g);
	for(my $i = 0; $i <= $#match; $i++)
	{
		$convertedString =~ s/$match[$i]/$chars2bdlexMap{$match[$i]}/;
	}
	$convertedString;
}

sub convertWord
{
	#recup  de l'argument 
	my $convertedString = $_[0];
	#on recuperer ce qui match avec l'expression reguliere
	my (@match) = ($convertedString =~ /\b($charsRegex)\b/g);
	for(my $i = 0; $i <= $#match; $i++)
	{
		$convertedString =~ s/\b$match[$i]\b/$chars2bdlexMap{$match[$i]}/;
	}
	$convertedString;
}
sub convertFromBdlex
{
	#recup  de l'argument 
	my $convertedString = $_[0];
	#on recuperer ce qui match avec l'expression reguliere
	my (@match) = ($convertedString =~ /$bdlexRegex/g);
	for(my $i = 0; $i <= $#match; $i++)
	{
		$convertedString =~ s/$match[$i]/$bdlex2charsMap{$match[$i]}/;
	}
	#on transforme ce qui match avec l'expression universelle apres pour eviter les conflits
	$convertedString =~ s/$universalRegFrom/$1/g;
	$convertedString;
}