hamzaNorm.pl
2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env perl
use strict;
my $Version="1.0";
#####
# Version 1.0 Released August 25, 2004
# - Initial Release
my $Usage="Usage: hamzaNorm.pl [ -i fmt ] Infile|- OutFile|-\n".
"Version: $Version\n".
"Desc: hamzaNorm normalizes the hamza in the initial position of the word to be a\n".
" consistent form. In the Buckwalter normalization scheme, the initial letters\n".
" 'A', '<', '>' and '|' are all translated to 'A'".
"Options:\n".
" -i fmt Set the input file formant to 'fmt'. The possible choices are:\n".
" txt -> plain text, the default\n".
" ctm -> CTM format, ignores all but the 5th column, and if\n".
" a division occurs and a confidence score is present,\n".
" the confidence score is copied to all parts.\n".
" stm -> STM format, change only the text field of the stm record\n".
"\n";
use Getopt::Long;
my ($InFmt) = undef;
my $result = GetOptions ("i:s" => \$InFmt);
die "Aborting:\n$Usage\n:" if (!$result);
if (defined($InFmt)) {
die("$Usage\n\nError: Undefined input format '$InFmt'")
if ($InFmt !~ /^(txt|ctm|stm)$/);
} else {
$InFmt = "txt";
}
#### The main functions arguements:
die "$Usage\nToo many arguements" if ($#ARGV > 1);
die "$Usage\nOutput Not Specified" if ($#ARGV == 0);
die "$Usage\nInput and Output Not Specified" if ($#ARGV == -1);
my $InFile=$ARGV[0];
my $OutFile=$ARGV[1];
die("$Usage\nError: Input file $InFile does not exist\n")
if ($InFile ne "-" && ! -r $InFile);
open(IN, "$InFile") || die "Unable to open trans-file $InFile";
open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile";
while (<IN>){
chomp;
if ($InFmt eq "txt"){
print OUT normalize($_)."\n";
} elsif ($InFmt eq "ctm"){
if ($_ =~ /^(\;\;|\#)/){
print OUT $_."\n";
next;
}
s/^(\s+)//;
my $prefix = (defined($1) ? $1 : "");
my @ctm = split(/(\s+)/,$_);
$ctm[8] = normalize($ctm[8]);
print OUT $prefix.join("", @ctm)."\n";
} elsif ($InFmt eq "stm"){
if ($_ =~ /^(\;\;|\#)/){
print OUT $_."\n";
next;
}
s/^(\s+)//;
my $prefix = (defined($1) ? $1 : "");
my @stm = split(/(\s+)/,$_, 7);
if ($stm[10] =~ /^<[^<>]*>$/){
$stm[12] = normalize($stm[12]);
} else {
$stm[10] .= join("",splice(@stm,11,2));
$stm[10] = normalize($stm[10]);
}
print OUT $prefix.join("", @stm)."\n";
} else {
die "Error: unknown input format '$InFmt'\n$Usage\n";
}
}
close IN; close OUT;
exit 0;
sub normalize{
my ($text) = @_;
$text = " ".$text;
$text =~ s/ (\330\242|\330\243|\330\245|\330\247)/ \330\247/g;
$text =~ s/^ //;
$text;
}