normalize_transcript_BW.pl
3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env perl
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0
use warnings;
use strict;
use Encode;
use utf8;
if (@ARGV !=2 )
{#
print "usage: $0 <inFile> <onlyArabicFile>\n";
exit (1);
}
# <\check usage>
my $inFile = shift (@ARGV);
my $ouFile = shift(@ARGV);
open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
binmode INFILE, ":encoding(utf8)";
open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
binmode OUTPUTFILE, ":encoding(utf8)";
while (<INFILE>) {
s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g; ## Removes non Arabic or numbers
my $BW = convertUTF8ToBuckwalter ($_);
print OUTPUTFILE "$BW"."\n";
}
close INFILE;
close OUTPUTFILE;
# this function is copied from MADATools.pm: MADA Tools
sub convertUTF8ToBuckwalter {
my ($line)= (@_);
#$line = $UTF8_ENCODING_OBJ->decode($line); ## Same as Encode::decode("utf8",$line), but faster since object already created
$line =~ s/\x{0621}/\'/g; ## HAMZA
$line =~ s/\x{0622}/\|/g; ## ALEF WITH MADDA ABOVE
$line =~ s/\x{0623}/\>/g; ## ALEF WITH HAMZA ABOVE
$line =~ s/\x{0624}/\&/g; ## WAW WITH HAMZA ABOVE
$line =~ s/\x{0625}/\</g; ## ALEF WITH HAMZA BELOW
$line =~ s/\x{0626}/\}/g; ## YEH WITH HAMZA ABOVE
$line =~ s/\x{0627}/A/g; ## ALEF
$line =~ s/\x{0628}/b/g; ## BEH
$line =~ s/\x{0629}/p/g; ## TEH MARBUTA
$line =~ s/\x{062A}/t/g; ## TEH
$line =~ s/\x{062B}/v/g; ## THEH
$line =~ s/\x{062C}/j/g; ## JEEM
$line =~ s/\x{062D}/H/g; ## HAH
$line =~ s/\x{062E}/x/g; ## KHAH
$line =~ s/\x{062F}/d/g; ## DAL
$line =~ s/\x{0630}/\*/g; ## THAL
$line =~ s/\x{0631}/r/g; ## REH
$line =~ s/\x{0632}/z/g; ## ZAIN
$line =~ s/\x{0633}/s/g; ## SEEN
$line =~ s/\x{0634}/\$/g; ## SHEEN
$line =~ s/\x{0635}/S/g; ## SAD
$line =~ s/\x{0636}/D/g; ## DAD
$line =~ s/\x{0637}/T/g; ## TAH
$line =~ s/\x{0638}/Z/g; ## ZAH
$line =~ s/\x{0639}/E/g; ## AIN
$line =~ s/\x{063A}/g/g; ## GHAIN
$line =~ s/\x{0640}/_/g; ## TATWEEL
$line =~ s/\x{0641}/f/g; ## FEH
$line =~ s/\x{0642}/q/g; ## QAF
$line =~ s/\x{0643}/k/g; ## KAF
$line =~ s/\x{0644}/l/g; ## LAM
$line =~ s/\x{0645}/m/g; ## MEEM
$line =~ s/\x{0646}/n/g; ## NOON
$line =~ s/\x{0647}/h/g; ## HEH
$line =~ s/\x{0648}/w/g; ## WAW
$line =~ s/\x{0649}/Y/g; ## ALEF MAKSURA
$line =~ s/\x{064A}/y/g; ## YEH
## Diacritics
$line =~ s/\x{064B}/F/g; ## FATHATAN
$line =~ s/\x{064C}/N/g; ## DAMMATAN
$line =~ s/\x{064D}/K/g; ## KASRATAN
$line =~ s/\x{064E}/a/g; ## FATHA
$line =~ s/\x{064F}/u/g; ## DAMMA
$line =~ s/\x{0650}/i/g; ## KASRA
$line =~ s/\x{0651}/\~/g; ## SHADDA
$line =~ s/\x{0652}/o/g; ## SUKUN
$line =~ s/\x{0670}/\`/g; ## SUPERSCRIPT ALEF
$line =~ s/\x{0671}/\{/g; ## ALEF WASLA
$line =~ s/\x{067E}/P/g; ## PEH
$line =~ s/\x{0686}/J/g; ## TCHEH
$line =~ s/\x{06A4}/V/g; ## VEH
$line =~ s/\x{06AF}/G/g; ## GAF
## Punctuation should really be handled by the utf8 cleaner or other method
# $line =~ s/\xa2/\,/g; # comma
# $line =~ s//\,/g; # comma
# $line =~ s//\,/g;
# $line =~ s//\;/g; # semicolon
# $line =~ s//\?/g; # questionmark
return $line;
}