Blame view
egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
3.39 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
#!/usr/bin/env perl # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 use warnings; use strict; use Encode; use utf8; if (@ARGV !=2 ) {# print "usage: $0 <inFile> <onlyArabicFile> "; exit (1); } # <\check usage> my $inFile = shift (@ARGV); my $ouFile = shift(@ARGV); open INFILE, "<$inFile" || die "unable to open the input file $inFile "; binmode INFILE, ":encoding(utf8)"; open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile "; binmode OUTPUTFILE, ":encoding(utf8)"; while (<INFILE>) { s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g; ## Removes non Arabic or numbers my $BW = convertUTF8ToBuckwalter ($_); print OUTPUTFILE "$BW"." "; } close INFILE; close OUTPUTFILE; # this function is copied from MADATools.pm: MADA Tools sub convertUTF8ToBuckwalter { my ($line)= (@_); #$line = $UTF8_ENCODING_OBJ->decode($line); ## Same as Encode::decode("utf8",$line), but faster since object already created $line =~ s/\x{0621}/\'/g; ## HAMZA $line =~ s/\x{0622}/\|/g; ## ALEF WITH MADDA ABOVE $line =~ s/\x{0623}/\>/g; ## ALEF WITH HAMZA ABOVE $line =~ s/\x{0624}/\&/g; ## WAW WITH HAMZA ABOVE $line =~ s/\x{0625}/\</g; ## ALEF WITH HAMZA BELOW $line =~ s/\x{0626}/\}/g; ## YEH WITH HAMZA ABOVE $line =~ s/\x{0627}/A/g; ## ALEF $line =~ s/\x{0628}/b/g; ## BEH $line =~ s/\x{0629}/p/g; ## TEH MARBUTA $line =~ s/\x{062A}/t/g; ## TEH $line =~ s/\x{062B}/v/g; ## THEH $line =~ s/\x{062C}/j/g; ## JEEM $line =~ s/\x{062D}/H/g; ## HAH $line =~ s/\x{062E}/x/g; ## KHAH $line =~ s/\x{062F}/d/g; ## DAL $line =~ s/\x{0630}/\*/g; ## THAL $line =~ s/\x{0631}/r/g; ## REH $line =~ s/\x{0632}/z/g; ## ZAIN $line =~ s/\x{0633}/s/g; ## SEEN $line =~ s/\x{0634}/\$/g; ## SHEEN $line =~ s/\x{0635}/S/g; ## SAD $line =~ s/\x{0636}/D/g; ## DAD $line =~ s/\x{0637}/T/g; ## TAH $line =~ s/\x{0638}/Z/g; ## ZAH $line =~ s/\x{0639}/E/g; ## AIN $line =~ s/\x{063A}/g/g; ## GHAIN $line =~ s/\x{0640}/_/g; ## TATWEEL $line =~ s/\x{0641}/f/g; ## FEH $line =~ s/\x{0642}/q/g; ## QAF $line =~ s/\x{0643}/k/g; ## KAF $line =~ s/\x{0644}/l/g; ## LAM $line =~ s/\x{0645}/m/g; ## MEEM $line =~ s/\x{0646}/n/g; ## NOON $line =~ s/\x{0647}/h/g; ## HEH $line =~ s/\x{0648}/w/g; ## WAW $line =~ s/\x{0649}/Y/g; ## ALEF MAKSURA $line =~ s/\x{064A}/y/g; ## YEH ## Diacritics $line =~ s/\x{064B}/F/g; ## FATHATAN $line =~ s/\x{064C}/N/g; ## DAMMATAN $line =~ s/\x{064D}/K/g; ## KASRATAN $line =~ s/\x{064E}/a/g; ## FATHA $line =~ s/\x{064F}/u/g; ## DAMMA $line =~ s/\x{0650}/i/g; ## KASRA $line =~ s/\x{0651}/\~/g; ## SHADDA $line =~ s/\x{0652}/o/g; ## SUKUN $line =~ s/\x{0670}/\`/g; ## SUPERSCRIPT ALEF $line =~ s/\x{0671}/\{/g; ## ALEF WASLA $line =~ s/\x{067E}/P/g; ## PEH $line =~ s/\x{0686}/J/g; ## TCHEH $line =~ s/\x{06A4}/V/g; ## VEH $line =~ s/\x{06AF}/G/g; ## GAF ## Punctuation should really be handled by the utf8 cleaner or other method # $line =~ s/\xa2/\,/g; # comma # $line =~ s//\,/g; # comma # $line =~ s//\,/g; # $line =~ s//\;/g; # semicolon # $line =~ s//\?/g; # questionmark return $line; } |