Blame view
tools/sctk-2.4.10/bin/tanweenFilt.pl
3.09 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/usr/bin/env perl use strict; my $Version="1.1"; ##### # Version 1.0 Released October, 2004 # - Initial Release # Version 1.1 Released October 12, 2004 # - Added -a option # - Fixed problem with word-final tanween characters that were optionally deletable my $Usage="Usage: tanweenFilt.pl [ -i fmt ] Infile|- OutFile|- ". "Version: $Version ". "Desc: tanweenFilt removes the tanween Arabic characters from the end of a word. ". " In the Buckwalter normalization scheme, the word-final letters ". " 'F', 'N', and 'K' are removed". "Options: ". " -a Remove all tanween characters, not just the final characters. ". " -i fmt Set the input file formant to 'fmt'. The possible choices are: ". " txt -> plain text, the default ". " ctm -> CTM format, ignores all but the 5th column, and if ". " a division occurs and a confidence score is present, ". " the confidence score is copied to all parts. ". " stm -> STM format, change only the text field of the stm record ". " "; use Getopt::Long; my ($InFmt, $AllTanween) = (undef, 0); my $result = GetOptions ("i:s" => \$InFmt, "a" => \$AllTanween); die "Aborting: $Usage :" if (!$result); if (defined($InFmt)) { die("$Usage Error: Undefined input format '$InFmt'") if ($InFmt !~ /^(txt|ctm|stm)$/); } else { $InFmt = "txt"; } #### The main functions arguements: die "$Usage Too many arguements" if ($#ARGV > 1); die "$Usage Output Not Specified" if ($#ARGV == 0); die "$Usage Input and Output Not Specified" if ($#ARGV == -1); my $InFile=$ARGV[0]; my $OutFile=$ARGV[1]; die("$Usage Error: Input file $InFile does not exist ") if ($InFile ne "-" && ! -r $InFile); open(IN, "$InFile") || die "Unable to open trans-file $InFile"; open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile"; while (<IN>){ chomp; if ($InFmt eq "txt"){ print OUT normalize($_)." "; } elsif ($InFmt eq "ctm"){ if ($_ =~ /^(\;\;|\#)/){ print OUT $_." "; next; } s/^(\s+)//; my $prefix = (defined($1) ? $1 : ""); my @ctm = split(/(\s+)/,$_); $ctm[8] = normalize($ctm[8]); print OUT $prefix.join("", @ctm)." "; } elsif ($InFmt eq "stm"){ if ($_ =~ /^(\;\;|\#)/){ print OUT $_." "; next; } s/^(\s+)//; my $prefix = (defined($1) ? $1 : ""); my @stm = split(/(\s+)/,$_, 7); if ($stm[10] =~ /^<[^<>]*>$/){ $stm[12] = normalize($stm[12]); } else { $stm[10] .= join("",splice(@stm,11,2)); $stm[10] = normalize($stm[10]); } print OUT $prefix.join("", @stm)." "; } else { die "Error: unknown input format '$InFmt' $Usage "; } } close IN; close OUT; exit 0; sub normalize{ my ($text) = @_; $text = " ".$text; if ($AllTanween){ $text =~ s/(\331\215|\331\214|\331\213)//g; } else { $text =~ s/(\331\215|\331\214|\331\213)$//g; $text =~ s/(\331\215|\331\214|\331\213)\)$/\)/g; $text =~ s/(\331\215|\331\214|\331\213)\) /\) /g; $text =~ s/(\331\215|\331\214|\331\213) / /g; } $text =~ s/^ //; $text; } |