Blame view
tools/sctk-2.4.10/src/hamzaNorm/hamzaNorm.pl
2.64 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env perl use strict; my $Version="1.0"; ##### # Version 1.0 Released August 25, 2004 # - Initial Release my $Usage="Usage: hamzaNorm.pl [ -i fmt ] Infile|- OutFile|- ". "Version: $Version ". "Desc: hamzaNorm normalizes the hamza in the initial position of the word to be a ". " consistent form. In the Buckwalter normalization scheme, the initial letters ". " 'A', '<', '>' and '|' are all translated to 'A'". "Options: ". " -i fmt Set the input file formant to 'fmt'. The possible choices are: ". " txt -> plain text, the default ". " ctm -> CTM format, ignores all but the 5th column, and if ". " a division occurs and a confidence score is present, ". " the confidence score is copied to all parts. ". " stm -> STM format, change only the text field of the stm record ". " "; use Getopt::Long; my ($InFmt) = undef; my $result = GetOptions ("i:s" => \$InFmt); die "Aborting: $Usage :" if (!$result); if (defined($InFmt)) { die("$Usage Error: Undefined input format '$InFmt'") if ($InFmt !~ /^(txt|ctm|stm)$/); } else { $InFmt = "txt"; } #### The main functions arguements: die "$Usage Too many arguements" if ($#ARGV > 1); die "$Usage Output Not Specified" if ($#ARGV == 0); die "$Usage Input and Output Not Specified" if ($#ARGV == -1); my $InFile=$ARGV[0]; my $OutFile=$ARGV[1]; die("$Usage Error: Input file $InFile does not exist ") if ($InFile ne "-" && ! -r $InFile); open(IN, "$InFile") || die "Unable to open trans-file $InFile"; open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile"; while (<IN>){ chomp; if ($InFmt eq "txt"){ print OUT normalize($_)." "; } elsif ($InFmt eq "ctm"){ if ($_ =~ /^(\;\;|\#)/){ print OUT $_." "; next; } s/^(\s+)//; my $prefix = (defined($1) ? $1 : ""); my @ctm = split(/(\s+)/,$_); $ctm[8] = normalize($ctm[8]); print OUT $prefix.join("", @ctm)." "; } elsif ($InFmt eq "stm"){ if ($_ =~ /^(\;\;|\#)/){ print OUT $_." "; next; } s/^(\s+)//; my $prefix = (defined($1) ? $1 : ""); my @stm = split(/(\s+)/,$_, 7); if ($stm[10] =~ /^<[^<>]*>$/){ $stm[12] = normalize($stm[12]); } else { $stm[10] .= join("",splice(@stm,11,2)); $stm[10] = normalize($stm[10]); } print OUT $prefix.join("", @stm)." "; } else { die "Error: unknown input format '$InFmt' $Usage "; } } close IN; close OUT; exit 0; sub normalize{ my ($text) = @_; $text = " ".$text; $text =~ s/ (\330\242|\330\243|\330\245|\330\247)/ \330\247/g; $text =~ s/^ //; $text; } |