Blame view
tools/sctk-2.4.10/bin/def_art.pl
5.66 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
#!/usr/bin/perl -w $Version="1.3"; ##### # Version 1.0 Released September 9, 1997 # - Initial Release # Version 1.1 Released October 14, 1997 # - Added the ability to perform the operation on Arabic script # This requires the LDC Arabic dictionary # Version 1.2 # - Fixed a coding bug, there was an incorrect 'if ($Format = "Script");' # Version 1.3 # - Fixes to avoid undefined hash lookup errors $Usage="Usage: def_art.sh [ -i fmt ] [ -s LDC_LEX ] -t Infile|- OutFile|- ". "Version: $Version ". "Desc: Detaches the arabic definite article. This script was donated by BBN". "Options: ". " -i fmt Set the input file formant to 'fmt'. The possible choices are: ". " txt -> plain text, the default ". " ctm -> CTM format, ignores all but the 5th column, and if ". " a division occurs and a confidence score is present, ". " the confidence score is copied to all parts. ". " stm -> STM format, change only the text field of the stm record ". " -s LDC_LEX ". " Perform the operation on arabic script transcripts. This requires ". " the LDC Arabic Lexicon. The script uses the lexicons morphological ". " tags to decided when to seperate the definate article. ". " -t If -s is used, Translate any token that match entries a romanized ". " orthography to it's script form ". " "; $Format="Romanized"; $Lexicon=""; $Translate = 0; %DefArt_LUT = (); %Script_LUT = (); use Getopt::Std; getopts("i:s:t"); if (defined($opt_i)) { die("$Usage Error: Undefined input format '$opt_i'") if ($opt_i !~ /^(txt|ctm|stm)$/); $InFmt = $opt_i; } if (defined($opt_s)){ $Format="Script"; $Lexicon=$opt_s; die("$Usage Error: Unable to read lexicon file '$Lexicon'") if (! -r $Lexicon); } if (defined($opt_t)){ die("$Usage Error: -t requires -s as well") if (!defined($opt_s)); $Translate = 1; $opt_t = 1; } #### The main functions arguements: die "$Usage Too many arguements" if ($#ARGV > 1); die "$Usage Output Not Specified" if ($#ARGV == 0); die "$Usage Input and Output Not Specified" if ($#ARGV == -1); $InFile=$ARGV[0]; $OutFile=$ARGV[1]; die("$Usage Error: Input file $InFile does not exist ") if ($InFile ne "-" && ! -r $InFile); #### &Load_Lexicon($Lexicon) if ($Format eq "Script"); open(IN, "$InFile") || die "Unable to open trans-file $InFile"; open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile"; while (<IN>){ chop; if ($InFmt eq "txt"){ print OUT &String_LU($_)." "; } elsif ($InFmt eq "ctm"){ s/^\s+//; local(@ctm); local(@new_words); local($i); local($newt); local($conf); @ctm = split(/\s+/,$_); if ($#ctm <= 4) { $conf = "" ; } else { $conf = $ctm[5]; } $newt = &String_LU($ctm[4]); @new_words = split(/\s+/,$newt); for ($i=0; $i<=$#new_words; $i++){ printf OUT ("%s %s %.2f %.3f %s %s ",$ctm[0],$ctm[1], $ctm[2] + ($i * ($ctm[3] / ($#new_words+1))), $ctm[3] / ($#new_words + 1), $new_words[$i], $conf); } } elsif ($InFmt eq "stm"){ s/^\s+//; local($file, $chan, $spk, $bt, $et, $lab_txt) = split(/\s+/,$_." ENDOFLN",6); local($head); $lab_txt =~ s/\s*ENDOFLN$//; if ($lab_txt =~ /^$/){ $lab = ""; $txt = ""; $head = "$file $chan $spk $bt $et"; } elsif ($lab_txt =~ /^(<[^<>]*>)(.*)$/){ $lab = $1; $txt = $2; $head = "$file $chan $spk $bt $et $lab"; } else { $lab = ""; $txt = $lab_txt; $head = "$file $chan $spk $bt $et"; } print OUT $head." ".&String_LU($txt)." "; } } close IN; close OUT; exit 0; ######### SUBROUTINES ################### sub String_LU { local($v) = @_; unless (/^\s*$/) { if ($Format eq "Romanized"){ # replace "il+" using the solar/lunar rules which state # that "l+" in "il+" is replaced by the first letter after # the "+" sign if that letter is one of the following: # t,g,d,r,z,s,$,S,D,T,Z,n,j,k and keep em seperate. $v =~ s/il\+([&]{0,1})([tgdrzs\$SDTZnjk])/i$2 $1$2/g; $v =~ s/(il)\+([&]{0,1})([a-zA-Z])/$1 $2$3/g; $v =~ s/\s+\*\*i([ltgdrzs\$SDTZnjk])\s+/ i$1 /g; $v =~ s/\s+([^\*\s]+)\*\*/ $1/g; $v =~ s/\s+(il)\+(\*?)/ $1 $2/g; } else { # Tokenize and lookup local ($new) = ""; foreach $word(split(/\s+/,$v)){ if ($Translate){ if ($Script_LUT{$word} !~ /^$/){ $word = $Script_LUT{$word}; } } if (defined($DefArt_LUT{$word})){ if ($DefArt_LUT{$word} ne /^$/){ $new .= " $DefArt_LUT{$word}"; } else { $new .= " $word"; } } else { warn "Word $word is not in lexicon "; $new .= " $word"; } } ($v = $new) =~ s/^ //; } } $v; } sub Load_Lexicon{ local ($orth_rom, $orth_scr, $pron, $stress, $morph, $tr_wf, $dt_wf, $et_wf); local ($det, $base); open(LEX,$Lexicon) || die("Error: Unable to open lexicon file $Lexicon"); while (<LEX>){ ($orth_rom, $orth_scr, $pron, $stress, $morph, $tr_wf, $dt_wf, $et_wf) = split; if ($Translate) { $Script_LUT{$orth_rom} = $orth_scr; } if ($morph =~ /\+article/ && $orth_rom =~ /il\+/){ # extract the base form ($base = $morph) =~ s/:.*$//; if (($orth_rom =~ /^&*il\+[^+]*$/) || ($orth_rom =~ /^&*bi\+il\+[^+]*$/) || ($orth_rom =~ /^&*fa\+il\+[^+]*$/)) { ($det = $orth_scr) =~ s/(ä)/$1 /; } elsif (($orth_rom =~ /^li\+il\+[^+]*$/) || ($orth_rom =~ /^li\+bi\+il\+[^+]*$/)) { ($det = $orth_scr) =~ s/(ää)/$1 /; } else { print "HELP $base $_"; } $DefArt_LUT{$orth_scr} = $det; } else { $DefArt_LUT{$orth_scr} = $orth_scr; } } close(LEX); } |