def_art.pl 5.66 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199


#!/usr/bin/perl -w

$Version="1.3";

#####
#  Version 1.0  Released September 9, 1997
#        - Initial Release
#  Version 1.1  Released October 14, 1997
#        - Added the ability to perform the operation on Arabic script
#          This requires the LDC Arabic dictionary
#  Version 1.2 
#        - Fixed a coding bug, there was an incorrect 'if ($Format = "Script");'
#  Version 1.3 
#        - Fixes to avoid undefined hash lookup errors

$Usage="Usage: def_art.sh [ -i fmt ] [ -s LDC_LEX ] -t Infile|- OutFile|-\n".
"Version: $Version\n".
"Desc: Detaches the arabic definite article.  This script was donated by BBN".
"Options:\n".
"      -i fmt   Set the input file formant to 'fmt'.  The possible choices are:\n".
"                  txt -> plain text, the default\n".
"                  ctm -> CTM format, ignores all but the 5th column, and if\n".
"                         a division occurs and a confidence score is present,\n".
"                         the confidence score is copied to all parts.\n".
"                  stm -> STM format, change only the text field of the stm record\n".
"      -s LDC_LEX\n".
"               Perform the operation on arabic script transcripts.  This requires\n".
"               the LDC Arabic Lexicon.  The script uses the lexicons morphological\n".
"               tags to decided when to seperate the definate article.\n".
"      -t       If -s is used, Translate any token that match entries a romanized\n".
"               orthography to it's script form\n".
"\n";

$Format="Romanized";
$Lexicon="";
$Translate = 0;
%DefArt_LUT = ();
%Script_LUT = ();

use Getopt::Std;

getopts("i:s:t");

if (defined($opt_i)) {
    die("$Usage\n\nError: Undefined input format '$opt_i'") 
	if ($opt_i !~ /^(txt|ctm|stm)$/);
    $InFmt = $opt_i;
}
if (defined($opt_s)){
    $Format="Script";
    $Lexicon=$opt_s;
    die("$Usage\n\nError: Unable to read lexicon file '$Lexicon'") 
	if (! -r $Lexicon);
}
if (defined($opt_t)){
    die("$Usage\n\nError: -t requires -s as well")
	if (!defined($opt_s));
    $Translate = 1;
    $opt_t = 1;
}

#### The main functions arguements:
die "$Usage\nToo many arguements" if ($#ARGV > 1);
die "$Usage\nOutput Not Specified" if ($#ARGV == 0);
die "$Usage\nInput and Output Not Specified" if ($#ARGV == -1);

$InFile=$ARGV[0];
$OutFile=$ARGV[1];
die("$Usage\nError: Input file $InFile does not exist\n")
    if ($InFile ne "-" && ! -r $InFile);

#### 
&Load_Lexicon($Lexicon) if ($Format eq "Script");

open(IN, "$InFile") || die "Unable to open trans-file $InFile";
open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile";

while (<IN>){
    chop;
    if ($InFmt eq "txt"){
	print OUT &String_LU($_)."\n";
    } elsif ($InFmt eq "ctm"){
	s/^\s+//;
	local(@ctm);
	local(@new_words);
	local($i);	    
	local($newt);
	local($conf);

	@ctm = split(/\s+/,$_);
	if ($#ctm <= 4) { $conf = "" ; } else { $conf = $ctm[5]; }

	$newt = &String_LU($ctm[4]);
	@new_words = split(/\s+/,$newt);
	for ($i=0; $i<=$#new_words; $i++){
	    printf OUT ("%s %s %.2f %.3f %s %s\n",$ctm[0],$ctm[1],
			$ctm[2] + ($i * ($ctm[3] / ($#new_words+1))),
			$ctm[3] / ($#new_words + 1), $new_words[$i], $conf);
	}
    } elsif ($InFmt eq "stm"){
	s/^\s+//;
	local($file, $chan, $spk, $bt, $et, $lab_txt) = split(/\s+/,$_." ENDOFLN",6);
	local($head);
	$lab_txt =~ s/\s*ENDOFLN$//;
	if ($lab_txt =~ /^$/){
	    $lab = "";
	    $txt = "";
	    $head = "$file $chan $spk $bt $et";
	} elsif ($lab_txt =~ /^(<[^<>]*>)(.*)$/){
	    $lab = $1;
	    $txt = $2;
	    $head = "$file $chan $spk $bt $et $lab";
	} else {
	    $lab = "";
	    $txt = $lab_txt;
	    $head = "$file $chan $spk $bt $et";
	}
	print OUT $head." ".&String_LU($txt)."\n";
    }
}

close IN; close OUT;
exit 0;

#########  SUBROUTINES  ###################

sub String_LU {
    local($v) = @_;

    unless (/^\s*$/) {
	if ($Format eq "Romanized"){
	    # replace "il+" using the solar/lunar rules which state
	    # that "l+" in "il+" is replaced by the first letter after
	    # the "+" sign if that letter is one of the following:
	    # t,g,d,r,z,s,$,S,D,T,Z,n,j,k and keep em seperate.
	    $v =~ s/il\+([&]{0,1})([tgdrzs\$SDTZnjk])/i$2 $1$2/g;
	    $v =~ s/(il)\+([&]{0,1})([a-zA-Z])/$1 $2$3/g;
	    $v =~ s/\s+\*\*i([ltgdrzs\$SDTZnjk])\s+/ i$1 /g;
	    $v =~ s/\s+([^\*\s]+)\*\*/ $1/g;
	    $v =~ s/\s+(il)\+(\*?)/ $1 $2/g;
	} else {
	    # Tokenize and lookup
	    local ($new) = "";
	    foreach $word(split(/\s+/,$v)){
		if ($Translate){
		    if ($Script_LUT{$word} !~ /^$/){
			$word = $Script_LUT{$word};
		    }
		}
		if (defined($DefArt_LUT{$word})){
		    if ($DefArt_LUT{$word} ne /^$/){	
			$new .= " $DefArt_LUT{$word}";
		    } else {
			$new .= " $word";
		    }
		} else {
		    warn "Word $word is not in lexicon\n";
		    $new .= " $word";
		}
	    }
	    ($v = $new) =~ s/^ //;
	}
    }
    $v;
}

sub Load_Lexicon{
    local ($orth_rom, $orth_scr, $pron, $stress, $morph, $tr_wf, $dt_wf, $et_wf);
    local ($det, $base);

    open(LEX,$Lexicon) || die("Error: Unable to open lexicon file $Lexicon");
    while (<LEX>){
	($orth_rom, $orth_scr, $pron, $stress, $morph, $tr_wf, $dt_wf, $et_wf) =
	    split;
	if ($Translate) {
	    $Script_LUT{$orth_rom} = $orth_scr;
	}

	if ($morph =~ /\+article/ && $orth_rom =~ /il\+/){
	    # extract the base form 
	    ($base = $morph) =~ s/:.*$//;
	    if (($orth_rom =~     /^&*il\+[^+]*$/) ||
		($orth_rom =~ /^&*bi\+il\+[^+]*$/) ||
		($orth_rom =~ /^&*fa\+il\+[^+]*$/)) {
		($det = $orth_scr) =~ s/(ä)/$1 /;
	    } elsif (($orth_rom =~     /^li\+il\+[^+]*$/) ||
		     ($orth_rom =~ /^li\+bi\+il\+[^+]*$/)) {
		($det = $orth_scr) =~ s/(ää)/$1 /;
	    } else {
		print "HELP  $base  $_";
	    }
	    $DefArt_LUT{$orth_scr} = $det;
	} else {
	    $DefArt_LUT{$orth_scr} = $orth_scr;
	}
    }
    close(LEX);
}