Blame view

tools/sctk-2.4.10/bin/tanweenFilt.pl 3.09 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  #!/usr/bin/env perl
  
  use strict;
  
  my $Version="1.1";
  
  #####
  #  Version 1.0  Released October, 2004
  #        - Initial Release
  #  Version 1.1  Released October 12, 2004
  #        - Added -a option
  #        - Fixed problem with word-final tanween characters that were optionally deletable
   
  my $Usage="Usage: tanweenFilt.pl [ -i fmt ] Infile|- OutFile|-
  ".
  "Version: $Version
  ".
  "Desc: tanweenFilt removes the tanween Arabic characters from the end of a word.
  ".
  "      In the Buckwalter normalization scheme, the word-final letters
  ".
  "      'F', 'N', and 'K' are removed".
  "Options:
  ".
  "      -a       Remove all tanween characters, not just the final characters.
  ".
  "      -i fmt   Set the input file formant to 'fmt'.  The possible choices are:
  ".
  "                  txt -> plain text, the default
  ".
  "                  ctm -> CTM format, ignores all but the 5th column, and if
  ".
  "                         a division occurs and a confidence score is present,
  ".
  "                         the confidence score is copied to all parts.
  ".
  "                  stm -> STM format, change only the text field of the stm record
  ".
  "
  ";
  
  use Getopt::Long;
  my ($InFmt, $AllTanween) = (undef, 0);
  my $result = GetOptions ("i:s" => \$InFmt, "a" => \$AllTanween);
  die "Aborting:
  $Usage
  :" if (!$result);
  
  if (defined($InFmt)) {
      die("$Usage
  
  Error: Undefined input format '$InFmt'") 
  	if ($InFmt !~ /^(txt|ctm|stm)$/);
  } else {
      $InFmt = "txt";
  } 
  
  #### The main functions arguements:
  die "$Usage
  Too many arguements" if ($#ARGV > 1);
  die "$Usage
  Output Not Specified" if ($#ARGV == 0);
  die "$Usage
  Input and Output Not Specified" if ($#ARGV == -1);
  
  my $InFile=$ARGV[0];
  my $OutFile=$ARGV[1];
  die("$Usage
  Error: Input file $InFile does not exist
  ")
      if ($InFile ne "-" && ! -r $InFile);
  
  open(IN, "$InFile") || die "Unable to open trans-file $InFile";
  open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile";
  
  while (<IN>){
      chomp;
      if ($InFmt eq "txt"){
  	print OUT normalize($_)."
  ";
      } elsif ($InFmt eq "ctm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @ctm = split(/(\s+)/,$_);
  	$ctm[8] = normalize($ctm[8]);
  	print OUT $prefix.join("", @ctm)."
  ";
      } elsif ($InFmt eq "stm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @stm = split(/(\s+)/,$_, 7);
  	if ($stm[10] =~ /^<[^<>]*>$/){
  	    $stm[12] = normalize($stm[12]);
  	} else {
  	    $stm[10] .= join("",splice(@stm,11,2));
  	    $stm[10] = normalize($stm[10]);
  	}
  	print OUT $prefix.join("", @stm)."
  ";
      } else {
  	die "Error: unknown input format '$InFmt'
  $Usage
  ";
      }    
  }
  
  close IN; close OUT;
  exit 0;
  
  sub normalize{
      my ($text) = @_;
      $text = " ".$text;    
      if ($AllTanween){
  	$text =~ s/(\331\215|\331\214|\331\213)//g;
      } else {
  	$text =~ s/(\331\215|\331\214|\331\213)$//g;
  	$text =~ s/(\331\215|\331\214|\331\213)\)$/\)/g;
  	$text =~ s/(\331\215|\331\214|\331\213)\) /\) /g;
  	$text =~ s/(\331\215|\331\214|\331\213) / /g;
      }
      $text =~ s/^ //;
      $text;
  }