Blame view

tools/sctk-2.4.10/src/hamzaNorm/hamzaNorm.pl 2.64 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
  #!/usr/bin/env perl
  
  use strict;
  
  my $Version="1.0";
  
  #####
  #  Version 1.0  Released August 25, 2004
  #        - Initial Release
  
  my $Usage="Usage: hamzaNorm.pl [ -i fmt ] Infile|- OutFile|-
  ".
  "Version: $Version
  ".
  "Desc: hamzaNorm normalizes the hamza in the initial position of the word to be a
  ".
  "      consistent form.  In the Buckwalter normalization scheme, the initial letters
  ".
  "      'A', '<', '>' and '|' are all translated to 'A'".
  "Options:
  ".
  "      -i fmt   Set the input file formant to 'fmt'.  The possible choices are:
  ".
  "                  txt -> plain text, the default
  ".
  "                  ctm -> CTM format, ignores all but the 5th column, and if
  ".
  "                         a division occurs and a confidence score is present,
  ".
  "                         the confidence score is copied to all parts.
  ".
  "                  stm -> STM format, change only the text field of the stm record
  ".
  "
  ";
  
  use Getopt::Long;
  my ($InFmt) = undef;
  my $result = GetOptions ("i:s" => \$InFmt);
  die "Aborting:
  $Usage
  :" if (!$result);
  
  if (defined($InFmt)) {
      die("$Usage
  
  Error: Undefined input format '$InFmt'") 
  	if ($InFmt !~ /^(txt|ctm|stm)$/);
  } else {
      $InFmt = "txt";
  } 
  
  #### The main functions arguements:
  die "$Usage
  Too many arguements" if ($#ARGV > 1);
  die "$Usage
  Output Not Specified" if ($#ARGV == 0);
  die "$Usage
  Input and Output Not Specified" if ($#ARGV == -1);
  
  my $InFile=$ARGV[0];
  my $OutFile=$ARGV[1];
  die("$Usage
  Error: Input file $InFile does not exist
  ")
      if ($InFile ne "-" && ! -r $InFile);
  
  open(IN, "$InFile") || die "Unable to open trans-file $InFile";
  open(OUT, ">$OutFile") || die "Unable to open new-trans-file $OutFile";
  
  while (<IN>){
      chomp;
      if ($InFmt eq "txt"){
  	print OUT normalize($_)."
  ";
      } elsif ($InFmt eq "ctm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @ctm = split(/(\s+)/,$_);
  	$ctm[8] = normalize($ctm[8]);
  	print OUT $prefix.join("", @ctm)."
  ";
      } elsif ($InFmt eq "stm"){
  	if ($_ =~ /^(\;\;|\#)/){
  	    print OUT $_."
  ";
  	    next;
  	}	     
  	s/^(\s+)//;
  	my $prefix = (defined($1) ? $1 : "");
  	my @stm = split(/(\s+)/,$_, 7);
  	if ($stm[10] =~ /^<[^<>]*>$/){
  	    $stm[12] = normalize($stm[12]);
  	} else {
  	    $stm[10] .= join("",splice(@stm,11,2));
  	    $stm[10] = normalize($stm[10]);
  	}
  	print OUT $prefix.join("", @stm)."
  ";
      } else {
  	die "Error: unknown input format '$InFmt'
  $Usage
  ";
      }    
  }
  
  close IN; close OUT;
  exit 0;
  
  sub normalize{
      my ($text) = @_;
      $text = " ".$text;    
      $text =~ s/ (\330\242|\330\243|\330\245|\330\247)/ \330\247/g;
      $text =~ s/^ //;
      $text;
  }