Blame view

egs/callhome_egyptian/s5/local/callhome_make_trans.pl 3.04 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
  #!/usr/bin/env perl
  
  use File::Basename;
  
  ($tmpdir)=@ARGV;
  $trans="$tmpdir/callhome_train_transcripts.flist";
  $reco="$tmpdir/reco2file_and_channel";
  open(T, "<", "$trans") || die "Can't open transcripts file";
  open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
  open(O, ">$tmpdir/text.1") || die "Can't open text file for writing";
  open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file";
  while (<T>) {
  	$file = $_;
  	m:([^/]+)\.txt: || die "Bad filename $_";
  	$call_id = $1;
  	print R "$call_id-A $call_id A
  ";
  	print R "$call_id-B $call_id B
  ";
  	open(I, "<$file") || die "Opening file $_";
  	#Now read each line and extract information
  	while (<I>) {
          #814.71 815.39 A: la ana baqu-
  		chomp;
          if ($_ eq "") {
              next;
          }
  		my @stringComponents = split(":");
          my @timeInfo = split(" ", $stringComponents[0]);
          $stringComponents[1] =~ s/^\s+|\s+$//g ;
          my $words = $stringComponents[1];
  		#Check number of components in this array
  		if ((scalar @stringComponents) >= 2) {
  			$start = sprintf("%06d", $timeInfo[0] * 100);
  			$end = sprintf("%06d", $timeInfo[1] * 100);
  			length($end) > 6 && die "Time too long $end in $file";
              $side = "A";
              if (index($timeInfo[2], "B") != -1) {
                  $side = "B";
              }
  			$utt_id = "${call_id}-$side-$start-$end";
  			$speaker_id = "${call_id}-$side";
              # All speakers are treated as male because speaker gender info 
              # is missing in this file
  			$gender = "m";
  			print G "$speaker_id $gender
  " || die "Error writing to speaker2gender file";
              $words =~ s|\[\[[^]]*\]\]||g;    #removes comments
              $words =~ s|\{laugh\}|\$laughter\$|g;    # replaces laughter tmp
              $words =~ s|\{[^}]*\}|\[noise\]|g;       # replaces noise
              $words =~ s|\[/*([^]]*)\]|\[noise\]|g;   # replaces end of noise
              $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again
              $words =~ s|\(\(([^)]*)\)\)|\1|g;        # replaces unintelligible speech
              $words =~ s|//([^/]*)//|\1|g;        # replaces aside speech
              $words =~ s|\*\*([^\*]*)\*\*|\1|g;        # replaces aside speech
              $words =~ s|#([^)]*)#|\1|g;        # replaces unintelligible speech
              $words =~ s|<\?([^>]*)>|\1|g;             # for unrecognized language
              $words =~ s|<\s*\S*\s*([^>]*)>|\1|g;        # replaces foreign text
              $words =~ s|~||g;
              $words =~ s|\(\S\)||g;                  # tEh marbUta "B"
              $words =~ s|&||g;                       # Proper noun marker removed
              $words =~ s|(\S+)%(\S*)|\1<ext>\2|g;
              $words =~ s|%(\S*)|[hes]|g;        # Takes care of hesitations
              $words =~ s|<ext>|%|g;
              $words =~ s|-*||g;                      # Removes hyphens 
              $words =~ s|\?||g;
              #$words =~ s|(\S*)B|\1B=1|g;
              chomp($words);
  			print O "$utt_id $words
  " || die "Error writing to text file";
  		}
  	}
  }
  close(T);
  close(R);
  close(O);
  close(G);