Blame view
egs/callhome_egyptian/s5/local/callhome_make_trans.pl
3.04 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
#!/usr/bin/env perl use File::Basename; ($tmpdir)=@ARGV; $trans="$tmpdir/callhome_train_transcripts.flist"; $reco="$tmpdir/reco2file_and_channel"; open(T, "<", "$trans") || die "Can't open transcripts file"; open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; open(O, ">$tmpdir/text.1") || die "Can't open text file for writing"; open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file"; while (<T>) { $file = $_; m:([^/]+)\.txt: || die "Bad filename $_"; $call_id = $1; print R "$call_id-A $call_id A "; print R "$call_id-B $call_id B "; open(I, "<$file") || die "Opening file $_"; #Now read each line and extract information while (<I>) { #814.71 815.39 A: la ana baqu- chomp; if ($_ eq "") { next; } my @stringComponents = split(":"); my @timeInfo = split(" ", $stringComponents[0]); $stringComponents[1] =~ s/^\s+|\s+$//g ; my $words = $stringComponents[1]; #Check number of components in this array if ((scalar @stringComponents) >= 2) { $start = sprintf("%06d", $timeInfo[0] * 100); $end = sprintf("%06d", $timeInfo[1] * 100); length($end) > 6 && die "Time too long $end in $file"; $side = "A"; if (index($timeInfo[2], "B") != -1) { $side = "B"; } $utt_id = "${call_id}-$side-$start-$end"; $speaker_id = "${call_id}-$side"; # All speakers are treated as male because speaker gender info # is missing in this file $gender = "m"; print G "$speaker_id $gender " || die "Error writing to speaker2gender file"; $words =~ s|\[\[[^]]*\]\]||g; #removes comments $words =~ s|\{laugh\}|\$laughter\$|g; # replaces laughter tmp $words =~ s|\{[^}]*\}|\[noise\]|g; # replaces noise $words =~ s|\[/*([^]]*)\]|\[noise\]|g; # replaces end of noise $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again $words =~ s|\(\(([^)]*)\)\)|\1|g; # replaces unintelligible speech $words =~ s|//([^/]*)//|\1|g; # replaces aside speech $words =~ s|\*\*([^\*]*)\*\*|\1|g; # replaces aside speech $words =~ s|#([^)]*)#|\1|g; # replaces unintelligible speech $words =~ s|<\?([^>]*)>|\1|g; # for unrecognized language $words =~ s|<\s*\S*\s*([^>]*)>|\1|g; # replaces foreign text $words =~ s|~||g; $words =~ s|\(\S\)||g; # tEh marbUta "B" $words =~ s|&||g; # Proper noun marker removed $words =~ s|(\S+)%(\S*)|\1<ext>\2|g; $words =~ s|%(\S*)|[hes]|g; # Takes care of hesitations $words =~ s|<ext>|%|g; $words =~ s|-*||g; # Removes hyphens $words =~ s|\?||g; #$words =~ s|(\S*)B|\1B=1|g; chomp($words); print O "$utt_id $words " || die "Error writing to text file"; } } } close(T); close(R); close(O); close(G); |