Blame view
egs/fisher_callhome_spanish/s5/local/callhome_make_trans.pl
3.12 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
#!/usr/bin/env perl # Copyright 2014 Gaurav Kumar. Apache 2.0 use utf8; use File::Basename; ($tmpdir)=@ARGV; $trans="$tmpdir/callhome_train_transcripts.flist"; $reco="$tmpdir/callhome_reco2file_and_channel"; open(T, "<", "$trans") || die "Can't open transcripts file"; open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!"; open(O, ">$tmpdir/callhome.text.1") || die "Can't open text file for writing"; open(G, ">$tmpdir/callhome_spk2gendertmp") || die "Can't open the speaker to gender map file"; binmode(O, ":utf8"); while (<T>) { $file = $_; m:([^/]+)\.txt: || die "Bad filename $_"; $call_id = $1; print R "$call_id-A $call_id A "; print R "$call_id-B $call_id B "; open(I, "<$file") || die "Opening file $_"; binmode(I, ":iso88591"); #Now read each line and extract information while (<I>) { #136.37 138.10 B: Ah, bueno, mamita. chomp; my @stringComponents = split(":", $_, 2); my @timeInfo = split(" ", $stringComponents[0]); $stringComponents[1] =~ s/^\s+|\s+$//g ; my $words = $stringComponents[1]; #Check number of components in this array if ((scalar @stringComponents) >= 2) { $start = sprintf("%06d", $timeInfo[0] * 100); $end = sprintf("%06d", $timeInfo[1] * 100); length($end) > 6 && die "Time too long $end in $file"; $side = "A"; if (index($timeInfo[2], "B") != -1) { $side = "B"; } $utt_id = "${call_id}-$side-$start-$end"; $speaker_id = "${call_id}-$side"; # All speakers are treated as male because speaker gender info # is missing in this file $gender = "m"; print G "$speaker_id $gender " || die "Error writing to speaker2gender file"; $words =~ s|\[\[[^]]*\]\]||g; #removes comments $words =~ s|\{laugh\}|\$laughter\$|g; # replaces laughter tmp $words =~ s|\[laugh\]|\$laughter\$|g; # replaces laughter tmp $words =~ s|\{[^}]*\}|\[noise\]|g; # replaces noise $words =~ s|\[[^]]*\]|\[noise\]|g; # replaces noise $words =~ s|\[/*([^]]*)\]|\[noise\]|g; # replaces end of noise $words =~ s|\$laughter\$|\[laughter\]|g; # replaces laughter again $words =~ s|\(\(([^)]*)\)\)|\1|g; # replaces unintelligible speech $words =~ s|<\?([^>]*)>|\1|g; # for unrecognized language $words =~ s|background speech|\[noise\]|g; $words =~ s|background noise|\[noise\]|g; $words =~ s/\[/larrow/g; $words =~ s/\]/rarrow/g; $words =~ s/[[:punct:]]//g; $words =~ s/larrow/\[/g; $words =~ s/rarrow/\]/g; $words =~ s/[¿¡]//g; $words =~ s/\h+/ /g; # horizontal whitespace characters $words = lc($words); print O "$utt_id $words " || die "Error writing to text file"; } } close(I); } close(T); close(R); close(O); close(G); |