fsp_make_trans.pl
2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env perl
# Copyright 2014 Gaurav Kumar. Apache 2.0
use utf8;
use File::Basename;
($tmpdir)=@ARGV;
#$tmpdir='../data/local/tmp';
$trans="$tmpdir/train_transcripts.flist";
$reco="$tmpdir/reco2file_and_channel";
open(T, "<", "$trans") || die "Can't open transcripts file";
open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
open(O, ">$tmpdir/text.1") || die "Can't open text file for writing";
open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file";
binmode(O, ":utf8");
while (<T>) {
$file = $_;
m:([^/]+)\.tdf: || die "Bad filename $_";
$call_id = $1;
print R "$call_id-A $call_id A\n";
print R "$call_id-B $call_id B\n";
open(I, "<$file") || die "Opening file $_";
binmode(I, ":utf8");
# Get rid of header sections first
foreach ( 0..2 ) {
$tmpLine = <I>;
}
#Now read each line and extract information
while (<I>) {
#20051017_215732_274_fsp.sph 1 0.0 0.909856781803 Audrey female native <foreign lang="English"> Audrey </foreign> 0 0 -1
chomp;
my @stringComponents = split(/\t/);
#Check number of components in this array
if ((scalar @stringComponents) >= 11) {
$start = sprintf("%06d", $stringComponents[2] * 100);
$end = sprintf("%06d", $stringComponents[3] * 100);
length($end) > 6 && die "Time too long $end in $file";
$side = $stringComponents[1] ? "B" : "A";
$words = $stringComponents[7];
$utt_id = "${call_id}-$side-$start-$end";
$speaker_id = "${call_id}-$side";
$gender = "m";
if ($stringComponents[5] == "female") {
$gender = "f";
}
print G "$speaker_id $gender\n" || die "Error writing to speaker2gender file";
$words =~ s:</:lendarrow:g;
$words =~ s/</larrow/g;
$words =~ s/>/rarrow/g;
$words =~ s/[[:punct:]]//g;
$words =~ s/larrow/</g;
$words =~ s/rarrow/>/g;
$words =~ s:lendarrow:</:g;
$words =~ s/Á/á/g;
$words =~ s/Í/í/g;
$words =~ s/Ó/ó/g;
$words =~ s/Ú/ú/g;
# $words =~ s/ì/í/g;
# $words =~ s/è/é/g;
# $words =~ s/¡/i/g;
# $words =~ s/J/J/g;
# $words =~ s/S/S/g;
# $words =~ s/à/á/g;
$words =~ s/¨//g;
$words =~ s/·//g;
$words =~ s/´//g;
$words =~ s/N/n/g;
# $words =~ s/2//g;
$words = lc($words);
# $words =~ s:ü([eiéí]):w\1:g;
# $words =~ s:ü:u:g;
# $words =~ s:ñ:N:g;
print O "$utt_id $words\n" || die "Error writing to text file";
}
}
close(I)
}
close(T);
close(R);
close(O);
close(G);