make_fisher.pl
3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env perl
use File::Basename;
# Copyright 2013 Daniel Povey
# Apache 2.0.
if (@ARGV != 3 ) {
print STDERR "Usage: make_fisher.pl <tbl-file> <sph-list> <out-dir>\n" .
"e.g.: make_fisher.pl /mnt/data/LDC2004T19/fe_03_p1_tran/doc/fe_03_p1_calldata.tbl " .
"all_files.txt data/train_fisher\n";
}
($tbl_file, $sph_list_file ,$out_dir) = @ARGV;
open(TBL, "<$tbl_file") or die "cannot open $tbl_file";
open(SPHLIST, "<", $sph_list_file) or die "cannot open wav list $sph_list_file";
open(GNDR, ">$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
open(UTT2SPK, ">$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV, ">$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
@bad_audio = ("01243", "06716", "00446");
# Read the sph-list, this will give us the full pathnames and will help us
# exclude missing files.
while(<SPHLIST>) {
chop; # e.g. $_ = /export/corpora3/LDC/LDC2004S13/fe_03_p1_sph1/audio/000/fe_03_00037.sph
$basename = basename($_); # e.g. basename = fe_03_00037.sph
$basename =~ m/^fe_\d\d_(\d\d\d\d\d)\.sph$/ || die "Unexpected filename $_";
$utt_id = $1; # match the 5-digit sequence..
if (/$utt_id/i ~~ @bad_audio) {
# don't do anything
} else {
$wav{$utt_id} = $_;
}
}
$header = <TBL>; # read the first line
# which is:
# CALL_ID,DATE_TIME,TOPICID,SIG_GRADE,CNV_GRADE,APIN,ASX.DL,APHNUM,APHSET,APHTYP,BPIN,BSX.DL,BPHNUM,BPHSET,BPHTYP
# Note: the numeric pin "APIN" is supposed to correspond to the speaker identitity but it
# does not always, as some individuals shared the pin with others; as a result, some
# PINs are recorded as male and female in separate calls. To get around this, we just
# regard each PIN as having two versions, a "male" and "female" version, so a PIN 12345
# will be mapped to two PINs, 12345f and 12345m, one or both of which may actually appear.
$num_bad_files = 0;
$num_good_files = 0;
while(<TBL>){
@conv = split(",",$_);
@conv == 15 || die "Bad line $_";
$utt_id = $conv[0];
$genderA = substr($conv[6], 0, 1); # "m" or "f"
$spkidA = $conv[5] . $genderA; # $conv[5] is a numeric PIN (APIN)
$genderB = substr($conv[11], 0, 1); # "m" or "f"
$spkidB = $conv[10] . $genderB;
if (!defined $wav{$utt_id}) {
$num_bad_files++;
# print STDERR "no wav file for $utt_id\n";
} else {
# we prepend the speaker-id to the utterance-id; this helps ensure
# that if we sort by utterance, the resulting list has the utterances
# from a single speaker as a block.
print WAV "$spkidA-$utt_id", "_A sph2pipe -f wav -p -c 1 $wav{$utt_id} |\n";
print WAV "$spkidB-$utt_id", "_B sph2pipe -f wav -p -c 2 $wav{$utt_id} |\n";
print UTT2SPK "$spkidA-$utt_id", "_A $spkidA\n";
print UTT2SPK "$spkidB-$utt_id", "_B $spkidB\n";
if (!defined $seen_spk{$spkidA}) {
$seen_spk{$spkidA} = 1;
print GNDR "$spkidA $genderA\n";
}
if (!defined $seen_spk{$spkidB}) {
$seen_spk{$spkidB} = 1;
print GNDR "$spkidB $genderB\n";
}
$used{$utt_id} = 1;
$num_good_files++;
}
}
while (($key, $value) = each(%wav)) {
if (!$used{$key}) {
print STDERR "wav file $value had no corresponding demographic\n";
}
}
print STDERR "Processed $num_good_files utterances; $num_bad_files had missing wav data.\n";