Blame view
egs/sre08/v1/local/make_fisher.pl
3.28 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
#!/usr/bin/env perl use File::Basename; # Copyright 2013 Daniel Povey # Apache 2.0. if (@ARGV != 3 ) { print STDERR "Usage: make_fisher.pl <tbl-file> <sph-list> <out-dir> " . "e.g.: make_fisher.pl /mnt/data/LDC2004T19/fe_03_p1_tran/doc/fe_03_p1_calldata.tbl " . "all_files.txt data/train_fisher "; } ($tbl_file, $sph_list_file ,$out_dir) = @ARGV; open(TBL, "<$tbl_file") or die "cannot open $tbl_file"; open(SPHLIST, "<", $sph_list_file) or die "cannot open wav list $sph_list_file"; open(GNDR, ">$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; open(UTT2SPK, ">$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; open(WAV, ">$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; @bad_audio = ("01243", "06716", "00446"); # Read the sph-list, this will give us the full pathnames and will help us # exclude missing files. while(<SPHLIST>) { chop; # e.g. $_ = /export/corpora3/LDC/LDC2004S13/fe_03_p1_sph1/audio/000/fe_03_00037.sph $basename = basename($_); # e.g. basename = fe_03_00037.sph $basename =~ m/^fe_\d\d_(\d\d\d\d\d)\.sph$/ || die "Unexpected filename $_"; $utt_id = $1; # match the 5-digit sequence.. if (/$utt_id/i ~~ @bad_audio) { # don't do anything } else { $wav{$utt_id} = $_; } } $header = <TBL>; # read the first line # which is: # CALL_ID,DATE_TIME,TOPICID,SIG_GRADE,CNV_GRADE,APIN,ASX.DL,APHNUM,APHSET,APHTYP,BPIN,BSX.DL,BPHNUM,BPHSET,BPHTYP # Note: the numeric pin "APIN" is supposed to correspond to the speaker identitity but it # does not always, as some individuals shared the pin with others; as a result, some # PINs are recorded as male and female in separate calls. To get around this, we just # regard each PIN as having two versions, a "male" and "female" version, so a PIN 12345 # will be mapped to two PINs, 12345f and 12345m, one or both of which may actually appear. $num_bad_files = 0; $num_good_files = 0; while(<TBL>){ @conv = split(",",$_); @conv == 15 || die "Bad line $_"; $utt_id = $conv[0]; $genderA = substr($conv[6], 0, 1); # "m" or "f" $spkidA = $conv[5] . $genderA; # $conv[5] is a numeric PIN (APIN) $genderB = substr($conv[11], 0, 1); # "m" or "f" $spkidB = $conv[10] . $genderB; if (!defined $wav{$utt_id}) { $num_bad_files++; # print STDERR "no wav file for $utt_id "; } else { # we prepend the speaker-id to the utterance-id; this helps ensure # that if we sort by utterance, the resulting list has the utterances # from a single speaker as a block. print WAV "$spkidA-$utt_id", "_A sph2pipe -f wav -p -c 1 $wav{$utt_id} | "; print WAV "$spkidB-$utt_id", "_B sph2pipe -f wav -p -c 2 $wav{$utt_id} | "; print UTT2SPK "$spkidA-$utt_id", "_A $spkidA "; print UTT2SPK "$spkidB-$utt_id", "_B $spkidB "; if (!defined $seen_spk{$spkidA}) { $seen_spk{$spkidA} = 1; print GNDR "$spkidA $genderA "; } if (!defined $seen_spk{$spkidB}) { $seen_spk{$spkidB} = 1; print GNDR "$spkidB $genderB "; } $used{$utt_id} = 1; $num_good_files++; } } while (($key, $value) = each(%wav)) { if (!$used{$key}) { print STDERR "wav file $value had no corresponding demographic "; } } print STDERR "Processed $num_good_files utterances; $num_bad_files had missing wav data. "; |