Blame view

egs/sre08/v1/local/make_fisher.pl 3.28 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  #!/usr/bin/env perl
  
  use File::Basename;
  
  # Copyright 2013   Daniel Povey
  # Apache 2.0.
  
  
  if (@ARGV != 3 ) {
    print STDERR "Usage: make_fisher.pl <tbl-file> <sph-list> <out-dir>
  " .
      "e.g.: make_fisher.pl /mnt/data/LDC2004T19/fe_03_p1_tran/doc/fe_03_p1_calldata.tbl " .
      "all_files.txt data/train_fisher
  ";
  }
  
  ($tbl_file, $sph_list_file ,$out_dir) = @ARGV;
  
  
  open(TBL, "<$tbl_file")  or die "cannot open $tbl_file";
  open(SPHLIST, "<", $sph_list_file) or die "cannot open wav list $sph_list_file";
  
  open(GNDR, ">$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
  open(UTT2SPK, ">$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
  open(WAV, ">$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
  
  @bad_audio = ("01243", "06716", "00446");
  
  # Read the sph-list, this will give us the full pathnames and will help us
  # exclude missing files.
  while(<SPHLIST>) {
      chop; # e.g. $_ = /export/corpora3/LDC/LDC2004S13/fe_03_p1_sph1/audio/000/fe_03_00037.sph
      $basename = basename($_); # e.g. basename = fe_03_00037.sph
      $basename =~ m/^fe_\d\d_(\d\d\d\d\d)\.sph$/ || die "Unexpected filename $_";
      $utt_id = $1; # match the 5-digit sequence..
      if (/$utt_id/i ~~ @bad_audio) {
        # don't do anything
      } else {
        $wav{$utt_id} = $_;
      }
  }
  
  $header = <TBL>; # read the first line
  # which is:
  # CALL_ID,DATE_TIME,TOPICID,SIG_GRADE,CNV_GRADE,APIN,ASX.DL,APHNUM,APHSET,APHTYP,BPIN,BSX.DL,BPHNUM,BPHSET,BPHTYP
  # Note: the numeric pin "APIN" is supposed to correspond to the speaker identitity but it
  # does not always, as some individuals shared the pin with others; as a result, some
  # PINs are recorded as male and female in separate calls.  To get around this, we just
  # regard each PIN as having two versions, a "male" and "female" version, so a PIN 12345
  # will be mapped to two PINs, 12345f and 12345m, one or both of which may actually appear.
  
  $num_bad_files = 0;
  $num_good_files = 0;
  
  while(<TBL>){
    @conv = split(",",$_);
    @conv == 15 || die "Bad line $_";
    $utt_id = $conv[0];
    $genderA = substr($conv[6], 0, 1); # "m" or "f"
    $spkidA = $conv[5] . $genderA;  # $conv[5] is a numeric PIN (APIN)
    $genderB = substr($conv[11], 0, 1); # "m" or "f"
    $spkidB = $conv[10] . $genderB;
  
    if (!defined $wav{$utt_id}) {
      $num_bad_files++;
  	# print STDERR "no wav file for $utt_id
  ";
    } else {
      # we prepend the speaker-id to the utterance-id; this helps ensure
      # that if we sort by utterance, the resulting list has the utterances
      # from a single speaker as a block.
  	print WAV "$spkidA-$utt_id", "_A sph2pipe -f wav -p -c 1 $wav{$utt_id} |
  ";
  	print WAV "$spkidB-$utt_id", "_B sph2pipe -f wav -p -c 2 $wav{$utt_id} |
  ";
  	print UTT2SPK "$spkidA-$utt_id", "_A $spkidA
  ";
  	print UTT2SPK "$spkidB-$utt_id", "_B $spkidB
  ";
  
      if (!defined $seen_spk{$spkidA}) {
        $seen_spk{$spkidA} = 1;
        print GNDR "$spkidA $genderA
  ";
      }
      if (!defined $seen_spk{$spkidB}) {
        $seen_spk{$spkidB} = 1;
        print GNDR "$spkidB $genderB
  ";
      }
  	$used{$utt_id} = 1;
      $num_good_files++;
    }
  }
  while (($key, $value) = each(%wav)) {
    if (!$used{$key}) {
  	print STDERR "wav file $value had no corresponding demographic
  ";
    }
  }
  
  print STDERR "Processed $num_good_files utterances; $num_bad_files had missing wav data.
  ";