Blame view

egs/lre/v1/local/make_sre_2008_train.pl 3.81 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  #!/usr/bin/env perl
  #
  # Copyright 2013-2014 Daniel Povey
  #                2014 David Snyder
  # Apache 2.0.
  # Usage: make_sre_2008_train.pl <path to LDC2011S05> <Path to root level output dir>
  
  if (@ARGV != 2) {
    print STDERR "Usage: $0 <path-to-LDC2011S05> <path-to-output>
  ";
    print STDERR "e.g. $0 /export/corpora5/LDC/LDC2011S05 data
  ";
    exit(1);
  }
  
  ($db_base, $out_base_dir) = @ARGV;
  
  $tmp_dir = "$out_base_dir/tmp";
  if (system("mkdir -p $tmp_dir") != 0) {
    die "Error making directory $tmp_dir"; 
  }
  
  if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
    die "Error getting list of sph files";
  }
  
  open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list";
  
  while(<WAVLIST>) {
    chomp;
    $sph = $_;
    @A = split("/", $sph);
    $basename = $A[$#A];
    $raw_basename = $basename;
    $raw_basename =~ s/\.sph$// || die "bad basename $basename";
    $wav{$raw_basename} = $sph;
  }
  
  $cfile3=$db_base . "/docs/NIST_SRE08_header_info.all.train.csv";
  @cflist = ($cfile3);
  
  
  foreach $cf (@cflist) {
      open(SEGKEY, "<", $cf) or die "Cannot open $cf";
      $t = <SEGKEY>;
      while (<SEGKEY>) {
          $t = $_;
          @t = split(",",$t);
          $speechtype=$t[2];
          $language=$t[1];
          $segment=$t[0];
          $segment =~ s/\.sph$//;
          
          $speechtype{$segment} = $speechtype;
          $lang{1,$segment} = $language;
          $lang{2,$segment} = $language;
      }
      close(SEGKEY);
  }
  
  @gender_list=("male","female");
  foreach $gender (@gender_list) {
    $g = substr($gender, 0, 1);
    @case_list=("10sec","3conv","8conv","short2");
    foreach $case (@case_list) {
      $out_dir = "$out_base_dir/sre08_train_${case}_${gender}";
      mkdir "$out_dir";
      $casefile = $db_base."/data/train/".$gender."/".$case.".trn";
      open(CF, "<", $casefile)  or die "cannot open $casefile";
      open(GNDR,">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
      open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
      open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
      open(LANG,">","$out_dir/utt2lang") or die "Could not open $out_dir/utt2lang";
      #open(CHAN,">","$out_dir/utt2chan") or die "Could not open $out_dir/utt2chan";
      while(<CF>) {
        chomp;
        $line = $_;
        @A = split(" ",$line);
        $spkr = $A[0];
        @wav_list = split(",", $A[1]);
        foreach $wav_id (@wav_list) {
          @B = split(":", $wav_id);
          $basename = $B[0];
          $side = $B[1];
          $raw_basename = $basename;
          $raw_basename =~ s/.sph$//;
          $side = $B[1];
          if ($side eq "A") {
            $channel = 1;
          } elsif ($side eq "B") {
            $channel = 2;
          } else {
            die "unknown channel $side
  ";
          }
          $spkr = "$A[0]_sre08";
          $uttId = $spkr . "-" . $raw_basename . "_" . $side; # prefix language-number to utt-id to ensure sorted order.
          $wave = $wav{$raw_basename};
          $lang = $lang{$channel,$raw_basename};
          if ($wave && -e $wave && defined $lang) {
            print WAV "$uttId"," sph2pipe -f wav -p -c $channel $wave |
  ";
            print SPKR "$uttId"," $spkr","
  ";
            print LANG "$uttId"," $lang
  ";
            #print CHAN "$uttId"," $channel_type[$channel]{$raw_basename}
  ";
          } else {
            print STDERR "No wave file or language missing for utterance $raw_basename
  ";
          }
        }
        print GNDR "$spkr $g
  ";
      }
      close(GNDR) || die;
      close(SPKR) || die;
      close(WAV) || die;
      if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
        die "Error creating spk2utt file in directory $out_dir";
      }
      system("utils/fix_data_dir.sh $out_dir");
      if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
        die "Error validating directory $out_dir";
      }
    }
  }