Blame view

egs/lre/v1/local/make_lre07.pl 2.52 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  #!/usr/bin/env perl
  #
  # Copyright 2014  David Snyder
  # Usage: make_lre07.pl <path-to-LDC2009S04> <output-dir>
  
  
  if (@ARGV != 2) {
    print STDERR "Usage: $0 <path-to-LDC2009S04> <output-dir>
  ";
    print STDERR "e.g. $0 /export/corpora5/LDC/LDC2009S04 data/lre07
  ";
    exit(1);
  }
  
  ($db_base, $dir) = @ARGV;
  
  $tmp_dir = "$dir/tmp";
  if (system("mkdir -p $tmp_dir") != 0) {
    die "Error making directory $tmp_dir";
  }
  
  if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
    die "Error getting list of sph files";
  }
  
  open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
  
  while (<WAVLIST>) {
    chomp;
    $sph = $_;
    @A = split("/", $sph);
    $basename = $A[$#A];
    $raw_basename = $basename;
    $raw_basename =~ s/\.sph$// || die "bad basename $basename";
    $wav{$raw_basename} = $sph;
  }
  open(WAV, ">$dir/wav.scp") || die "Failed opening output file $out_dir/wav.scp";
  open(UTT2SPK, ">$dir/utt2spk") || die "Failed opening output file $dir/utt2spk";
  open(SPK2UTT, ">$dir/spk2utt") || die "Failed opening output file $dir/spk2utt";
  open(UTT2LANG, ">$dir/utt2lang") || die "Failed opening output file $dir/utt2lang";
  open(DUR3, ">$dir/3sec") || die "Failed opening output file $dir/3sec";
  open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec";
  open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec";
  
  my $key_str = `wget -qO- "http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt"`;
  @key_lines = split("
  ",$key_str);
  %utt2lang = (); 
  %utt2dur = (); 
  foreach (@key_lines) {
    @words = split(' ', $_);
    if (index($words[0], "#") == -1) {
      $utt2lang{$words[0]} = $words[1];
      $utt2dur{$words[0]} = $words[5];
    }
  }
  
  foreach (sort keys(%wav)) {
    $uttId = $_;
    print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$uttId} |
  ";
    # We don't really have speaker info, so just make it the same as the
    # utterances: an identity map.
    print UTT2SPK "$uttId $uttId
  ";
    print SPK2UTT "$uttId $uttId
  ";
    print UTT2LANG "$uttId $utt2lang{$uttId}
  ";
    if ($utt2dur{$uttId} == 3) {
      print DUR3 "$uttId
  ";
    } elsif ($utt2dur{$uttId} == 10) {
      print DUR10 "$uttId
  ";
    } elsif ($utt2dur{$uttId} == 30) {
      print DUR30 "$uttId
  ";
    } else {
      die "Invalid nominal duration in test segment";
    }
  }
  close(WAV) || die;
  close(UTT2SPK) || die;
  close(SPK2UTT) || die;
  close(UTT2LANG) || die;
  close(DUR3) || die;
  close(DUR10) || die;
  close(DUR30) || die;
  close(WAVLIST) || die;
  system("rm -r $dir/tmp");
  
  (system("utils/validate_data_dir.sh --no-text --no-feats $dir") == 0) || die "Error validating data dir.";