Blame view
egs/lre07/v1/local/make_lre07.pl
2.51 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
#!/usr/bin/env perl # # Copyright 2014 David Snyder # Usage: make_lre07.pl <path-to-LDC2009S04> <output-dir> if (@ARGV != 2) { print STDERR "Usage: $0 <path-to-LDC2009S04> <output-dir> "; print STDERR "e.g. $0 /export/corpora5/LDC/LDC2009S04 data/lre07 "; exit(1); } ($db_base, $dir) = @ARGV; $tmp_dir = "$dir/tmp"; if (system("mkdir -p $tmp_dir") != 0) { die "Error making directory $tmp_dir"; } if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { die "Error getting list of sph files"; } open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; while (<WAVLIST>) { chomp; $sph = $_; @A = split("/", $sph); $basename = $A[$#A]; $raw_basename = $basename; $raw_basename =~ s/\.sph$// || die "bad basename $basename"; $wav{$raw_basename} = $sph; } open(WAV, ">$dir/wav.scp") || die "Failed opening output file $out_dir/wav.scp"; open(UTT2SPK, ">$dir/utt2spk") || die "Failed opening output file $dir/utt2spk"; open(SPK2UTT, ">$dir/spk2utt") || die "Failed opening output file $dir/spk2utt"; open(UTT2LANG, ">$dir/utt2lang") || die "Failed opening output file $dir/utt2lang"; open(DUR3, ">$dir/3sec") || die "Failed opening output file $dir/3sec"; open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec"; open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec"; my $key_str = `wget -qO- "http://www.openslr.org/resources/23/lre07_key.txt"`; @key_lines = split(" ",$key_str); %utt2lang = (); %utt2dur = (); foreach (@key_lines) { @words = split(' ', $_); if (index($words[0], "#") == -1) { $utt2lang{$words[0]} = $words[1]; $utt2dur{$words[0]} = $words[5]; } } foreach (sort keys(%wav)) { $uttId = $_; print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wav{$uttId} | "; # We don't really have speaker info, so just make it the same as the # utterances: an identity map. print UTT2SPK "$uttId $uttId "; print SPK2UTT "$uttId $uttId "; print UTT2LANG "$uttId $utt2lang{$uttId} "; if ($utt2dur{$uttId} == 3) { print DUR3 "$uttId "; } elsif ($utt2dur{$uttId} == 10) { print DUR10 "$uttId "; } elsif ($utt2dur{$uttId} == 30) { print DUR30 "$uttId "; } else { die "Invalid nominal duration in test segment"; } } close(WAV) || die; close(UTT2SPK) || die; close(SPK2UTT) || die; close(UTT2LANG) || die; close(DUR3) || die; close(DUR10) || die; close(DUR30) || die; close(WAVLIST) || die; system("rm -r $dir/tmp"); (system("utils/validate_data_dir.sh --no-text --no-feats $dir") == 0) || die "Error validating data dir."; |