Blame view
egs/lre07/v1/local/make_lre07_train.pl
2.13 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#!/usr/bin/env perl # # Copyright 2014 David Snyder # Apache 2.0. use File::Basename; if (@ARGV != 2) { print STDERR "Usage: $0 <path-to-LDC2009S05> <path-to-output> "; print STDERR "e.g. $0 /export/corpora5/LDC/LDC2009S05 data "; exit(1); } ($db_base, $out_base_dir) = @ARGV; $ldc_code = lc basename($db_base); # We won't use the speaker or gender information. Anyway it's not that useful # as it seems to be 2-wire recordings, and everything is mixed together. foreach $set ('arb', 'ben', 'cfr', 'rus', 'tha', 'urd', 'wuu', 'yuh') { $tmp_dir = "$out_base_dir/tmp"; if (system("mkdir -p $tmp_dir") != 0) { die "Error making directory $tmp_dir"; } if (system("find $db_base/data/lre07_tr/$set -name '*.sph' | grep '$set' > $tmp_dir/sph.list") != 0) { die "Error getting list of sph files"; } $tmp_dir = "$out_base_dir/tmp"; open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list"; %wav = (); while($sph = <WAVLIST>) { chomp($sph); @A = split("/", $sph); $basename = $A[$#A]; $raw_basename = $basename; $raw_basename =~ s/\.sph$// || die "bad basename $basename"; $wav{$raw_basename} = $sph; } close(WAVLIST) || die; $out_dir = $out_base_dir . "/" . $ldc_code . '_' . $set; if (system("mkdir -p $out_dir") != 0) { die "Error making directory $out_dir"; } open(WAV, ">$out_dir" . '/wav.scp') || die "Failed opening output file $out_dir/wav.scp"; open(UTT2LANG, ">$out_dir" . '/utt2lang') || die "Failed opening output file $out_dir/utt2lang"; open(UTT2SPK, ">$out_dir" . '/utt2spk') || die "Failed opening output file $out_dir/utt2spk"; foreach $recording (sort keys(%wav)) { $uttId = $ldc_code . "_" . $recording; print WAV "$uttId"," sph2pipe -f wav -p $wav{$recording} | "; print UTT2SPK "$uttId $uttId "; print UTT2LANG "$uttId $set "; } close(WAV) || die; close(UTT2SPK) || die; close(UTT2LANG) || die; system("rm -r $out_base_dir/tmp"); system("utils/fix_data_dir.sh $out_dir"); (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") == 0) || die "Error validating data dir."; } |