Blame view
egs/voxceleb/v1/local/make_voxceleb1.pl
4.54 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/usr/bin/perl # # Copyright 2018 Ewald Enzinger # 2018 David Snyder # # Usage: make_voxceleb1.pl /export/voxceleb1 data/ if (@ARGV != 2) { print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir> "; print STDERR "e.g. $0 /export/voxceleb1 data/ "; exit(1); } ($data_base, $out_dir) = @ARGV; my $out_test_dir = "$out_dir/voxceleb1_test"; my $out_train_dir = "$out_dir/voxceleb1_train"; if (system("mkdir -p $out_test_dir") != 0) { die "Error making directory $out_test_dir"; } if (system("mkdir -p $out_train_dir") != 0) { die "Error making directory $out_train_dir"; } opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); closedir $dh; if (! -e "$data_base/voxceleb1_test.txt") { system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); } if (! -e "$data_base/vox1_meta.csv") { system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); } open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; my %id2spkr = (); while (<META_IN>) { chomp; my ($vox_id, $spkr_id, $gender, $nation, $set) = split; $id2spkr{$vox_id} = $spkr_id; } my $test_spkrs = (); while (<TRIAL_IN>) { chomp; my ($tar_or_non, $path1, $path2) = split; # Create entry for left-hand side of trial my ($spkr_id, $filename) = split('/', $path1); my $rec_id = substr($filename, 0, 11); my $segment = substr($filename, 12, 7); my $utt_id1 = "$spkr_id-$rec_id-$segment"; $test_spkrs{$spkr_id} = (); # Create entry for right-hand side of trial my ($spkr_id, $filename) = split('/', $path2); my $rec_id = substr($filename, 0, 11); my $segment = substr($filename, 12, 7); my $utt_id2 = "$spkr_id-$rec_id-$segment"; $test_spkrs{$spkr_id} = (); my $target = "nontarget"; if ($tar_or_non eq "1") { $target = "target"; } print TRIAL_OUT "$utt_id1 $utt_id2 $target "; } foreach (@spkr_dirs) { my $spkr_id = $_; my $new_spkr_id = $spkr_id; # If we're using a newer version of VoxCeleb1, we need to "deanonymize" # the speaker labels. if (exists $id2spkr{$spkr_id}) { $new_spkr_id = $id2spkr{$spkr_id}; } opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); closedir $dh; foreach (@files) { my $filename = $_; my $rec_id = substr($filename, 0, 11); my $segment = substr($filename, 12, 7); my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; my $utt_id = "$new_spkr_id-$rec_id-$segment"; if (exists $test_spkrs{$new_spkr_id}) { print WAV_TEST "$utt_id", " $wav", " "; print SPKR_TEST "$utt_id", " $new_spkr_id", " "; } else { print WAV_TRAIN "$utt_id", " $wav", " "; print SPKR_TRAIN "$utt_id", " $new_spkr_id", " "; } } } close(SPKR_TEST) or die; close(WAV_TEST) or die; close(SPKR_TRAIN) or die; close(WAV_TRAIN) or die; close(TRIAL_OUT) or die; close(TRIAL_IN) or die; close(META_IN) or die; if (system( "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { die "Error creating spk2utt file in directory $out_test_dir"; } system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { die "Error validating directory $out_test_dir"; } if (system( "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { die "Error creating spk2utt file in directory $out_train_dir"; } system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { die "Error validating directory $out_train_dir"; } |