Blame view
egs/commonvoice/s5/local/data_prep.pl
2.39 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
#!/usr/bin/perl # # Copyright 2017 Ewald Enzinger # Apache 2.0 # # Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train if (@ARGV != 3) { print STDERR "Usage: $0 <path-to-commonvoice-corpus> <dataset> <valid-train|valid-dev|valid-test> "; print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train "; exit(1); } ($db_base, $dataset, $out_dir) = @ARGV; mkdir data unless -d data; mkdir $out_dir unless -d $out_dir; open(CSV, "<", "$db_base/$dataset.csv") or die "cannot open dataset CSV file"; open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; open(GNDR,">", "$out_dir/utt2gender") or die "Could not open the output file $out_dir/utt2gender"; open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text"; open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; my $header = <CSV>; while(<CSV>) { chomp; ($filepath, $text, $upvotes, $downvotes, $age, $gender, $accent, $duration) = split(",", $_); if ("$gender" eq "female") { $gender = "f"; } else { # Use male as default if not provided (no reason, just adopting the same default as in voxforge) $gender = "m"; } $uttId = $filepath; $uttId =~ s/\.mp3//g; $uttId =~ tr/\//-/; # No speaker information is provided, so we treat each utterance as coming from a different speaker $spkr = $uttId; $text =~ s/ said 'eat when/ said eat when/g; $text =~ s/'and this is what your son said'/and this is what your son said/g; $text =~ s/^'m /i'm /g; $text =~ s/'mummy'/mummy/g; $text =~ s/'poppy'/poppy/g; $text =~ s/'every/every/g; $text =~ s/'super fun playground'/super fun playground/g; $text =~ s/'under construction'/under construction/g; $text =~ tr/a-z/A-Z/; print TEXT "$uttId"," ","$text"," "; print GNDR "$uttId"," ","$gender"," "; print WAV "$uttId"," sox $db_base/$filepath -t wav -r 16k -b 16 -e signed - | "; print SPKR "$uttId"," $spkr"," "; } close(SPKR) || die; close(TEXT) || die; close(WAV) || die; close(GNDR) || die; close(WAVLIST); if (system( "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { die "Error creating spk2utt file in directory $out_dir"; } system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) { die "Error validating directory $out_dir"; } |