Blame view
Scripts/utils/subset_data_dir_tr_cv.sh
3.54 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
#!/bin/bash # Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin); # Brno University of Technology (Author: Karel Vesely); # Johns Hopkins University (Author: Daniel Povey); # Apache 2.0 # This script splits dataset to two parts : # training set from (100-P)% of speakers/utterances and # held-out set (or cross-validation) from P% of remaining speakers/remaining utterances, # which will be later on used for neural network training # # There are two options for choosing held-out (or cross-validation) set, either by # --cv-spk-percent P , which will give you CV set based on random chosen P% of speakers, or # --cv-utt-percent P , which will give you CV set based on last P% utterances in the dataset # # If you don't apply the above two options, by default the script will use --cv-utt-percent option, # and the default cross validation percentage portion is equal to 10% (i.e. P=10) # # The --cv-spk-percent option is useful if you would like to have subset chosen from random speakers order, # especially for the cases where dataset contains multiple different corpora, # where type of speakers or recording channels may be quite different # Begin configuration. cv_spk_percent= # % of speakers is parsed by option cv_utt_percent=10 # default 10% of total utterances seed=777 # use seed for speaker shuffling # End configuration. echo "$0 $@" # Print the command line for logging uttbase=true; # by default, we choose last 10% utterances for CV if [ "$1" == "--cv-spk-percent" ]; then uttbase=false; spkbase=true; fi [ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; if [ $# != 3 ]; then echo "Usage: $0 [--cv-spk-percent P|--cv-utt-percent P] <srcdir> <traindir> <crossvaldir>" echo " --cv-spk-percent P Cross Validation portion of the total speakers, recommend value is 10% (i.e. P=10)" echo " --cv-utt-percent P Cross Validation portion of the total utterances, default is 10% (i.e. P=10)" echo " " exit 1; fi srcdir=$1 trndir=$2 cvdir=$3 ## use simple last P% utterance for CV if $uttbase; then if [ ! -f $srcdir/utt2spk ]; then echo "$0: no such file $srcdir/utt2spk" exit 1; fi #total number of lines N=$(cat $srcdir/utt2spk | wc -l) #get line number where (100-P)% of the data lies P_utt=$((N * cv_utt_percent / 100)) N_head=$((N -P_utt)) #move the boundary so it is located on speaker change N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }') #the rest of the data will be that big N_tail=$((N-N_head)) #now call the subset_data_dir.sh and fix the directories subset_data_dir.sh --first $srcdir $N_head $trndir subset_data_dir.sh --last $srcdir $N_tail $cvdir exit 0; fi ## use random chosen P% speakers for CV if [ ! -f $srcdir/spk2utt ]; then echo "$0: no such file $srcdir/spk2utt" exit 1; fi #total, cv, train number of speakers N=$(cat $srcdir/spk2utt | wc -l) N_spk_cv=$((N * cv_spk_percent / 100)) N_spk_trn=$((N - N_spk_cv)) mkdir -p $cvdir $trndir #shuffle the speaker list awk '{print $1}' $srcdir/spk2utt | shuffle_list.pl --srand $seed > $trndir/_tmpf_randspk #split the train/cv head -n $N_spk_cv $trndir/_tmpf_randspk > $cvdir/_tmpf_cvspk tail -n $N_spk_trn $trndir/_tmpf_randspk > $trndir/_tmpf_trainspk #now call the subset_data_dir.sh subset_data_dir.sh --spk-list $trndir/_tmpf_trainspk $srcdir $trndir subset_data_dir.sh --spk-list $cvdir/_tmpf_cvspk $srcdir $cvdir #clean-up rm -f $trndir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk |