subset_data_dir_tr_cv.sh
3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
# Copyright 2013 Hong Kong University of Science and Technology (Author: Ricky Chan Ho Yin);
# Brno University of Technology (Author: Karel Vesely);
# Johns Hopkins University (Author: Daniel Povey);
# Apache 2.0
# This script splits dataset to two parts :
# training set from (100-P)% of speakers/utterances and
# held-out set (or cross-validation) from P% of remaining speakers/remaining utterances,
# which will be later on used for neural network training
#
# There are two options for choosing held-out (or cross-validation) set, either by
# --cv-spk-percent P , which will give you CV set based on random chosen P% of speakers, or
# --cv-utt-percent P , which will give you CV set based on last P% utterances in the dataset
#
# If you don't apply the above two options, by default the script will use --cv-utt-percent option,
# and the default cross validation percentage portion is equal to 10% (i.e. P=10)
#
# The --cv-spk-percent option is useful if you would like to have subset chosen from random speakers order,
# especially for the cases where dataset contains multiple different corpora,
# where type of speakers or recording channels may be quite different
# Begin configuration.
cv_spk_percent= # % of speakers is parsed by option
cv_utt_percent=10 # default 10% of total utterances
seed=777 # use seed for speaker shuffling
# End configuration.
echo "$0 $@" # Print the command line for logging
uttbase=true; # by default, we choose last 10% utterances for CV
if [ "$1" == "--cv-spk-percent" ]; then
uttbase=false;
spkbase=true;
fi
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [--cv-spk-percent P|--cv-utt-percent P] <srcdir> <traindir> <crossvaldir>"
echo " --cv-spk-percent P Cross Validation portion of the total speakers, recommend value is 10% (i.e. P=10)"
echo " --cv-utt-percent P Cross Validation portion of the total utterances, default is 10% (i.e. P=10)"
echo " "
exit 1;
fi
srcdir=$1
trndir=$2
cvdir=$3
## use simple last P% utterance for CV
if $uttbase; then
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
#total number of lines
N=$(cat $srcdir/utt2spk | wc -l)
#get line number where (100-P)% of the data lies
P_utt=$((N * cv_utt_percent / 100))
N_head=$((N -P_utt))
#move the boundary so it is located on speaker change
N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }')
#the rest of the data will be that big
N_tail=$((N-N_head))
#now call the subset_data_dir.sh and fix the directories
subset_data_dir.sh --first $srcdir $N_head $trndir
subset_data_dir.sh --last $srcdir $N_tail $cvdir
exit 0;
fi
## use random chosen P% speakers for CV
if [ ! -f $srcdir/spk2utt ]; then
echo "$0: no such file $srcdir/spk2utt"
exit 1;
fi
#total, cv, train number of speakers
N=$(cat $srcdir/spk2utt | wc -l)
N_spk_cv=$((N * cv_spk_percent / 100))
N_spk_trn=$((N - N_spk_cv))
mkdir -p $cvdir $trndir
#shuffle the speaker list
awk '{print $1}' $srcdir/spk2utt | shuffle_list.pl --srand $seed > $trndir/_tmpf_randspk
#split the train/cv
head -n $N_spk_cv $trndir/_tmpf_randspk > $cvdir/_tmpf_cvspk
tail -n $N_spk_trn $trndir/_tmpf_randspk > $trndir/_tmpf_trainspk
#now call the subset_data_dir.sh
subset_data_dir.sh --spk-list $trndir/_tmpf_trainspk $srcdir $trndir
subset_data_dir.sh --spk-list $cvdir/_tmpf_cvspk $srcdir $cvdir
#clean-up
rm -f $trndir/_tmpf_randspk $trndir/_tmpf_trainspk $cvdir/_tmpf_cvspk