subset_data_tr_cv.sh
2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash
#
# Copyright 2017 Brno University of Technology (Author: Karel Vesely);
# Apache 2.0
# This scripts splits 'data' directory into two parts:
# - training set with 90% of speakers
# - held-out set with 10% of speakers (cv)
# (to be used in frame cross-entropy training of 'nnet1' models),
# The script also accepts a list of held-out set speakers by '--cv-spk-list'
# (with perturbed data, we pass the list of speakers externally).
# The remaining set of speakers is the the training set.
cv_spk_percent=10
cv_spk_list= # To be used with perturbed data,
seed=777
cv_utt_percent= # ignored (compatibility),
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "Usage: $0 [opts] <src-data> <train-data> <cv-data>"
echo " --cv-spk-percent N (default 10)"
echo " --cv-spk-list <file> (a pre-defined list with cv speakers)"
exit 1;
fi
set -euo pipefail
src_data=$1
trn_data=$2
cv_data=$3
[ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1
tmp=$(mktemp -d /tmp/${USER}_XXXXX)
if [ -z "$cv_spk_list" ]; then
# Select 'cv_spk_percent' speakers randomly,
cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers
n_spk=$(wc -l <$tmp/speakers)
n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ")
#
head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv
tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn
else
# Use pre-defined list of speakers,
cp $cv_spk_list $tmp/speakers_cv
join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn
fi
# Sanity checks,
n_spk=$(wc -l <$src_data/spk2utt)
echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)"
overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l)
[ $overlap != 0 ] && \
echo "WARNING, speaker overlap detected!" && \
join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \
echo '...'
# Create new data dirs,
utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data
utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data