subset_data_dir.sh
5.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0
# This script operates on a data directory, such as in data/train/.
# See http://kaldi.sourceforge.net/data_prep.html#data_prep_data
# for what these directories contain.
# The script It creates a subset of that data, consisting of some specified
# number of utterances. (The selected utterances are distributed evenly
# throughout the file, by the program ./subset_scp.pl).
# There are four options, none compatible with any other.
# If you give the --per-spk option, it will attempt to select the supplied
# number of utterances for each speaker (typically you would supply a much
# smaller number in this case).
# If you give the --speakers option, it selects a subset of n randomly
# selected speakers.
# If you give the --shortest option, it will give you the n shortest utterances.
# If you give the --first option, it will just give you the n first utterances.
# If you give the --last option, it will just give you the n last utterances.
shortest=false
perspk=false
first_opt=""
speakers=false
spk_list_specified=false
if [ "$1" == "--per-spk" ]; then
perspk=true;
shift;
elif [ "$1" == "--shortest" ]; then
shortest=true;
shift;
elif [ "$1" == "--first" ]; then
first_opt="--first";
shift;
elif [ "$1" == "--speakers" ]; then
speakers=true
shift;
elif [ "$1" == "--last" ]; then
first_opt="--last";
shift;
elif [ "$1" == "--spk-list" ]; then
spk_list_specified=true
shift;
fi
if [ $# != 3 ]; then
echo "Usage: "
echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
echo " subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
echo "By default, randomly selects <num-utt> utterances from the data directory."
echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
echo "With --first, selects the first <num-utt> utterances"
echo "With --last, selects the last <num-utt> utterances"
echo "With --shortest, selects the shortest <num-utt> utterances."
exit 1;
fi
if $spk_list_specified; then
spk_list=$1
srcdir=$2
destdir=$3
else
srcdir=$1
numutt=$2
destdir=$3
fi
export LC_ALL=C
if [ ! -f $srcdir/utt2spk ]; then
echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
exit 1;
fi
function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
[ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
[ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
if [ -f $srcdir/segments ]; then
utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/reco2file_and_channel ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
# Filter the STM file for proper sclite scoring (this will also remove the comments lines)
[ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
rm $destdir/reco
fi
srcutts=`cat $srcdir/utt2spk | wc -l`
destutts=`cat $destdir/utt2spk | wc -l`
echo "$0: reducing #utt from $srcutts to $destutts"
}
if $spk_list_specified; then
mkdir -p $destdir
utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
do_filtering; # bash function.
exit 0;
elif $speakers; then
mkdir -p $destdir
utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
sort > $destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
exit 0;
elif $perspk; then
mkdir -p $destdir
awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
do_filtering; # bash function.
exit 0;
else
if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then
echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
exit 1;
fi
mkdir -p $destdir || exit 1;
## scripting note: $shortest evaluates to true or false
## so this becomes the command true or false.
if $shortest; then
# select the n shortest utterances.
. ./path.sh
[ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
rm $destdir/tmp.uttlist $destdir/tmp.len
else
utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
fi
utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
do_filtering;
exit 0;
fi