reduce_data_dir_by_reclist.sh
1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
# koried, 10/29/2012
# Reduce a data set based on a list of recordings
if [ $# != 3 ]; then
echo "usage: $0 srcdir reclist destdir"
exit 1;
fi
srcdir=$1
reclist=$2
destdir=$3
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
function do_filtering {
# assumes the utt2spk and spk2utt files already exist.
[ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
[ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
[ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
if [ -f $srcdir/segments ]; then
utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
# The next line would override the command above for wav.scp, which would be incorrect.
[ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
[ -f $srcdir/reco2file_and_channel ] && \
utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
[ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
rm $destdir/reco
fi
srcutts=`cat $srcdir/utt2spk | wc -l`
destutts=`cat $destdir/utt2spk | wc -l`
echo "Reduced #utt from $srcutts to $destutts"
}
mkdir -p $destdir
# filter the utt2spk based on the set of recordings
rm -f $destdir/utt2spk
for i in `cat $reclist`; do
cat $srcdir/utt2spk | grep ^$i >> $destdir/utt2spk
done
utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
do_filtering;