copy_data_dir.sh
2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/bash
# Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
# Warning, this script is deprecated, please use utils/data/modify_speaker_info.sh
# This script is as utils/copy_data_dir.sh in that it copies a data-dir,
# but it supports the --utts-per-spk-max option. If nonzero, it modifies
# the utt2spk and spk2utt files by splitting each speaker into multiple
# versions, so that each speaker has no more than --utts-per-spk-max
# utterances.
# begin configuration section
utts_per_spk_max=-1
# end configuration section
. utils/parse_options.sh
if [ $# != 2 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <destdir>"
echo "e.g.:"
echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
echo "Options"
echo " --utts-per-spk-max <n> # number of utterances per speaker maximum,"
echo " # default -1 (meaning no maximum). E.g. 2."
exit 1;
fi
echo "$0: this script is deprecated, please use utils/data/modify_speaker_info.sh."
export LC_ALL=C
srcdir=$1
destdir=$2
if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi
set -e;
set -o pipefail
mkdir -p $destdir
if [ "$utts_per_spk_max" != -1 ]; then
# create spk2utt file with reduced number of utterances per speaker.
awk -v max=$utts_per_spk_max '{ n=2; count=0;
while(n<=NF) {
int_max=int(max)+ (rand() < (max-int(max))?1:0);
nmax=n+int_max; count++; printf("%s-%06x", $1, count);
for (;n<nmax&&n<=NF; n++) printf(" %s", $n); print "";} }' \
<$srcdir/spk2utt >$destdir/spk2utt
utils/spk2utt_to_utt2spk.pl <$destdir/spk2utt >$destdir/utt2spk
if [ -f $srcdir/cmvn.scp ]; then
# below, the first apply_map command outputs a cmvn.scp indexed by utt;
# the second one outputs a cmvn.scp indexed by new speaker-id.
utils/apply_map.pl -f 2 $srcdir/cmvn.scp <$srcdir/utt2spk | \
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq > $destdir/cmvn.scp
echo "$0: mapping cmvn.scp, but you may want to recompute it if it's needed,"
echo " as it would probably change."
fi
if [ -f $srcdir/spk2gender ]; then
utils/apply_map.pl -f 2 $srcdir/spk2gender <$srcdir/utt2spk | \
utils/apply_map.pl -f 1 $destdir/utt2spk | sort | uniq >$destdir/spk2gender
fi
else
cp $srcdir/spk2utt $srcdir/utt2spk $destdir/
[ -f $srcdir/spk2gender ] && cp $srcdir/spk2gender $destdir/
[ -f $srcdir/cmvn.scp ] && cp $srcdir/cmvn.scp $destdir/
fi
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
[ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done
echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_per_spk_max"
opts=
[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
[ ! -f $srcdir/text ] && opts="$opts --no-text"
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"
utils/validate_data_dir.sh $opts $destdir