subset_ali_dir.sh
1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
# Copyright 2017 Vimal Manohar
# Apache 2.0.
cmd=run.pl
if [ -f ./path.sh ]; then . ./path.sh; fi
. ./utils/parse_options.sh
if [ $# -ne 4 ]; then
cat <<EOF
This script creates an alignment directory containing a subset of
utterances contained in <subset-data-dir> from the
original alignment directory containing alignments for utterances in
<full-data-dir>.
The number of split jobs in the output alignment directory is
equal to the number of jobs in the original alignment directory,
unless the subset data directory has too few speakers.
Usage: $0 [options] <full-data-dir> <subset-data-dir> <ali-dir> <subset-ali-dir>
e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali
Options:
--cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs.
EOF
exit 1
fi
data=$1
subset_data=$2
ali_dir=$3
dir=$4
nj=$(cat $ali_dir/num_jobs) || exit 1
utils/split_data.sh $data $nj
mkdir -p $dir
cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
cp -r $ali_dir/phones $dir 2>/dev/null || true
$cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \
ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
for n in `seq $nj`; do
cat $dir/ali_tmp.$n.scp
done > $dir/ali_tmp.scp
num_spk=$(cat $subset_data/spk2utt | wc -l)
if [ $num_spk -lt $nj ]; then
nj=$num_spk
fi
utils/split_data.sh $subset_data $nj
$cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \
copy-int-vector \
"scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \
"ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
echo $nj > $dir/num_jobs
rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp
exit 0