validate_targets_dir.sh
2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash
# Copyright 2017 Vimal Manohar
# Apache 2.0
# This script validates a 'targets_dir' as created by lats_to_targets.sh.
# See that script for details about the format of the targets.
[ -f ./path.sh ] && . ./path.sh
if [ $# -ne 2 ]; then
cat <<EOF
This script validates a 'targets_dir' as created by lats_to_targets.sh.
See that script for details about the format of the targets.
Usage: steps/segmentation/validate_targets_dir.sh <targets-dir> <data-dir>
e.g.: steps/segmentation/validate_targets_dir.sh \
exp/segmentation1a/tri3b_train_split10s_targets \
data/train_split10s
EOF
exit 1
fi
targets_dir=$1
data=$2
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted_and_uniq {
! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
}
for f in $targets_dir/targets.scp $data/utt2spk; do
if [ ! -f $f ]; then
echo "$0: Could not find $f"
exit 1
fi
done
utils/data/validate_data_dir.sh --no-text --no-wav --no-spk-sort \
$data || exit 1
check_sorted_and_uniq $targets_dir/targets.scp
nu=`cat $data/utt2spk | wc -l` || exit 1
nt=`cat $targets_dir/targets.scp | wc -l` || exit 1
if [ $nt -ne $nu ]; then
echo "WARNING: It seems not all of the targets files were successfully created in "
echo "$targets_dir/targets.scp for $data ($nt != $nu)."
fi
if [ $nt -lt $[$nu - ($nu/20)] ]; then
echo "Less than 95% the targets were successfully generated. Probably a serious error."
exit 1
fi
head -n 100 $targets_dir/targets.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.targets || exit 1
utils/filter_scp.pl $tmpdir/len.targets $data/feats.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.feats || exit 1
frame_subsampling_factor=1
if [ -f $targets_dir/frame_subsampling_factor ]; then
frame_subsampling_factor=$(cat $targets_dir/frame_subsampling_factor) || exit 1
fi
utils/filter_scp.pl $tmpdir/len.feats $tmpdir/len.targets | \
paste -d ' ' - $tmpdir/len.feats | python -c "
import sys
num_lines = 0
for line in sys.stdin:
parts = line.strip().split()
if parts[0] != parts[2]:
continue
len_target = int(parts[1])
len_feats = int(float(parts[3]) / $frame_subsampling_factor)
diff = abs(len_target - len_feats)
if diff > 3:
sys.stderr.write('Mismatch in length for utterance {utt} between '
'targets and feats: {0} vs {1}; diff={2}'.format(
len_target, len_feats, diff, utt=parts[0]))
sys.exit(1)
num_lines += 1" || exit 1
echo "$0: Successfully validated data-directory $data"