Blame view
egs/wsj/s5/steps/segmentation/validate_targets_dir.sh
2.63 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
#!/bin/bash # Copyright 2017 Vimal Manohar # Apache 2.0 # This script validates a 'targets_dir' as created by lats_to_targets.sh. # See that script for details about the format of the targets. [ -f ./path.sh ] && . ./path.sh if [ $# -ne 2 ]; then cat <<EOF This script validates a 'targets_dir' as created by lats_to_targets.sh. See that script for details about the format of the targets. Usage: steps/segmentation/validate_targets_dir.sh <targets-dir> <data-dir> e.g.: steps/segmentation/validate_targets_dir.sh \ exp/segmentation1a/tri3b_train_split10s_targets \ data/train_split10s EOF exit 1 fi targets_dir=$1 data=$2 tmpdir=$(mktemp -d /tmp/kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM export LC_ALL=C function check_sorted_and_uniq { ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; } for f in $targets_dir/targets.scp $data/utt2spk; do if [ ! -f $f ]; then echo "$0: Could not find $f" exit 1 fi done utils/data/validate_data_dir.sh --no-text --no-wav --no-spk-sort \ $data || exit 1 check_sorted_and_uniq $targets_dir/targets.scp nu=`cat $data/utt2spk | wc -l` || exit 1 nt=`cat $targets_dir/targets.scp | wc -l` || exit 1 if [ $nt -ne $nu ]; then echo "WARNING: It seems not all of the targets files were successfully created in " echo "$targets_dir/targets.scp for $data ($nt != $nu)." fi if [ $nt -lt $[$nu - ($nu/20)] ]; then echo "Less than 95% the targets were successfully generated. Probably a serious error." exit 1 fi head -n 100 $targets_dir/targets.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.targets || exit 1 utils/filter_scp.pl $tmpdir/len.targets $data/feats.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.feats || exit 1 frame_subsampling_factor=1 if [ -f $targets_dir/frame_subsampling_factor ]; then frame_subsampling_factor=$(cat $targets_dir/frame_subsampling_factor) || exit 1 fi utils/filter_scp.pl $tmpdir/len.feats $tmpdir/len.targets | \ paste -d ' ' - $tmpdir/len.feats | python -c " import sys num_lines = 0 for line in sys.stdin: parts = line.strip().split() if parts[0] != parts[2]: continue len_target = int(parts[1]) len_feats = int(float(parts[3]) / $frame_subsampling_factor) diff = abs(len_target - len_feats) if diff > 3: sys.stderr.write('Mismatch in length for utterance {utt} between ' 'targets and feats: {0} vs {1}; diff={2}'.format( len_target, len_feats, diff, utt=parts[0])) sys.exit(1) num_lines += 1" || exit 1 echo "$0: Successfully validated data-directory $data" |