Blame view

egs/wsj/s5/steps/segmentation/validate_targets_dir.sh 2.63 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  #!/bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0
  
  # This script validates a 'targets_dir' as created by lats_to_targets.sh.
  # See that script for details about the format of the targets.
  
  [ -f ./path.sh ] && . ./path.sh
  
  if [ $# -ne 2 ]; then
    cat <<EOF
    This script validates a 'targets_dir' as created by lats_to_targets.sh.
    See that script for details about the format of the targets.
  
    Usage: steps/segmentation/validate_targets_dir.sh <targets-dir> <data-dir>
    e.g.: steps/segmentation/validate_targets_dir.sh \
      exp/segmentation1a/tri3b_train_split10s_targets \
      data/train_split10s
  EOF
    exit 1
  fi
  
  targets_dir=$1
  data=$2
  
  tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
  trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
  
  export LC_ALL=C
  
  function check_sorted_and_uniq {
    ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
      echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
  }
  
  for f in $targets_dir/targets.scp $data/utt2spk; do 
    if [ ! -f $f ]; then
      echo "$0: Could not find $f"
      exit 1
    fi
  done
  
  utils/data/validate_data_dir.sh --no-text --no-wav --no-spk-sort \
    $data || exit 1
  
  check_sorted_and_uniq $targets_dir/targets.scp
  
  nu=`cat $data/utt2spk | wc -l` || exit 1
  nt=`cat $targets_dir/targets.scp | wc -l` || exit 1
  if [ $nt -ne $nu ]; then
    echo "WARNING: It seems not all of the targets files were successfully created in "
    echo "$targets_dir/targets.scp for $data ($nt != $nu)."
  fi
  
  if [ $nt -lt $[$nu - ($nu/20)] ]; then
    echo "Less than 95% the targets were successfully generated.  Probably a serious error."
    exit 1
  fi
  
  head -n 100 $targets_dir/targets.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.targets || exit 1
  utils/filter_scp.pl $tmpdir/len.targets $data/feats.scp | sort -k1,1 | feat-to-len scp:- ark,t:$tmpdir/len.feats || exit 1
  
  frame_subsampling_factor=1
  if [ -f $targets_dir/frame_subsampling_factor ]; then
    frame_subsampling_factor=$(cat $targets_dir/frame_subsampling_factor) || exit 1
  fi
  
  utils/filter_scp.pl $tmpdir/len.feats $tmpdir/len.targets | \
    paste -d ' ' - $tmpdir/len.feats | python -c "
  import sys
  num_lines = 0
  for line in sys.stdin:
    parts = line.strip().split()
    if parts[0] != parts[2]:
      continue
    len_target = int(parts[1])
    len_feats = int(float(parts[3]) / $frame_subsampling_factor)
    diff = abs(len_target - len_feats)
    if diff > 3:
      sys.stderr.write('Mismatch in length for utterance {utt} between '
                       'targets and feats: {0} vs {1}; diff={2}'.format(
                        len_target, len_feats, diff, utt=parts[0]))
      sys.exit(1)
    num_lines += 1" || exit 1
  
  echo "$0: Successfully validated data-directory $data"