Blame view

Scripts/utils/validate_data_dir.sh 6.75 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
  #!/bin/bash
  
  
  no_feats=false
  no_wav=false
  no_text=false
  
  for x in `seq 3`; do
    if [ $1 == "--no-feats" ]; then
      no_feats=true
      shift;
    fi
    if [ $1 == "--no-text" ]; then
      no_text=true
      shift;
    fi
    if [ $1 == "--no-wav" ]; then
      no_wav=true
      shift;
    fi
  done
  
  if [ $# -ne 1 ]; then
    echo "Usage: $0 [---no-feats] [---no-text] [---no-wav] data-dir"
    echo "e.g.: $0 data/train"
  fi
  
  data=$1
  
  if [ ! -d $data ]; then
    echo "$0: no such directory $data"
    exit 1;
  fi
  
  for f in spk2utt utt2spk; do
    if [ ! -f $data/$f ]; then
      echo "$0: no such file $f"
      exit 1;
    fi
    if [ ! -s $data/$f ]; then
      echo "$0: empty file $f"
      exit 1;
    fi
  done
  
  ! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \
    echo "$0: $data/utt2spk has wrong format." && exit;
  
  tmpdir=$(mktemp -d);
  trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
  
  export LC_ALL=C
  
  function check_sorted {
    ! cat $1 | sort | cmp -s - $1 && \
      echo "$0: file $1 is not in sorted order" && exit 1;
    
  }
  function partial_diff {
    diff $1 $2 | head -n 6
    echo "..."
    diff $1 $2 | tail -n 6
    n1=`cat $1 | wc -l`
    n2=`cat $2 | wc -l`
    echo "[Lengths are $1=$n1 versus $2=$n2]"
  }
  
  check_sorted $data/utt2spk
  
  ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \
     echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \
     echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
  
  check_sorted $data/spk2utt
  
  ! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \
       <(utils/spk2utt_to_utt2spk.pl $data/spk2utt)  && \
     echo "$0: spk2utt and utt2spk do not seem to match" && exit 1;
  
  cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts
  
  if [ ! -f $data/text ] && ! $no_text; then
    echo "$0: no such file $data/text (if this is by design, specify ---no-text)"
    exit 1;
  fi
  
  num_utts=`cat $tmpdir/utts | wc -l`
  if [ -f $data/text ]; then
    check_sorted $data/text
    text_len=`cat $data/text | wc -l`
    awk '{print $1}' < $data/text > $tmpdir/utts.txt
    if ! cmp -s $tmpdir/utts{,.txt}; then
      echo "$0: Error: in $data, utterance lists extracted from utt2spk and text"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/utts{,.txt}
      exit 1;
    fi
  fi
  
  if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then
    echo "$0: in directory $data, segments file exists but no wav.scp"
    exit 1;
  fi
  
  
  if [ ! -f $data/wav.scp ] && ! $no_wav; then
    echo "$0: no such file $data/wav.scp (if this is by design, specify ---no-wav)"
    exit 1;
  fi
  
  if [ -f $data/wav.scp ]; then
    check_sorted $data/wav.scp
  
    if [ -f $data/segments ]; then
  
      check_sorted $data/segments
      # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
      ! cat $data/segments | \
        awk '{if (NF != 4 || !($4 > $3)) { print "Bad line in segments file", $0; exit(1); }}' && \
        echo "$0: badly formatted segments file" && exit 1;
      
      segments_len=`cat $data/segments | wc -l`
      ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \
        echo "$0: Utterance list differs between $data/text and $data/segments " && \
        echo "$0: Lengths are $segments_len vs $num_utts";
  
      cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings
      awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav
      if ! cmp -s $tmpdir/recordings{,.wav}; then
        echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp"
        echo "$0: differ, partial diff is:"
        partial_diff $tmpdir/recordings{,.wav}
        exit 1;
      fi
      if [ -f $data/reco2file_and_channel ]; then
        # this file is needed only for ctm scoring; it's indexed by recording-id.
        check_sorted $data/reco2file_and_channel
        ! cat $data/reco2file_and_channel | \
          awk '{if (NF != 3 || ($3 != "A" && $3 != "B")) { print "Bad line ", $0; exit 1; }}' && \
          echo "$0: badly formatted reco2file_and_channel file" && exit 1;
        cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc
        if ! cmp -s $tmpdir/recordings{,.r2fc}; then
          echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel"
          echo "$0: differ, partial diff is:"
          partial_diff $tmpdir/recordings{,.r2fc}
          exit 1;
        fi
      fi
    else
      # No segments file -> assume wav.scp indexed by utterance.
      cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav
      if ! cmp -s $tmpdir/utts{,.wav}; then
        echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp"
        echo "$0: differ, partial diff is:"
        partial_diff $tmpdir/utts{,.wav}
        exit 1;
      fi
  
      if [ -f $data/reco2file_and_channel ]; then
        # this file is needed only for ctm scoring; it's indexed by recording-id.
        check_sorted $data/reco2file_and_channel
        ! cat $data/reco2file_and_channel | \
          awk '{if (NF != 3 || ($3 != "A" && $3 != "B")) { print "Bad line ", $0; exit 1; }}' && \
          echo "$0: badly formatted reco2file_and_channel file" && exit 1;
        cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc
        if ! cmp -s $tmpdir/utts{,.r2fc}; then
          echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel"
          echo "$0: differ, partial diff is:"
          partial_diff $tmpdir/utts{,.r2fc}
          exit 1;
        fi
      fi
    fi
  fi
  
  if [ ! -f $data/feats.scp ] && ! $no_feats; then
    echo "$0: no such file $data/feats.scp (if this is by design, specify ---no-feats)"
    exit 1;
  fi
  
  if [ -f $data/feats.scp ]; then
    check_sorted $data/feats.scp
    cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats
    if ! cmp -s $tmpdir/utts{,.feats}; then
      echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/utts{,.feats}
      exit 1;
    fi
  fi
  
  if [ -f $data/cmvn.scp ]; then
    check_sorted $data/cmvn.scp
    cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
    cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
    if ! cmp -s $tmpdir/speakers{,.cmvn}; then
      echo "$0: Error: in $data, speaker lists extracted from spkutt and cmvn"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/speakers{,.cmvn}
      exit 1;
    fi
  fi
  
  if [ -f $data/spk2gender ]; then
    check_sorted $data/spk2gender
    ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \
       echo "Mal-formed spk2gender file" && exit 1;
    cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender
    cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
    if ! cmp -s $tmpdir/speakers{,.spk2gender}; then
      echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender"
      echo "$0: differ, partial diff is:"
      partial_diff $tmpdir/speakers{,.spk2gender}
      exit 1;
    fi
  fi
  
  echo "Successfully validated data-directory $data"