Blame view

egs/wsj/s5/utils/fix_data_dir.sh 6.29 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  #!/bin/bash
  
  # This script makes sure that only the segments present in
  # all of "feats.scp", "wav.scp" [if present], segments [if present]
  # text, and utt2spk are present in any of them.
  # It puts the original contents of data-dir into
  # data-dir/.backup
  
  cmd="$@"
  
  utt_extra_files=
  spk_extra_files=
  
  . utils/parse_options.sh
  
  if [ $# != 1 ]; then
    echo "Usage: utils/data/fix_data_dir.sh <data-dir>"
    echo "e.g.: utils/data/fix_data_dir.sh data/train"
    echo "This script helps ensure that the various files in a data directory"
    echo "are correctly sorted and filtered, for example removing utterances"
    echo "that have no features (if feats.scp is present)"
    exit 1
  fi
  
  data=$1
  
  if [ -f $data/images.scp ]; then
    image/fix_data_dir.sh $cmd
    exit $?
  fi
  
  mkdir -p $data/.backup
  
  [ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
  
  [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
  
  set -e -o pipefail -u
  
  tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
  trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
  
  export LC_ALL=C
  
  function check_sorted {
    file=$1
    sort -k1,1 -u <$file >$file.tmp
    if ! cmp -s $file $file.tmp; then
      echo "$0: file $1 is not in sorted order or not unique, sorting it"
      mv $file.tmp $file
    else
      rm $file.tmp
    fi
  }
  
  for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
      reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
    if [ -f $data/$x ]; then
      cp $data/$x $data/.backup/$x
      check_sorted $data/$x
    fi
  done
  
  
  function filter_file {
    filter=$1
    file_to_filter=$2
    cp $file_to_filter ${file_to_filter}.tmp
    utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
    if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
      length1=$(cat ${file_to_filter}.tmp | wc -l)
      length2=$(cat ${file_to_filter} | wc -l)
      if [ $length1 -ne $length2 ]; then
        echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
      fi
    fi
    rm $file_to_filter.tmp
  }
  
  function filter_recordings {
    # We call this once before the stage when we filter on utterance-id, and once
    # after.
  
    if [ -f $data/segments ]; then
    # We have a segments file -> we need to filter this and the file wav.scp, and
    # reco2file_and_utt, if it exists, to make sure they have the same list of
    # recording-ids.
  
      if [ ! -f $data/wav.scp ]; then
        echo "$0: $data/segments exists but not $data/wav.scp"
        exit 1;
      fi
      awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
      n1=$(cat $tmpdir/recordings | wc -l)
      [ ! -s $tmpdir/recordings ] && \
        echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
      utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
      mv $tmpdir/recordings.tmp $tmpdir/recordings
  
  
      cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
      filter_file $tmpdir/recordings $data/segments
      cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
      rm $data/segments.tmp
  
      filter_file $tmpdir/recordings $data/wav.scp
      [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
      [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur
      true
    fi
  }
  
  function filter_speakers {
    # throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
    utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
  
    cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers
    for s in cmvn.scp spk2gender; do
      f=$data/$s
      if [ -f $f ]; then
        filter_file $f $tmpdir/speakers
      fi
    done
  
    filter_file $tmpdir/speakers $data/spk2utt
    utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
  
    for s in cmvn.scp spk2gender $spk_extra_files; do
      f=$data/$s
      if [ -f $f ]; then
        filter_file $tmpdir/speakers $f
      fi
    done
  }
  
  function filter_utts {
    cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
  
    ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
      echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
  
    ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
      echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
      echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
  
    ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
      echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
  
    if [ -f $data/utt2uniq ]; then
      ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
        echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
    fi
  
    maybe_wav=
    maybe_reco2dur=
    [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
    [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
  
    maybe_utt2dur=
    if [ -f $data/utt2dur ]; then
      cat $data/utt2dur | \
        awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
      maybe_utt2dur=utt2dur.ok
    fi
  
    for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur; do
      if [ -f $data/$x ]; then
        utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
        mv $tmpdir/utts.tmp $tmpdir/utts
      fi
    done
    rm $data/utt2dur.ok 2>/dev/null || true
  
    [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
      rm $tmpdir/utts && exit 1;
  
  
    if [ -f $data/utt2spk ]; then
      new_nutts=$(cat $tmpdir/utts | wc -l)
      old_nutts=$(cat $data/utt2spk | wc -l)
      if [ $new_nutts -ne $old_nutts ]; then
        echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts"
      else
        echo "fix_data_dir.sh: kept all $old_nutts utterances."
      fi
    fi
  
    for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do
      if [ -f $data/$x ]; then
        cp $data/$x $data/.backup/$x
        if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
          utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
        fi
      fi
    done
  
  }
  
  filter_recordings
  filter_speakers
  filter_utts
  filter_speakers
  filter_recordings
  
  utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
  
  echo "fix_data_dir.sh: old files are kept in $data/.backup"