Blame view
egs/wsj/s5/utils/fix_data_dir.sh
6.29 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
#!/bin/bash # This script makes sure that only the segments present in # all of "feats.scp", "wav.scp" [if present], segments [if present] # text, and utt2spk are present in any of them. # It puts the original contents of data-dir into # data-dir/.backup cmd="$@" utt_extra_files= spk_extra_files= . utils/parse_options.sh if [ $# != 1 ]; then echo "Usage: utils/data/fix_data_dir.sh <data-dir>" echo "e.g.: utils/data/fix_data_dir.sh data/train" echo "This script helps ensure that the various files in a data directory" echo "are correctly sorted and filtered, for example removing utterances" echo "that have no features (if feats.scp is present)" exit 1 fi data=$1 if [ -f $data/images.scp ]; then image/fix_data_dir.sh $cmd exit $? fi mkdir -p $data/.backup [ ! -d $data ] && echo "$0: no such directory $data" && exit 1; [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; set -e -o pipefail -u tmpdir=$(mktemp -d /tmp/kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM export LC_ALL=C function check_sorted { file=$1 sort -k1,1 -u <$file >$file.tmp if ! cmp -s $file $file.tmp; then echo "$0: file $1 is not in sorted order or not unique, sorting it" mv $file.tmp $file else rm $file.tmp fi } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x fi done function filter_file { filter=$1 file_to_filter=$2 cp $file_to_filter ${file_to_filter}.tmp utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then length1=$(cat ${file_to_filter}.tmp | wc -l) length2=$(cat ${file_to_filter} | wc -l) if [ $length1 -ne $length2 ]; then echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." fi fi rm $file_to_filter.tmp } function filter_recordings { # We call this once before the stage when we filter on utterance-id, and once # after. if [ -f $data/segments ]; then # We have a segments file -> we need to filter this and the file wav.scp, and # reco2file_and_utt, if it exists, to make sure they have the same list of # recording-ids. if [ ! -f $data/wav.scp ]; then echo "$0: $data/segments exists but not $data/wav.scp" exit 1; fi awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings n1=$(cat $tmpdir/recordings | wc -l) [ ! -s $tmpdir/recordings ] && \ echo "Empty list of recordings (bad file $data/segments)?" && exit 1; utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp mv $tmpdir/recordings.tmp $tmpdir/recordings cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments filter_file $tmpdir/recordings $data/segments cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments rm $data/segments.tmp filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur true fi } function filter_speakers { # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers for s in cmvn.scp spk2gender; do f=$data/$s if [ -f $f ]; then filter_file $f $tmpdir/speakers fi done filter_file $tmpdir/speakers $data/spk2utt utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk for s in cmvn.scp spk2gender $spk_extra_files; do f=$data/$s if [ -f $f ]; then filter_file $tmpdir/speakers $f fi done } function filter_utts { cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; if [ -f $data/utt2uniq ]; then ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; fi maybe_wav= maybe_reco2dur= [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts maybe_utt2dur= if [ -f $data/utt2dur ]; then cat $data/utt2dur | \ awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1 maybe_utt2dur=utt2dur.ok fi for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur; do if [ -f $data/$x ]; then utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp mv $tmpdir/utts.tmp $tmpdir/utts fi done rm $data/utt2dur.ok 2>/dev/null || true [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ rm $tmpdir/utts && exit 1; if [ -f $data/utt2spk ]; then new_nutts=$(cat $tmpdir/utts | wc -l) old_nutts=$(cat $data/utt2spk | wc -l) if [ $new_nutts -ne $old_nutts ]; then echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" else echo "fix_data_dir.sh: kept all $old_nutts utterances." fi fi for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x fi fi done } filter_recordings filter_speakers filter_utts filter_speakers filter_recordings utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt echo "fix_data_dir.sh: old files are kept in $data/.backup" |