Blame view
Scripts/utils/fix_data_dir.sh
5.26 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
#!/bin/bash # This script makes sure that only the segments present in # all of "feats.scp", "wav.scp" [if present], segments[if prsent] # text, and utt2spk are present in any of them. # It puts the original contents of data-dir into # data-dir/.backup if [ $# != 1 ]; then echo "Usage: fix_data_dir.sh data-dir" exit 1 fi data=$1 mkdir -p $data/.backup [ ! -d $data ] && echo "$0: no such directory $data" && exit 1; [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; tmpdir=$(mktemp -d); trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM export LC_ALL=C function check_sorted { file=$1 sort -k1,1 -u <$file >$file.tmp if ! cmp -s $file $file.tmp; then echo "$0: file $1 is not in sorted order or not unique, sorting it" mv $file.tmp $file else rm $file.tmp fi } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x fi done function filter_file { filter=$1 file_to_filter=$2 cp $file_to_filter ${file_to_filter}.tmp utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then length1=`cat ${file_to_filter}.tmp | wc -l` length2=`cat ${file_to_filter} | wc -l` if [ $length1 -ne $length2 ]; then echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." fi fi } function filter_recordings { # We call this once before the stage when we filter on utterance-id, and once # after. if [ -f $data/segments ]; then # We have a segments file -> we need to filter this and the file wav.scp, and # reco2file_and_utt, if it exists, to make sure they have the same list of # recording-ids. if [ ! -f $data/wav.scp ]; then echo "$0: $data/segments exists but not $data/wav.scp" exit 1; fi awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings n1=`cat $tmpdir/recordings | wc -l` [ ! -s $tmpdir/recordings ] && \ echo "Empty list of recordings (bad file $data/segments)?" && exit 1; utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp mv $tmpdir/recordings.tmp $tmpdir/recordings cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments filter_file $tmpdir/recordings $data/segments cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel fi } function filter_speakers { # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt if [ -f $data/cmvn.scp ]; then cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers cat $data/cmvn.scp | awk '{print $1}' >$tmpdir/speakers.cmvn utils/filter_scp.pl $data/cmvn.scp $tmpdir/speakers > $tmpdir/speakers.tmp mv $tmpdir/speakers.tmp $tmpdir/speakers filter_file $tmpdir/speakers $data/cmvn.scp filter_file $tmpdir/speakers $data/spk2utt utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk fi if [ -f $data/spk2gender ]; then # We don't handle the case when the spk2gender does not cover all speakers. cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers filter_file $tmpdir/speakers $data/spk2gender fi } function filter_utts { cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts # Do a check. ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; maybe_wav= [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. for x in feats.scp text segments $maybe_wav; do if [ -f $data/$x ]; then utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp mv $tmpdir/utts.tmp $tmpdir/utts fi done [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ rm $tmpdir/utts && exit 1; nutts=`cat $tmpdir/utts | wc -l` if [ -f $data/feats.scp ]; then nfeats=`cat $data/feats.scp | wc -l` else nfeats=0 fi ntext=`cat $data/text | wc -l` if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions." else echo "fix_data_dir.sh: kept all $nutts utterances." fi for x in utt2spk feats.scp text segments $maybe_wav; do if [ -f $data/$x ]; then mv $data/$x $data/.backup/$x utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x fi done } filter_recordings filter_speakers filter_utts filter_recordings utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt echo "fix_data_dir.sh: old files are kept in $data/.backup" |