Blame view
egs/wsj/s5/utils/subset_data_dir.sh
7.12 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
#!/bin/bash # Copyright 2010-2011 Microsoft Corporation # 2012-2013 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # This script operates on a data directory, such as in data/train/. # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data # for what these directories contain. # This script creates a subset of that data, consisting of some specified # number of utterances. (The selected utterances are distributed evenly # throughout the file, by the program ./subset_scp.pl). # There are six options, none compatible with any other. # If you give the --per-spk option, it will attempt to select the supplied # number of utterances for each speaker (typically you would supply a much # smaller number in this case). # If you give the --speakers option, it selects a subset of n randomly # selected speakers. # If you give the --shortest option, it will give you the n shortest utterances. # If you give the --first option, it will just give you the n first utterances. # If you give the --last option, it will just give you the n last utterances. # If you give the --spk-list or --utt-list option, it reads the # speakers/utterances to keep from <speaker-list-file>/<utt-list-file>" (note, # in this case there is no <num-utt> positional parameter; see usage message.) shortest=false perspk=false speakers=false first_opt= spk_list= utt_list= expect_args=3 case $1 in --first|--last) first_opt=$1; shift ;; --per-spk) perspk=true; shift ;; --shortest) shortest=true; shift ;; --speakers) speakers=true; shift ;; --spk-list) shift; spk_list=$1; shift; expect_args=2 ;; --utt-list) shift; utt_list=$1; shift; expect_args=2 ;; --*) echo "$0: invalid option '$1'"; exit 1 esac if [ $# != $expect_args ]; then echo "Usage:" echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>" echo " subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>" echo " subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>" echo "By default, randomly selects <num-utt> utterances from the data directory." echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances" echo "With --per-spk, selects <num-utt> utterances per speaker, if available." echo "With --first, selects the first <num-utt> utterances" echo "With --last, selects the last <num-utt> utterances" echo "With --shortest, selects the shortest <num-utt> utterances." echo "With --spk-list, reads the speakers to keep from <speaker-list-file>" echo "With --utt-list, reads the utterances to keep from <utt-list-file>" exit 1; fi srcdir=$1 if [[ $spk_list || $utt_list ]]; then numutt= destdir=$2 else numutt=$2 destdir=$3 fi export LC_ALL=C if [ ! -f $srcdir/utt2spk ]; then echo "$0: no such file $srcdir/utt2spk" exit 1 fi if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then echo "$0: cannot subset to more utterances than you originally had." exit 1 fi if $shortest && [ ! -f $srcdir/feats.scp ]; then echo "$0: you selected --shortest but no feats.scp exist." exit 1 fi mkdir -p $destdir || exit 1 if [[ $spk_list ]]; then utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; elif [[ $utt_list ]]; then utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; elif $speakers; then utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | sort > $destdir/spk2utt utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk elif $perspk; then awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; } for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } printf(" "); }' <$srcdir/spk2utt >$destdir/spk2utt utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk else if $shortest; then # Select $numutt shortest utterances. . ./path.sh feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk rm $destdir/tmp.uttlist $destdir/tmp.len else # Select $numutt random utterances. utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; fi utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt fi # Perform filtering. utt2spk and spk2utt files already exist by this point. # Filter by utterance. [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur [ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames [ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text # Filter by speaker. [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp # Filter by recording-id. if [ -f $srcdir/segments ]; then utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments # Recording-ids are in segments. awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco # The next line overrides the command above for wav.scp, which would be incorrect. [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp else # No segments; recording-ids are in wav.scp. awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco fi [ -f $srcdir/reco2file_and_channel ] && utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel [ -f $srcdir/reco2dur ] && utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur # Filter the STM file for proper sclite scoring. # Copy over the comments from STM file. [ -f $srcdir/stm ] && (grep "^;;" $srcdir/stm utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm rm $destdir/reco # Copy frame_shift if present. [ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir srcutts=$(wc -l <$srcdir/utt2spk) destutts=$(wc -l <$destdir/utt2spk) echo "$0: reducing #utt from $srcutts to $destutts" exit 0 |