Blame view
Scripts/utils/subset_data_dir.sh
5.66 KB
ec85f8892 first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#!/bin/bash # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # This script operates on a data directory, such as in data/train/. # See http://kaldi.sourceforge.net/data_prep.html#data_prep_data # for what these directories contain. # The script It creates a subset of that data, consisting of some specified # number of utterances. (The selected utterances are distributed evenly # throughout the file, by the program ./subset_scp.pl). # There are four options, none compatible with any other. # If you give the --per-spk option, it will attempt to select the supplied # number of utterances for each speaker (typically you would supply a much # smaller number in this case). # If you give the --speakers option, it selects a subset of n randomly # selected speakers. # If you give the --shortest option, it will give you the n shortest utterances. # If you give the --first option, it will just give you the n first utterances. # If you give the --last option, it will just give you the n last utterances. shortest=false perspk=false first_opt="" speakers=false spk_list_specified=false if [ "$1" == "--per-spk" ]; then perspk=true; shift; elif [ "$1" == "--shortest" ]; then shortest=true; shift; elif [ "$1" == "--first" ]; then first_opt="--first"; shift; elif [ "$1" == "--speakers" ]; then speakers=true shift; elif [ "$1" == "--last" ]; then first_opt="--last"; shift; elif [ "$1" == "--spk-list" ]; then spk_list_specified=true shift; fi if [ $# != 3 ]; then echo "Usage: " echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>" echo " subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>" echo "By default, randomly selects <num-utt> utterances from the data directory." echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances" echo "With --first, selects the first <num-utt> utterances" echo "With --last, selects the last <num-utt> utterances" echo "With --shortest, selects the shortest <num-utt> utterances." exit 1; fi if $spk_list_specified; then spk_list=$1 srcdir=$2 destdir=$3 else srcdir=$1 numutt=$2 destdir=$3 fi export LC_ALL=C if [ ! -f $srcdir/utt2spk ]; then echo "subset_data_dir.sh: no such file $srcdir/utt2spk" exit 1; fi function do_filtering { # assumes the utt2spk and spk2utt files already exist. [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp if [ -f $srcdir/segments ]; then utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings. # The next line would override the command above for wav.scp, which would be incorrect. [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp [ -f $srcdir/reco2file_and_channel ] && \ utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel # Filter the STM file for proper sclite scoring (this will also remove the comments lines) [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm rm $destdir/reco fi srcutts=`cat $srcdir/utt2spk | wc -l` destutts=`cat $destdir/utt2spk | wc -l` echo "$0: reducing #utt from $srcutts to $destutts" } if $spk_list_specified; then mkdir -p $destdir utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; do_filtering; # bash function. exit 0; elif $speakers; then mkdir -p $destdir utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \ sort > $destdir/spk2utt utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk do_filtering; # bash function. exit 0; elif $perspk; then mkdir -p $destdir awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; } for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } printf(" "); }' <$srcdir/spk2utt >$destdir/spk2utt utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk do_filtering; # bash function. exit 0; else if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then echo "subset_data_dir.sh: cannot subset to more utterances than you originally had." exit 1; fi mkdir -p $destdir || exit 1; ## scripting note: $shortest evaluates to true or false ## so this becomes the command true or false. if $shortest; then # select the n shortest utterances. . ./path.sh [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1; feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk rm $destdir/tmp.uttlist $destdir/tmp.len else utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; fi utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt do_filtering; exit 0; fi |