Blame view
egs/wsj/s5/utils/data/combine_short_segments.sh
6 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#!/bin/bash # Copyright 2013 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 # This script copies and modifies a data directory while combining # segments whose duration is lower than a specified minimum segment # length. # # Note: this does not work for the wav.scp, since there is no natural way to # concatenate segments; you have to operate on directories that already have # features extracted. # # begin configuration section cleanup=true # end configuration section . utils/parse_options.sh if [ $# != 3 ]; then echo "Usage: " echo " $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>" echo "e.g.:" echo " $0 data/train 1.55 data/train_comb" # options documentation here. exit 1; fi export LC_ALL=C srcdir=$1 min_seg_len=$2 dir=$3 if [ "$dir" == "$srcdir" ]; then echo "$0: this script requires <srcdir> and <dir> to be different." exit 1 fi for f in $srcdir/utt2spk $srcdir/feats.scp; do [ ! -s $f ] && echo "$0: expected file $f to exist and be nonempty" && exit 1 done if ! awk '{if (NF != 2) exit(1);}' <$srcdir/feats.scp; then echo "$0: could not combine short segments because $srcdir/feats.scp has " echo " entries with too many fields" fi if ! mkdir -p $dir; then echo "$0: could not create directory $dir" exit 1; fi if ! utils/validate_data_dir.sh $srcdir; then echo "$0: failed to validate input directory $srcdir. If needed, run utils/fix_data_dir.sh $srcdir" exit 1 fi if ! python -c "x=float('$min_seg_len'); assert(x>0.0 and x<100.0);" 2>/dev/null; then echo "$0: bad <min-segment-length-in-seconds>: got '$min_seg_len'" exit 1 fi set -e set -o pipefail # make sure $srcdir/utt2dur exists. utils/data/get_utt2dur.sh $srcdir utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \ $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt # create the feats.scp. # if a line of utt2utts is like 'utt2-comb2 utt2 utt3', then # the utils/apply_map.pl will create a line that looks like # 'utt2-comb2 foo.ark:4315 foo.ark:431423' # and the awk command creates suitable command lines like: # 'utt2-comb2 concat-feats foo.ark:4315 foo.ark:431423 - |' utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \ awk '{if (NF<=2){print;} else { $1 = $1 " concat-feats --print-args=false"; $NF = $NF " - |"; print; }}' > $dir/feats.scp # create $dir/text by concatenating the source 'text' entries for the original # utts. utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text if [ -f $srcdir/utt2uniq ]; then # the utt2uniq file is such that if 2 utts were derived from the same original # utt (e.g. by speed perturbing) they map to the same 'uniq' value. This is # so that we can properly hold out validation data for neural net training and # know that we're not training on perturbed verions of that utterance. We # need to obtain the utt2uniq file so that if any 2 'new' utts contain any of # the same 'old' utts, their 'uniq' values are the same [but otherwise as far # as possible, the 'uniq' values are different.] # # we'll do this by arranging the old 'uniq' values into groups as necessary to # capture this property. # The following command creates 'uniq_sets', each line of which contains # a set of original 'uniq' values, and effectively we assert that they must # be grouped together to the same 'uniq' value. # the first awk command prints a group of the original utterance-ids that # are combined together into a single new utterance, and the apply_map # command converts those into a list of original 'uniq' values. awk '{$1 = ""; print;}' < $dir/utt2utts | \ utils/apply_map.pl $srcdir/utt2uniq > $dir/uniq_sets # The next command creates $dir/uniq2merged_uniq, which is a map from the # original 'uniq' values to the 'merged' uniq values. # for example, if $dir/uniq_sets were to contain # a b # b c # d # then we'd obtain a uniq2merged_uniq file that looks like: # a a # b a # c a # d d # ... because a and b appear together, and b and c appear together, # they have to be merged into the same set, and we name that set 'a' # (in general, we take the lowest string in lexicographical order). cat $dir/uniq_sets | LC_ALL=C python -c ' import sys; from collections import defaultdict uniq2orig_uniq = dict() equal_pairs = set() # set of 2-tuples (a,b) which should have equal orig_uniq while True: line = sys.stdin.readline() if line == "": break split_line = line.split() # list of uniq strings that should map in same set # initialize uniq2orig_uniq to the identity mapping for uniq in split_line: uniq2orig_uniq[uniq] = uniq for a in split_line[1:]: equal_pairs.add((split_line[0], a)) changed = True while changed: changed = False for a,b in equal_pairs: min_orig_uniq = min(uniq2orig_uniq[a], uniq2orig_uniq[b]) for x in [a,b]: if uniq2orig_uniq[x] != min_orig_uniq: uniq2orig_uniq[x] = min_orig_uniq changed = True for uniq in sorted(uniq2orig_uniq.keys()): print uniq, uniq2orig_uniq[uniq] ' > $dir/uniq_to_orig_uniq rm $dir/uniq_sets # In the following command, suppose we have a line like: # utt1-comb2 utt1 utt2 # .. the first awk command retains only the first original utt, to give # utt1-comb2 utt1 # [we can pick one arbitrarily since we know any of them would map to the same # orig_uniq value.] # the first apply_map.pl command maps the 'utt1' to the 'uniq' value it mapped to # in $srcdir, and the second apply_map.pl command maps it to the grouped 'uniq' # value obtained by the inline python script above. awk '{print $1, $2}' < $dir/utt2utts | utils/apply_map.pl -f 2 $srcdir/utt2uniq | \ utils/apply_map.pl -f 2 $dir/uniq_to_orig_uniq > $dir/utt2uniq rm $dir/uniq_to_orig_uniq fi # note: the user will have to recompute the cmvn, as the speakers may have changed. rm $dir/cmvn.scp 2>/dev/null || true utils/validate_data_dir.sh --no-wav $dir if $cleanup; then rm $dir/utt2utts fi |