Blame view
egs/wsj/s5/utils/data/subsegment_data_dir.sh
9.62 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
#!/bin/bash # Copyright 2013 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 # This script allows you to specify a 'segments' file with segments # relative to existing utterances, with lines like # utterance_foo-1 utterance_foo 7.5 8.2 # utterance_foo-2 utterance_foo 8.9 10.1 # and a 'text' file with sub-segmented text like # utterance_foo-1 hello there # utterance_foo-2 how are you # and combine this with an existing data-dir that was all relative # to the original utterance-ids like 'utterance_foo', producing # a new subsegmented output directory. # # It does the right thing for you on the various files that the # data directory contained (except you have to recreate # the CMVN stats). segment_end_padding=0.0 cmd=run.pl nj=1 . utils/parse_options.sh if [ $# != 4 ] && [ $# != 3 ]; then echo "Usage: " echo " $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>" echo "This script sub-segments a data directory. <subsegments-file> is to" echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>" echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>." echo "This script appropriately combines the <subsegments-file> with the original" echo "segments file, if necessary, and if not, creates a segments file." echo "e.g.:" echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented" echo " Options:" echo " --segment-end-padding <padding-time> # e.g. 0.02. Default 0.0. If provided," echo " # we will add this value to the end times of <destdir>/segments" echo " # when creating it. This can be useful to account for" echo " # end effects in feature generation. The reason this is" echo " # not just applied to the input segments file, is that" echo " # for purposes of computing the num-frames of the parts of" echo " # matrices in feats.scp, the padding should not be done." echo " See also: resolve_ctm_overlaps.py" exit 1; fi export LC_ALL=C srcdir=$1 subsegments=$2 add_subsegment_text=false if [ $# -eq 4 ]; then new_text=$3 dir=$4 add_subsegment_text=true if [ ! -f "$new_text" ]; then echo "$0: no such file $new_text" exit 1 fi else dir=$3 fi for f in "$subsegments" "$srcdir/utt2spk"; do if [ ! -f "$f" ]; then echo "$0: no such file $f" exit 1; fi done if ! mkdir -p $dir; then echo "$0: failed to create directory $dir" fi if $add_subsegment_text; then if ! cmp <(awk '{print $1}' <$subsegments) <(awk '{print $1}' <$new_text); then echo "$0: expected the first fields of the files $subsegments and $new_text to be identical" exit 1 fi fi # create the utt2spk in $dir if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then echo "$0: failed checking subsegments file $subsegments" exit 1 fi set -e set -o pipefail # Create a mapping from the new to old utterances. This file will be deleted later. awk '{print $1, $2}' < $subsegments > $dir/new2old_utt # Create the new utt2spk file [just map from the second field utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk # .. and the new spk2utt file. utils/utt2spk_to_spk2utt.pl <$dir/utt2spk >$dir/spk2utt if $add_subsegment_text; then # the new text file is just what the user provides. cp $new_text $dir/text fi # copy the source wav.scp cp $srcdir/wav.scp $dir if [ -f $srcdir/reco2file_and_channel ]; then cp $srcdir/reco2file_and_channel $dir fi # copy the source reco2dur if [ -f $srcdir/reco2dur ]; then cp $srcdir/reco2dur $dir fi if [ -f $srcdir/segments ]; then # we have to map the segments file. # What's going on below is a little subtle. # $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time> # and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time> # The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments] # with <recording-id> <start-time> <end-time>. # so after that first command we have lines like # <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \ # <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt> # which the awk command turns into: # <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording> utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \ awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments else # the subsegments file just becomes the segments file. awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments fi if [ -f $srcdir/utt2uniq ]; then utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq fi if [ -f $srcdir/feats.scp ]; then # We want to avoid recomputing the features. We'll use sub-matrices of the # original feature matrices, using the [] notation that is available for # matrices in Kaldi. if [ ! -s $srcdir/frame_shift ]; then frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1 else frame_shift=$(cat $srcdir/frame_shift) fi echo "$0: note: frame shift is $frame_shift [affects feats.scp]" # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>. # e.g. 'utt_foo-1 utt_foo 7.21 8.93' # The first awk command replaces this with the format: # <new-utt-id> <old-utt-id> <first-frame> <last-frame> # e.g. 'utt_foo-1 utt_foo 721 893' # and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry # from the original wav.scp, so we get a line like: # e.g. 'utt_foo-1 foo-bar.ark:514231 721 892' # Note: the reason we subtract one from the last time is that it's going to # represent the 'last' frame, not the 'end' frame [i.e. not one past the last], # in the matlab-like, but zero-indexed [first:last] notion. For instance, a segment with 1 frame # would have start-time 0.00 and end-time 0.01, which would become the frame range # [0:0] # The second awk command turns this into something like # utt_foo-1 foo-bar.ark:514231[721:892] # It has to be a bit careful because the format actually allows for more general things # like pipes that might contain spaces, so it has to be able to produce output like the # following: # utt_foo-1 some command|[721:892] # The 'end' frame is ensured to not exceed the feature archive size of # <old-utt-id>. This is done using the script fix_subsegment_feats.pl. # e.g if the number of frames in foo-bar.ark is 891, then the features are # truncated to that many frames. # utt_foo-1 foo-bar.ark:514231[721:890] # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if # the original data-dir already had data-ranges in square brackets. # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>. # This is equal to the number of frames in the feature archive for <old-utt-id>. if [ ! -f $srcdir/utt2num_frames ]; then echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh." echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc." fi utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir awk '{print $1" "$2}' $subsegments | \ utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \ $dir/utt2max_frames awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \ utils/apply_map.pl -f 2 $srcdir/feats.scp | \ awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d] ", $k, $l, $NF)}' | \ utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \ utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; } # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame] # and write the number-of-frames = last-frame - first-frame + 1 for the utterance. cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . " "' > \ $dir/utt2num_frames # Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above. if [ -f $srcdir/vad.scp ]; then cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \ utils/apply_map.pl -f 2 $srcdir/vad.scp | \ awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d] ", $k, $l, $NF)}' | \ utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \ utils/data/normalize_data_range.pl >$dir/vad.scp fi fi if [ -f $dir/cmvn.scp ]; then rm $dir/cmvn.scp echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features." fi # remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file. rm $dir/utt2dur 2>/dev/null || true if [ -f $srcdir/spk2gender ]; then cp $srcdir/spk2gender $dir fi if [ -f $srcdir/glm ]; then cp $srcdir/glm $dir fi if [ -f $srcdir/stm ]; then cp $srcdir/stm $dir fi for f in ctm; do if [ -f $srcdir/$f ]; then echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is " echo " ... not implemented yet (and probably it's not needed.)" fi done rm $dir/new2old_utt echo "$0: subsegmented data from $srcdir to $dir" |