Blame view

egs/wsj/s5/utils/data/subsegment_data_dir.sh 9.62 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
  #!/bin/bash
  
  # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  
  # This script allows you to specify a 'segments' file with segments
  # relative to existing utterances, with lines like
  #  utterance_foo-1 utterance_foo 7.5 8.2
  #  utterance_foo-2 utterance_foo 8.9 10.1
  # and a 'text' file with sub-segmented text like
  #  utterance_foo-1 hello there
  #  utterance_foo-2 how are you
  # and combine this with an existing data-dir that was all relative
  # to the original utterance-ids like 'utterance_foo', producing
  # a new subsegmented output directory.
  #
  # It does the right thing for you on the various files that the
  # data directory contained (except you have to recreate
  # the CMVN stats).
  
  
  segment_end_padding=0.0
  cmd=run.pl
  nj=1
  
  . utils/parse_options.sh
  
  if [ $# != 4 ] && [ $# != 3 ]; then
    echo "Usage: "
    echo "  $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>"
    echo "This script sub-segments a data directory.  <subsegments-file> is to"
    echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>"
    echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>."
    echo "This script appropriately combines the <subsegments-file> with the original"
    echo "segments file, if necessary, and if not, creates a segments file."
    echo "e.g.:"
    echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented"
    echo " Options:"
    echo "  --segment-end-padding <padding-time>       # e.g. 0.02.  Default 0.0.  If provided,"
    echo "                                             # we will add this value to the end times of <destdir>/segments"
    echo "                                             # when creating it.  This can be useful to account for"
    echo "                                             # end effects in feature generation.  The reason this is"
    echo "                                             # not just applied to the input segments file, is that"
    echo "                                             # for purposes of computing the num-frames of the parts of"
    echo "                                             # matrices in feats.scp, the padding should not be done."
    echo "  See also: resolve_ctm_overlaps.py"
    exit 1;
  fi
  
  
  export LC_ALL=C
  
  srcdir=$1
  subsegments=$2
  
  add_subsegment_text=false
  if [ $# -eq 4 ]; then
    new_text=$3
    dir=$4
    add_subsegment_text=true
  
    if [ ! -f "$new_text" ]; then
      echo "$0: no such file $new_text"
      exit 1
    fi
  
  else
    dir=$3
  fi
  
  for f in "$subsegments" "$srcdir/utt2spk"; do
    if [ ! -f "$f" ]; then
      echo "$0: no such file $f"
      exit 1;
    fi
  done
  
  if ! mkdir -p $dir; then
    echo "$0: failed to create directory $dir"
  fi
  
  if $add_subsegment_text; then
    if ! cmp <(awk '{print $1}' <$subsegments)  <(awk '{print $1}' <$new_text); then
      echo "$0: expected the first fields of the files $subsegments and $new_text to be identical"
      exit 1
    fi
  fi
  
  # create the utt2spk in $dir
  if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then
    echo "$0: failed checking subsegments file $subsegments"
    exit 1
  fi
  
  set -e
  set -o pipefail
  
  # Create a mapping from the new to old utterances.  This file will be deleted later.
  awk '{print $1, $2}' < $subsegments > $dir/new2old_utt
  
  # Create the new utt2spk file [just map from the second field
  utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk
  # .. and the new spk2utt file.
  utils/utt2spk_to_spk2utt.pl  <$dir/utt2spk >$dir/spk2utt
  
  if $add_subsegment_text; then
    # the new text file is just what the user provides.
    cp $new_text $dir/text
  fi
  
  # copy the source wav.scp
  cp $srcdir/wav.scp $dir
  if [ -f $srcdir/reco2file_and_channel ]; then
    cp $srcdir/reco2file_and_channel $dir
  fi
  
  # copy the source reco2dur
  if [ -f $srcdir/reco2dur ]; then
    cp $srcdir/reco2dur $dir
  fi
  
  if [ -f $srcdir/segments ]; then
    # we have to map the segments file.
    # What's going on below is a little subtle.
    # $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time>
    # and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time>
    # The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments]
    # with <recording-id> <start-time> <end-time>.
    # so after that first command we have lines like
    # <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \
    #   <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt>
    # which the awk command turns into:
    # <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording>
    utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \
      awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments
  else
    # the subsegments file just becomes the segments file.
    awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments
  fi
  
  if [ -f $srcdir/utt2uniq ]; then
    utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq
  fi
  
  if [ -f $srcdir/feats.scp ]; then
    # We want to avoid recomputing the features.   We'll use sub-matrices of the
    # original feature matrices, using the [] notation that is available for
    # matrices in Kaldi.
    if [ ! -s $srcdir/frame_shift ]; then
      frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1
    else
      frame_shift=$(cat $srcdir/frame_shift)
    fi
    echo "$0: note: frame shift is $frame_shift [affects feats.scp]"
  
    # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
    # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
    # The first awk command replaces this with the format:
    # <new-utt-id> <old-utt-id> <first-frame> <last-frame>
    # e.g. 'utt_foo-1 utt_foo 721 893'
    # and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry
    # from the original wav.scp, so we get a line like:
    # e.g. 'utt_foo-1 foo-bar.ark:514231 721 892'
    # Note: the reason we subtract one from the last time is that it's going to
    # represent the 'last' frame, not the 'end' frame [i.e. not one past the last],
    # in the matlab-like, but zero-indexed [first:last] notion.  For instance, a segment with 1 frame
    # would have start-time 0.00 and end-time 0.01, which would become the frame range
    # [0:0]
    # The second awk command turns this into something like
    # utt_foo-1 foo-bar.ark:514231[721:892]
    # It has to be a bit careful because the format actually allows for more general things
    # like pipes that might contain spaces, so it has to be able to produce output like the
    # following:
    # utt_foo-1 some command|[721:892]
    # The 'end' frame is ensured to not exceed the feature archive size of
    # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
    # e.g if the number of frames in foo-bar.ark is 891, then the features are
    # truncated to that many frames.
    # utt_foo-1 foo-bar.ark:514231[721:890]
    # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
    # the original data-dir already had data-ranges in square brackets.
  
    # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
    # This is equal to the number of frames in the feature archive for <old-utt-id>.
    if [ ! -f $srcdir/utt2num_frames ]; then
      echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
      echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
    fi
    utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
    awk '{print $1" "$2}' $subsegments | \
      utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
      $dir/utt2max_frames
  
    awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
      utils/apply_map.pl -f 2 $srcdir/feats.scp | \
      awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]
  ", $k, $l, $NF)}' | \
      utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
      utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }
  
    # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
    # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
    cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "
  "' > \
      $dir/utt2num_frames
  
    # Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above.
    if [ -f $srcdir/vad.scp ]; then
      cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \
        utils/apply_map.pl -f 2 $srcdir/vad.scp | \
        awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]
  ", $k, $l, $NF)}' | \
        utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
        utils/data/normalize_data_range.pl >$dir/vad.scp
    fi
  fi
  
  
  if [ -f $dir/cmvn.scp ]; then
    rm $dir/cmvn.scp
    echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features."
  fi
  
  # remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file.
  rm $dir/utt2dur 2>/dev/null || true
  
  if [ -f $srcdir/spk2gender ]; then
    cp $srcdir/spk2gender $dir
  fi
  if [ -f $srcdir/glm ]; then
    cp $srcdir/glm $dir
  fi
  if [ -f $srcdir/stm ]; then
    cp $srcdir/stm $dir
  fi
  
  for f in ctm; do
    if [ -f $srcdir/$f ]; then
      echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
      echo " ... not implemented yet (and probably it's not needed.)"
    fi
  done
  
  rm $dir/new2old_utt
  
  echo "$0: subsegmented data from $srcdir to $dir"