Blame view

egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh 12.5 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
  #! /bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0
    
  # This script prepares targets for training neural network for 
  # speech activity detction. 
  # See steps/segmentation/lats_to_targets.sh for details about the 
  # format of the targets.
  
  # The targets are obtained from a combination
  # of supervision-constrained lattices and lattices obtained by decoding. 
  # Also, we assume that the out-of-segment regions are all silence (target 
  # values of [ 1 0 0 ]. We merge the targets from the multiple sources 
  # by a weighted average using weights specified by --weights. Also, 
  # the frames where the labels from multiple sources do not match are 
  # removed in the script steps/segmentation/merge_targets_dirs.sh.
  
  # In this script, we use GMMs trained for ASR on in-domain data 
  # to generate the lattices required for creating the targets. To generate
  # supervision-constrained lattices, we use speaker-adapted GMM models. To 
  # generate lattices without supervision, we use speaker-independent GMM models
  # from the LDA+MLLT stage, but apply per-recording cepstral mean subtraction.
  # The phones in the lattices are mapped deterministically to 
  # 0, 1, and 2 representing respectively silence, speech and garbage classes.
  # The mapping is defined by --garbage-phones-list and --silence-phones-list
  # options. But when these are unspecified, the silence phones other than
  # oov are mapped to silence class and the oov is mapped to garbage class.
  
  stage=-1
  train_cmd=run.pl
  decode_cmd=run.pl
  nj=4
  reco_nj=4
  
  lang_test=    # If different from $lang
  graph_dir=    # If not provided, a new one will be created using $lang_test
  
  garbage_phones_list=
  silence_phones_list=
  
  # Uniform segmentation options for decoding whole recordings. All values are in
  # seconds.
  max_segment_duration=10
  overlap_duration=2.5
  max_remaining_duration=5  # If the last remaining piece when splitting uniformly
                            # is smaller than this duration, then the last piece 
                            # is  merged with the previous.
  
  # List of weights on labels obtained from alignment, 
  # labels obtained from decoding and default labels in out-of-segment regions
  merge_weights=1.0,0.1,0.5
  
  [ -f ./path.sh ] && . ./path.sh 
  
  set -e -u -o pipefail
  . utils/parse_options.sh 
  
  if [ $# -ne 6 ]; then
    cat <<EOF
    This script prepares targets for training neural network for 
    speech activity detction. The targets are obtained from a combination
    of supervision-constrained lattices and lattices obtained by decoding. 
    See comments in the script for more details.
  
    Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir>
     e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a
    
    Note: <whole-recording-data> is expected to have feats.scp and <data> 
    expected to have segments file. We will get the features for <data> by 
    using row ranges of <whole-recording-data>/feats.scp. This script will 
    work on a copy of <data> created to have the recording-id as the speaker-id.
  EOF
    exit 1
  fi
  
  lang=$1   # Must match the one used to train the models
  in_data_dir=$2
  in_whole_data_dir=$3
  ali_model_dir=$4  # Model directory used to align the $data_dir to get target 
                    # labels for training SAD. This should typically be a
                    # speaker-adapted system.
  model_dir=$5      # Model direcotry used to decode the whole-recording version
                    # of the $data_dir to get target labels for training SAD. This
                    # should typically be a speaker-independent system like
                    # LDA+MLLT system.
  dir=$6
  
  mkdir -p $dir
  
  if [ -z "$lang_test" ]; then
    lang_test=$lang
  fi
  
  extra_files=
  if [ -z "$graph_dir" ]; then
    extra_files="$extra_files $lang_test/G.fst $lang_test/phones.txt"
  else
    extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt"
  fi
  
  for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
    $lang/phones.txt $garbage_phones_list $silence_phones_list \
    $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do
    if [ ! -f $f ]; then
      echo "$0: Could not find file $f"
      exit 1
    fi
  done
  
  utils/validate_data_dir.sh $in_data_dir || exit 1
  utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1
  
  if ! cat $garbage_phones_list $silence_phones_list | \
    steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
    echo "$0: Invalid $garbage_phones_list $silence_phones_list"
    exit 1
  fi
  
  data_id=$(basename $in_data_dir)
  whole_data_id=$(basename $in_whole_data_dir)
  
  if [ $stage -le 0 ]; then
    rm -r $dir/$data_id 2>/dev/null || true
    mkdir -p $dir/$data_id
  
    utils/data/modify_speaker_info_to_recording.sh \
      $in_data_dir $dir/$data_id || exit 1
    utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1
  fi 
  
  # Work with a temporary data directory with recording-id as the speaker labels.
  data_dir=$dir/${data_id}
  
  ###############################################################################
  # Get feats for the manual segments
  ###############################################################################
  if [ $stage -le 1 ]; then
    utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp
    cp $data_dir/tmp/feats.scp $data_dir
  
    steps/compute_cmvn_stats.sh $data_dir || exit 1
  fi
  
  if [ $stage -le 2 ]; then
    utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id
  
    utils/fix_data_dir.sh $dir/$whole_data_id
  
    # Copy the CMVN stats to the whole directory
    cp $data_dir/cmvn.scp $dir/$whole_data_id
  fi
  
  # Work with a temporary data directory with CMVN stats computed using 
  # only the segments from the original data directory.
  whole_data_dir=$dir/$whole_data_id
  
  ###############################################################################
  # Obtain supervision-constrained lattices
  ###############################################################################
  sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id}
  if [ $stage -le 2 ]; then
    steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
      ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1
  fi
  
  ###############################################################################
  # Uniformly segment whole data directory for decoding
  ###############################################################################
  uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec
  uniform_seg_data_id=`basename $uniform_seg_data_dir`
  
  if [ $stage -le 3 ]; then
    utils/data/get_segments_for_data.sh ${whole_data_dir} > \
      ${whole_data_dir}/segments
  
    mkdir -p $uniform_seg_data_dir
  
    utils/data/get_uniform_subsegments.py \
      --max-segment-duration $max_segment_duration \
      --overlap-duration $overlap_duration \
      --max-remaining-duration $max_remaining_duration \
      ${whole_data_dir}/segments > $uniform_seg_data_dir/sub_segments
  
    utils/data/subsegment_data_dir.sh $whole_data_dir \
      $uniform_seg_data_dir/sub_segments $uniform_seg_data_dir
    cp $whole_data_dir/cmvn.scp $uniform_seg_data_dir/
  fi
  
  model_id=$(basename $model_dir)
  ###############################################################################
  # Create graph dir for decoding
  ###############################################################################
  if [ -z "$graph_dir" ]; then
    graph_dir=$dir/$model_id/graph
    if [ $stage -le 4 ]; then
      if [ ! -f $graph_dir/HCLG.fst ]; then
        rm -r $dir/lang_test 2>/dev/null || true
        cp -r $lang_test/ $dir/lang_test
        utils/mkgraph.sh $dir/lang_test $model_dir $graph_dir || exit 1
      fi
    fi
  fi
  
  ###############################################################################
  # Decode uniformly segmented data directory
  ###############################################################################
  model_id=$(basename $model_dir)
  decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id}
  if [ $stage -le 5 ]; then 
    mkdir -p $decode_dir
    
    cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id}
    cp $model_dir/phones.txt $dir/$model_id
  
    # We use a small beam and max-active since we are only interested in 
    # the speech / silence decisions, not the exact word sequences.
    steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \
      --max-active 1000 --beam 10.0 \
      --decode-extra-opts "--word-determinize=false" --skip-scoring true \
      $graph_dir $uniform_seg_data_dir $decode_dir
  fi
  
  ali_model_id=`basename $ali_model_dir`
  ###############################################################################
  # Get frame-level targets from lattices for nnet training
  # Targets are matrices of 3 columns -- silence, speech and garbage
  # The target values are obtained by summing up posterior probabilites of 
  # arcs from lattice-arc-post over silence, speech and garbage phones.
  ###############################################################################
  if [ $stage -le 6 ]; then
    steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
      --silence-phones "$silence_phones_list" \
      --garbage-phones "$garbage_phones_list" \
      --max-phone-duration 0.5 \
      $data_dir $lang $sup_lats_dir \
      $dir/${ali_model_id}_${data_id}_sup_targets
  fi
  
  if [ $stage -le 7 ]; then
    steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
      --silence-phones "$silence_phones_list" \
      --garbage-phones "$garbage_phones_list" \
      --max-phone-duration 0.5 \
      $uniform_seg_data_dir $lang $decode_dir \
      $dir/${model_id}_${uniform_seg_data_id}_targets
  fi
  
  ###############################################################################
  # Convert targets to be w.r.t. whole data directory and subsample the 
  # targets by a factor of 3.
  # Since the targets from transcript-constrained lattices have only values 
  # for the manual segments, these are converted to whole recording-levels 
  # by inserting [ 0 0 0 ] for the out-of-manual segment regions.
  ###############################################################################
  if [ $stage -le 8 ]; then
    steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
      $data_dir $whole_data_dir \
      $dir/${ali_model_id}_${data_id}_sup_targets \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets
    
    steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
      $whole_data_dir \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3
  fi
  
  ###############################################################################
  # Convert the targets from decoding to whole recording. 
  ###############################################################################
  if [ $stage -le 9 ]; then
    steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
      $dir/${uniform_seg_data_id} $whole_data_dir \
      $dir/${model_id}_${uniform_seg_data_id}_targets \
      $dir/${model_id}_${whole_data_id}_targets
  
    steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
      $whole_data_dir \
      $dir/${model_id}_${whole_data_id}_targets \
      $dir/${model_id}_${whole_data_id}_targets_sub3
  fi
  
  ###############################################################################
  # "default targets" values for the out-of-manual-segment regions.
  # We assume in this setup that this is silence i.e. [ 1 0 0 ].
  ###############################################################################
  
  if [ $stage -le 10 ]; then
    echo " [ 1 0 0 ]" > $dir/default_targets.vec
    steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \
      --nj $reco_nj --frame-subsampling-factor 3 \
      --default-targets $dir/default_targets.vec \
      $data_dir $whole_data_dir $dir/out_of_seg_${whole_data_id}_default_targets_sub3
  fi
  
  ###############################################################################
  # Merge targets for the same data from multiple sources (systems)
  # --weights is used to weight targets from alignment with a higher weight 
  # the targets from decoding. 
  # If --remove-mismatch-frames is true, then if alignment and decoding 
  # disagree (more than 0.5 probability on different classes), then those frames
  # are removed by setting targets to [ 0 0 0 ]. 
  ###############################################################################
  if [ $stage -le 11 ]; then
    steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \
      --weights $merge_weights --remove-mismatch-frames true \
      $whole_data_dir \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \
      $dir/${model_id}_${whole_data_id}_targets_sub3 \
      $dir/out_of_seg_${whole_data_id}_default_targets_sub3 \
      $dir/${whole_data_id}_combined_targets_sub3
  fi
  
  cp $dir/${whole_data_id}_combined_targets_sub3/targets.scp $dir/
  
  echo "$0: Prepared targets in $dir/targets.scp"