Blame view
egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
12.5 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
#! /bin/bash # Copyright 2017 Vimal Manohar # Apache 2.0 # This script prepares targets for training neural network for # speech activity detction. # See steps/segmentation/lats_to_targets.sh for details about the # format of the targets. # The targets are obtained from a combination # of supervision-constrained lattices and lattices obtained by decoding. # Also, we assume that the out-of-segment regions are all silence (target # values of [ 1 0 0 ]. We merge the targets from the multiple sources # by a weighted average using weights specified by --weights. Also, # the frames where the labels from multiple sources do not match are # removed in the script steps/segmentation/merge_targets_dirs.sh. # In this script, we use GMMs trained for ASR on in-domain data # to generate the lattices required for creating the targets. To generate # supervision-constrained lattices, we use speaker-adapted GMM models. To # generate lattices without supervision, we use speaker-independent GMM models # from the LDA+MLLT stage, but apply per-recording cepstral mean subtraction. # The phones in the lattices are mapped deterministically to # 0, 1, and 2 representing respectively silence, speech and garbage classes. # The mapping is defined by --garbage-phones-list and --silence-phones-list # options. But when these are unspecified, the silence phones other than # oov are mapped to silence class and the oov is mapped to garbage class. stage=-1 train_cmd=run.pl decode_cmd=run.pl nj=4 reco_nj=4 lang_test= # If different from $lang graph_dir= # If not provided, a new one will be created using $lang_test garbage_phones_list= silence_phones_list= # Uniform segmentation options for decoding whole recordings. All values are in # seconds. max_segment_duration=10 overlap_duration=2.5 max_remaining_duration=5 # If the last remaining piece when splitting uniformly # is smaller than this duration, then the last piece # is merged with the previous. # List of weights on labels obtained from alignment, # labels obtained from decoding and default labels in out-of-segment regions merge_weights=1.0,0.1,0.5 [ -f ./path.sh ] && . ./path.sh set -e -u -o pipefail . utils/parse_options.sh if [ $# -ne 6 ]; then cat <<EOF This script prepares targets for training neural network for speech activity detction. The targets are obtained from a combination of supervision-constrained lattices and lattices obtained by decoding. See comments in the script for more details. Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir> e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a Note: <whole-recording-data> is expected to have feats.scp and <data> expected to have segments file. We will get the features for <data> by using row ranges of <whole-recording-data>/feats.scp. This script will work on a copy of <data> created to have the recording-id as the speaker-id. EOF exit 1 fi lang=$1 # Must match the one used to train the models in_data_dir=$2 in_whole_data_dir=$3 ali_model_dir=$4 # Model directory used to align the $data_dir to get target # labels for training SAD. This should typically be a # speaker-adapted system. model_dir=$5 # Model direcotry used to decode the whole-recording version # of the $data_dir to get target labels for training SAD. This # should typically be a speaker-independent system like # LDA+MLLT system. dir=$6 mkdir -p $dir if [ -z "$lang_test" ]; then lang_test=$lang fi extra_files= if [ -z "$graph_dir" ]; then extra_files="$extra_files $lang_test/G.fst $lang_test/phones.txt" else extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt" fi for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \ $lang/phones.txt $garbage_phones_list $silence_phones_list \ $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do if [ ! -f $f ]; then echo "$0: Could not find file $f" exit 1 fi done utils/validate_data_dir.sh $in_data_dir || exit 1 utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1 if ! cat $garbage_phones_list $silence_phones_list | \ steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then echo "$0: Invalid $garbage_phones_list $silence_phones_list" exit 1 fi data_id=$(basename $in_data_dir) whole_data_id=$(basename $in_whole_data_dir) if [ $stage -le 0 ]; then rm -r $dir/$data_id 2>/dev/null || true mkdir -p $dir/$data_id utils/data/modify_speaker_info_to_recording.sh \ $in_data_dir $dir/$data_id || exit 1 utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1 fi # Work with a temporary data directory with recording-id as the speaker labels. data_dir=$dir/${data_id} ############################################################################### # Get feats for the manual segments ############################################################################### if [ $stage -le 1 ]; then utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp cp $data_dir/tmp/feats.scp $data_dir steps/compute_cmvn_stats.sh $data_dir || exit 1 fi if [ $stage -le 2 ]; then utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id utils/fix_data_dir.sh $dir/$whole_data_id # Copy the CMVN stats to the whole directory cp $data_dir/cmvn.scp $dir/$whole_data_id fi # Work with a temporary data directory with CMVN stats computed using # only the segments from the original data directory. whole_data_dir=$dir/$whole_data_id ############################################################################### # Obtain supervision-constrained lattices ############################################################################### sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id} if [ $stage -le 2 ]; then steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1 fi ############################################################################### # Uniformly segment whole data directory for decoding ############################################################################### uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec uniform_seg_data_id=`basename $uniform_seg_data_dir` if [ $stage -le 3 ]; then utils/data/get_segments_for_data.sh ${whole_data_dir} > \ ${whole_data_dir}/segments mkdir -p $uniform_seg_data_dir utils/data/get_uniform_subsegments.py \ --max-segment-duration $max_segment_duration \ --overlap-duration $overlap_duration \ --max-remaining-duration $max_remaining_duration \ ${whole_data_dir}/segments > $uniform_seg_data_dir/sub_segments utils/data/subsegment_data_dir.sh $whole_data_dir \ $uniform_seg_data_dir/sub_segments $uniform_seg_data_dir cp $whole_data_dir/cmvn.scp $uniform_seg_data_dir/ fi model_id=$(basename $model_dir) ############################################################################### # Create graph dir for decoding ############################################################################### if [ -z "$graph_dir" ]; then graph_dir=$dir/$model_id/graph if [ $stage -le 4 ]; then if [ ! -f $graph_dir/HCLG.fst ]; then rm -r $dir/lang_test 2>/dev/null || true cp -r $lang_test/ $dir/lang_test utils/mkgraph.sh $dir/lang_test $model_dir $graph_dir || exit 1 fi fi fi ############################################################################### # Decode uniformly segmented data directory ############################################################################### model_id=$(basename $model_dir) decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id} if [ $stage -le 5 ]; then mkdir -p $decode_dir cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id} cp $model_dir/phones.txt $dir/$model_id # We use a small beam and max-active since we are only interested in # the speech / silence decisions, not the exact word sequences. steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \ --max-active 1000 --beam 10.0 \ --decode-extra-opts "--word-determinize=false" --skip-scoring true \ $graph_dir $uniform_seg_data_dir $decode_dir fi ali_model_id=`basename $ali_model_dir` ############################################################################### # Get frame-level targets from lattices for nnet training # Targets are matrices of 3 columns -- silence, speech and garbage # The target values are obtained by summing up posterior probabilites of # arcs from lattice-arc-post over silence, speech and garbage phones. ############################################################################### if [ $stage -le 6 ]; then steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \ --silence-phones "$silence_phones_list" \ --garbage-phones "$garbage_phones_list" \ --max-phone-duration 0.5 \ $data_dir $lang $sup_lats_dir \ $dir/${ali_model_id}_${data_id}_sup_targets fi if [ $stage -le 7 ]; then steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \ --silence-phones "$silence_phones_list" \ --garbage-phones "$garbage_phones_list" \ --max-phone-duration 0.5 \ $uniform_seg_data_dir $lang $decode_dir \ $dir/${model_id}_${uniform_seg_data_id}_targets fi ############################################################################### # Convert targets to be w.r.t. whole data directory and subsample the # targets by a factor of 3. # Since the targets from transcript-constrained lattices have only values # for the manual segments, these are converted to whole recording-levels # by inserting [ 0 0 0 ] for the out-of-manual segment regions. ############################################################################### if [ $stage -le 8 ]; then steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \ $data_dir $whole_data_dir \ $dir/${ali_model_id}_${data_id}_sup_targets \ $dir/${ali_model_id}_${whole_data_id}_sup_targets steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \ $whole_data_dir \ $dir/${ali_model_id}_${whole_data_id}_sup_targets \ $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 fi ############################################################################### # Convert the targets from decoding to whole recording. ############################################################################### if [ $stage -le 9 ]; then steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \ $dir/${uniform_seg_data_id} $whole_data_dir \ $dir/${model_id}_${uniform_seg_data_id}_targets \ $dir/${model_id}_${whole_data_id}_targets steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \ $whole_data_dir \ $dir/${model_id}_${whole_data_id}_targets \ $dir/${model_id}_${whole_data_id}_targets_sub3 fi ############################################################################### # "default targets" values for the out-of-manual-segment regions. # We assume in this setup that this is silence i.e. [ 1 0 0 ]. ############################################################################### if [ $stage -le 10 ]; then echo " [ 1 0 0 ]" > $dir/default_targets.vec steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \ --nj $reco_nj --frame-subsampling-factor 3 \ --default-targets $dir/default_targets.vec \ $data_dir $whole_data_dir $dir/out_of_seg_${whole_data_id}_default_targets_sub3 fi ############################################################################### # Merge targets for the same data from multiple sources (systems) # --weights is used to weight targets from alignment with a higher weight # the targets from decoding. # If --remove-mismatch-frames is true, then if alignment and decoding # disagree (more than 0.5 probability on different classes), then those frames # are removed by setting targets to [ 0 0 0 ]. ############################################################################### if [ $stage -le 11 ]; then steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \ --weights $merge_weights --remove-mismatch-frames true \ $whole_data_dir \ $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \ $dir/${model_id}_${whole_data_id}_targets_sub3 \ $dir/out_of_seg_${whole_data_id}_default_targets_sub3 \ $dir/${whole_data_id}_combined_targets_sub3 fi cp $dir/${whole_data_id}_combined_targets_sub3/targets.scp $dir/ echo "$0: Prepared targets in $dir/targets.scp" |