Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh 12.5 KB
  #! /bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0
    
  # This script prepares targets for training neural network for 
  # speech activity detction. 
  # See steps/segmentation/lats_to_targets.sh for details about the 
  # format of the targets.
  
  # The targets are obtained from a combination
  # of supervision-constrained lattices and lattices obtained by decoding. 
  # Also, we assume that the out-of-segment regions are all silence (target 
  # values of [ 1 0 0 ]. We merge the targets from the multiple sources 
  # by a weighted average using weights specified by --weights. Also, 
  # the frames where the labels from multiple sources do not match are 
  # removed in the script steps/segmentation/merge_targets_dirs.sh.
  
  # In this script, we use GMMs trained for ASR on in-domain data 
  # to generate the lattices required for creating the targets. To generate
  # supervision-constrained lattices, we use speaker-adapted GMM models. To 
  # generate lattices without supervision, we use speaker-independent GMM models
  # from the LDA+MLLT stage, but apply per-recording cepstral mean subtraction.
  # The phones in the lattices are mapped deterministically to 
  # 0, 1, and 2 representing respectively silence, speech and garbage classes.
  # The mapping is defined by --garbage-phones-list and --silence-phones-list
  # options. But when these are unspecified, the silence phones other than
  # oov are mapped to silence class and the oov is mapped to garbage class.
  
  stage=-1
  train_cmd=run.pl
  decode_cmd=run.pl
  nj=4
  reco_nj=4
  
  lang_test=    # If different from $lang
  graph_dir=    # If not provided, a new one will be created using $lang_test
  
  garbage_phones_list=
  silence_phones_list=
  
  # Uniform segmentation options for decoding whole recordings. All values are in
  # seconds.
  max_segment_duration=10
  overlap_duration=2.5
  max_remaining_duration=5  # If the last remaining piece when splitting uniformly
                            # is smaller than this duration, then the last piece 
                            # is  merged with the previous.
  
  # List of weights on labels obtained from alignment, 
  # labels obtained from decoding and default labels in out-of-segment regions
  merge_weights=1.0,0.1,0.5
  
  [ -f ./path.sh ] && . ./path.sh 
  
  set -e -u -o pipefail
  . utils/parse_options.sh 
  
  if [ $# -ne 6 ]; then
    cat <<EOF
    This script prepares targets for training neural network for 
    speech activity detction. The targets are obtained from a combination
    of supervision-constrained lattices and lattices obtained by decoding. 
    See comments in the script for more details.
  
    Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir>
     e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a
    
    Note: <whole-recording-data> is expected to have feats.scp and <data> 
    expected to have segments file. We will get the features for <data> by 
    using row ranges of <whole-recording-data>/feats.scp. This script will 
    work on a copy of <data> created to have the recording-id as the speaker-id.
  EOF
    exit 1
  fi
  
  lang=$1   # Must match the one used to train the models
  in_data_dir=$2
  in_whole_data_dir=$3
  ali_model_dir=$4  # Model directory used to align the $data_dir to get target 
                    # labels for training SAD. This should typically be a
                    # speaker-adapted system.
  model_dir=$5      # Model direcotry used to decode the whole-recording version
                    # of the $data_dir to get target labels for training SAD. This
                    # should typically be a speaker-independent system like
                    # LDA+MLLT system.
  dir=$6
  
  mkdir -p $dir
  
  if [ -z "$lang_test" ]; then
    lang_test=$lang
  fi
  
  extra_files=
  if [ -z "$graph_dir" ]; then
    extra_files="$extra_files $lang_test/G.fst $lang_test/phones.txt"
  else
    extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt"
  fi
  
  for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
    $lang/phones.txt $garbage_phones_list $silence_phones_list \
    $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do
    if [ ! -f $f ]; then
      echo "$0: Could not find file $f"
      exit 1
    fi
  done
  
  utils/validate_data_dir.sh $in_data_dir || exit 1
  utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1
  
  if ! cat $garbage_phones_list $silence_phones_list | \
    steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
    echo "$0: Invalid $garbage_phones_list $silence_phones_list"
    exit 1
  fi
  
  data_id=$(basename $in_data_dir)
  whole_data_id=$(basename $in_whole_data_dir)
  
  if [ $stage -le 0 ]; then
    rm -r $dir/$data_id 2>/dev/null || true
    mkdir -p $dir/$data_id
  
    utils/data/modify_speaker_info_to_recording.sh \
      $in_data_dir $dir/$data_id || exit 1
    utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1
  fi 
  
  # Work with a temporary data directory with recording-id as the speaker labels.
  data_dir=$dir/${data_id}
  
  ###############################################################################
  # Get feats for the manual segments
  ###############################################################################
  if [ $stage -le 1 ]; then
    utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp
    cp $data_dir/tmp/feats.scp $data_dir
  
    steps/compute_cmvn_stats.sh $data_dir || exit 1
  fi
  
  if [ $stage -le 2 ]; then
    utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id
  
    utils/fix_data_dir.sh $dir/$whole_data_id
  
    # Copy the CMVN stats to the whole directory
    cp $data_dir/cmvn.scp $dir/$whole_data_id
  fi
  
  # Work with a temporary data directory with CMVN stats computed using 
  # only the segments from the original data directory.
  whole_data_dir=$dir/$whole_data_id
  
  ###############################################################################
  # Obtain supervision-constrained lattices
  ###############################################################################
  sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id}
  if [ $stage -le 2 ]; then
    steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
      ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1
  fi
  
  ###############################################################################
  # Uniformly segment whole data directory for decoding
  ###############################################################################
  uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec
  uniform_seg_data_id=`basename $uniform_seg_data_dir`
  
  if [ $stage -le 3 ]; then
    utils/data/get_segments_for_data.sh ${whole_data_dir} > \
      ${whole_data_dir}/segments
  
    mkdir -p $uniform_seg_data_dir
  
    utils/data/get_uniform_subsegments.py \
      --max-segment-duration $max_segment_duration \
      --overlap-duration $overlap_duration \
      --max-remaining-duration $max_remaining_duration \
      ${whole_data_dir}/segments > $uniform_seg_data_dir/sub_segments
  
    utils/data/subsegment_data_dir.sh $whole_data_dir \
      $uniform_seg_data_dir/sub_segments $uniform_seg_data_dir
    cp $whole_data_dir/cmvn.scp $uniform_seg_data_dir/
  fi
  
  model_id=$(basename $model_dir)
  ###############################################################################
  # Create graph dir for decoding
  ###############################################################################
  if [ -z "$graph_dir" ]; then
    graph_dir=$dir/$model_id/graph
    if [ $stage -le 4 ]; then
      if [ ! -f $graph_dir/HCLG.fst ]; then
        rm -r $dir/lang_test 2>/dev/null || true
        cp -r $lang_test/ $dir/lang_test
        utils/mkgraph.sh $dir/lang_test $model_dir $graph_dir || exit 1
      fi
    fi
  fi
  
  ###############################################################################
  # Decode uniformly segmented data directory
  ###############################################################################
  model_id=$(basename $model_dir)
  decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id}
  if [ $stage -le 5 ]; then 
    mkdir -p $decode_dir
    
    cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id}
    cp $model_dir/phones.txt $dir/$model_id
  
    # We use a small beam and max-active since we are only interested in 
    # the speech / silence decisions, not the exact word sequences.
    steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \
      --max-active 1000 --beam 10.0 \
      --decode-extra-opts "--word-determinize=false" --skip-scoring true \
      $graph_dir $uniform_seg_data_dir $decode_dir
  fi
  
  ali_model_id=`basename $ali_model_dir`
  ###############################################################################
  # Get frame-level targets from lattices for nnet training
  # Targets are matrices of 3 columns -- silence, speech and garbage
  # The target values are obtained by summing up posterior probabilites of 
  # arcs from lattice-arc-post over silence, speech and garbage phones.
  ###############################################################################
  if [ $stage -le 6 ]; then
    steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
      --silence-phones "$silence_phones_list" \
      --garbage-phones "$garbage_phones_list" \
      --max-phone-duration 0.5 \
      $data_dir $lang $sup_lats_dir \
      $dir/${ali_model_id}_${data_id}_sup_targets
  fi
  
  if [ $stage -le 7 ]; then
    steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
      --silence-phones "$silence_phones_list" \
      --garbage-phones "$garbage_phones_list" \
      --max-phone-duration 0.5 \
      $uniform_seg_data_dir $lang $decode_dir \
      $dir/${model_id}_${uniform_seg_data_id}_targets
  fi
  
  ###############################################################################
  # Convert targets to be w.r.t. whole data directory and subsample the 
  # targets by a factor of 3.
  # Since the targets from transcript-constrained lattices have only values 
  # for the manual segments, these are converted to whole recording-levels 
  # by inserting [ 0 0 0 ] for the out-of-manual segment regions.
  ###############################################################################
  if [ $stage -le 8 ]; then
    steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
      $data_dir $whole_data_dir \
      $dir/${ali_model_id}_${data_id}_sup_targets \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets
    
    steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
      $whole_data_dir \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3
  fi
  
  ###############################################################################
  # Convert the targets from decoding to whole recording. 
  ###############################################################################
  if [ $stage -le 9 ]; then
    steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
      $dir/${uniform_seg_data_id} $whole_data_dir \
      $dir/${model_id}_${uniform_seg_data_id}_targets \
      $dir/${model_id}_${whole_data_id}_targets
  
    steps/segmentation/resample_targets_dir.sh --cmd "$train_cmd" --nj $reco_nj 3 \
      $whole_data_dir \
      $dir/${model_id}_${whole_data_id}_targets \
      $dir/${model_id}_${whole_data_id}_targets_sub3
  fi
  
  ###############################################################################
  # "default targets" values for the out-of-manual-segment regions.
  # We assume in this setup that this is silence i.e. [ 1 0 0 ].
  ###############################################################################
  
  if [ $stage -le 10 ]; then
    echo " [ 1 0 0 ]" > $dir/default_targets.vec
    steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \
      --nj $reco_nj --frame-subsampling-factor 3 \
      --default-targets $dir/default_targets.vec \
      $data_dir $whole_data_dir $dir/out_of_seg_${whole_data_id}_default_targets_sub3
  fi
  
  ###############################################################################
  # Merge targets for the same data from multiple sources (systems)
  # --weights is used to weight targets from alignment with a higher weight 
  # the targets from decoding. 
  # If --remove-mismatch-frames is true, then if alignment and decoding 
  # disagree (more than 0.5 probability on different classes), then those frames
  # are removed by setting targets to [ 0 0 0 ]. 
  ###############################################################################
  if [ $stage -le 11 ]; then
    steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \
      --weights $merge_weights --remove-mismatch-frames true \
      $whole_data_dir \
      $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \
      $dir/${model_id}_${whole_data_id}_targets_sub3 \
      $dir/out_of_seg_${whole_data_id}_default_targets_sub3 \
      $dir/${whole_data_id}_combined_targets_sub3
  fi
  
  cp $dir/${whole_data_id}_combined_targets_sub3/targets.scp $dir/
  
  echo "$0: Prepared targets in $dir/targets.scp"