Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/steps/segmentation/lats_to_targets.sh 4.57 KB
  #!/bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0
  
  # This script converts lattices into targets for training neural network
  # for speech activity detection. The targets is a matrix of size 
  # (num-frames-subsampled x 3)
  # with each row representing probabilities for speech, silence and 
  # garbage classes for the corresponding frame (after subsampling). The 
  # probability values are lattice posteriors for the 3 classes and are
  # obtained by summing up phone arc posteriors for the phones
  # corresponding to each class.
  # The mapping from phones to speech / silence / garbage classes
  # is defined by the options --silence-phones and --garbage-phones.
  # Also "speech" phones longer than --max-phone-duration seconds are 
  # treated as "garbage".
  
  set -o pipefail
  
  silence_phones=
  garbage_phones=
  max_phone_duration=0.5
  acwt=0.1
  
  cmd=run.pl
  
  [ -f ./path.sh ] && . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    cat <<EOF
    This script converts lattices into targets for training neural network
    for speech activity detection. The targets is a matrix of size 
    (num-frames-subsampled x 3)
    with each row representing probabilities for speech, silence and 
    garbage classes for the corresponding frame (after subsampling). The 
    probability values are lattice posteriors for the 3 classes and are
    obtained by summing up phone arc posteriors for the phones
    corresponding to each class.
    The mapping from phones to speech / silence / garbage classes
    is defined by the options --silence-phones and --garbage-phones.
    Also "speech" phones longer than --max-phone-duration seconds are 
    treated as "garbage".
  
    Usage: steps/segmentation/lats_to_targets.sh <data-dir> <lang> <lattice-dir> <targets-dir>"
    e.g.: steps/segmentation/lats_to_targets.sh \
    --silence-phones exp/segmentation1a/silence_phones.txt \
    --garbage-phones exp/segmentation1a/garbage_phones.txt \
    --max-phone-duration 0.5 \
    data/train_split10s data/lang \
    exp/segmentation1a/tri3b_train_split10s_lats \
    exp/segmentation1a/tri3b_train_split10s_targets
  
    note: 
    silence_phones.txt and garbage_phones.txt must list phones, one per line.
    garbage_phones.txt can contain phones corresponding to ambiguous items like 
    OOV, laugh and spoken noise that you want to map to "garbage class".
    silence_phones.txt might just contain the phones from 
    data/lang/phones/silence_phones.txt other than the garbage phones. These
    are mapped to the "silence" class.
  EOF
    exit 1
  fi
  
  data=$1
  lang=$2
  lats_dir=$3
  dir=$4
  
  if [ -f $lats_dir/final.mdl ]; then
    srcdir=$lats_dir
  else
    srcdir=$lats_dir/..
  fi
  
  for f in $data/utt2spk $lats_dir/lat.1.gz $srcdir/final.mdl; do 
    if [ ! -f $f ]; then
      echo "$0: Could not find file $f"
      exit 1
    fi
  done
  
  mkdir -p $dir
  
  if [ -z "$garbage_phones" ]; then
    oov_phone=$(steps/segmentation/internal/get_oov_phone.py $lang) || exit 1
    echo $oov_phone | utils/int2sym.pl $lang/phones.txt > $dir/garbage_phones.txt || exit 1
  else 
    cp $garbage_phones $dir/garbage_phones.txt || exit 1
  fi
  
  if [ -z "$silence_phones" ]; then
    cat $lang/silence_phones.txt | \
      utils/filter_scp.pl --exclude $dir/garbage_phones.txt > \
      $dir/silence_phones.txt
  else 
    cp $silence_phones $dir/silence_phones.txt
  fi
  
  nj=$(cat $lats_dir/num_jobs) || exit 1
  
  $cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
    lattice-push "ark:gunzip -c $lats_dir/lat.JOB.gz |" ark:- \| \
    lattice-align-phones --replace-output-symbols=true $srcdir/final.mdl ark:- ark:- \| \
    lattice-arc-post --acoustic-scale=$acwt $srcdir/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $lang/phones.txt '>' \
    $dir/arc_info_sym.JOB.txt || exit 1
  
  # make $dir an absolute pathname.
  dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
  
  frame_subsampling_factor=1
  if [ -f $srcdir/frame_subsampling_factor ]; then
    frame_subsampling_factor=$(cat $srcdir/frames_subsampling_factor)
    echo $frame_subsampling_factor > $dir/frame_subsampling_factor
  fi
  
  frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
  max_phone_len=$(perl -e "print int($max_phone_duration / $frame_shift)")
  
  $cmd JOB=1:$nj $dir/log/get_targets.JOB.log \
    steps/segmentation/internal/arc_info_to_targets.py \
      --silence-phones=$dir/silence_phones.txt \
      --garbage-phones=$dir/garbage_phones.txt \
      --max-phone-length=$max_phone_len \
      $dir/arc_info_sym.JOB.txt - \| \
    copy-feats ark,t:- \
      ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1
  
  for n in $(seq $nj); do
    cat $dir/targets.$n.scp
  done > $dir/targets.scp
  
  steps/segmentation/validate_targets_dir.sh $dir $data || exit 1
  
  echo "$0: Done creating targets in $dir/targets.scp"