Blame view
egs/wsj/s5/steps/segmentation/lats_to_targets.sh
4.57 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/bin/bash # Copyright 2017 Vimal Manohar # Apache 2.0 # This script converts lattices into targets for training neural network # for speech activity detection. The targets is a matrix of size # (num-frames-subsampled x 3) # with each row representing probabilities for speech, silence and # garbage classes for the corresponding frame (after subsampling). The # probability values are lattice posteriors for the 3 classes and are # obtained by summing up phone arc posteriors for the phones # corresponding to each class. # The mapping from phones to speech / silence / garbage classes # is defined by the options --silence-phones and --garbage-phones. # Also "speech" phones longer than --max-phone-duration seconds are # treated as "garbage". set -o pipefail silence_phones= garbage_phones= max_phone_duration=0.5 acwt=0.1 cmd=run.pl [ -f ./path.sh ] && . ./path.sh . utils/parse_options.sh if [ $# -ne 4 ]; then cat <<EOF This script converts lattices into targets for training neural network for speech activity detection. The targets is a matrix of size (num-frames-subsampled x 3) with each row representing probabilities for speech, silence and garbage classes for the corresponding frame (after subsampling). The probability values are lattice posteriors for the 3 classes and are obtained by summing up phone arc posteriors for the phones corresponding to each class. The mapping from phones to speech / silence / garbage classes is defined by the options --silence-phones and --garbage-phones. Also "speech" phones longer than --max-phone-duration seconds are treated as "garbage". Usage: steps/segmentation/lats_to_targets.sh <data-dir> <lang> <lattice-dir> <targets-dir>" e.g.: steps/segmentation/lats_to_targets.sh \ --silence-phones exp/segmentation1a/silence_phones.txt \ --garbage-phones exp/segmentation1a/garbage_phones.txt \ --max-phone-duration 0.5 \ data/train_split10s data/lang \ exp/segmentation1a/tri3b_train_split10s_lats \ exp/segmentation1a/tri3b_train_split10s_targets note: silence_phones.txt and garbage_phones.txt must list phones, one per line. garbage_phones.txt can contain phones corresponding to ambiguous items like OOV, laugh and spoken noise that you want to map to "garbage class". silence_phones.txt might just contain the phones from data/lang/phones/silence_phones.txt other than the garbage phones. These are mapped to the "silence" class. EOF exit 1 fi data=$1 lang=$2 lats_dir=$3 dir=$4 if [ -f $lats_dir/final.mdl ]; then srcdir=$lats_dir else srcdir=$lats_dir/.. fi for f in $data/utt2spk $lats_dir/lat.1.gz $srcdir/final.mdl; do if [ ! -f $f ]; then echo "$0: Could not find file $f" exit 1 fi done mkdir -p $dir if [ -z "$garbage_phones" ]; then oov_phone=$(steps/segmentation/internal/get_oov_phone.py $lang) || exit 1 echo $oov_phone | utils/int2sym.pl $lang/phones.txt > $dir/garbage_phones.txt || exit 1 else cp $garbage_phones $dir/garbage_phones.txt || exit 1 fi if [ -z "$silence_phones" ]; then cat $lang/silence_phones.txt | \ utils/filter_scp.pl --exclude $dir/garbage_phones.txt > \ $dir/silence_phones.txt else cp $silence_phones $dir/silence_phones.txt fi nj=$(cat $lats_dir/num_jobs) || exit 1 $cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \ lattice-push "ark:gunzip -c $lats_dir/lat.JOB.gz |" ark:- \| \ lattice-align-phones --replace-output-symbols=true $srcdir/final.mdl ark:- ark:- \| \ lattice-arc-post --acoustic-scale=$acwt $srcdir/final.mdl ark:- - \| \ utils/int2sym.pl -f 5 $lang/phones.txt '>' \ $dir/arc_info_sym.JOB.txt || exit 1 # make $dir an absolute pathname. dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` frame_subsampling_factor=1 if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_factor=$(cat $srcdir/frames_subsampling_factor) echo $frame_subsampling_factor > $dir/frame_subsampling_factor fi frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 max_phone_len=$(perl -e "print int($max_phone_duration / $frame_shift)") $cmd JOB=1:$nj $dir/log/get_targets.JOB.log \ steps/segmentation/internal/arc_info_to_targets.py \ --silence-phones=$dir/silence_phones.txt \ --garbage-phones=$dir/garbage_phones.txt \ --max-phone-length=$max_phone_len \ $dir/arc_info_sym.JOB.txt - \| \ copy-feats ark,t:- \ ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1 for n in $(seq $nj); do cat $dir/targets.$n.scp done > $dir/targets.scp steps/segmentation/validate_targets_dir.sh $dir $data || exit 1 echo "$0: Done creating targets in $dir/targets.scp" |