Blame view

egs/wsj/s5/steps/segmentation/lats_to_targets.sh 4.57 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  #!/bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0
  
  # This script converts lattices into targets for training neural network
  # for speech activity detection. The targets is a matrix of size 
  # (num-frames-subsampled x 3)
  # with each row representing probabilities for speech, silence and 
  # garbage classes for the corresponding frame (after subsampling). The 
  # probability values are lattice posteriors for the 3 classes and are
  # obtained by summing up phone arc posteriors for the phones
  # corresponding to each class.
  # The mapping from phones to speech / silence / garbage classes
  # is defined by the options --silence-phones and --garbage-phones.
  # Also "speech" phones longer than --max-phone-duration seconds are 
  # treated as "garbage".
  
  set -o pipefail
  
  silence_phones=
  garbage_phones=
  max_phone_duration=0.5
  acwt=0.1
  
  cmd=run.pl
  
  [ -f ./path.sh ] && . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    cat <<EOF
    This script converts lattices into targets for training neural network
    for speech activity detection. The targets is a matrix of size 
    (num-frames-subsampled x 3)
    with each row representing probabilities for speech, silence and 
    garbage classes for the corresponding frame (after subsampling). The 
    probability values are lattice posteriors for the 3 classes and are
    obtained by summing up phone arc posteriors for the phones
    corresponding to each class.
    The mapping from phones to speech / silence / garbage classes
    is defined by the options --silence-phones and --garbage-phones.
    Also "speech" phones longer than --max-phone-duration seconds are 
    treated as "garbage".
  
    Usage: steps/segmentation/lats_to_targets.sh <data-dir> <lang> <lattice-dir> <targets-dir>"
    e.g.: steps/segmentation/lats_to_targets.sh \
    --silence-phones exp/segmentation1a/silence_phones.txt \
    --garbage-phones exp/segmentation1a/garbage_phones.txt \
    --max-phone-duration 0.5 \
    data/train_split10s data/lang \
    exp/segmentation1a/tri3b_train_split10s_lats \
    exp/segmentation1a/tri3b_train_split10s_targets
  
    note: 
    silence_phones.txt and garbage_phones.txt must list phones, one per line.
    garbage_phones.txt can contain phones corresponding to ambiguous items like 
    OOV, laugh and spoken noise that you want to map to "garbage class".
    silence_phones.txt might just contain the phones from 
    data/lang/phones/silence_phones.txt other than the garbage phones. These
    are mapped to the "silence" class.
  EOF
    exit 1
  fi
  
  data=$1
  lang=$2
  lats_dir=$3
  dir=$4
  
  if [ -f $lats_dir/final.mdl ]; then
    srcdir=$lats_dir
  else
    srcdir=$lats_dir/..
  fi
  
  for f in $data/utt2spk $lats_dir/lat.1.gz $srcdir/final.mdl; do 
    if [ ! -f $f ]; then
      echo "$0: Could not find file $f"
      exit 1
    fi
  done
  
  mkdir -p $dir
  
  if [ -z "$garbage_phones" ]; then
    oov_phone=$(steps/segmentation/internal/get_oov_phone.py $lang) || exit 1
    echo $oov_phone | utils/int2sym.pl $lang/phones.txt > $dir/garbage_phones.txt || exit 1
  else 
    cp $garbage_phones $dir/garbage_phones.txt || exit 1
  fi
  
  if [ -z "$silence_phones" ]; then
    cat $lang/silence_phones.txt | \
      utils/filter_scp.pl --exclude $dir/garbage_phones.txt > \
      $dir/silence_phones.txt
  else 
    cp $silence_phones $dir/silence_phones.txt
  fi
  
  nj=$(cat $lats_dir/num_jobs) || exit 1
  
  $cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
    lattice-push "ark:gunzip -c $lats_dir/lat.JOB.gz |" ark:- \| \
    lattice-align-phones --replace-output-symbols=true $srcdir/final.mdl ark:- ark:- \| \
    lattice-arc-post --acoustic-scale=$acwt $srcdir/final.mdl ark:- - \| \
    utils/int2sym.pl -f 5 $lang/phones.txt '>' \
    $dir/arc_info_sym.JOB.txt || exit 1
  
  # make $dir an absolute pathname.
  dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
  
  frame_subsampling_factor=1
  if [ -f $srcdir/frame_subsampling_factor ]; then
    frame_subsampling_factor=$(cat $srcdir/frames_subsampling_factor)
    echo $frame_subsampling_factor > $dir/frame_subsampling_factor
  fi
  
  frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
  max_phone_len=$(perl -e "print int($max_phone_duration / $frame_shift)")
  
  $cmd JOB=1:$nj $dir/log/get_targets.JOB.log \
    steps/segmentation/internal/arc_info_to_targets.py \
      --silence-phones=$dir/silence_phones.txt \
      --garbage-phones=$dir/garbage_phones.txt \
      --max-phone-length=$max_phone_len \
      $dir/arc_info_sym.JOB.txt - \| \
    copy-feats ark,t:- \
      ark,scp:$dir/targets.JOB.ark,$dir/targets.JOB.scp || exit 1
  
  for n in $(seq $nj); do
    cat $dir/targets.$n.scp
  done > $dir/targets.scp
  
  steps/segmentation/validate_targets_dir.sh $dir $data || exit 1
  
  echo "$0: Done creating targets in $dir/targets.scp"