decode.sh 5.98 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162


#!/bin/bash

# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
#           2019 Vimal Manohar 
# Apache 2.0.

# This script does 2-stage decoding where the first stage is used to get 
# reliable frames for i-vector extraction.

set -e

# general opts
iter=
stage=0
nj=30
affix=  # affix for decode directory

# ivector opts
max_count=75  # parameter for extract_ivectors.sh
sub_speaker_frames=6000
ivector_scale=0.75
get_weights_from_ctm=true
weights_file=   # use weights from this archive (must be compressed using gunzip)
silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
ivector_dir=exp/nnet3

# decode opts
pass2_decode_opts="--min-active 1000"
lattice_beam=8
extra_left_context=0 # change for (B)LSTM
extra_right_context=0 # change for BLSTM
frames_per_chunk=50 # change for (B)LSTM
acwt=0.1 # important to change this when using chain models
post_decode_acwt=1.0 # important to change this when using chain models
extra_left_context_initial=0
extra_right_context_final=0

score_opts="--min-lmwt 6 --max-lmwt 13"

. ./cmd.sh
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh || exit 1;

if [ $# -ne 4 ]; then
  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
  echo " Options:"
  echo "    --stage (0|1|2)   # start scoring script from part-way through."
  echo "e.g.:"
  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
  exit 1;
fi

data=$1 # data directory 
lang=$2 # data/lang
graph=$3 #exp/tri5a/graph_pp
dir=$4 # exp/nnet3/tdnn

model_affix=`basename $dir`
ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
affix=${affix:+_${affix}}${iter:+_iter${iter}}

if [ $stage -le 1 ]; then
  if [ ! -s ${data}_hires/feats.scp ]; then
    utils/copy_data_dir.sh $data ${data}_hires
    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
    steps/compute_cmvn_stats.sh ${data}_hires
    utils/fix_data_dir.sh ${data}_hires
  fi
fi

data_set=$(basename $data)
if [ $stage -le 2 ]; then
  echo "Extracting i-vectors, stage 1"
  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
    --max-count $max_count \
    ${data}_hires $ivector_dir/extractor \
    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
  # float comparisons are hard in bash
  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
    ivector_scale_affix=_scale$ivector_scale
  else
    ivector_scale_affix=
  fi

  if [ ! -z "$ivector_scale_affix" ]; then
    echo "$0: Scaling iVectors, stage 1"
    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
    mkdir -p $outdir
    $train_cmd $outdir/log/scale_ivectors.log \
      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
    cp $srcdir/ivector_period $outdir/ivector_period
  fi
fi

decode_dir=$dir/decode_${data_set}${affix}
# generate the lattices
if [ $stage -le 3 ]; then
  echo "Generating lattices, stage 1"
  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
    --acwt $acwt --post-decode-acwt $post_decode_acwt \
    --extra-left-context $extra_left_context  \
    --extra-right-context $extra_right_context  \
    --extra-left-context-initial $extra_left_context_initial \
    --extra-right-context-final $extra_right_context_final \
    --frames-per-chunk "$frames_per_chunk" \
    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
    --skip-scoring true ${iter:+--iter $iter} \
    $graph ${data}_hires ${decode_dir}_stage1;
fi

if [ $stage -le 4 ]; then
  if $get_weights_from_ctm; then
    if [ ! -z $weights_file ]; then
      echo "$0: Using provided vad weights file $weights_file"
      ivector_extractor_weights=$weights_file
    else
      echo "$0 : Generating vad weights file"
      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
      local/extract_vad_weights.sh --silence-weight $silence_weight \
        --cmd "$decode_cmd" ${iter:+--iter $iter} \
        ${data}_hires $lang \
        ${decode_dir}_stage1 $ivector_extractor_weights
    fi
  else
    # get weights from best path decoding
    ivector_extractor_weights=${decode_dir}_stage1
  fi
fi

if [ $stage -le 5 ]; then
  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
  # this does offline decoding, except we estimate the iVectors per
  # speaker, excluding silence (based on alignments from a DNN decoding), with a
  # different script.  This is just to demonstrate that script.
  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
  # up into "sub-speakers" of at least that many frames... can be useful if
  # acoustic conditions drift over time within the speaker's data.
  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
    --silence-weight $silence_weight \
    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
    ${data}_hires $lang $ivector_dir/extractor \
    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
fi

if [ $stage -le 6 ]; then
  echo "Generating lattices, stage 2 with --acwt $acwt"
  rm -f ${decode_dir}/.error
  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
      --acwt $acwt --post-decode-acwt $post_decode_acwt \
      --extra-left-context $extra_left_context  \
      --extra-right-context $extra_right_context  \
      --extra-left-context-initial $extra_left_context_initial \
      --extra-right-context-final $extra_right_context_final \
      --frames-per-chunk "$frames_per_chunk" \
      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
fi
exit 0