Blame view

egs/aspire/s5/local/nnet3/segment_and_decode.sh 5.74 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
  #!/bin/bash
  
  # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016.  Apache 2.0.
  # This script generates the ctm files for dev_aspire, test_aspire and eval_aspire
  # for scoring with ASpIRE scoring server.
  # It also provides the WER for dev_aspire data.
  
  set -e
  
  # general opts
  iter=
  stage=0
  sad_num_jobs=30
  decode_num_jobs=30
  affix=
  
  # segmentation opts
  sad_affix=
  sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
  sad_graph_opts=
  sad_priors_opts=
  sad_stage=0
  
  # ivector opts
  max_count=75  # parameter for extract_ivectors.sh
  sub_speaker_frames=6000
  
  # decode opts
  decode_opts="--min-active 1000"
  lattice_beam=8
  extra_left_context=0 # change for (B)LSTM
  extra_right_context=0 # change for BLSTM
  frames_per_chunk=50 # change for (B)LSTM
  acwt=0.1 # important to change this when using chain models
  post_decode_acwt=1.0 # important to change this when using chain models
  extra_left_context_initial=0
  extra_right_context_final=0
  
  score_opts="--min-lmwt 6 --max-lmwt 13"
  
  . ./cmd.sh
  [ -f ./path.sh ] && . ./path.sh
  . utils/parse_options.sh || exit 1;
  
  if [ $# -ne 6 ]; then
    echo "Usage: $0 [options] <data-dir> <sad-nnet-dir> <work-dir> <lang-dir> <graph-dir> <model-dir>"
    echo " Options:"
    echo "    --stage (0|1|2)   # start scoring script from part-way through."
    echo "e.g.:"
    echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
    exit 1;
  fi
  
  data_set=$1 #select from {dev_aspire, test_aspire, eval_aspire}*
  sad_nnet_dir=$2
  sad_work_dir=$3
  lang=$4 # data/lang
  graph=$5 #exp/tri5a/graph_pp
  dir=$6 # exp/nnet3/tdnn
  
  model_affix=`basename $dir`
  ivector_root_dir=exp/nnet3
  affix=${affix:+_${affix}}${iter:+_iter${iter}}
  
  if [[ "$data_set" =~ "test_aspire" ]]; then
    out_file=single_dev_test${affix}_$model_affix.ctm
    act_data_set=test_aspire
  elif [[ "$data_set" =~ "eval_aspire" ]]; then
    out_file=single_eval${affix}_$model_affix.ctm
    act_data_set=eval_aspire
  elif [[ "$data_set" =~  "dev_aspire" ]]; then
    # we will just decode the directory without oracle segments file
    # as we would like to operate in the actual evaluation condition
    out_file=single_dev${affix}_${model_affix}.ctm
    act_data_set=dev_aspire
  else
    echo "$0: Unknown data-set $data_set"
    exit 1
  fi
  
  if [ $stage -le 2 ]; then
    steps/segmentation/detect_speech_activity.sh \
      --nj $sad_num_jobs --stage $sad_stage \
      --affix "$sad_affix" --graph-opts "$sad_graph_opts" \
      --transform-probs-opts "$sad_priors_opts" $sad_opts \
      data/$data_set $sad_nnet_dir mfcc_hires $sad_work_dir \
      $sad_work_dir/${data_set}${sad_affix:+_$sad_affix} || exit 1
  fi
  
  segmented_data_set=${data_set}${sad_affix:+_$sad_affix}
  
  if [ $stage -le 3 ]; then
    if [ -f data/$act_data_set/ref.rttm ]; then
      if [ ! -f $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel ]; then
        awk '{print $2" "1}' $sad_work_dir/${segmented_data_set}_seg/segments | \
          sort -u > $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel
      fi
  
      steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
        --reco2file-and-channel=${sad_work_dir}/${segmented_data_set}_seg/reco2file_and_channel \
        ${sad_work_dir}/${segmented_data_set}_seg/{utt2spk,segments,sys.rttm} || exit 1
  
      export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin
      md-eval.pl -c 0.25 -r data/dev_aspire/ref.rttm \
        -s ${sad_work_dir}/${segmented_data_set}_seg/sys.rttm > \
        ${sad_work_dir}/${segmented_data_set}_seg/md_eval.log 
    fi
  fi
  
  if [ $stage -le 4 ]; then
    utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \
      data/${segmented_data_set}_seg_hires
    steps/compute_cmvn_stats.sh data/${segmented_data_set}_seg_hires
    utils/fix_data_dir.sh data/${segmented_data_set}_seg_hires
  fi
  
  if [ $stage -le 5 ]; then
    echo "Extracting i-vectors"
    # this does offline decoding. 
    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
    # up into "sub-speakers" of at least that many frames... can be useful if
    # acoustic conditions drift over time within the speaker's data.
    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \
      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
      data/${segmented_data_set}_seg_hires $lang $ivector_root_dir/extractor \
      $ivector_root_dir/ivectors_${segmented_data_set}_seg
  fi
  
  decode_dir=$dir/decode_${segmented_data_set}_seg${affix}_pp
  if [ $stage -le 6 ]; then
    echo "Generating lattices"
    rm -f ${decode_dir}_tg/.error
    steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \
        --acwt $acwt --post-decode-acwt $post_decode_acwt $decode_opts \
        --extra-left-context $extra_left_context  \
        --extra-right-context $extra_right_context  \
        --extra-left-context-initial $extra_left_context_initial \
        --extra-right-context-final $extra_right_context_final \
        --frames-per-chunk "$frames_per_chunk" \
        --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \
        --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set}_seg \
       $graph data/${segmented_data_set}_seg_hires ${decode_dir}_tg || \
       { echo "$0: Error decoding" && exit 1; }
  fi
  
  if [ $stage -le 7 ]; then
    echo "Rescoring lattices"
    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
      --skip-scoring true \
      ${lang}_pp_test{,_fg} data/${segmented_data_set}_seg_hires \
      ${decode_dir}_{tg,fg};
  fi
  
  decode_dir=${decode_dir}_fg
  
  if [ $stage -le 8 ]; then
    local/score_aspire.sh --cmd "$decode_cmd" \
      $score_opts \
      --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \
      --ctm-beam 6 \
      ${iter:+--iter $iter} \
      --decode-mbr true \
      --tune-hyper true \
      $lang $decode_dir $act_data_set ${segmented_data_set}_seg $out_file
  fi