run_recog.sh
5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/bin/bash
#
# Based mostly on the TED-LIUM and Switchboard recipe
#
# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
# Apache 2.0
#
# This is a subset of run.sh to only perform recognition experiments with evaluation data
# Begin configuration section.
decode_nj=20
stage=0
enhancement=beamformit # for a new enhancement method,
# change this variable and stage 4
# End configuration section
. ./utils/parse_options.sh
. ./cmd.sh
. ./path.sh
set -e # exit on error
# chime5 main directory path
# please change the path accordingly
chime5_corpus=/export/corpora4/CHiME5
json_dir=${chime5_corpus}/transcriptions
audio_dir=${chime5_corpus}/audio
# training and test data
train_set=train_worn_u100k
test_sets="eval_${enhancement}_ref"
# This script also needs the phonetisaurus g2p, srilm, beamformit
./local/check_tools.sh || exit 1
if [ $stage -le 4 ]; then
# Beamforming using reference arrays
# enhanced WAV directory
enhandir=enhan
for dset in eval; do
for mictype in u01 u02 u03 u04 u05 u06; do
local/run_beamformit.sh --cmd "$train_cmd" \
${audio_dir}/${dset} \
${enhandir}/${dset}_${enhancement}_${mictype} \
${mictype}
done
done
for dset in eval; do
local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
${json_dir}/${dset} data/${dset}_${enhancement}_ref
done
fi
if [ $stage -le 6 ]; then
# fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
# add array ID to the speaker ID to avoid the use of other array information to meet regulations
# Before this fix
# $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
# P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
# P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
# After this fix
# $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
# P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
# P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
for dset in ${test_sets}; do
utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
mkdir -p data/${dset}_nosplit_fix
cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
done
# Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and
# lets us use more jobs for decoding etc.
for dset in ${test_sets}; do
utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
done
fi
if [ $stage -le 7 ]; then
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in ${test_sets}; do
steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
data/$x exp/make_mfcc/$x $mfccdir
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
utils/fix_data_dir.sh data/$x
done
fi
if [ $stage -le 17 ]; then
nnet3_affix=_${train_set}_cleaned
for datadir in ${test_sets}; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
done
for datadir in ${test_sets}; do
steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires || exit 1;
steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
done
for data in $test_sets; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
exp/nnet3${nnet3_affix}/ivectors_${data}_hires
done
fi
if [ $stage -le 18 ]; then
# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
lm_suffix=
# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=1a # affix for the TDNN directory name
tree_affix=
tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
# training options
# training chunk-options
chunk_width=140,100,160
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
utils/mkgraph.sh \
--self-loop-scale 1.0 data/lang${lm_suffix}/ \
$tree_dir $tree_dir/graph${lm_suffix} || exit 1;
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
rm $dir/.error 2>/dev/null || true
for data in $test_sets; do
(
steps/nnet3/decode.sh \
--acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj 8 --cmd "$decode_cmd" --num-threads 4 \
--online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
$tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
) || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
fi
if [ $stage -le 20 ]; then
# final scoring to get the official challenge result
# please specify both dev and eval set directories so that the search parameters
# (insertion penalty and language model weight) will be tuned using the dev set
local/score_for_submit.sh \
--dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \
--eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref
fi