run_ivector_common.sh
6.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash
set -e -o pipefail
# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
# be called by more scripts). It contains the common feature preparation and iVector-related parts
# of the script. See those scripts for examples of usage.
stage=0
train_set=train_960_cleaned # you might set this to e.g. train_960
gmm=tri6b_cleaned # This specifies a GMM-dir from the features of the type you're training the system on;
# it should contain alignments for 'train_set'.
num_threads_ubm=16
num_processes=4
nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it
# becomes exp/nnet3_cleaned or whatever.
. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh
gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}_sp
for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done
if [ $stage -le 1 ]; then
#Although the nnet will be trained by high resolution data, we still have to
# perturb the normal data to get the alignment. _sp stands for speed-perturbed
echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
echo "$0: making MFCC features for low-resolution speed-perturbed data"
steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 data/${train_set}_sp || exit 1;
steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
echo "$0: fixing input data-dir to remove nonexistent features, in case some "
echo ".. speed-perturbed segments were too short."
utils/fix_data_dir.sh data/${train_set}_sp
fi
if [ $stage -le 2 ]; then
if [ -f $ali_dir/ali.1.gz ]; then
echo "$0: alignments in $ali_dir appear to already exist. Please either remove them "
echo " ... or use a later --stage option."
exit 1
fi
echo "$0: aligning with the perturbed low-resolution data"
steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
fi
if [ $stage -le 3 ]; then
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
echo "$0: creating high-resolution MFCC features"
mfccdir=data/${train_set}_sp_hires/data
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi
for datadir in ${train_set}_sp test_clean test_other dev_clean dev_other; do
utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
done
# do volume-perturbation on the training data prior to extracting hires
# features; this helps make trained nnets more invariant to test data volume.
utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
for datadir in ${train_set}_sp test_clean test_other dev_clean dev_other; do
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/${datadir}_hires || exit 1;
steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
utils/fix_data_dir.sh data/${datadir}_hires
done
# now create a data subset. 60k is 1/5th of the training dataset (around 200 hours).
utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 data/${train_set}_sp_hires_60k
fi
if [ $stage -le 4 ]; then
echo "$0: making a subset of data to train the diagonal UBM and the PCA transform."
# We'll one hundredth of the data, since Librispeech is very large.
mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
num_utts=$[$num_utts_total/100]
utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
$num_utts ${temp_data_root}/${train_set}_sp_hires_subset
echo "$0: computing a PCA transform from the hires data."
steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
--max-utts 10000 --subsample 2 \
${temp_data_root}/${train_set}_sp_hires_subset \
exp/nnet3${nnet3_affix}/pca_transform
echo "$0: training the diagonal UBM."
# Use 512 Gaussians in the UBM.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
--num-frames 700000 \
--num-threads $num_threads_ubm \
${temp_data_root}/${train_set}_sp_hires_subset 512 \
exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi
if [ $stage -le 5 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 60k subset (about one fifth of the data, or 200 hours).
echo "$0: training the iVector extractor"
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 --num-processes $num_processes \
data/${train_set}_sp_hires_60k exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
fi
if [ $stage -le 6 ]; then
echo "$0: extracting iVectors for training data"
ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/ivectors/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on the speed-perturbed training data after combining
# short segments, which will be what we train the system on. With
# --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
# each of these pairs as one speaker. this gives more diversity in iVectors..
# Note that these are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
data/${train_set}_sp_hires ${ivectordir}/${train_set}_sp_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
${ivectordir}/${train_set}_sp_hires_max2 exp/nnet3${nnet3_affix}/extractor \
$ivectordir || exit 1;
fi
if [ $stage -le 7 ]; then
echo "$0: extracting iVectors for dev and test data"
for data in test_clean test_other dev_clean dev_other; do
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
exp/nnet3${nnet3_affix}/ivectors_${data}_hires || exit 1;
done
fi
exit 0;