run_ivector_common.sh 5.12 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185


#!/bin/bash

set -euo pipefail

# This script is called from local/nnet3/run_tdnn.sh and
# local/chain/run_tdnn.sh (and may eventually be called by more
# scripts).  It contains the common feature preparation and
# iVector-related parts of the script.  See those scripts for examples
# of usage.

stage=0
train_set=train
test_sets="devtest test"
gmm=tri3b

nnet3_affix=

. ./cmd.sh
. ./path.sh
. utils/parse_options.sh

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}_sp

for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
    # perturb data to get alignments
    # nnet will be trained by high resolution data
    # _sp stands for speed-perturbed
    echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
    utils/data/perturb_data_dir_speed_3way.sh \
	data/${train_set} \
	data/${train_set}_sp
    echo "$0: making mfcc features for low-resolution speed-perturbed data"
    steps/make_mfcc.sh \
	--cmd "$train_cmd" \
	--nj 10 \
	data/${train_set}_sp
    steps/compute_cmvn_stats.sh \
	data/${train_set}_sp
    utils/fix_data_dir.sh \
	data/${train_set}_sp
fi

if [ $stage -le 2 ]; then
    echo "$0: aligning with the perturbed low-resolution data"
    steps/align_fmllr.sh \
	--nj 20 \
	--cmd "$train_cmd" \
	data/${train_set}_sp \
	data/lang \
	$gmm_dir \
	$ali_dir
fi

if [ $stage -le 3 ]; then
    # Create high-resolution MFCC features (with 40 cepstra instead of 13).

    echo "$0: creating high-resolution MFCC features"
    mfccdir=data/${train_set}_sp_hires/data
    for datadir in ${train_set}_sp ${test_sets}; do
	utils/copy_data_dir.sh \
	    data/$datadir \
	    data/${datadir}_hires
    done

    # do volume-perturbation on the training data prior to extracting hires
    # features; this helps make trained nnets more invariant to test data volume.
    utils/data/perturb_data_dir_volume.sh \
	data/${train_set}_sp_hires

    for datadir in ${train_set}_sp ${test_sets}; do
	steps/make_mfcc.sh \
	    --nj 10 \
	    --mfcc-config conf/mfcc_hires.conf \
	    --cmd "$train_cmd" \
	    data/${datadir}_hires
	steps/compute_cmvn_stats.sh \
	    data/${datadir}_hires
	utils/fix_data_dir.sh \
	    data/${datadir}_hires
    done
fi

if [ $stage -le 4 ]; then
    echo "$0: computing a subset of data to train the diagonal UBM."
    # We'll use about a quarter of the data.
    mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
    temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

    num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
    num_utts=$[$num_utts_total/4]
    utils/data/subset_data_dir.sh \
	data/${train_set}_sp_hires \
	$num_utts \
	${temp_data_root}/${train_set}_sp_hires_subset

    echo "$0: computing a PCA transform from the hires data."
    steps/online/nnet2/get_pca_transform.sh \
	--cmd "$train_cmd" \
	--splice-opts "--left-context=3 --right-context=3" \
	--max-utts 10000 \
	--subsample 2 \
	${temp_data_root}/${train_set}_sp_hires_subset \
	exp/nnet3${nnet3_affix}/pca_transform

    echo "$0: training the diagonal UBM."
    # Use 512 Gaussians in the UBM.
    steps/online/nnet2/train_diag_ubm.sh \
	--cmd "$train_cmd" \
	--nj 20 \
	--num-frames 700000 \
	--num-threads 8 \
	${temp_data_root}/${train_set}_sp_hires_subset \
	512 \
	exp/nnet3${nnet3_affix}/pca_transform \
	exp/nnet3${nnet3_affix}/diag_ubm
fi

if [ $stage -le 5 ]; then
    # Train the iVector extractor.
    # Use all the speed-perturbed data .
    # iVector extractors can be sensitive to the amount of data.
    # The script defaults to an iVector dimension of 100.
    echo "$0: training the iVector extractor"
    steps/online/nnet2/train_ivector_extractor.sh \
	--cmd "$train_cmd" \
	--nj 10 \
	data/${train_set}_sp_hires \
	exp/nnet3${nnet3_affix}/diag_ubm \
	exp/nnet3${nnet3_affix}/extractor
fi

# combine   and train system on short segments.
# extract iVectors on speed-perturbed training data
# With --utts-per-spk-max 2, script pairs  utterances into twos.
# Treats each  pair as one speaker.
# Gives more diversity in iVectors.
# Extracted online.

# note: extract  ivectors from max2 data
# Why is max2 not encoded in ivectordir name?
# valid for non-max2 data
#  utterance list is the same.

# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).

if [ $stage -le 6 ]; then
    ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
    temp_data_root=${ivectordir}
    utils/data/modify_speaker_info.sh \
	--utts-per-spk-max 2 \
	data/${train_set}_sp_hires \
	${temp_data_root}/${train_set}_sp_hires_max2

    steps/online/nnet2/extract_ivectors_online.sh \
	--cmd "$train_cmd" \
	--nj 20 \
	${temp_data_root}/${train_set}_sp_hires_max2 \
	exp/nnet3${nnet3_affix}/extractor \
	$ivectordir
fi

# Also extract iVectors for test data.
# No need for speed perturbation (sp).

if [ $stage -le 7 ]; then
    for data in $test_sets; do
	steps/online/nnet2/extract_ivectors_online.sh \
	    --cmd "$train_cmd" \
	    --nj 1 \
	    data/${data}_hires \
	    exp/nnet3${nnet3_affix}/extractor \
	    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
    done
fi

exit 0