run.sh 7.08 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179


#!/bin/bash
#
# Johns Hopkins University (Author : Gaurav Kumar, Daniel Povey)
# Recipe for CallHome Egyptian Arabic
# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT

. ./cmd.sh
. ./path.sh
mfccdir=`pwd`/mfcc
set -e

# Specify the location of the speech files, the transcripts and the lexicon
# These are passed off to other scripts in including the one for data and lexicon prep

eca_speech=/export/corpora/LDC/LDC97S45
eca_transcripts=/export/corpora/LDC/LDC97T19
eca_lexicon=/export/corpora/LDC/LDC99L22
sup_speech=/export/corpora/LDC/LDC2002S37
sup_transcripts=/export/corpora/LDC/LDC2002T38
h5_speech=/export/corpora/LDC/LDC2002S22
h5_transcripts=/export/corpora/LDC/LDC2002T39
split=local/splits

local/callhome_data_prep.sh $eca_speech $eca_transcripts $sup_speech $sup_transcripts $h5_speech $h5_transcripts

local/callhome_prepare_dict.sh $eca_lexicon

# Added c,j, v to the non silences phones manually
utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang

# Make sure that you do not use your test and your dev sets to train the LM
# Some form of cross validation is possible where you decode your dev/set based on an
# LM that is trained on  everything but that that conversation
local/callhome_train_lms.sh $split
local/callhome_create_test_lang.sh

utils/fix_data_dir.sh data/local/data/train_all

steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;

utils/fix_data_dir.sh data/local/data/train_all
utils/validate_data_dir.sh data/local/data/train_all

cp -r data/local/data/train_all data/train_all

# Creating data partitions for the pipeline

local/create_splits $split

# Now compute CMVN stats for the train, dev and test subsets
steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
steps/compute_cmvn_stats.sh data/sup exp/make_mfcc/sup $mfccdir
steps/compute_cmvn_stats.sh data/h5 exp/make_mfcc/h5 $mfccdir

steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir

# Again from Dan's recipe : Reduced monophone training data
# Now-- there are 1.6 million utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
# ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
# utterances from those.

steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
  data/train data/lang exp/mono0a

steps/align_si.sh --nj 30 --cmd "$train_cmd" \
   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;

steps/train_deltas.sh --cmd "$train_cmd" \
    1000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;


(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
 steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
   exp/tri1/graph data/dev exp/tri1/decode_dev)&

steps/align_si.sh --nj 30 --cmd "$train_cmd" \
   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;

steps/train_deltas.sh --cmd "$train_cmd" \
    1400 15000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;

(
  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
   exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
)&

steps/align_si.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;

# Train tri3a, which is LDA+MLLT, on 100k data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
   --splice-opts "--left-context=3 --right-context=3" \
   1800 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
(
  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
   exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
)&

# Next we'll use fMLLR and train with SAT (i.e. on
# fMLLR features)

steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;

steps/train_sat.sh  --cmd "$train_cmd" \
  2200 25000 data/train data/lang exp/tri3a_ali  exp/tri4a || exit 1;

(
  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
   exp/tri4a/graph data/dev exp/tri4a/decode_dev
)&


steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
  data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;

# Reduce the number of gaussians
steps/train_sat.sh  --cmd "$train_cmd" \
  2600 30000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;

(
  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
   exp/tri5a/graph data/dev exp/tri5a/decode_dev
)&

(
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
  exp/tri5a/graph data/test exp/tri5a/decode_test
  # Decode Supplement and H5
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
  exp/tri5a/graph data/sup exp/tri5a/decode_sup
  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
  exp/tri5a/graph data/h5 exp/tri5a/decode_h5
)&

dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
                       --parallel-opts "--num-threads 16" --cmd "queue.pl  --mem 1G")
dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
                       --parallel-opts "--gpu 1" --cmd "queue.pl  --mem 1G")

steps/nnet2/train_pnorm_ensemble.sh \
  --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
  --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\
  --cmd "$train_cmd" \
  "${dnn_gpu_parallel_opts[@]}" \
  --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \
  data/train data/lang exp/tri5a_ali exp/tri6a_dnn

(
  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
) &

# Decode test sets
(
  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test
  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup
  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5
) &

wait

# (TDNN + iVectors) training
# Note that the alignments used by run_tdnn.sh come from the pnorm-ensemble model
# If you choose to skip ensemble training (which is slow), use the best
# fmllr alignments available (tri4a)
# You can modify this in local/nnet/run_tdnn.sh
local/nnet3/run_tdnn.sh

exit 0;