run.sh 10.9 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271


#!/bin/bash
#
# Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
# Copyright 2014  Gaurav Kumar.   Apache 2.0
# Recipe for Fisher/Callhome-Spanish

stage=0
train_stage=-20
train_sgmm2=false

# call the next line with the directory where the Spanish Fisher data is
# (the values below are just an example).
sfisher_speech=/export/corpora/LDC/LDC2010S01
sfisher_transcripts=/export/corpora/LDC/LDC2010T04
spanish_lexicon=/export/corpora/LDC/LDC96L16
split=local/splits/split_fisher

callhome_speech=/export/corpora/LDC/LDC96S35
callhome_transcripts=/export/corpora/LDC/LDC96T17
split_callhome=local/splits/split_callhome

mfccdir=`pwd`/mfcc

. ./cmd.sh
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

set -e

if [ $stage -le 1 ]; then
  local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts

  local/callhome_data_prep.sh $callhome_speech $callhome_transcripts

  # The lexicon is created using the LDC spanish lexicon, the words from the
  # fisher spanish corpus. Additional (most frequent) words are added from the
  # ES gigaword corpus to bring the total to 64k words. The ES frequency sorted
  # wordlist is downloaded if it is not available.
  local/fsp_prepare_dict.sh $spanish_lexicon

  # Added c,j, v to the non silences phones manually
  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang

  # Make sure that you do not use your test and your dev sets to train the LM
  # Some form of cross validation is possible where you decode your dev/set based on an
  # LM that is trained on  everything but that that conversation
  # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
  # to get the numbers. Depending on your needs, you might have to change the size of
  # the splits within that file. The default paritions are based on the Kaldi + Joshua
  # requirements which means that I have very large dev and test sets
  local/fsp_train_lms.sh $split
  local/fsp_create_test_lang.sh

  utils/fix_data_dir.sh data/local/data/train_all

  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;

  utils/fix_data_dir.sh data/local/data/train_all
  utils/validate_data_dir.sh data/local/data/train_all

  cp -r data/local/data/train_all data/train_all

  # For the CALLHOME corpus
  utils/fix_data_dir.sh data/local/data/callhome_train_all

  steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/local/data/callhome_train_all exp/make_mfcc/callhome_train_all $mfccdir || exit 1;

  utils/fix_data_dir.sh data/local/data/callhome_train_all
  utils/validate_data_dir.sh data/local/data/callhome_train_all

  cp -r data/local/data/callhome_train_all data/callhome_train_all

  # Creating data partitions for the pipeline
  # We need datasets for both the ASR and SMT system
  # We have 257455 utterances left, so the partitions are roughly as follows
  # ASR Train : 100k utterances
  # ASR Tune : 17455 utterances
  # ASR Eval : 20k utterances
  # MT Train : 100k utterances
  # MT Tune : Same as the ASR eval set (Use the lattices from here)
  # MT Eval : 20k utterances
  # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
  # overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
  # As noted above, the LM has not been trained on the dev and the test sets.
  #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
  #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
  #utils/subset_data_dir.sh --last data/dev_and_test 120312 data/mt_train_and_test
  #utils/subset_data_dir.sh --first data/asr_dev_and_test 17662 data/dev
  #utils/subset_data_dir.sh --last data/asr_dev_and_test 20152 data/test
  #utils/subset_data_dir.sh --first data/mt_train_and_test 100238 data/mt_train
  #utils/subset_data_dir.sh --last data/mt_train_and_test 20074 data/mt_test
  #rm -r data/dev_and_test
  #rm -r data/asr_dev_and_test
  #rm -r data/mt_train_and_test

  local/create_splits.sh $split
  local/callhome_create_splits.sh $split_callhome
fi

if [ $stage -le 2 ]; then
  # Now compute CMVN stats for the train, dev and test subsets
  steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
  steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
  steps/compute_cmvn_stats.sh data/dev2 exp/make_mfcc/dev2 $mfccdir
  #steps/compute_cmvn_stats.sh data/mt_train exp/make_mfcc/mt_train $mfccdir
  #steps/compute_cmvn_stats.sh data/mt_test exp/make_mfcc/mt_test $mfccdir

  #n=$[`cat data/train_all/segments | wc -l` - 158126]
  #utils/subset_data_dir.sh --last data/train_all $n data/train
  steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir

  steps/compute_cmvn_stats.sh data/callhome_dev exp/make_mfcc/callhome_dev $mfccdir
  steps/compute_cmvn_stats.sh data/callhome_test exp/make_mfcc/callhome_test $mfccdir
  steps/compute_cmvn_stats.sh data/callhome_train exp/make_mfcc/callhome_train $mfccdir

  # Again from Dan's recipe : Reduced monophone training data
  # Now-- there are 1.6 million utterances, and we want to start the monophone training
  # on relatively short utterances (easier to align), but not only the very shortest
  # ones (mostly uh-huh).  So take the 100k shortest ones, and then take 10k random
  # utterances from those.

  utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
  utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
  utils/data/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
  utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
  utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
fi

if [ $stage -le 3 ]; then
  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
    data/train_10k_nodup data/lang exp/mono0a

  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;

  steps/train_deltas.sh --cmd "$train_cmd" \
    2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;


  (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    exp/tri1/graph data/dev exp/tri1/decode_dev)&

  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;

  steps/train_deltas.sh --cmd "$train_cmd" \
    2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;

  (
    utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
   )&
fi

if [ $stage -le 4 ]; then
  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;

# Train tri3a, which is LDA+MLLT, on 100k data.
  steps/train_lda_mllt.sh --cmd "$train_cmd" \
   --splice-opts "--left-context=3 --right-context=3" \
   3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
  (
    utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
     exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
  )&
fi

if [ $stage -le 5 ]; then
# Next we'll use fMLLR and train with SAT (i.e. on
# fMLLR features)
  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
    data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;

  steps/train_sat.sh  --cmd "$train_cmd" \
    4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;

  (
    utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri4a/graph data/dev exp/tri4a/decode_dev
)&


  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
    data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;

# Reduce the number of gaussians
  steps/train_sat.sh  --cmd "$train_cmd" \
    5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;

  (
    utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri5a/graph data/dev exp/tri5a/decode_dev
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri5a/graph data/test exp/tri5a/decode_test

  # Decode CALLHOME
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
      exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
    ) &


   steps/align_fmllr.sh \
     --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
     data/train data/lang exp/tri5a exp/tri5a_ali
fi

if $train_sgmm2; then

steps/train_ubm.sh \
  --cmd "$train_cmd" 750 \
  data/train data/lang exp/tri5a_ali exp/ubm5

steps/train_sgmm2.sh \
  --cmd "$train_cmd" 5000 18000 \
  data/train data/lang exp/tri5a_ali exp/ubm5/final.ubm exp/sgmm5

utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph

(
  steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
    --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
   exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
)&

steps/align_sgmm2.sh \
  --nj 32  --cmd "$train_cmd" --transform-dir exp/tri5a_ali \
  --use-graphs true --use-gselect true \
  data/train data/lang exp/sgmm5 exp/sgmm5_ali

steps/make_denlats_sgmm2.sh \
  --nj 32 --sub-split 32 --num-threads 4 \
  --beam 10.0 --lattice-beam 6 --cmd "$decode_cmd" --transform-dir exp/tri5a_ali \
  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats

steps/train_mmi_sgmm2.sh \
  --cmd "$train_cmd" --drop-frames true --transform-dir exp/tri5a_ali --boost 0.1 \
  data/train data/lang exp/sgmm5_ali exp/sgmm5_denlats \
  exp/sgmm5_mmi_b0.1

(
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
 exp/tri5a/graph data/dev exp/tri5a/decode_dev
utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
steps/decode_sgmm2.sh --nj 13 --cmd "$decode_cmd" --num-threads 5 \
  --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev \
 exp/sgmm5/graph data/dev exp/sgmm5/decode_dev
for iter in 1 2 3 4; do
  decode=exp/sgmm5_mmi_b0.1/decode_dev_it$iter
  mkdir -p $decode
  steps/decode_sgmm2_rescore.sh  \
    --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5a/decode_dev \
    data/lang_test data/dev/  exp/sgmm5/decode_dev $decode
done
) &

fi

local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
exit 0;