extend_vocab_demo.sh 13.9 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328


#!/usr/bin/env bash

# This script demonstrates how to use the grammar-decoding framework to build
# graphs made out of more than one part.  It demonstrates using `fstequivalent`
# that the graph constructed this way is equivalent to what you would create if
# you had the LM all as a single piece.  This uses the command line tools to
# expand to a regular FST (--write-as-grammar=false) In practice you might not
# want do to that, since the result might be large, and since writing the entire
# thing might take too much time.  The code itself allows you to construct these
# GrammarFst objects in lightweight way and decode using them.

# Unfortunately the filenames here are not very well through through.  I hope to
# rework this when I have time.

stage=0
run_g2p=false  # set this to true to run the g2p stuff, it's slow so
               # by default we fake it by providing what it previously output
set -e

. ./path.sh
. utils/parse_options.sh


tree_dir=exp/chain/tree_sp
lang_base=data/lang_nosp_basevocab
lang_ext=data/lang_nosp_extvocab

# For the purposes of this script we just need a biphone tree and associated
# transition-model for testing, because we're testing it at the graph level,
# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
# involved here.

# We're doing this with the "no-silprobs" dictionary dir for now, as we
# need to write some scripts to support silprobs with this.

# For reference, here is how we could create the 'lang' dir for the
# baseline.
#utils/prepare_lang.sh data/local/dict_nosp \
#   "<UNK>" data/local/lang_tmp_nosp data/lang_nosp

if [ $stage -le 0 ]; then
  cp -r data/local/dict_nosp data/local/dict_nosp_basevocab
  echo "#nonterm:unk" > data/local/dict_nosp_basevocab/nonterminals.txt

  utils/prepare_lang.sh data/local/dict_nosp_basevocab \
       "<UNK>" data/local/lang_tmp_nosp $lang_base
fi

if [ $stage -le 1 ]; then
  # note: <UNK> does appear in that arpa file, with a reasonable probability
  # (0.0)...  presumably because the vocab that the arpa file was built with was
  # not vast, so there were plenty of OOVs.  It would be possible to adjust its
  # probability with adjust_unk_arpa.pl, but for now we just leave it as-is.
  # The <UNK> appears quite a few times in the ARPA.  In the language model we
  # replaced it with #nonterm:unk, which will later expand to our custom graph
  # of new words.

  # We don't want the #nonterm:unk on the output side of G.fst, or it would
  # appear in the decoded output, so we remove it using the 'fstrmsymbols' command.

  nonterm_unk=$(grep '#nonterm:unk' $lang_base/words.txt | awk '{print $2}')

  gunzip -c  data/local/lm/lm_tgsmall.arpa.gz | \
    sed 's/<UNK>/#nonterm:unk/g' | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang_base/words.txt - | \
    fstrmsymbols --remove-from-output=true "echo $nonterm_unk|" - $lang_base/G.fst
fi


if [ $stage -le 2 ]; then
  # make the top-level part of the graph.
  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_nosp_top
fi

if [ $stage -le 3 ] && $run_g2p; then
  # you may have to do some stuff manually to install sequitur, to get this to work.
  dict=data/local/dict_nosp_basevocab
  steps/dict/train_g2p.sh --silence-phones $dict/silence_phones.txt $dict/lexicon.txt  $tree_dir/extvocab_nosp_g2p
fi


if [ $stage -le 4 ]; then
  # Create data/local/dict_nosp_newvocab as a dict-dir containing just the
  # newly created vocabulary entries (but the same phone list as our old setup, not
  # that it matters)

  mkdir -p $tree_dir/extvocab_nosp_lexicon

  # First find a list of words in the test set that are out of vocabulary.
  # Of course this is totally cheating.
  awk -v w=data/lang/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} {for(n=2;n<=NF;n++) if(!($n in seen)) oov[$n] = 1}
                                END{ for(k in oov) print k;}' < data/dev_clean_2/text > $tree_dir/extvocab_nosp_lexicon/words
  echo "$0: generating g2p entries for $(wc -l <$tree_dir/extvocab_nosp_lexicon/words) words"

  if $run_g2p; then
    steps/dict/apply_g2p.sh $tree_dir/extvocab_nosp_lexicon/words $tree_dir/extvocab_nosp_g2p  $tree_dir/extvocab_nosp_lexicon
  else
    cat <<EOF >$tree_dir/extvocab_nosp_lexicon/lexicon.lex
HARDWIGG	0.962436	HH AA1 R D W IH1 G
SUDVESTR	0.162048	S AH1 D V EY1 S T R
SUDVESTR	0.133349	S AH1 D V EH1 S T R
SUDVESTR	0.114376	S AH1 D V EH1 S T ER0
VINOS	0.558345	V IY1 N OW0 Z
VINOS	0.068883	V AY1 N OW0 Z
VINOS	0.068431	V IY1 N OW0 S
DOMA	0.645714	D OW1 M AH0
DOMA	0.118255	D UW1 M AH0
DOMA	0.080682	D OW0 M AH0
GWYNPLAINE'S	0.983053	G W IH1 N P L EY1 N Z
SHIMERDA	0.610922	SH IH0 M EH1 R D AH0
SHIMERDA	0.175678	SH IY0 M EH1 R D AH0
SHIMERDA	0.069785	SH AY1 M ER1 D AH0
MYRDALS	0.479183	M IH1 R D AH0 L Z
MYRDALS	0.135225	M ER1 D AH0 L Z
MYRDALS	0.115478	M IH1 R D L Z
HEUCHERA	0.650042	HH OY1 K IH1 R AH0
HEUCHERA	0.119363	HH OY1 K EH1 R AH0
HEUCHERA	0.077907	HH OY1 K ER0 AH0
IMPARA	0.906222	IH0 M P AA1 R AH0
VERLOC'S	0.564847	V ER0 L AA1 K S
VERLOC'S	0.173540	V ER1 L AH0 K S
VERLOC'S	0.050543	V ER1 L AA1 K S
UNTRUSSING	0.998019	AH0 N T R AH1 S IH0 NG
DARFHULVA	0.317057	D AA2 F UH1 L V AH0
DARFHULVA	0.262882	D AA2 F HH UH1 L V AH0
DARFHULVA	0.064055	D AA2 F HH UW1 L V AH0
FINNACTA	0.594586	F IH1 N AH0 K T AH0
FINNACTA	0.232454	F IH1 N AE1 K T AH0
FINNACTA	0.044733	F IH1 N IH0 K T AH0
YOKUL	0.845279	Y OW1 K AH0 L
YOKUL	0.051082	Y OW2 K AH0 L
YOKUL	0.029435	Y OW0 K AH0 L
CONGAL	0.504228	K AA1 NG G AH0 L
CONGAL	0.151648	K AA2 NG G AH0 L
CONGAL	0.137837	K AH0 N JH AH0 L
DELECTASTI	0.632180	D IH0 L EH0 K T EY1 S T IY0
DELECTASTI	0.203808	D IH0 L EH1 K T EY1 S T IY0
DELECTASTI	0.066722	D IH0 L EH0 K T AE1 S T IY0
YUNDT	0.975077	Y AH1 N T
QUINCI	0.426115	K W IH1 N S IY0
QUINCI	0.369324	K W IH1 N CH IY0
QUINCI	0.064507	K W IY0 N CH IY0
BIRDIKINS	0.856979	B ER1 D IH0 K AH0 N Z
BIRDIKINS	0.045315	B ER1 D AH0 K AH0 N Z
SNEFFELS	0.928413	S N EH1 F AH0 L Z
FJORDUNGR	0.130629	F Y AO1 R D UW0 NG G R
FJORDUNGR	0.125082	F Y AO1 R D AH0 NG G R
FJORDUNGR	0.111035	F Y AO1 R D UH1 NG R
YULKA	0.540253	Y UW1 L K AH0
YULKA	0.295588	Y AH1 L K AH0
YULKA	0.076631	Y UH1 L K AH0
LACQUEY'S	0.987908	L AE1 K IY0 Z
OSSIPON'S	0.651400	AA1 S AH0 P AA2 N Z
OSSIPON'S	0.118444	AA1 S AH0 P AA0 N Z
OSSIPON'S	0.106377	AA1 S AH0 P AH0 N Z
SAKNUSSEMM	0.060270	S AE1 K N AH1 S EH1 M
SAKNUSSEMM	0.044992	S AE1 K N AH0 S EH1 M
SAKNUSSEMM	0.044084	S AA0 K N AH1 S EH1 M
CONGAL'S	0.618287	K AA1 NG G AH0 L Z
CONGAL'S	0.185952	K AA2 NG G AH0 L Z
CONGAL'S	0.115143	K AH0 N G AH0 L Z
TARRINZEAU	0.159153	T AA1 R IY0 N Z OW1
TARRINZEAU	0.136536	T AA1 R AH0 N Z OW1
TARRINZEAU	0.100924	T EH1 R IY0 N Z OW1
SHIMERDAS	0.230819	SH IH0 M EH1 R D AH0 Z
SHIMERDAS	0.216235	SH IH0 M EH1 R D AH0 S
SHIMERDAS	0.073311	SH AY1 M ER1 D AH0 Z
RUGGEDO'S	0.821285	R UW0 JH EY1 D OW0 Z
RUGGEDO'S	0.166825	R AH1 G AH0 D OW0 Z
CORNCAKES	0.934118	K AO1 R N K EY2 K S
VENDHYA	0.616662	V EH0 N D Y AH0
VENDHYA	0.178349	V EH1 N D Y AH0
VENDHYA	0.160768	V AA1 N D Y AH0
GINGLE	0.919815	G IH1 NG G AH0 L
STUPIRTI	0.422653	S T UW0 P IH1 R T IY0
STUPIRTI	0.126925	S T UW1 P IH0 R T IY0
STUPIRTI	0.078422	S T UW1 P AH0 R T IY0
HERBIVORE	0.950887	HH ER1 B IH0 V AO2 R
BRION'S	0.838326	B R AY1 AH0 N Z
BRION'S	0.140310	B R IY0 AH0 N Z
DELAUNAY'S	0.993259	D EH1 L AO0 N EY0 Z
KHOSALA	0.920908	K OW0 S AA1 L AH0
BRANDD	0.827461	B R AE1 N D
BRANDD	0.085646	B R AE2 N D
GARDAR	0.598675	G AA0 R D AA1 R
GARDAR	0.289831	G AA1 R D AA2 R
GARDAR	0.057983	G AA0 R D AA2 R
MACKLEWAIN	0.570209	M AE1 K AH0 L W EY0 N
MACKLEWAIN	0.101477	M AH0 K AH0 L W EY0 N
MACKLEWAIN	0.067905	M AE1 K AH0 L W EY2 N
LIBANO	0.993297	L IY0 B AA1 N OW0
MOLING	0.782578	M OW1 L IH0 NG
MOLING	0.059362	M OW2 L IH0 NG
MOLING	0.056217	M AA1 L IH0 NG
BENNYDECK'S	0.583859	B EH1 N IY0 D EH0 K S
BENNYDECK'S	0.276699	B EH1 N IH0 D EH0 K S
BENNYDECK'S	0.028343	B EH1 N IH0 D IH0 K S
MACKLEWAIN'S	0.615766	M AE1 K AH0 L W EY0 N Z
MACKLEWAIN'S	0.109585	M AH0 K AH0 L W EY0 N Z
MACKLEWAIN'S	0.039423	M AE1 K AH0 L W AH0 N Z
PRESTY	0.616071	P R EH1 S T IY0
PRESTY	0.288701	P R AH0 S T IY0
BREADHOUSE	0.995874	B R EH1 D HH AW2 S
BUZZER'S	0.992495	B AH1 Z ER0 Z
BHUNDA	0.502439	B UW1 N D AH0
BHUNDA	0.267733	B AH0 N D AH0
BHUNDA	0.193772	B UH1 N D AH0
PINKIES	0.998440	P IH1 NG K IY0 Z
TROKE	0.723320	T R OW1 K
TROKE	0.269707	T R OW2 K
OSSIPON	0.728486	AA1 S AH0 P AA2 N
OSSIPON	0.098752	AA1 S AH0 P AH0 N
OSSIPON	0.033957	AA1 S AH0 P AO0 N
RIVERLIKE	0.991731	R IH1 V ER0 L AY2 K
NICLESS	0.478183	N IH1 K L AH0 S
NICLESS	0.159889	N IH0 K L AH0 S
NICLESS	0.120611	N IH1 K L IH0 S
TRAMPE	0.959184	T R AE1 M P
VERLOC	0.610461	V ER0 L AA1 K
VERLOC	0.128479	V ER1 L AH0 K
VERLOC	0.073687	V ER1 L AA0 K
GANNY	0.991703	G AE1 N IY0
AMBROSCH	0.302906	AE0 M B R OW1 SH
AMBROSCH	0.201163	AE0 M B R AO1 SH
AMBROSCH	0.109274	AE1 M B R AO1 SH
FIBI	0.619154	F IH1 B IY0
FIBI	0.163168	F IY1 B IY0
FIBI	0.083443	F AY1 B IY0
IROLG	0.823123	IH0 R OW1 L G
IROLG	0.053196	IH0 R OW1 L JH
IROLG	0.021038	IH0 R OW1 L JH IY1
BALVASTRO	0.251546	B AA0 L V AA1 S T R OW0
BALVASTRO	0.213351	B AE0 L V AE1 S T R OW0
BALVASTRO	0.133005	B AA0 L V AE1 S T R OW0
BOOLOOROO	0.676757	B UW1 L UW1 R UW0
BOOLOOROO	0.173653	B UW1 L UH2 R UW0
BOOLOOROO	0.086501	B UW1 L UH0 R UW0
EOF
  fi

  # extend_lang.sh needs it to have basename 'lexiconp.txt'.
  mv $tree_dir/extvocab_nosp_lexicon/lexicon.lex $tree_dir/extvocab_nosp_lexicon/lexiconp.txt

  [ -f data/lang_nosp_extvocab/G.fst ] && rm data/lang_nosp_extvocab/G.fst
  utils/lang/extend_lang.sh  data/lang_nosp_basevocab $tree_dir/extvocab_nosp_lexicon/lexiconp.txt  data/lang_nosp_extvocab
fi

if [ $stage -le 5 ]; then
  # make the G.fst for the extra words.  Just assign equal probabilities to all of
  # them.  The words will all transition from state 1 to 2.
  cat <<EOF > $lang_ext/G.txt
0    1    #nonterm_begin <eps>
2    3    #nonterm_end <eps>
3
EOF
  lexicon=$tree_dir/extvocab_nosp_lexicon/lexiconp.txt
  num_words=$(wc -l <$lexicon)
  cost=$(perl -e "print log($num_words)");
  awk -v cost=$cost '{print 1, 2, $1, $1, cost}' <$lexicon >>$lang_ext/G.txt
  fstcompile --isymbols=$lang_ext/words.txt --osymbols=$lang_ext/words.txt <$lang_ext/G.txt | \
    fstarcsort --sort_type=ilabel >$lang_ext/G.fst
fi

if [ $stage -le 6 ]; then
  # make the part of the graph that will be included.
  # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
  # this in code.
  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
fi

if [ $stage -le 7 ]; then
  offset=$(grep nonterm_bos $lang_ext/phones.txt | awk '{print $2}')
  nonterm_unk=$(grep nonterm:unk $lang_ext/phones.txt | awk '{print $2}')

  mkdir -p $tree_dir/extvocab_nosp_combined
  [ -d $tree_dir/extvocab_nosp_combined/phones ] && rm -r $tree_dir/extvocab_nosp_combined/phones
  # the decoding script expects words.txt and phones/, copy them from the extvocab_part
  # graph directory where they will have suitable values.
  cp -r $tree_dir/extvocab_nosp_part/{words.txt,phones.txt,phones/} $tree_dir/extvocab_nosp_combined

  # the following, due to --write-as-grammar=false, compiles it into an FST
  # which can be decoded by our normal decoder.
  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
                   $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.fst

  # the following compiles it and writes as GrammarFst.  The size is 176M, vs. 182M for HCLG.fst.
  # In other examples, of course the difference might be more.

  make-grammar-fst --write-as-grammar=true --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
                $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.gra
fi


if [ $stage -le 8 ]; then
  # OK, now we actually decode the test data.  For reference, the command which was used to
  # decode the test data in the current (at the time of writing) chain TDNN system
  # local/chain/run_tdnn.sh (as figured out by running it from that stage), was:
  # steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
  #   --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
  #   exp/chain/tree_sp/graph_tgsmall data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2

  # We just replace the graph with the one in $treedir/extvocab_nosp_combined.

  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb


#  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
#%WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_12_0.0# s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh

 #.. versus the baseline below note, the baseline is not 100% comparable as it used the
 #   silence probabilities, which the grammar-decoding does not (yet) support...
 # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_* | utils/best_wer.sh
 # %WER 12.01 [ 2418 / 20138, 244 ins, 307 del, 1867 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_13_0.0
fi

if [ $stage -le 9 ]; then
  steps/nnet3/decode_grammar.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra

  #  The WER when decoding with the grammar FST directly is exactly the same:
  # s5:  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_* | utils/best_wer.sh
  # %WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_12_0.0
fi