run_learn_lex_bayesian.sh
6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /bin/bash
#
# This script demonstrates a lexicon learning recipe, which aims to imrove
# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh
# for explanation of the options.
#
# Copyright 2016 Xiaohui Zhang
# Apache 2.0
. ./cmd.sh
. ./path.sh
oov_symbol="<unk>"
# The user may have an English g2p model ready.
g2p_mdl_dir=
# The dir which contains the reference lexicon (most probably hand-derived)
# we want to expand/improve, and nonsilence_phones.txt,.etc which we need
# for building new dict dirs.
ref_dict=data/local/dict
# acoustic training data we use to get alternative
# pronunciations and collet acoustic evidence.
data=data/train
# the cut-off parameter used to select pronunciation candidates from phone
# decoding. We remove pronunciations with probabilities less than this value
# after normalizing the probs s.t. the max-prob is 1.0 for each word."
min_prob=0.4
# Mean of priors (summing up to 1) assigned to three exclusive pronunciation
# source: reference lexicon, g2p, and phone decoding (used in the Bayesian
# pronunciation selection procedure). We recommend setting a larger prior
# mean for the reference lexicon, e.g. '0.6,0.2,0.2'.
prior_mean="0.7,0.2,0.1"
# Total amount of prior counts we add to all pronunciation candidates of
# each word. By multiplying it with the prior mean of a source, and then dividing
# by the number of candidates (for a word) from this source, we get the
# prior counts we actually add to each candidate.
prior_counts_tot=15
# In the Bayesian pronunciation selection procedure, for each word, we
# choose candidates (from all three sources) with highest posteriors
# until the total prob mass hit this amount.
# It's used in a similar fashion when we apply G2P.
variants_prob_mass=0.6
# In the Bayesian pronunciation selection procedure, for each word,
# after the total prob mass of selected candidates hit variants-prob-mass,
# we continue to pick up reference candidates with highest posteriors
# until the total prob mass hit this amount (must >= variants_prob_mass).
variants_prob_mass_ref=0.95
# Intermediate outputs of the lexicon learning stage will be put into dir
dir=exp/tri3_lex_work
nj=35
decode_nj=30
stage=0
lexlearn_stage=0
. utils/parse_options.sh # accept options
# The reference vocab is the list of words which we already have hand-derived pronunciations.
ref_vocab=data/local/vocab.txt
cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1;
# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
# in acoustic training data.
if [ $stage -le 0 ]; then
if [ -z $g2p_mdl_dir ]; then
g2p_mdl_dir=exp/g2p
steps/dict/train_g2p.sh --cmd "$decode_cmd --mem 4G" $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
fi
awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
$data/text | sort -u > $data/train_vocab.txt || exit 1;
awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
$data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
steps/dict/apply_g2p.sh --var-counts 4 $data/oov_train.txt \
$g2p_mdl_dir exp/g2p/oov_lex_train || exit 1;
cat exp/g2p/oov_lex_train/lexicon.lex | awk '{if (NF>=3) print $0}' | cut -f1,3 | \
tr -s '\t' ' ' | sort | uniq > $data/lexicon_oov_g2p.txt || exit 1;
fi
# Learn a lexicon based on the acoustic training data and the reference lexicon.
if [ $stage -le 1 ]; then
steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
--min-prob $min_prob --variants-prob-mass $variants_prob_mass \
--variants-prob-mass-ref $variants_prob_mass_ref \
--prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \
--stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl true \
$ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_learned_nosp \
$dir || exit 1;
fi
# Add pronounciation probs to the learned lexicon.
if [ $stage -le 1 ]; then
utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
data/local/dict_learned_nosp $oov_symbol data/local/lang_learned_nosp data/lang_learned_nosp || exit 1;
steps/align_si.sh --nj $nj --cmd "$train_cmd" \
$data data/lang_learned_nosp exp/tri2 exp/tri2_ali_learned_lex_nosp || exit 1;
steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_learned_nosp exp/tri2_ali_learned_lex_nosp || exit 1;
utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_learned_nosp exp/tri2_ali_learned_lex_nosp/pron_counts_nowb.txt \
exp/tri2_ali_learned_lex_nosp/sil_counts_nowb.txt \
exp/tri2_ali_learned_lex_nosp/pron_bigram_counts_nowb.txt data/local/dict_learned || exit 1;
utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
data/local/dict_learned $oov_symbol data/local/lang_learned data/lang_learned || exit 1;
fi
# Re-train the acoustic model using the learned lexicon
if [ $stage -le 2 ]; then
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
$data data/lang_learned exp/tri3 exp/tri3_ali_learned_lex || exit 1;
steps/train_sat.sh --cmd "$train_cmd" \
5000 100000 $data data/lang_learned exp/tri3_ali_learned_lex exp/tri3_learned_lex || exit 1;
fi
# Decode
if [ $stage -le 3 ]; then
cp -rT data/lang_learned data/lang_learned_rescore || exit 1;
! cmp data/lang_nosp/words.txt data/lang_learned/words.txt &&\
echo "$0: The vocab of the learned lexicon and the reference vocab may be incompatible."
cp data/lang_nosp/G.fst data/lang_learned/
cp data/lang_nosp_rescore/G.carpa data/lang_learned_rescore/
utils/mkgraph.sh data/lang_learned exp/tri3_learned_lex exp/tri3_learned_lex/graph || exit 1;
for dset in dev test; do
( steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri3_learned_lex/graph data/${dset} exp/tri3_learned_lex/decode_${dset} || exit 1;
steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_learned data/lang_learned_rescore \
data/${dset} exp/tri3_learned_lex/decode_${dset} exp/tri3_learned_lex/decode_${dset}_rescore || exit 1;
) &
done
fi
wait