get_phone_post.sh
10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/bin/bash
# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.
# This script obtains phone posteriors from a trained chain model, using either
# the xent output or the forward-backward posteriors from the denominator fst.
# The phone posteriors will be in matrices where the column index can be
# interpreted as phone-index - 1.
# You may want to mess with the compression options. Be careful: with the current
# settings, you might sometimes get exact zeros as the posterior values.
# CAUTION! This script isn't very suitable for dumping features from recurrent
# architectures such as LSTMs, because it doesn't support setting the chunk size
# and left and right context. (Those would have to be passed into nnet3-compute
# or nnet3-chain-compute-post).
# Begin configuration section.
stage=0
nj=1 # Number of jobs to run.
cmd=run.pl
remove_word_position_dependency=false
use_xent_output=false
online_ivector_dir=
use_gpu=false
count_smoothing=1.0 # this should be some small number, I don't think it's critical;
# it will mainly affect the probability we assign to phones that
# were never seen in training. note: this is added to the raw
# transition-id occupation counts, so 1.0 means, add a single
# frame's count to each transition-id's counts.
# End configuration section.
set -e -u
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: $0 <chain-tree-dir> <chain-model-dir> <lang-dir> <data-dir> <phone-post-dir>"
echo " e.g.: $0 --remove-word-position-dependency true --online-ivector-dir exp/nnet3/ivectors_test_eval92_hires \\"
echo " exp/chain/tree_a_sp exp/chain/tdnn1a_sp data/lang data/test_eval92_hires exp/chain/tdnn1a_sp_post_eval92"
echo " ... you'll normally want to set the --nj and --cmd options as well."
echo ""
echo "Main options (for others, see top of script file)"
echo " --cmd (run.pl|queue.pl|... <queue opts>) # how to run jobs."
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
echo " --nj <N> # Number of parallel jobs to run, default:1"
echo " --remove-word-position-dependency <bool> # If true, remove word-position-dependency"
echo " # info when dumping posteriors (default: false)"
echo " --use-xent-output <bool> # If true, use the cross-entropy output of the"
echo " # neural network when dumping posteriors"
echo " # (default: false, will use chain denominator FST)"
echo " --online-ivector-dir <dir> # Directory where we dumped online-computed"
echo " # ivectors corresponding to the data in <data>"
echo " --use-gpu <bool> # Set to true to use GPUs (not recommended as the"
echo " # binary is very poorly optimized for GPU use)."
exit 1;
fi
tree_dir=$1
model_dir=$2
lang=$3
data=$4
dir=$5
for f in $tree_dir/tree $tree_dir/final.mdl $tree_dir/ali.1.gz $tree_dir/num_jobs \
$model_dir/final.mdl $model_dir/frame_subsampling_factor $model_dir/den.fst \
$data/feats.scp $lang/phones.txt; do
[ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
done
sdata=$data/split${nj}utt
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1;
use_ivector=false
cmvn_opts=$(cat $model_dir/cmvn_opts)
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
if [ ! -z "$online_ivector_dir" ];then
steps/nnet2/check_ivectors_compatible.sh $model_dir $online_ivector_dir || exit 1;
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp |"
ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'"
else
ivector_opts=
fi
if $use_gpu; then
gpu_queue_opt="--gpu 1"
gpu_opt="--use-gpu=yes"
if ! cuda-compiled; then
echo "$0: WARNING: you are running with one thread but you have not compiled"
echo " for CUDA. You may be running a setup optimized for GPUs. If you have"
echo " GPUs and have nvcc installed, go to src/ and do ./configure; make"
exit 1
fi
else
gpu_queue_opts=
gpu_opt="--use-gpu=no"
fi
frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor)
mkdir -p $dir/log
cp $model_dir/frame_subsampling_factor $dir/
if [ $stage -le 0 ]; then
if [ ! -f $dir/tacc ] || [ $dir/tacc -ot $tree_dir/ali.1.gz ]; then
echo "$0: obtaining transition-id counts in $dir/tacc"
# Obtain counts for each transition-id, from the alignments.
this_nj=$(cat $tree_dir/num_jobs)
$cmd JOB=1:$this_nj $dir/log/acc_taccs.JOB.log \
ali-to-post "ark:gunzip -c $tree_dir/ali.JOB.gz|" ark:- \| \
post-to-tacc $tree_dir/final.mdl ark:- $dir/tacc.JOB
input_taccs=$(for n in $(seq $this_nj); do echo $dir/tacc.$n; done)
$cmd $dir/log/sum_taccs.log \
vector-sum --binary=false $input_taccs $dir/tacc
rm $dir/tacc.*
else
echo "$0: skipping creation of $dir/tacc since it already exists."
fi
fi
if [ $stage -le 1 ] && $remove_word_position_dependency; then
echo "$0: creating $dir/phone_map.int"
utils/lang/get_word_position_phone_map.pl $lang $dir
else
# Either way, $dir/phones.txt will be a symbol table for the phones that
# we are dumping (although the matrices we dump won't contain anything
# for symbol 0 which is <eps>).
grep -v '^#' $lang/phones.txt > $dir/phones.txt
fi
if [ $stage -le 1 ]; then
# we want the phones in integer form as it's safer for processing by script.
# $data/fake_phones.txt will just contain e.g. "0 0\n1 1\n....", it's used
# to force show-transitions to print the phones as integers.
awk '{print $2,$2}' <$lang/phones.txt >$dir/fake_phones.txt
# The format of the 'show-transitions' command below is like the following:
#show-transitions tempdir/phone_map.int exp/chain/tree_a_sp/final.mdl
#Transition-state 1: phone = 1 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
# Transition-id = 1 p = 0.5 [self-loop]
# Transition-id = 2 p = 0.5 [0 -> 1]
#Transition-state 2: phone = 10 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51
# Transition-id = 3 p = 0.5 [self-loop]
# Transition-id = 4 p = 0.5 [0 -> 1]
# The following inline script processes that info about the transition model
# into the file $dir/phones_and_pdfs.txt, which has a line for each transition-id
# (starting from number 1), and the format of each line is
# <phone-id> <pdf-id>
show-transitions $dir/fake_phones.txt $tree_dir/final.mdl | \
perl -ane ' if(m/Transition-state.* phone = (\d+) pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $2; }
if(m/Transition-state.* phone = (\d+) .* forward-pdf = (\d+) self-loop-pdf = (\d+)/) {
$phone = $1; $forward_pdf = $2; $self_loop_pdf = $3; }
if(m/Transition-id/) { if (m/self-loop/) { print "$phone $self_loop_pdf\n"; }
else { print "$phone $forward_pdf\n" } } ' > $dir/phones_and_pdfs.txt
# The following command just separates the 'tacc' file into a similar format
# to $dir/phones_and_pdfs.txt, with one count per line, and a line per transition-id
# starting from number 1. We skip the first two fields which are "[ 0" (the 0 is
# for transition-id=0, since transition-ids are 1-based), and the last field which is "]".
awk '{ for (n=3;n<NF;n++) print $n; }' <$dir/tacc >$dir/transition_counts.txt
num_lines1=$(wc -l <$dir/phones_and_pdfs.txt)
num_lines2=$(wc -l <$dir/transition_counts.txt)
if [ $num_lines1 -ne $num_lines2 ]; then
echo "$0: mismatch in num-lines between phones_and_pdfs.txt and transition_counts.txt: $num_lines1 vs $num_lines2"
exit 1
fi
# after 'paste', the format of the data will be
# <phone-id> <pdf-id> <data-count>
# we add the count smoothing at this point.
paste $dir/phones_and_pdfs.txt $dir/transition_counts.txt | \
awk -v s=$count_smoothing '{print $1, $2, (s+$3);}' > $dir/combined_info.txt
if $remove_word_position_dependency; then
# map the phones to word-position-independent phones; you can see $dir/phones.txt
# to interpret the final output.
utils/apply_map.pl -f 1 $dir/phone_map.int <$dir/combined_info.txt > $dir/temp.txt
mv $dir/temp.txt $dir/combined_info.txt
fi
awk 'BEGIN{num_phones=1;num_pdfs=1;} { phone=$1; pdf=$2; count=$3; pdf_count[pdf] += count; counts[pdf,phone] += count;
if (phone>num_phones) num_phones=phone; if (pdf>=num_pdfs) num_pdfs = pdf + 1; }
END{ print "[ "; for(phone=1;phone<=num_phones;phone++) {
for (pdf=0;pdf<num_pdfs;pdf++) printf("%.3f ", counts[pdf,phone]/pdf_count[pdf]);
print ""; } print "]"; }' <$dir/combined_info.txt >$dir/transform.mat
fi
if [ $stage -le 2 ]; then
# note: --compression-method=3 is kTwoByteAuto: Each element is stored in two
# bytes as a uint16, with the representable range of values chosen
# automatically with the minimum and maximum elements of the matrix as its
# edges.
compress_opts="--compress=true --compression-method=3"
if $use_xent_output; then
# This block uses the 'output-xent' output of the nnet.
model="nnet3-copy '--edits-config=echo remove-output-nodes name=output; echo rename-node old-name=output-xent new-name=output|' $model_dir/final.mdl -|"
$cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
nnet3-compute $gpu_opt $ivector_opts \
--frame-subsampling-factor=$frame_subsampling_factor --apply-exp=true \
"$model" "$feats" ark:- \| \
transform-feats $dir/transform.mat ark:- ark:- \| \
copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
else
# This block is when we are using the 'chain' output (recommended as the posteriors
# will be much more accurate).
$cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \
nnet3-chain-compute-post $gpu_opt $ivector_opts --transform-mat=$dir/transform.mat \
--frame-subsampling-factor=$frame_subsampling_factor \
$model_dir/final.mdl $model_dir/den.fst "$feats" ark:- \| \
copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp
fi
sleep 5
# Make a single .scp file, for convenience.
for n in $(seq $nj); do cat $dir/phone_post.$n.scp; done > $dir/phone_post.scp
fi