best_path_weights.sh
3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
# Copyright 2014-17 Vimal Manohar
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script gets from the lattice the best path alignments and frame-level
# posteriors of the pdfs in the best path alignment.
# The output directory has the format of an alignment directory.
# It can optionally read alignments from a directory, in which case,
# the script gets frame-level posteriors of the pdf corresponding to those
# alignments.
# The frame-level posteriors in the form of kaldi vectors and are
# output in weights.scp.
set -e
# begin configuration section.
cmd=run.pl
stage=-10
acwt=0.1
#end configuration section.
if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh || exit 1;
if [ $# -ne 3 ] && [ $# -ne 4 ]; then
cat <<EOF
Usage: $0 [options] <data-dir> <decode-dir> [<ali-dir>] <out-dir>
E.g. $0 data/train_unt.seg exp/tri1/decode exp/tri1/best_path
Options:
--cmd (run.pl|queue.pl...) # specify how to run the sub-processes.
EOF
exit 1;
fi
data=$1
decode_dir=$2
dir=${@: -1} # last argument to the script
ali_dir=$dir
if [ $# -eq 4 ]; then
ali_dir=$3
fi
mkdir -p $dir
nj=$(cat $decode_dir/num_jobs)
echo $nj > $dir/num_jobs
if [ $stage -le 1 ]; then
mkdir -p $dir/log
$cmd JOB=1:$nj $dir/log/best_path.JOB.log \
lattice-best-path --acoustic-scale=$acwt \
"ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz |" \
ark:/dev/null "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
fi
# Find where the final.mdl is.
if [ -f $(dirname $decode_dir)/final.mdl ]; then
src_dir=$(dirname $decode_dir)
else
src_dir=$decode_dir
fi
cp $src_dir/cmvn_opts $dir/ || exit 1
for f in final.mat splice_opts frame_subsampling_factor; do
if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir; fi
done
# make $dir an absolute pathname.
fdir=$(perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD})
model=$src_dir/final.mdl
tree=$src_dir/tree
for f in $model $decode_dir/lat.1.gz $tree; do
if [ ! -f $f ]; then echo "$0: expecting file $f to exist" && exit 1; fi
done
cp $model $tree $dir || exit 1
ali_nj=$(cat $ali_dir/num_jobs) || exit 1
if [ $nj -ne $ali_nj ]; then
echo "$0: $decode_dir and $ali_dir have different number of jobs. Redo alignment with $nj jobs."
exit 1
fi
if [ $stage -lt 2 ]; then
$cmd JOB=1:$nj $dir/log/get_post.JOB.log \
lattice-to-post --acoustic-scale=$acwt \
"ark,s,cs:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
post-to-pdf-post $model ark,s,cs:- ark:- \| \
get-post-on-ali ark,s,cs:- \
"ark,s,cs:gunzip -c $ali_dir/ali.JOB.gz | convert-ali $dir/final.mdl $model $tree ark,s,cs:- ark:- | ali-to-pdf $model ark,s,cs:- ark:- |" \
"ark,scp:$fdir/weights.JOB.ark,$fdir/weights.JOB.scp" || exit 1
fi
for n in `seq $nj`; do
cat $dir/weights.$n.scp
done > $dir/weights.scp
rm $dir/weights.*.scp
exit 0