make_weighted_den_fst.sh
5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
# Copyright 2017 Vimal Manohar
# 2017 Pegah Ghahremani
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script creates denominator FST (den.fst) and normalization.fst for
# chain training. It additionally copies the transition model and tree from the
# first alignment directory to the chain directory.
# This script can accept multiple sources of alignments with same phone sets
# that can be weighted to estimate phone LM.
# You can use the --num-repeats option to repeat some source data more than
# once when training the LM for the denominator FST.
set -o pipefail
# begin configuration section.
cmd=run.pl
stage=0
num_repeats= # Comma-separated list of positive integer multiplicities, one
# for each input alignment directory. The alignments from
# each source will be scaled by the corresponding value when
# training the LM.
# If not specified, weight '1' is used for all data sources.
lm_opts='--num-extra-lm-states=2000'
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -lt 2 ]; then
echo "Usage: $0 [options] <ali-dir1> [<ali-dir2> ...] <out-dir>";
echo "e.g.: $0 exp/tri1_ali exp/tri2_ali exp/chain/tdnn_1a_sp";
echo "Options: "
echo " --cmd (run.pl|queue.pl...) # Specify how to run jobs.";
echo "--lm-opts # Options for phone LM generation";
echo "--num-repeats # Comma-separated list of postive integer"
echo " # multiplicities, one for each input"
echo " # alignment directory. The alignments"
echo " # from each source will be scaled by"
echo " # the corresponding value when training"
echo " # the LM. If not specified, weight '1'"
echo " # is used for all data sources."
exit 1;
fi
dir=${@: -1} # the working directory: last argument to the script
ali_dirs=( $@ ) # read the remaining arguments into an array
unset ali_dirs[${#ali_dirs[@]}-1] # 'pop' the last argument which is $dir
num_alignments=${#ali_dirs[@]} # number of alignment dirs to combine
mkdir -p $dir/log
for n in `seq 0 $[$num_alignments-1]`;do
ali_dir=${ali_dirs[$n]}
for f in $ali_dir/ali.1.gz $ali_dir/final.mdl $ali_dir/tree; do
[ ! -f $f ] && echo "$0: Expected file $f to exist" && exit 1;
done
utils/lang/check_phones_compatible.sh ${ali_dirs[0]}/phones.txt \
${ali_dirs[$n]}/phones.txt || exit 1;
done
cp ${ali_dirs[0]}/tree $dir/ || exit 1
if [ -z "$num_repeats" ]; then
# If 'num_repeats' is not specified, set num_repeats_array to e.g. (1 1 1).
num_repeats_array=( $(for n in $(seq $num_alignments); do echo 1; done) )
else
num_repeats_array=(${num_repeats//,/ })
num_repeats=${#num_repeats_array[@]}
if [ $num_repeats -ne $num_alignments ]; then
echo "$0: too many or too few elements in --num-repeats option: '$num_repeats'"
exit 1
fi
fi
all_phones="" # will contain the names of the .gz files containing phones,
# with some members possibly repeated per the --num-repeats
# option
for n in `seq 0 $[num_alignments-1]`; do
this_num_repeats=${num_repeats_array[$n]}
this_alignment_dir=${ali_dirs[$n]}
num_jobs=$(cat $this_alignment_dir/num_jobs)
if ! [ "$this_num_repeats" -ge 0 ]; then
echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'"
exit 1
fi
if [ $stage -le 1 ]; then
for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \
ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1;
fi
if [ ! -s $dir/phones.$n.gz ]; then
echo "$dir/phones.$n.gz is empty or does not exist"
exit 1
fi
all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)"
done
if [ $stage -le 2 ]; then
$cmd $dir/log/make_phone_lm_fst.log \
gunzip -c $all_phones \| \
chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1;
rm $dir/phones.*.gz
fi
if [ $stage -le 3 ]; then
copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1;
fi
if [ $stage -le 4 ]; then
$cmd $dir/log/make_den_fst.log \
chain-make-den-fst $dir/tree $dir/0.trans_mdl \
$dir/phone_lm.fst \
$dir/den.fst $dir/normalization.fst || exit 1
fi
echo "Successfully created {den,normalization}.fst"
exit 0