align_lats.sh
6.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
# Copyright 2012 Brno University of Technology (Author: Karel Vesely)
# 2013 Johns Hopkins University (Author: Daniel Povey)
# 2015 Vijayaditya Peddinti
# 2016 Vimal Manohar
# 2017 Pegah Ghahremani
# Apache 2.0
# Computes training alignments using nnet3 DNN, with output to lattices.
# Begin configuration section.
nj=4
cmd=run.pl
stage=-1
# Begin configuration.
scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
acoustic_scale=0.1
beam=20
iter=final
frames_per_chunk=50
extra_left_context=0
extra_right_context=0
extra_left_context_initial=-1
extra_right_context_final=-1
online_ivector_dir=
graphs_scp=
generate_ali_from_lats=false # If true, alingments generated from lattices.
# End configuration options.
echo "$0 $@" # Print the command line for logging
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
data=$1
lang=$2
srcdir=$3
dir=$4
oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split${nj}
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
split_data.sh $data $nj || exit 1;
extra_files=
if [ ! -z "$online_ivector_dir" ]; then
steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
fi
for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
[ ! -f $f ] && echo "$0: no such file $f" && exit 1;
done
cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
cp $lang/phones.txt $dir || exit 1;
## Set up features. Note: these are different from the normal features
## because we have one rspecifier that has the features for the entire
## training set, not separate ones for each batch.
echo "$0: feature type is raw"
cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
cp $srcdir/cmvn_opts $dir 2>/dev/null
feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
ivector_opts=
if [ ! -z "$online_ivector_dir" ]; then
ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
fi
echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
frame_subsampling_opt=
if [ -f $srcdir/frame_subsampling_factor ]; then
# e.g. for 'chain' systems
frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
cp $srcdir/frame_subsampling_factor $dir
if [[ $frame_subsampling_factor -gt 1 ]]; then
# Assume a chain system, check agrument sanity.
if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
$scale_opts == *--transition-scale=1.0* &&
$acoustic_scale = '1.0') ]]; then
echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
echo "... You should pass the following options to this script:"
echo " --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
"--acoustic_scale 1.0"
fi
fi
fi
if [ ! -z "$graphs_scp" ]; then
if [ ! -f $graphs_scp ]; then
echo "Could not find graphs $graphs_scp" && exit 1
fi
tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
prog=compile-train-graphs-fsts
else
tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
prog=compile-train-graphs
fi
if [ $stage -le 0 ]; then
## because nnet3-latgen-faster doesn't support adding the transition-probs to the
## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs,
## because the other scripts write them without transition probs.
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
$prog --read-disambig-syms=$lang/phones/disambig.int \
$scale_opts \
$dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" \
"ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
fi
if [ $stage -le 1 ]; then
# Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
# alignment errors (however, it does have a default min-active=200 so this
# will tend to reduce alignment errors).
# --allow_partial=false makes sure we reach the end of the decoding graph.
# --word-determinize=false makes sure we retain the alternative pronunciations of
# words (including alternatives regarding optional silences).
# --lattice-beam=$beam keeps all the alternatives that were within the beam,
# it means we do no pruning of the lattice (lattices from a training transcription
# will be small anyway).
$cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
--frames-per-chunk=$frames_per_chunk \
--extra-left-context=$extra_left_context \
--extra-right-context=$extra_right_context \
--extra-left-context-initial=$extra_left_context_initial \
--extra-right-context-final=$extra_right_context_final \
--beam=$beam --lattice-beam=$beam \
--allow-partial=false --word-determinize=false \
$srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
"$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
fi
if [ $stage -le 2 ] && $generate_ali_from_lats; then
# If generate_alignments is true, ali.*.gz is generated in lats dir
$cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi
echo "$0: done generating lattices from training transcripts."