combine_ali_dirs.sh
7.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/bash
# Copyright 2016 Xiaohui Zhang Apache 2.0.
# Copyright 2019 SmartAction (kkm)
# This script combines alignment directories, such as exp/tri4a_ali, and
# validates matching of the utterances and alignments after combining.
# Begin configuration section.
cmd=run.pl
nj=4
combine_lat=true
combine_ali=true
tolerance=10
# End configuration section.
echo "$0 $@" # Print the command line for logging.
[[ -f path.sh ]] && . ./path.sh
. parse_options.sh || exit 1
export LC_ALL=C
if [[ $# -lt 3 ]]; then
cat >&2 <<EOF
Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
Options:
--nj <nj> # number of jobs to split combined archives [4]
--combine_ali false # merge ali.*.gz if present [true]
--combine_lat false # merge lat.*.gz if present [true]
--tolerance <int,%> # maximum percentage of missing alignments or lattices
# w.r.t. total utterances in <data> before error is
# reported [10]
The script checks that certain important files are present and compatible in all
source directories (phones.txt, tree); other are copied from the first source
(cmvn_opts, final.mdl) without much checking.
Both --combine_ali and --combine_lat are true by default, but the script
proceeds with a warning if directories do not contain either alignments or
alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
after the script completes if additional programmatic check is required.
EOF
exit 1;
fi
if [[ ! $combine_lat && ! $combine_ali ]]; then
echo "$0: at least one of --combine_lat and --combine_ali must be true"
exit 1
fi
data=$1
dest=$2
shift 2
first_src=$1
do_ali=$combine_ali
do_lat=$combine_lat
# Check if alignments and/or lattices are present. Since we combine both,
# whichever present, issue a warning only. Also verify that the target is
# different from any source; we cannot combine in-place, and a lot of damage
# could result.
for src in $@; do
if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
"$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
echo "$0: error: Source $src is same as target $dest."
exit 1
fi
if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
"combining. Consider '--combine_ali false' to suppress this warning."
do_ali=false
fi
if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
"not combining. Consider '--combine_lat false' to suppress this warning."
do_lat=false
fi
done
if ! $do_ali && ! $do_lat; then
echo "$0: error: Cannot combine directories."
exit 1
fi
# Verify that required files are present in the first directory.
for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
if [ ! -f $first_src/$f ]; then
echo "$0: error: Required source file $first_src/$f is missing."
exit 1
fi
done
# Verify that phones and trees are compatible in all directories, and than
# num_jobs files are present, too.
for src in $@; do
if [[ $src != $first_src ]]; then
if [[ ! -f $src/num_jobs ]]; then
echo "$0: error: Required source file $src/num_jobs is missing."
exit 1
fi
if ! cmp -s $first_src/tree $src/tree; then
echo "$0: error: tree $src/tree is either missing or not the" \
"same as $first_src/tree."
exit 1
fi
if [[ ! -f $src/phones.txt ]]; then
echo "$0: error: Required source file $src/phones.txt is missing."
exit 1
fi
utils/lang/check_phones_compatible.sh $first_src/phones.txt \
$src/phones.txt || exit 1
fi
done
# All checks passed, ok to prepare directory. Copy model and other files from
# the first source, as they either checked to be compatible, or we do not care
# if they are.
mkdir -p $dest || exit 1
rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
$do_ali && rm -f $dest/ali.*.{gz,scp}
$do_lat && rm -f $dest/lat.*.{gz,scp}
cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null # If present.
echo $nj > $dest/num_jobs || exit 1
# Make temporary directory, delete on signal, but not on 'exit 1'.
temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
cleanup() { rm -rf "$temp_dir"; }
trap cleanup HUP INT TERM
echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
"script failure, so you could examine it for troubleshooting."
# This function may be called twice, once to combine alignments and the second
# time to combine lattices. The two invocations are as follows:
# do_combine ali alignments copy-int-vector $@
# do_combine lat lattices lattice-copy $@
# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
# program used to copy corresponding objects.
do_combine() {
local ark=$1 entities=$2 copy_program=$3
shift 3
echo "$0: Gathering $entities from each source directory."
# Assign all source gzipped archive names to an exported variable, one each
# per source directory, so that we can copy archives in a job per source.
src_id=0
for src in $@; do
src_id=$((src_id + 1))
nj_src=$(cat $src/num_jobs) || exit 1
# Create and export variable src_arcs_${src_id} for the job runner.
# Each numbered variable will contain the list of archives, e. g.:
# src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
# ('printf' repeats its format as long as there are more arguments).
printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src)
export src_arks_${src_id}
done
# Gather archives in parallel jobs.
$cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
$copy_program \
"ark:gunzip -c \${src_arks_JOB} |" \
"ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1
# Merge (presumed already sorted) scp's into a single script.
sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
$cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
$copy_program \
"scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \
"ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1
# Get some interesting stats, and signal an error if error threshold exceeded.
n_utt=$(wc -l <$data/utt2spk)
n_ali=$(wc -l <$temp_dir/$ark.scp)
n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
echo "$0: Combined $n_ali $entities for $n_utt utterances." \
"There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
"$entities, and $n_ali_no_utt $entities not matching any utterance."
if (( $n_utt_no_ali_pct >= $tolerance )); then
echo "$0: error: Percentage of utterances missing $entities," \
"${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
exit 1
fi
return 0
}
# Do the actual combining. Do not check returned exit code, as
# the function always calls 'exit 1' on failure.
$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
$do_lat && do_combine lat 'lattices' lattice-copy "$@"
# Delete the temporary directory on success.
cleanup
what=
$do_ali && what+='alignments '
$do_ali && $do_lat && what+='and '
$do_lat && what+='lattices '
echo "$0: Stored combined ${what}in $dest" # No period, interferes with
# copy/paste from tty emulator.
exit 0