run_kws.sh
3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash
# Copyright (c) 2018, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
# License: Apache 2.0
# Begin configuration section.
flen=0.01
stage=0
cmd=run.pl
data=data/dev_clean_2
lang=data/lang
keywords=local/kws/example/keywords.txt
output=data/dev_clean_2/kws/
# End configuration section
. ./utils/parse_options.sh
. ./path.sh
set -e -o pipefail
set -o nounset # Treat unset variables as an error
mkdir -p $output
if [ $stage -le 1 ] ; then
## generate the auxiliary data files
## utt.map
## wav.map
## trials
## frame_length
## keywords.int
## For simplicity, we do not generate the following files
## categories
## We will generate the following files later
## hitlist
## keywords.fsts
[ ! -f $data/utt2dur ] &&
utils/data/get_utt2dur.sh $data
duration=$(cat $data/utt2dur | awk '{sum += $2} END{print sum}' )
echo $duration > $output/trials
echo $flen > $output/frame_length
echo "Number of trials: $(cat $output/trials)"
echo "Frame lengths: $(cat $output/frame_length)"
echo "Generating map files"
cat $data/utt2dur | awk 'BEGIN{i=1}; {print $1, i; i+=1;}' > $output/utt.map
cat $data/wav.scp | awk 'BEGIN{i=1}; {print $1, i; i+=1;}' > $output/wav.map
cp $lang/words.txt $output/words.txt
cp $keywords $output/keywords.txt
cat $output/keywords.txt | \
local/kws/keywords_to_indices.pl --map-oov 0 $output/words.txt | \
sort -u > $output/keywords.int
fi
if [ $stage -le 2 ] ; then
## this step generates the file hitlist
## in many cases, when the reference hits are given, the followin two steps \
## are not needed
## we create the alignments of the data directory
## this is only so that we can obtain the hitlist
steps/align_fmllr.sh --nj 5 --cmd "$cmd" \
$data data/lang exp/tri3b exp/tri3b_ali_$(basename $data)
local/kws/create_hitlist.sh $data $lang data/local/lang_tmp \
exp/tri3b_ali_$(basename $data) $output
fi
if [ $stage -le 3 ] ; then
## this steps generates the file keywords.fsts
## compile the keywords (it's done via tmp work dirs, so that
## you can use the keywords filtering and then just run fsts-union
local/kws/compile_keywords.sh $output $lang $output/tmp.2
cp $output/tmp.2/keywords.fsts $output/keywords.fsts
# for example
# fsts-union scp:<(sort data/$dir/kwset_${set}/tmp*/keywords.scp) \
# ark,t:"|gzip -c >data/$dir/kwset_${set}/keywords.fsts.gz"
##
fi
system=exp/chain/tdnn1h_sp_online/decode_tglarge_dev_clean_2/
if [ $stage -le 4 ]; then
## this is not exactly necessary for a single system and single keyword set
## but if you have multiple keyword sets, then it avoids having to recompute
## the indices unnecesarily every time (see --indices-dir and --skip-indexing
## parameters to the search script bellow).
for lmwt in `seq 8 14` ; do
steps/make_index.sh --cmd "$cmd" --lmwt $lmwt --acwt 1.0 \
--frame-subsampling-factor 3\
$output $lang $system $system/kws_indices_$lmwt
done
fi
if [ $stage -le 5 ]; then
## find the hits, normalize and score
local/kws/search.sh --cmd "$cmd" --min-lmwt 8 --max-lmwt 14 \
--indices-dir $system/kws_indices --skip-indexing true\
$lang $data $system
fi
echo "Done"