resegment_data.sh
4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2013. Apache 2.0.
# This script segments speech data based on some kind of decoding of
# whole recordings (e.g. whole conversation sides. See
# egs/swbd/s5b/local/run_resegment.sh for an example of usage.
# You'll probably want to use the script resegment_text.sh
# begin configuration section.
stage=0
cmd=run.pl
cleanup=true
segmentation_opts= # E.g. set this as --segmentation-opts "--silence-proportion 0.2 --max-segment-length 10"
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 5 ]; then
echo "Usage: $0 [options] <in-data-dir> <lang> <decode-dir|ali-dir> <out-data-dir> <temp/log-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --segmentation-opts '--opt1 opt1val --opt2 opt2val' # options for segmentation.pl"
echo "e.g.:"
echo "$0 data/train_unseg exp/tri3b/decode_train_unseg data/train_seg exp/tri3b_resegment"
exit 1;
fi
data=$1
lang=$2
alidir=$3 # may actually be decode-dir.
data_out=$4
dir=$5
mkdir -p $data_out || exit 1;
rm $data_out/* 2>/dev/null # Old stuff that's partial can cause problems later if
# we call fix_data_dir.sh; it will cause things to be
# thrown out.
mkdir -p $dir/log || exit 1;
utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
cp $alidir/phones.txt $dir || exit 1;
for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs; do
if [ ! -f $f ]; then
echo "$0: no such file $f"
exit 1;
fi
done
if [ -f $alidir/final.mdl ]; then
model=$alidir/final.mdl
else
if [ ! -f $alidir/../final.mdl ]; then
echo "$0: found no model in $alidir/final.mdl or $alidir/../final.mdl"
exit 1;
fi
model=$alidir/../final.mdl
fi
# get lists of sil,noise,nonsil phones
# convert *.ali.gz to *.ali.gz with 0,1,2.
# run perl script..
# output segments?
if ! [ `cat $lang/phones/optional_silence.txt | wc -w` -eq 1 ]; then
echo "Error: this script only works if $lang/phones/optional_silence.txt contains exactly one entry.";
echo "You'd have to modify the script to handle other cases."
exit 1;
fi
silphone=`cat $lang/phones/optional_silence.txt`
# silphone will typically be "sil" or "SIL".
# 3 sets of phones: 0 is silence, 1 is noise, 2 is speech.,
(
echo "$silphone 0"
grep -v -w $silphone $lang/phones/silence.txt | awk '{print $1, 1;}'
cat $lang/phones/nonsilence.txt | awk '{print $1, 2;}'
) > $dir/phone_map.txt
nj=`cat $alidir/num_jobs` || exit 1;
if [ $stage -le 0 ]; then
$cmd JOB=1:$nj $dir/log/resegment.JOB.log \
ali-to-phones --per-frame=true "$model" "ark:gunzip -c $alidir/ali.JOB.gz|" ark,t:- \| \
utils/int2sym.pl -f 2- $lang/phones.txt \| \
utils/apply_map.pl -f 2- $dir/phone_map.txt \| \
utils/segmentation.pl $segmentation_opts \| \
gzip -c '>' $dir/segments.JOB.gz
fi
if [ $stage -le 1 ]; then
if [ -f $data/reco2file_and_channel ]; then
cp $data/reco2file_and_channel $data_out/reco2file_and_channel
fi
if [ -f $data/wav.scp ]; then
cp $data/wav.scp $data_out/wav.scp
else
echo "Expected file $data/wav.scp to exist" # or there is really nothing to copy.
exit 1
fi
for f in glm stm; do
if [ -f $data/$f ]; then
cp $data/$f $data_out/$f
fi
done
for n in `seq $nj`; do gunzip -c $dir/segments.$n.gz; done | \
sort > $data_out/segments || exit 1;
[ ! -s $data_out/segments ] && echo "No data produced" && exit 1;
# We'll make the speaker-ids be the same as the recording-ids (e.g. conversation
# sides). This will normally be OK for telephone data.
cat $data_out/segments | awk '{print $1, $2}' > $data_out/utt2spk || exit 1
utils/utt2spk_to_spk2utt.pl $data_out/utt2spk > $data_out/spk2utt || exit 1
if $cleanup; then
rm $dir/segments.*.gz
fi
fi
cat $data_out/segments | awk '{num_secs += $4 - $3;} END{print "Number of hours of data is " (num_secs/3600);}'