run_cleanup_segmentation.sh
2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
# 2017 Nagendra Kumar Goel
# 2019 AsusTek Computer Inc. (author: Alex Hung)
# Apache 2.0
# This script demonstrates how to re-segment training data selecting only the
# "good" audio that matches the transcripts.
# The basic idea is to decode with an existing in-domain acoustic model, and a
# biased language model built from the reference, and then work out the
# segmentation from a ctm like file.
# For nnet3 and chain results after cleanup, see the scripts in
# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
# [will add these later].
set -e
set -o pipefail
set -u
stage=0
cleanup_stage=0
data=data/train
cleanup_affix=cleaned
srcdir=exp/tri5a
langdir=data/lang_test
nj=20
decode_nj=20
decode_num_threads=1
. ./cmd.sh
if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh
cleaned_data=${data}_${cleanup_affix}
dir=${srcdir}_${cleanup_affix}_work
cleaned_dir=${srcdir}_${cleanup_affix}
if [ $stage -le 1 ]; then
# This does the actual data cleanup.
steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
--nj $nj --cmd "$train_cmd" \
$data $langdir $srcdir $dir $cleaned_data
fi
if [ $stage -le 2 ]; then
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
$cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
fi
if [ $stage -le 3 ]; then
steps/train_sat.sh --cmd "$train_cmd" \
3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
fi
utils/data/get_utt2dur.sh data/train_cleaned
ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur)
new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur)
echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s."
# average duration was reduced from 21.68s to 10.97s.
exit 0;