run-2-segmentation.sh
4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
# Copyright 2014 Vimal Manohar, Johns Hopkins University (Author: Jan Trmal)
# Apache 2.0
#Begin configuration section
silence_segment_fraction=1.0 # What fraction of segment we should keep
#end configuration section
# This is not necessarily the top-level run.sh as it is in other directories. see README.txt first.
[ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
. conf/common_vars.sh || exit 1;
. ./lang.conf || exit 1;
[ -f local.conf ] && . ./local.conf
. ./utils/parse_options.sh
set -e #Exit on non-zero return code from any command
set -o pipefail #Exit if any of the commands in the pipeline will
#return non-zero return code
set -u #Fail on an undefined variable
#Later in the script we assume the run-1-main.sh was run (because we are using exp/tri4)
#So let's make it mandatory, instead of doing the work on our own.
[ ! -f data/raw_train_data/.done ] && echo "The source training data directory is not ready. Use the run-1-main.sh script to prepare it!" && exit 1
nj_max=`cat $train_data_list | wc -l`
if [[ "$nj_max" -lt "$train_nj" ]] ; then
echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
exit 1;
train_nj=$nj_max
fi
train_data_dir=`utils/make_absolute.sh ./data/raw_train_data`
if [ ! -f data/train_seg/.done ]; then
mkdir -p data/train_seg
echo ---------------------------------------------------------------------
echo "Preparing acoustic training lists in data/train on" `date`
echo ---------------------------------------------------------------------
local/prepare_acoustic_training_data.pl --get-whole-transcripts "true" \
--vocab data/local/lexicon.txt --fragmentMarkers \-\*\~ \
$train_data_dir data/train_seg > data/train_seg/skipped_utts.log
mv data/train_seg/text data/train_seg/text_orig
num_silence_segments=$(cat data/train_seg/text_orig | awk '{if (NF == 2 && $2 == "<silence>") {print $0}}' | wc -l)
num_keep_silence_segments=`perl -e "printf '%d', ($num_silence_segments * $silence_segment_fraction)"`
if [ $num_silence_segments -eq $num_keep_silence_segments ]; then
# Keep all segments including silence segments
cat data/train_seg/text_orig | awk '{if (NF == 2 && $2 == "<silence>") {print $1} else {print $0}}' > data/train_seg/text
else
# Keep only a fraction of silence segments
cat data/train_seg/text_orig \
| awk 'BEGIN{i=0} \
{ \
if (NF == 2 && $2 == "<silence>") { \
if (i<'$num_keep_silence_segments') { \
print $1; \
i++; \
} \
} else {print $0}\
}' > data/train_seg/text
fi
#rm data/train_seg/text_orig
utils/fix_data_dir.sh data/train_seg
echo ---------------------------------------------------------------------
echo "Starting plp feature extraction for data/train_seg in plp on" `date`
echo ---------------------------------------------------------------------
if [ ! -f data/train_seg/.plp.done ]; then
if $use_pitch; then
steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $train_nj \
data/train_seg exp/make_plp_pitch/train_seg plp
else
steps/make_plp.sh --cmd "$train_cmd" --nj $train_nj \
data/train_seg exp/make_plp/train_seg plp
fi
utils/fix_data_dir.sh data/train_seg
steps/compute_cmvn_stats.sh data/train_seg exp/make_plp/train_seg plp
utils/fix_data_dir.sh data/train_seg
touch data/train_seg/.plp.done
fi
touch data/train_seg/.done
fi
echo ---------------------------------------------------------------------
echo "Training segmentation model in exp/tri4b_seg"
echo ---------------------------------------------------------------------
local/resegment/train_segmentation.sh \
--boost-sil 1.0 --nj $train_nj --cmd "$decode_cmd" \
exp/tri4 data/train_seg data/lang exp/tri4b_seg || exit 1
echo ---------------------------------------------------------------------
echo "Finished successfully on" `date`
echo ---------------------------------------------------------------------
exit 0