train_g2p.sh
3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016 Xiaohui Zhang
# Apache 2.0
# Begin configuration section.
iters=5
stage=0
encoding='utf-8'
only_words=true
cmd=run.pl
# a list of silence phones, like data/local/dict/silence_phones.txt
silence_phones=
# End configuration section.
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;
set -u
set -e
if [ $# != 2 ]; then
echo "Usage: $0 [options] <lexicon-in> <work-dir>"
echo " where <lexicon-in> is the training lexicon (one pronunciation per "
echo " word per line, with lines like 'hello h uh l ow') and"
echo " <work-dir> is directory where the models will be stored"
echo "e.g.: train_g2p.sh --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
echo ""
echo "main options (for others, see top of script file)"
echo " --iters <int> # How many iterations. Relates to N-ngram order"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt."
echo " # A list of silence phones, one or more per line"
echo " # Relates to --only-words option"
echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e."
echo " # words with 1 phone which is a silence."
exit 1;
fi
lexicon=$1
wdir=$2
mkdir -p $wdir/log
[ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1
# Optionally remove words that are mapped to a single silence phone from the lexicon.
if $only_words && [ ! -z "$silence_phones" ]; then
awk -v s=$silence_phones \
'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
{if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
lexicon=$wdir/lexicon_onlywords.txt
fi
if ! g2p=`which g2p.py` ; then
echo "Sequitur was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
exit 1
fi
echo "Training the G2P model (iter 0)"
if [ $stage -le 0 ]; then
$cmd $wdir/log/g2p.0.log \
g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0
fi
for i in `seq 0 $(($iters-2))`; do
echo "Training the G2P model (iter $[$i + 1] )"
if [ $stage -le $i ]; then
$cmd $wdir/log/g2p.$(($i + 1)).log \
g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1))
fi
done
! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1
if [ $stage -le $(($i + 2)) ]; then
echo "Running test..."
$cmd $wdir/log/test.log \
g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon
fi