train_g2p_phonetisaurus.sh
3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Copyright 2017 Intellisist, Inc. (Author: Navneeth K)
# 2017 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache License 2.0
# This script trains a g2p model using Phonetisaurus.
stage=0
encoding='utf-8'
only_words=true
silence_phones=
echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;
set -u
set -e
if [ $# != 2 ]; then
echo "Usage: $0 [options] <lexicon-in> <work-dir>"
echo " where <lexicon-in> is the training lexicon (one pronunciation per "
echo " word per line, with lines like 'hello h uh l ow') and"
echo " <work-dir> is directory where the models will be stored"
echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
echo ""
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt."
echo " # A list of silence phones, one or more per line"
echo " # Relates to --only-words option"
echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e."
echo " # words with one or multiple phones which are all silence."
exit 1;
fi
lexicon=$1
wdir=$2
[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
isuconv=`which uconv`
if [ -z $isuconv ]; then
echo "uconv was not found. You must install the icu4c package."
exit 1;
fi
if ! phonetisaurus=`which phonetisaurus-apply` ; then
echo "Phonetisarus was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
exit 1
fi
mkdir -p $wdir
# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
# and optionally remove words that are mapped to a single silence phone from the lexicon.
if [ $stage -le 0 ]; then
if $only_words && [ ! -z "$silence_phones" ]; then
awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
$silence_phones $lexicon | \
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
else
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
fi
fi
if [ $stage -le 1 ]; then
# Align lexicon stage. Lexicon is assumed to have first column tab separated
phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
fi
if [ $stage -le 2 ]; then
# Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
fi
if [ $stage -le 3 ]; then
# Convert the arpa file to FST.
phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
fi