prepare_dict.sh
2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
# Copyright 2012 Vassil Panayotov
# 2017 Ewald Enzinger
# Apache 2.0
# Adapted from egs/voxforge/s5/local/voxforge_prepare_dict.sh (commit acb5439bf97a39398d5eeb926a2a5cfa71b5f72a)
. path.sh || exit 1
locdata=data/local
locdict=$locdata/dict
echo "=== Preparing the dictionary ..."
if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then
echo "--- Downloading CMU dictionary ..."
mkdir -p $locdict
svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
$locdict/cmudict || exit 1;
fi
echo "--- Striping stress and pronunciation variant markers from cmudict ..."
perl $locdict/cmudict/scripts/make_baseform.pl \
$locdict/cmudict/cmudict.0.7a /dev/stdout |\
sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' | tr '[A-Z]' '[a-z]' > $locdict/cmudict-plain.txt
echo "--- Searching for OOV words ..."
awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
$locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
egrep -v '<.?s>' > $locdict/vocab-oov.txt
awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
$locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
egrep -v '<.?s>' > $locdict/lexicon-iv.txt
wc -l $locdict/vocab-oov.txt
wc -l $locdict/lexicon-iv.txt
if [ ! -f conf/g2p_model ]; then
echo "--- Downloading a pre-trained Sequitur G2P model ..."
wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
if [ ! -f conf/g2p_model ]; then
echo "Failed to download the g2p model!"
exit 1
fi
fi
if [[ "$(uname)" == "Darwin" ]]; then
command -v greadlink >/dev/null 2>&1 || \
{ echo "Mac OS X detected and 'greadlink' not found - please install using macports or homebrew"; exit 1; }
alias readlink=greadlink
fi
sequitur=$KALDI_ROOT/tools/sequitur-g2p
export PATH=$PATH:$sequitur/bin
export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
if ! g2p=`which g2p.py` ; then
echo "The Sequitur was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
exit 1
fi
echo "--- Preparing pronunciations for OOV words ..."
g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt
cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\
sort > $locdict/lexicon.txt
rm $locdict/lexiconp.txt 2>/dev/null || true
echo "--- Prepare phone lists ..."
echo SIL > $locdict/silence_phones.txt
echo SIL > $locdict/optional_silence.txt
grep -v -w sil $locdict/lexicon.txt | \
awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' |\
sort > $locdict/nonsilence_phones.txt
echo "--- Adding <unk> to the lexicon ..."
echo -e "<unk>\tSIL" >> $locdict/lexicon.txt
# Some downstream scripts expect this file exists, even if empty
touch $locdict/extra_questions.txt
echo "*** Dictionary preparation finished!"