wsj_prepare_beep_dict.sh
3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/bash
# Copyright 2013 MERL (author: Felix Weninger)
# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# run this from ../
dir=data/local/dict
mkdir -p $dir
# Get BEEP dictionary
BEEP_URL="https://www.dropbox.com/s/skiemso2ohw1ood/beep-1.0.tar.gz"
x=`basename $BEEP_URL`
mkdir -p $dir/beep
if [ ! -e $dir/beep/$x ]; then
wget $BEEP_URL -O $dir/beep/$x || exit 1;
tar zxvf $dir/beep/$x -C $dir || exit 1;
fi
# (1) Get the CMU dictionary
if [ ! -d $dir/cmudict/.svn ]; then
svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
$dir/cmudict || exit 1;
fi
# can add -r 10966 for strict compatibility.
# merge pronunciations
# so that beep pronunciations take precedence
perl local/merge_dict.pl $dir/beep/beep-1.0 $dir/cmudict/cmudict.0.7a $dir/beep_cmu.dict
#(2) Dictionary preparation:
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
# silence phones, one per line.
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt
# obtain list of phones
grep -vi sil $dir/beep_cmu.dict | \
perl -e 'while(<>){
next if /^#/;
my @e=split;
for (@e[1..$#e]) { $p{$_}=1; }
}
print join("\n", map { uc } sort keys %p), "\n"' \
> $dir/nonsilence_phones.txt || exit 1;
# nonsilence phones; on each line is a list of phones that correspond
# really to the same base phone.
#cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
# perl -e 'while(<>){
# chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
# $phones_of{$1} .= "$_ "; }
# foreach $list (values %phones_of) {print $list . "\n"; } ' \
# > $dir/nonsilence_phones.txt || exit 1;
# A few extra questions that will be added to those obtained by automatically clustering
# the "real" phones. These ask about stress; there's also one for silence.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
>> $dir/extra_questions.txt || exit 1;
#exit
grep -v "^#" $dir/beep_cmu.dict > $dir/lexicon1_raw_nosil.txt
#grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
# perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
# > $dir/lexicon1_raw_nosil.txt || exit 1;
# Add to cmudict the silences, noises etc.
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
# lexicon.txt is without the _B, _E, _S, _I markers.
cp $dir/lexicon2_raw.txt $dir/lexicon.txt
echo "Dictionary preparation succeeded"