gp_dict_prep.sh
4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
function error_exit () {
printf "$@\n" >&2; exit 1;
}
function read_dirname () {
[ -d "$1" ] || error_exit "Argument '$1' not a directory";
local retval=`cd $1 2>/dev/null && pwd || exit 1`
echo $retval
}
. ./path.sh # Sets the PATH to contain necessary executables
# Begin configuration section.
config_dir=conf # if true, use SRILM to change the LM vocab
map_dir=
# end configuration sections
help_message="Usage: "`basename $0`" [options] GP-dir LC [LC ... ]
where GP-dir is the directory containing the GlobalPhone corpus, and
LC is a 2-letter code for GlobalPhone languages (e.g. RU for Russian).\n
options:
--help # print this message and exit
--config-dir DIR # directory to find config files (default: $config_dir)
--map-dir DIR # directory to find phone mappings (default: '$map_dir')
";
. utils/parse_options.sh
if [ $# -lt 2 ]; then
printf "$help_message\n"; exit 1;
fi
GPDIR=`read_dirname $1`; shift;
LANGUAGES=
while [ $# -gt 0 ]; do
case "$1" in
??) LANGUAGES=$LANGUAGES" $1"; shift ;;
*) echo "Unknown argument: $1, exiting"; error_exit "$help_message" ;;
esac
done
# (1) check if the config files are in place:
pushd $config_dir > /dev/null
[ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
[ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
popd > /dev/null
[ -f path.sh ] && . ./path.sh # Sets the PATH to contain necessary executables
# (1) Normalize the dictionary
for L in $LANGUAGES; do
printf "Language - ${L}: preparing pronunciation lexicon ... "
mkdir -p data/$L/local/dict
full_name=`awk '/'$L'/ {print $2}' $config_dir/lang_codes.txt`;
pron_lex=$GPDIR/Dictionaries/${L}/${full_name}-GPDict.txt
if [ ! -f "$pron_lex" ]; then
pron_lex=$GPDIR/Dictionaries/${L}/${full_name}GP.dict # Polish & Bulgarian
[ -f "$pron_lex" ] || { echo "Error: no dictionary found for $L"; exit 1; }
fi
if [ ! -z "$map_dir" ]; then # map the phones to a different phoneset
if [ -f "$map_dir/$full_name" ]; then # found the mapping file
local/gp_norm_dict_${L}.pl -i "$pron_lex" -m "$map_dir/$full_name" \
| sort -u > data/$L/local/dict/lexicon_nosil.txt
else
echo "No phone mapping '$map_dir/$full_name': keeping original phoneset";
local/gp_norm_dict_${L}.pl -i "$pron_lex" | sort -u \
> data/$L/local/dict/lexicon_nosil.txt
fi
else
local/gp_norm_dict_${L}.pl -i "$pron_lex" | sort -u \
> data/$L/local/dict/lexicon_nosil.txt
fi
(printf '!SIL\tsil\n<unk>\tspn\n';) \
| cat - data/$L/local/dict/lexicon_nosil.txt \
> data/$L/local/dict/lexicon.txt;
echo "Done"
printf "Language - ${L}: extracting phone lists ... "
# silence phones, one per line.
{ echo sil; echo spn; } > data/$L/local/dict/silence_phones.txt
echo sil > data/$L/local/dict/optional_silence.txt
cut -f2- data/$L/local/dict/lexicon_nosil.txt | tr ' ' '\n' | sort -u \
> data/$L/local/dict/nonsilence_phones.txt
# Ask questions about the entire set of 'silence' and 'non-silence' phones.
# These augment the questions obtained automatically by clustering.
( tr '\n' ' ' < data/$L/local/dict/silence_phones.txt; echo;
tr '\n' ' ' < data/$L/local/dict/nonsilence_phones.txt; echo;
) > data/$L/local/dict/extra_questions.txt
echo "Done"
done
echo "Finished dictionary preparation."