Blame view
egs/gp/s5/local/gp_dict_prep.sh
4.06 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit function error_exit () { printf "$@ " >&2; exit 1; } function read_dirname () { [ -d "$1" ] || error_exit "Argument '$1' not a directory"; local retval=`cd $1 2>/dev/null && pwd || exit 1` echo $retval } . ./path.sh # Sets the PATH to contain necessary executables # Begin configuration section. config_dir=conf # if true, use SRILM to change the LM vocab map_dir= # end configuration sections help_message="Usage: "`basename $0`" [options] GP-dir LC [LC ... ] where GP-dir is the directory containing the GlobalPhone corpus, and LC is a 2-letter code for GlobalPhone languages (e.g. RU for Russian). options: --help # print this message and exit --config-dir DIR # directory to find config files (default: $config_dir) --map-dir DIR # directory to find phone mappings (default: '$map_dir') "; . utils/parse_options.sh if [ $# -lt 2 ]; then printf "$help_message "; exit 1; fi GPDIR=`read_dirname $1`; shift; LANGUAGES= while [ $# -gt 0 ]; do case "$1" in ??) LANGUAGES=$LANGUAGES" $1"; shift ;; *) echo "Unknown argument: $1, exiting"; error_exit "$help_message" ;; esac done # (1) check if the config files are in place: pushd $config_dir > /dev/null [ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found."; [ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found."; [ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found."; popd > /dev/null [ -f path.sh ] && . ./path.sh # Sets the PATH to contain necessary executables # (1) Normalize the dictionary for L in $LANGUAGES; do printf "Language - ${L}: preparing pronunciation lexicon ... " mkdir -p data/$L/local/dict full_name=`awk '/'$L'/ {print $2}' $config_dir/lang_codes.txt`; pron_lex=$GPDIR/Dictionaries/${L}/${full_name}-GPDict.txt if [ ! -f "$pron_lex" ]; then pron_lex=$GPDIR/Dictionaries/${L}/${full_name}GP.dict # Polish & Bulgarian [ -f "$pron_lex" ] || { echo "Error: no dictionary found for $L"; exit 1; } fi if [ ! -z "$map_dir" ]; then # map the phones to a different phoneset if [ -f "$map_dir/$full_name" ]; then # found the mapping file local/gp_norm_dict_${L}.pl -i "$pron_lex" -m "$map_dir/$full_name" \ | sort -u > data/$L/local/dict/lexicon_nosil.txt else echo "No phone mapping '$map_dir/$full_name': keeping original phoneset"; local/gp_norm_dict_${L}.pl -i "$pron_lex" | sort -u \ > data/$L/local/dict/lexicon_nosil.txt fi else local/gp_norm_dict_${L}.pl -i "$pron_lex" | sort -u \ > data/$L/local/dict/lexicon_nosil.txt fi (printf '!SIL\tsil <unk>\tspn ';) \ | cat - data/$L/local/dict/lexicon_nosil.txt \ > data/$L/local/dict/lexicon.txt; echo "Done" printf "Language - ${L}: extracting phone lists ... " # silence phones, one per line. { echo sil; echo spn; } > data/$L/local/dict/silence_phones.txt echo sil > data/$L/local/dict/optional_silence.txt cut -f2- data/$L/local/dict/lexicon_nosil.txt | tr ' ' ' ' | sort -u \ > data/$L/local/dict/nonsilence_phones.txt # Ask questions about the entire set of 'silence' and 'non-silence' phones. # These augment the questions obtained automatically by clustering. ( tr ' ' ' ' < data/$L/local/dict/silence_phones.txt; echo; tr ' ' ' ' < data/$L/local/dict/nonsilence_phones.txt; echo; ) > data/$L/local/dict/extra_questions.txt echo "Done" done echo "Finished dictionary preparation." |