Blame view
egs/timit/s5/local/timit_prepare_dict.sh
3.13 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/bin/bash # Copyright 2013 (Authors: Daniel Povey, Bagher BabaAli) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # Call this script from one level above, e.g. from the s3/ directory. It puts # its output in data/local/. # The parts of the output of this that will be needed are # [in data/local/dict/ ] # lexicon.txt # extra_questions.txt # nonsilence_phones.txt # optional_silence.txt # silence_phones.txt # run this from ../ srcdir=data/local/data dir=data/local/dict lmdir=data/local/nist_lm tmpdir=data/local/lm_tmp mkdir -p $dir $lmdir $tmpdir [ -f path.sh ] && . ./path.sh #(1) Dictionary preparation: # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. # silence phones, one per line. echo sil > $dir/silence_phones.txt echo sil > $dir/optional_silence.txt # nonsilence phones; on each line is a list of phones that correspond # really to the same base phone. # Create the lexicon, which is just an identity mapping cut -d' ' -f2- $srcdir/train.text | tr ' ' ' ' | sort -u > $dir/phones.txt paste $dir/phones.txt $dir/phones.txt > $dir/lexicon.txt || exit 1; grep -v -F -f $dir/silence_phones.txt $dir/phones.txt > $dir/nonsilence_phones.txt # A few extra questions that will be added to those obtained by automatically clustering # the "real" phones. These ask about stress; there's also one for silence. cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf " ";}' > $dir/extra_questions.txt || exit 1; cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l ";}' \ >> $dir/extra_questions.txt || exit 1; # (2) Create the phone bigram LM if [ -z $IRSTLM ] ; then export IRSTLM=$KALDI_ROOT/tools/irstlm/ fi export PATH=${PATH}:$IRSTLM/bin if ! command -v prune-lm >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 echo "$0: Error: this is no longer the case." >&2 echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 echo "$0: Error: and run extras/install_irstlm.sh" >&2 exit 1 fi cut -d' ' -f2- $srcdir/train.text | sed -e 's:^:<s> :' -e 's:$: </s>:' \ > $srcdir/lm_train.text build-lm.sh -i $srcdir/lm_train.text -n 2 \ -o $tmpdir/lm_phone_bg.ilm.gz compile-lm $tmpdir/lm_phone_bg.ilm.gz -t=yes /dev/stdout | \ grep -v unk | gzip -c > $lmdir/lm_phone_bg.arpa.gz echo "Dictionary & language model preparation succeeded" |