Blame view
egs/sprakbanken/s5/local/dict_prep.sh
3.55 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
#!/bin/bash # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal) # Copyright 2014-2016 Andreas Kirkedal5D # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. KALDI_ROOT=$(pwd)/../../.. exproot=$(pwd) lmdir=data/local/transcript_lm dictsrc=data/local/dictsrc dictdir=data/local/dict espeakdir='espeak-1.48.04-source' mkdir -p $dictsrc $dictdir # Dictionary preparation: # Create wordlist from the AM transcripts cat $lmdir/transcripts.uniq | tr [:blank:] ' ' | sort -u > $dictsrc/wlist.txt & # Install eSpeak if it is not installed already if hash espeak 2>/dev/null; then echo 'eSpeak installed' else cd $KALDI_ROOT/tools || exit 1; wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip wait unzip -q $espeakdir.zip cd $espeakdir/src # Remove dependency to portaudio - we only need the text-to-phoneme system perl -pi.back -e 's/^(AUDIO = portaudio)$/\#\1/' -e 's/^\#(AUDIO = portaudio2)$/\#\1/' Makefile make || exit 1; echo 'Installed eSpeak' cd $exproot || exit 1; fi # Wait for the wordlist to be fully created wait # Run wordlist through espeak to get phonetics # improvised parallelisation - simple call because 'split' often has different versions split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_ for w in $dictsrc/Wtemp_*; do (cat $w | espeak -q -vda -x > $w.pho) & done wait cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt rm -f $dictsrc/Wtemp_* # Filter transcription # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove # initial and trailing spaces and collapse 2 or more spaces to one space cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt #Some question marks are not caught above perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt # Create lexicon.txt and put it in data/local/dict paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt # Remove entries without transcription grep -P "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt # Copy pre-made phone table with cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt # Add "!SIL SIL" to lexicon.txt echo -e '!SIL\tSIL' > $dictsrc/lex_first echo -e '<UNK>\tSPN' >> $dictsrc/lex_first cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first mv $dictsrc/lex_first $dictdir/lexicon.txt # silence phones, one per line. if [ ! -f $dictdir/silence_phones.txt ]; then echo SIL > $dictdir/silence_phones.txt fi if [ ! -f $dictdir/optional_silence.txt ]; then echo SIL > $dictdir/optional_silence.txt fi if [ ! -f $dictdir/extra_questions.txt ]; then touch $dictdir/extra_questions.txt fi echo "Dictionary preparation succeeded" |