Blame view
egs/sprakbanken/s5/local/sprak_prepare_dict.sh
4.51 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
#!/bin/bash # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. KALDI_ROOT=$(pwd)/../../.. exproot=$(pwd) dir=data/local/dict espeakdir='espeak-1.48.04-source' mkdir -p $dir # Dictionary preparation: # Normalise transcripts and create a transcript file # Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') # outputs a normalised transcript without utterance ids and a list of utterance ids echo "Normalising " python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl data/train/text1 data/train/onlyids $dir/transcripts.tmp # Additional normalisation, uppercasing, writing numbers etc. # and recombine with local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am cp $dir/transcripts.am data/train/onlytext paste data/train/onlyids data/train/onlytext > data/train/text # lmsents is output by sprak_data_prep.sh and contains # sentences that are disjoint from the test and dev set python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm wait # Create wordlist from the AM transcripts cat $dir/transcripts.am | tr [:blank:] ' ' | sort -u > $dir/wlist.txt & # Because training data is read aloud, there are many occurences of the same # sentence and bias towards the domain. Make a version where # the sentences are unique to reduce bias. local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt sort -u $dir/transcripts.txt > $dir/transcripts.uniq # Install eSpeak if it is not installed already if hash espeak 2>/dev/null; then echo 'eSpeak installed' else cd $KALDI_ROOT/tools || exit 1; if [ ! -d $espeakdir ]; then wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip unzip $espeakdir.zip fi cd $espeakdir/src make || exit 1; echo 'Installed eSpeak' cd $exproot || exit 1; fi # Wait for the wordlist to be fully created wait # Run wordlist through espeak to get phonetics # improvised parallelisation - simple call because 'split' often has different versions split -l 10000 $dir/wlist.txt $dir/Wtemp_ for w in $dir/Wtemp_*; do (cat $w | espeak -q -vda -x > $w.pho) & done wait cat $dir/Wtemp_*.pho > $dir/plist.txt rm -f $dir/Wtemp_* # Filter transcription # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove # initial and trailing spaces and collapse 2 or more spaces to one space # This could also be handled in non_silence.txt but this filtering is from earlier work cat $dir/plist.txt | tr '^%,=:_|#$12;-?!' ' ' | tr "'" " " | perl -pe 's/\(..\)|\-|\~//g' | perl -pe 's// /g' | perl -pe 's/^ +| +$//g' | tr -s ' ' > $dir/plist2.txt # Map phones with few occurences (Y, L, J, z, U, T, "Z" and x) to # phones with many occurences (y, l, y, s, w, t, dZ and dZ respectively) cat $dir/plist2.txt | tr 'BYLJzUT*Q' 'bylyswtRg' | perl -pe 's/d Z/dZ/g' | perl -pe 's/a I/aI/g' | perl -pe 's/ ?x ?| Z ?|Z / dZ /g' > $dir/plist3.txt # Create lexicon.txt and put it in data/local/dict paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt # Remove entries without transcription grep -P "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt # Create nonsilence_phones.txt and put in in data/local/dict cat $dir/plist3.txt | tr [:blank:] ' ' | sort -u > $dir/nonsilence_phones1.txt grep -v "^$" $dir/nonsilence_phones1.txt > $dir/nonsilence_phones.txt #cp $exproot/nonsilence_phones.txt $dir/nonsilence_phones.txt # Add "!SIL SIL" to lexicon.txt echo -e '!SIL\tSIL' > $dir/lex_first echo -e '<UNK>\tSPN' >> $dir/lex_first cat $dir/lexicon2.txt >> $dir/lex_first mv $dir/lex_first $dir/lexicon.txt # silence phones, one per line. (echo SIL; echo SPN) > $dir/silence_phones.txt echo SIL > $dir/optional_silence.txt touch $dir/extra_questions.txt ## TODO: add cleanup commands echo "Dictionary preparation succeeded" |