Blame view
egs/sprakbanken/s5/local/sprak_train_irstlm.sh
4.48 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
#!/bin/bash # Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal) # Apache 2.0 # This script takes data prepared in a corpus-dependent way # in data/local/, and converts it into the "canonical" form, # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, # data/train_si284, data/train_si84, etc. . ./path.sh || exit 1; if [ -z $IRSTLM ] ; then export IRSTLM=$KALDI_ROOT/tools/irstlm/ fi export PATH=${PATH}:$IRSTLM/bin if ! command -v ngt >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 echo "$0: Error: this is no longer the case." >&2 echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 echo "$0: Error: and run extras/install_irstlm.sh" >&2 exit 1 fi srcdict=$1 newtext=$2 lm_suffix=$3 N=$4 lmdir=$5 extdict=${srcdict}_$lm_suffix lang_tmp=data/local/lang_tmp extlang=data/lang_$lm_suffix if [ ! -d $lmdir ]; then mkdir -p $lmdir fi if [ ! -d $extdict ]; then echo "Creating $extdict based on $srcdict" # Extend the $srcdict to include the new data mkdir -p $extdict for f in extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do cp $srcdict/$f $extdict/ done mv $extdict/lexicon.txt $extdict/oldlexicon.txt fi if [ ! -f $extdict/transcripts.uniq ]; then # Create the text data for LMs and RNNs cat $srcdict/transcripts.txt $newtext > $extdict/transcripts.txt sort -u $extdict/transcripts.txt > $extdict/transcripts.uniq fi # Checks if espeak is available on the system. espeak is necessary to extend # the setup because the original transcriptions were created with espeak and # filtered if ! which espeak >&/dev/null; then echo "espeak is not available on your system. You must install espeak before proceeding." exit 0; fi if [ ! -f $extdict/lexicon.txt ]; then # Extend lexicon with pronunciations from espeak echo "Transcibing $newtext using espeak" cat $newtext | tr [:blank:] ' ' | grep -P -v '^[\s?|\.|\!]*$' | sort -u > $extdict/wlist.txt # Piped so only a number is stored in the variable nwords=$(wc -l < $extdict/wlist.txt) nsplit=$((nwords / 8)) # Create wordlist # Run through espeak to get phonetics split -l $nsplit $extdict/wlist.txt $extdict/Wtemp_ for w in $extdict/Wtemp_*; do (cat $w | espeak -q -vda -x > $w.pho ) & done wait cat $extdict/Wtemp_*.pho > $extdict/plist.txt rm -f $extdict/Wtemp_* # Filter transcription # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove # initial and trailing spaces and collapse 2 or more spaces to one space cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt #Some question marks are not caught above perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt # Create lexicon.txt and put it in data/local/dict paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt # Remove entries without transcription grep -P "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt echo "Combining lexicons" # Combine lexicons cat $extdict/oldlexicon.txt $extdict/newlexicon.txt > $extdict/templex sort -u $extdict/templex > $extdict/lexicon.txt fi if [ ! -d $extlang ]; then # Create new lang_ext dir utils/prepare_lang.sh $extdict "<UNK>" $lang_tmp $extlang || exit 1; fi if [ ! -f $lmdir/extra4.ngt ]; then echo "Preparing LM data" grep -P -v '^[\s?|\.|\!]*$' $newtext | \ awk '{if(NF>=4){ printf("%s ",$0); }}' > $lmdir/text.filt # Envelop LM training data in context cues add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input echo "Creating new binary ngram table $lmdir/extra4.ngt" ngt -i=$lmdir/lm_input -n=4 -o=$lmdir/extra4.ngt -b=yes fi echo "Training ARPA model extra$lm_suffix" # Randomly chose n=4 as upper bound for the ngram table tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix # Next, create the corresponding FST # and the corresponding lang_test_* directory. test=data/lang_test_${N}${lm_suffix} mkdir -p $test cp -r $extlang $test cat $lmdir/extra${N}$lm_suffix | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; exit 0; |