Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sprakbanken/s5/local/dict_prep.sh 3.55 KB
  #!/bin/bash
  
  # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
  # Copyright 2014-2016 Andreas Kirkedal5D
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  KALDI_ROOT=$(pwd)/../../..
  
  exproot=$(pwd)
  lmdir=data/local/transcript_lm
  dictsrc=data/local/dictsrc
  dictdir=data/local/dict
  espeakdir='espeak-1.48.04-source'
  mkdir -p $dictsrc $dictdir
  
  
  # Dictionary preparation:
  
  # Create wordlist from the AM transcripts
  cat $lmdir/transcripts.uniq | tr [:blank:] '
  ' | sort -u > $dictsrc/wlist.txt &
  
  # Install eSpeak if it is not installed already
  if hash espeak 2>/dev/null;
  then
      echo 'eSpeak installed'
  else
      cd $KALDI_ROOT/tools || exit 1;
      wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip
      wait
      unzip -q $espeakdir.zip
      cd $espeakdir/src
      # Remove dependency to portaudio - we only need the text-to-phoneme system
      perl -pi.back -e 's/^(AUDIO = portaudio)$/\#\1/' -e 's/^\#(AUDIO = portaudio2)$/\#\1/' Makefile
      make || exit 1;
      echo 'Installed eSpeak'
      cd $exproot || exit 1;
  fi
  
  # Wait for the wordlist to be fully created
  wait
  
  # Run wordlist through espeak to get phonetics
  # improvised parallelisation - simple call because 'split' often has different versions
  split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_
  for w in $dictsrc/Wtemp_*; do
      (cat $w | espeak -q -vda -x > $w.pho) &
  done
  
  wait
  
  cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt
  rm -f $dictsrc/Wtemp_*
  
  
  # Filter transcription
  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
  # initial and trailing spaces and collapse 2 or more spaces to one space
  
  cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt
  
  #Some question marks are not caught above
  perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt
  
  # Create lexicon.txt and put it in data/local/dict
  paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt
  
  # Remove entries without transcription
  grep -P  "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt
  
  # Copy pre-made phone table with
  cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt
  
  
  # Add "!SIL SIL" to lexicon.txt
  echo -e '!SIL\tSIL' > $dictsrc/lex_first
  echo -e '<UNK>\tSPN' >> $dictsrc/lex_first
  cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first
  mv $dictsrc/lex_first $dictdir/lexicon.txt
  
  # silence phones, one per line.
  
  if [ ! -f $dictdir/silence_phones.txt ]; then
      echo SIL > $dictdir/silence_phones.txt
  fi
  
  if [ ! -f $dictdir/optional_silence.txt ]; then
      echo SIL > $dictdir/optional_silence.txt
  fi
  
  if [ ! -f $dictdir/extra_questions.txt ]; then
      touch $dictdir/extra_questions.txt
  fi
  
  
  echo "Dictionary preparation succeeded"