prepare_en_transcription.sh 1.08 KB
#!/bin/bash

locdata=$1
locdict=$2

cmu_dict=common/cmudict.0.7a
cmu_ext=common/cmudict.ext

mkdir -p $locdict

if [ ! -f $cmu_dict ] ; then
  echo "--- Downloading CMU dictionary ..."
  svn export http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a \
     $cmu_dict || exit 1;
fi

echo; echo "If common/cmudict.ext exists, add extra pronunciation to dictionary" ; echo
cat $cmu_dict  $cmu_ext > $locdict/cmudict_ext.txt 2> /dev/null  # ignoring if no extension

echo "--- Striping stress and pronunciation variant markers from cmudict ..."
perl local/make_baseform.pl \
  $locdict/cmudict_ext.txt /dev/stdout |\
  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt

echo "--- Searching for OOV words ..."
gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
  $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
  egrep -v '<.?s>' > $locdict/vocab-oov.txt

gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
  $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
  egrep -v '<.?s>' > $locdict/lexicon.txt

wc -l $locdict/vocab-oov.txt
wc -l $locdict/lexicon.txt