Blame view
egs/vystadial_en/s5/local/prepare_en_transcription.sh
1.08 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
#!/bin/bash locdata=$1 locdict=$2 cmu_dict=common/cmudict.0.7a cmu_ext=common/cmudict.ext mkdir -p $locdict if [ ! -f $cmu_dict ] ; then echo "--- Downloading CMU dictionary ..." svn export http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a \ $cmu_dict || exit 1; fi echo; echo "If common/cmudict.ext exists, add extra pronunciation to dictionary" ; echo cat $cmu_dict $cmu_ext > $locdict/cmudict_ext.txt 2> /dev/null # ignoring if no extension echo "--- Striping stress and pronunciation variant markers from cmudict ..." perl local/make_baseform.pl \ $locdict/cmudict_ext.txt /dev/stdout |\ sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt echo "--- Searching for OOV words ..." gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\ egrep -v '<.?s>' > $locdict/vocab-oov.txt gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\ egrep -v '<.?s>' > $locdict/lexicon.txt wc -l $locdict/vocab-oov.txt wc -l $locdict/lexicon.txt |