Blame view
egs/commonvoice/s5/local/prepare_dict.sh
2.74 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
#!/bin/bash # Copyright 2012 Vassil Panayotov # 2017 Ewald Enzinger # Apache 2.0 # Adapted from egs/voxforge/s5/local/voxforge_prepare_dict.sh (commit acb5439bf97a39398d5eeb926a2a5cfa71b5f72a) . path.sh || exit 1 locdata=data/local locdict=$locdata/dict echo "=== Preparing the dictionary ..." if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then echo "--- Downloading CMU dictionary ..." mkdir -p $locdict svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ $locdict/cmudict || exit 1; fi echo "--- Striping stress and pronunciation variant markers from cmudict ..." perl $locdict/cmudict/scripts/make_baseform.pl \ $locdict/cmudict/cmudict.0.7a /dev/stdout |\ sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' | tr '[A-Z]' '[a-z]' > $locdict/cmudict-plain.txt echo "--- Searching for OOV words ..." awk 'NR==FNR{words[$1]; next;} !($1 in words)' \ $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\ egrep -v '<.?s>' > $locdict/vocab-oov.txt awk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\ egrep -v '<.?s>' > $locdict/lexicon-iv.txt wc -l $locdict/vocab-oov.txt wc -l $locdict/lexicon-iv.txt if [ ! -f conf/g2p_model ]; then echo "--- Downloading a pre-trained Sequitur G2P model ..." wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model if [ ! -f conf/g2p_model ]; then echo "Failed to download the g2p model!" exit 1 fi fi if [[ "$(uname)" == "Darwin" ]]; then command -v greadlink >/dev/null 2>&1 || \ { echo "Mac OS X detected and 'greadlink' not found - please install using macports or homebrew"; exit 1; } alias readlink=greadlink fi sequitur=$KALDI_ROOT/tools/sequitur-g2p export PATH=$PATH:$sequitur/bin export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages` if ! g2p=`which g2p.py` ; then echo "The Sequitur was not found !" echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh" exit 1 fi echo "--- Preparing pronunciations for OOV words ..." g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\ sort > $locdict/lexicon.txt rm $locdict/lexiconp.txt 2>/dev/null || true echo "--- Prepare phone lists ..." echo SIL > $locdict/silence_phones.txt echo SIL > $locdict/optional_silence.txt grep -v -w sil $locdict/lexicon.txt | \ awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' |\ sort > $locdict/nonsilence_phones.txt echo "--- Adding <unk> to the lexicon ..." echo -e "<unk>\tSIL" >> $locdict/lexicon.txt # Some downstream scripts expect this file exists, even if empty touch $locdict/extra_questions.txt echo "*** Dictionary preparation finished!" |