Blame view
egs/voxforge/s5/local/voxforge_prepare_lm.sh
1.4 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#!/bin/bash # Copyright 2012 Vassil Panayotov # Apache 2.0 . ./path.sh || exit 1 echo "=== Building a language model ..." locdata=data/local loctmp=$locdata/tmp echo "--- Preparing a corpus from test and train transcripts ..." # Language model order order=3 . utils/parse_options.sh # Prepare a LM training corpus from the transcripts _not_ in the test set cut -f2- -d' ' < $locdata/test_trans.txt |\ sed -e 's:[ ]\+: :g' | sort -u > $loctmp/test_utt.txt # We are not removing the test utterances in the current version of the recipe # because this messes up with some of the later stages - e.g. too many OOV # words in tri2b_mmi cut -f2- -d' ' < $locdata/train_trans.txt |\ sed -e 's:[ ]\+: :g' |\ sort -u > $loctmp/corpus.txt loc=`which ngram-count`; if [ -z $loc ]; then if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 else sdir=$KALDI_ROOT/tools/srilm/bin/i686 fi if [ -f $sdir/ngram-count ]; then echo Using SRILM tools from $sdir export PATH=$PATH:$sdir else echo You appear to not have SRILM tools installed, either on your path, echo or installed in $sdir. See tools/install_srilm.sh for installation echo instructions. exit 1 fi fi ngram-count -order $order -write-vocab $locdata/vocab-full.txt -wbdiscount \ -text $loctmp/corpus.txt -lm $locdata/lm.arpa echo "*** Finished building the LM model!" |