Blame view
egs/tunisian_msa/s5/local/prepare_lm.sh
1.25 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
#!/bin/bash # Copyright 2018 John Morgan # Apache 2.0. . ./cmd.sh set -e . ./path.sh . $KALDI_ROOT/tools/env.sh stage=0 nsegs=1000000; # limit the number of training segments . ./utils/parse_options.sh if [ ! -d data/local/lm ]; then mkdir -p data/local/lm fi corpus=$1 if [ ! -f $corpus ]; then echo "$0: input data $corpus not found." exit 1 fi perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < $corpus | \ head -n $nsegs > data/local/lm/train.txt if ! command ngram-count >/dev/null; then if uname -a | grep darwin >/dev/null; then # For MACOSX... sdir=$KALDI_ROOT/tools/srilm/bin/macosx elif uname -a | grep 64 >/dev/null; then # some kind of 64 bit... sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 else sdir=$KALDI_ROOT/tools/srilm/bin/i686 fi if [ -f $sdir/ngram-count ]; then echo Using SRILM tools from $sdir export PATH=$PATH:$sdir else echo You appear to not have SRILM tools installed, either on your path, echo or installed in $sdir. See tools/install_srilm.sh for installation echo instructions. exit 1 fi fi ngram-count -order 3 -interpolate -unk -map-unk "<UNK>" \ -limit-vocab -text data/local/lm/train.txt -lm data/local/lm/trigram.arpa || exit 1; gzip -f data/local/lm/trigram.arpa |