Blame view
egs/gale_arabic/s5c/local/prepare_lm_subword.sh
1.41 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#!/bin/bash # Copyright 2012 Vassil Panayotov # 2017 Ewald Enzinger # 2019 Dongji Gao # Apache 2.0 . ./path.sh || exit 1 echo "=== Building a language model ..." dir=data/local/lm/ text=data/train/text lexicon=data/local/dict/lexicon.txt # Language model order order=6 . utils/parse_options.sh # Prepare a LM training corpus from the transcripts mkdir -p $dir for f in "$text" "$lexicon"; do [ ! -f $f ] && echo "$0: No such file $f" && exit 1; done loc=`which ngram-count`; if [ -z $loc ]; then if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 else sdir=$KALDI_ROOT/tools/srilm/bin/i686 fi if [ -f $sdir/ngram-count ]; then echo Using SRILM tools from $sdir export PATH=$PATH:$sdir else echo You appear to not have SRILM tools installed, either on your path, echo or installed in $sdir. See tools/install_srilm.sh for installation echo instructions. exit 1 fi fi cat data/train/text | cut -d " " -f 2- > $dir/text.txt cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt cut -d' ' -f1 $lexicon > $dir/wordlist ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \ -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt echo "*** Finished building the LM model!" |