Blame view
egs/sprakbanken/s5/local/train_irstlm.sh
2.12 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#!/bin/bash # Copyright 2013 Mirsk Digital ApS (Author: Andreas Kirkedal) # Apache 2.0 # This script takes data prepared in a corpus-dependent way # in data/local/, and converts it into the "canonical" form, # in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, # data/train_si284, data/train_si84, etc. # Don't bother doing train_si84 separately (although we have the file lists # in data/local/) because it's just the first 7138 utterances in train_si284. # We'll create train_si84 after doing the feature extraction. . ./path.sh || exit 1; if [ -z $IRSTLM ] ; then export IRSTLM=$KALDI_ROOT/tools/irstlm/ fi export PATH=${PATH}:$IRSTLM/bin if ! command -v ngt >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 echo "$0: Error: this is no longer the case." >&2 echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 echo "$0: Error: and run extras/install_irstlm.sh" >&2 exit 1 fi echo "Preparing train and test data" srcdir=$4 lmdir=$5 tmpdir=data/local/lm_tmp lang_tmp=data/local/lang_tmp lexicon=$1 ngram=$2 lm_suffix=$3 mkdir -p $lmdir mkdir -p $tmpdir #grep -P -v '^[\s?|\.|\!]*$' $lexicon | grep -v '^ *$' | \ #awk '{if(NF>=4){ printf("%s ",$0); }}' > $lmdir/text.filt # Envelop LM training data in context cues add-start-end.sh < $lexicon | awk '{if(NF>=3){ printf("%s ",$0); }}' > $lmdir/lm_input wait # Next, for each type of language model, create the corresponding FST # and the corresponding lang_test_* directory. echo "Preparing language models for test" # Create Ngram table ngt -i=$lmdir/lm_input -n=$ngram -o=$lmdir/train${ngram}.ngt -b=yes wait # Estimate trigram and quadrigram models in ARPA format tlm -tr=$lmdir/train${ngram}.ngt -n=$ngram -lm=wb -o=$lmdir/train${ngram}.arpa wait test=data/lang_test_${lm_suffix} mkdir -p $test cp -r $srcdir/* $test cat $lmdir/train${ngram}.arpa | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; echo "Succeeded in formatting data." exit 0; #rm -rf $tmpdir #rm -f $ccs |