Blame view
egs/wsj/s5/utils/build_const_arpa_lm.sh
1.07 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/bin/bash # Copyright 2014 Guoguo Chen # Apache 2.0 # This script reads in an Arpa format language model, and converts it into the # ConstArpaLm format language model. # begin configuration section # end configuration section [ -f path.sh ] && . ./path.sh; . utils/parse_options.sh if [ $# != 3 ]; then echo "Usage: " echo " $0 [options] <arpa-lm-path> <old-lang-dir> <new-lang-dir>" echo "e.g.:" echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" echo "Options" exit 1; fi export LC_ALL=C arpa_lm=$1 old_lang=$2 new_lang=$3 mkdir -p $new_lang mkdir -p $new_lang cp -r $old_lang/* $new_lang unk=`cat $new_lang/oov.int` bos=`grep "^<s>\s" $new_lang/words.txt | awk '{print $2}'` eos=`grep "^</s>\s" $new_lang/words.txt | awk '{print $2}'` if [[ -z $bos || -z $eos ]]; then echo "$0: <s> and </s> symbols are not in $new_lang/words.txt" exit 1 fi arpa-to-const-arpa --bos-symbol=$bos \ --eos-symbol=$eos --unk-symbol=$unk \ "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; exit 0; |