Blame view
egs/wsj/s5/local/wsj_format_local_lms.sh
2.77 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2012 # Guoguo Chen 2014 lang_suffix= echo "$0 $@" # Print the command line for logging . ./path.sh . utils/parse_options.sh || exit 1; [ ! -d data/lang${lang_suffix}_bd ] &&\ echo "Expect data/local/lang${lang_suffix}_bd to exist" && exit 1; lm_srcdir_3g=data/local/local_lm/3gram-mincount lm_srcdir_4g=data/local/local_lm/4gram-mincount [ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1; [ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1; for d in data/lang${lang_suffix}_test_bd_{tg,tgpr,tgconst,fg,fgpr,fgconst}; do rm -r $d 2>/dev/null cp -r data/lang${lang_suffix}_bd $d done lang=data/lang${lang_suffix}_bd # Check a few files that we have to use. for f in words.txt oov.int; do if [[ ! -f $lang/$f ]]; then echo "$0: no such file $lang/$f" exit 1; fi done # Parameters needed for ConstArpaLm. unk=`cat $lang/oov.int` bos=`grep "<s>" $lang/words.txt | awk '{print $2}'` eos=`grep "</s>" $lang/words.txt | awk '{print $2}'` if [[ -z $bos || -z $eos ]]; then echo "$0: <s> and </s> symbols are not in $lang/words.txt" exit 1; fi # Be careful: this time we dispense with the grep -v '<s> <s>' so this might # not work for LMs generated from all toolkits. gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst # Build ConstArpaLm for the unpruned language model. gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ utils/map_arpa_lm.pl $lang/words.txt | \ arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \ --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst # Build ConstArpaLm for the unpruned language model. gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ utils/map_arpa_lm.pl $lang/words.txt | \ arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \ --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst exit 0; |