Blame view
egs/ptb/s5/local/rnnlm/train_backoff_lm2.sh
1.85 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
#!/bin/bash # this script is to train a small, pruned n-gram backoff LM to be used for sampling # purposes during RNNLM training. It uses the C++ tool that we wrote for this # purpose, which creates an LM optimized specifically for this task. dir=exp/rnnlm_data_prep # To be run from the directory egs/ptb/s5. # to be run after prepare_rnnlm_data.sh. this will all later be refactored. . ./path.sh set -e # When we start running this with multiple datasets we'll have to # incorporate the data weights. This is triial. # old version that writes the ARPA file: #rnnlm-get-sampling-lm --discounting-constant=1.0 \ # --unigram-factor=200.0 --backoff-factor=2.0 \ # "cat data/text/ptb.txt | utils/sym2int.pl data/vocab/words.txt | awk '{print 1.0, \$0}' |" \ # data/vocab/words.txt "| gzip -c > $dir/lm.arpa.gz" vocab_size=$(tail -n 1 data/vocab/words.txt |awk '{print $NF + 1}') rnnlm-get-sampling-lm --discounting-constant=1.0 \ --unigram-factor=200.0 --backoff-factor=2.0 \ --vocab-size=$vocab_size \ "cat data/text/ptb.txt | utils/sym2int.pl data/vocab/words.txt | awk '{print 1.0, \$0}' |" \ $dir/sampling.lm exit 0 ############## BEYOND HERE IS COMMENT # baseline speed test with pocolm-based LM: time rnnlm-get-egs --vocab-size=10003 data/vocab/words.txt 'gunzip -c data/pocolm/trigram_100k.arpa.gz|' 'head -n 10000 exp/rnnlm_data_prep/text/1.txt|' ark:/dev/null # 23 seconds # and the size of the pocolm lm: gunzip -c data/pocolm/trigram_100k.arpa.gz|head -n 4 \data\ ngram 1=10001 ngram 2=69769 ngram 3=23799 # the size of our LM: gunzip -c exp/rnnlm_data_prep/lm.arpa.gz|head -n 4 \data\ ngram 1=10002 ngram 2=17697 ngram 3=14678 time rnnlm-get-egs --vocab-size=10003 data/vocab/words.txt 'gunzip -c exp/rnnlm_data_prep/lm.arpa.gz|' 'head -n 10000 exp/rnnlm_data_prep/text/1.txt|' ark:/dev/null # 9.8 seconds. |