Blame view

egs/ptb/s5/local/rnnlm/train_backoff_lm2.sh 1.85 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  #!/bin/bash
  
  # this script is to train a small, pruned n-gram backoff LM to be used for sampling
  # purposes during RNNLM training.  It uses the C++ tool that we wrote for this
  # purpose, which creates an LM optimized specifically for this task.
  
  
  dir=exp/rnnlm_data_prep
  
  # To be run from the directory egs/ptb/s5.
  # to be run after prepare_rnnlm_data.sh.  this will all later be refactored.
  
  . ./path.sh
  set -e
  
  
  # When we start running this with multiple datasets we'll have to
  # incorporate the data weights.  This is triial.
  
  
  
  # old version that writes the ARPA file:
  #rnnlm-get-sampling-lm --discounting-constant=1.0 \
  #  --unigram-factor=200.0 --backoff-factor=2.0 \
  #   "cat data/text/ptb.txt | utils/sym2int.pl data/vocab/words.txt | awk '{print 1.0, \$0}' |" \
  #   data/vocab/words.txt "| gzip -c > $dir/lm.arpa.gz"
  
  
  
  vocab_size=$(tail -n 1 data/vocab/words.txt |awk '{print $NF + 1}')
  
  rnnlm-get-sampling-lm --discounting-constant=1.0 \
                        --unigram-factor=200.0 --backoff-factor=2.0 \
                        --vocab-size=$vocab_size \
     "cat data/text/ptb.txt | utils/sym2int.pl data/vocab/words.txt | awk '{print 1.0, \$0}' |" \
    $dir/sampling.lm
  
  
  exit 0
  ############## BEYOND HERE IS COMMENT
  
  # baseline speed test with pocolm-based LM:
  time rnnlm-get-egs --vocab-size=10003 data/vocab/words.txt 'gunzip -c data/pocolm/trigram_100k.arpa.gz|' 'head -n 10000 exp/rnnlm_data_prep/text/1.txt|' ark:/dev/null
  # 23 seconds
  
  
  # and the size of the pocolm lm:
  gunzip -c data/pocolm/trigram_100k.arpa.gz|head -n 4
  \data\
  ngram 1=10001
  ngram 2=69769
  ngram 3=23799
  
  
  
  # the size of our LM:
  gunzip -c exp/rnnlm_data_prep/lm.arpa.gz|head -n 4
  \data\
  ngram 1=10002
  ngram 2=17697
  ngram 3=14678
  
  time rnnlm-get-egs --vocab-size=10003 data/vocab/words.txt 'gunzip -c exp/rnnlm_data_prep/lm.arpa.gz|' 'head -n 10000 exp/rnnlm_data_prep/text/1.txt|' ark:/dev/null
  # 9.8 seconds.