Blame view

egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh 5.74 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  #!/bin/bash
  
  # Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
  #           2017  Hainan Xu
  #           2017  Ke Li
  
  # rnnlm/train_rnnlm.sh: best iteration (out of 80) was 79, linking it to final iteration.
  # rnnlm/train_rnnlm.sh: train/dev perplexity was 44.3 / 49.9. 
  # Train objf: -1038.00 -5.35 -5.04 -4.87 -4.76 -4.68 -4.61 -4.56 -4.52 -4.47 -4.44 -4.41 -4.37 -4.35 -4.33 -4.31 -4.29 -4.27 -4.25 -4.24 -4.23 -4.21 -4.19 -4.17 -4.16 -4.15 -4.13 -4.12 -4.11 -4.10 -4.09 -4.07 -4.07 -4.06 -4.05 -4.04 -4.03 -4.02 -4.01 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.93 -3.93 -3.92 -3.92 -3.91 -3.91 -3.90 -3.90 -3.89 -3.88 -3.88 -3.88 -3.88 -3.88 -3.86 -3.86 -3.85 -3.85 -3.84 -3.83 -3.83 -3.83 -3.82 -3.82 -3.81 -3.81 -3.80 -3.80 -3.79 -3.79 -3.79 -3.79 
  # Dev objf:   -11.73 -5.66 -5.18 -4.96 -4.82 -4.73 -4.66 -4.59 -4.54 -4.51 -4.47 -4.44 -4.40 -4.38 -4.36 -4.34 -4.32 -4.30 -4.28 -4.27 -4.26 -4.21 -4.19 -4.18 -4.16 -4.15 -4.14 -4.13 -4.12 -4.12 -4.11 -4.09 -4.09 -4.08 -4.07 -4.07 -4.06 -4.06 -4.05 -4.04 -4.04 -4.04 -4.03 -4.02 -4.02 -4.01 -4.01 -4.00 -4.00 -4.00 -3.99 -3.99 -3.98 -3.98 -3.98 -3.98 -3.97 -3.97 -3.97 -3.97 -3.96 -3.95 -3.95 -3.94 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 
  
  # WER numbers
  
  # without RNNLM
  # %WER 7.51 [ 618 / 8234, 82 ins, 112 del, 424 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_dev93/wer_10_1.0
  # %WER 5.21 [ 294 / 5643, 55 ins, 34 del, 205 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_eval92/wer_11_0.5
  
  # with RNNLM
  # %WER 5.74 [ 473 / 8234, 81 ins, 76 del, 316 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_dev93_rnnlm/wer_14_1.0
  # %WER 4.27 [ 241 / 5643, 62 ins, 23 del, 156 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_eval92_rnnlm/wer_12_1.0
  
  # Begin configuration section.
  
  dir=exp/rnnlm_lstm_tdnn_1b
  embedding_dim=800
  lstm_rpd=200
  lstm_nrpd=200
  embedding_l2=0.001 # embedding layer l2 regularize
  comp_l2=0.001 # component-level l2 regularize
  output_l2=0.001 # output-layer l2 regularize
  epochs=20
  stage=-10
  train_stage=-10
  
  # variables for rnnlm rescoring
  ac_model_dir=exp/chain/tdnn_lstm1b_sp
  ngram_order=4
  decode_dir_suffix=rnnlm
  
  . ./cmd.sh
  . ./utils/parse_options.sh
  [ -z "$cmd" ] && cmd=$train_cmd
  
  
  text=data/local/dict_nosp_larger/cleaned.gz
  wordlist=data/lang_nosp/words.txt
  text_dir=data/rnnlm/text_nosp
  mkdir -p $dir/config
  set -e
  
  for f in $text $wordlist; do
    [ ! -f $f ] && \
      echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
  done
  
  if [ $stage -le 0 ]; then
    mkdir -p $text_dir
    echo -n >$text_dir/dev.txt
    # hold out one in every 500 lines as dev data.
    gunzip -c $text  | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/wsj.txt
  fi
  
  if [ $stage -le 1 ]; then
    # the training scripts require that <s>, </s> and <brk> be present in a particular
    # order.
    cp $wordlist $dir/config/ 
    n=`cat $dir/config/words.txt | wc -l` 
    echo "<brk> $n" >> $dir/config/words.txt 
  
    # words that are not present in words.txt but are in the training or dev data, will be
    # mapped to <SPOKEN_NOISE> during training.
    echo "<SPOKEN_NOISE>" >$dir/config/oov.txt
  
    cat > $dir/config/data_weights.txt <<EOF
  wsj   1   1.0
  EOF
  
    rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
                               --unk-word="<SPOKEN_NOISE>" \
                               --data-weights-file=$dir/config/data_weights.txt \
                               $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
  
    # choose features
    rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
                             --use-constant-feature=true \
                             --top-word-features=50000 \
                             --min-frequency 1.0e-03 \
                             --special-words='<s>,</s>,<brk>,<SPOKEN_NOISE>' \
                             $dir/config/words.txt > $dir/config/features.txt
  
  lstm_opts="l2-regularize=$comp_l2"
  tdnn_opts="l2-regularize=$comp_l2"
  output_opts="l2-regularize=$output_l2"
  
    cat >$dir/config/xconfig <<EOF
  input dim=$embedding_dim name=input
  relu-renorm-layer name=tdnn1 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1)) 
  fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd $lstm_opts
  relu-renorm-layer name=tdnn2 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-2))
  fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd $lstm_opts
  relu-renorm-layer name=tdnn3 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
  output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
  EOF
    rnnlm/validate_config_dir.sh $text_dir $dir/config
  fi
  
  if [ $stage -le 2 ]; then
    # the --unigram-factor option is set larger than the default (100)
    # in order to reduce the size of the sampling LM, because rnnlm-get-egs
    # was taking up too much CPU (as much as 10 cores).
    rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
                               $text_dir $dir/config $dir
  fi
  
  if [ $stage -le 3 ]; then
    rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
                         --embedding_l2 $embedding_l2 \
                         --stage $train_stage --num-epochs $epochs --cmd "$cmd" $dir
  fi
  
  LM=tgpr
  if [ $stage -le 4 ]; then
    for decode_set in dev93 eval92; do
      decode_dir=${ac_model_dir}/decode_looped_${LM}_${decode_set}
  
      # Lattice rescoring
      rnnlm/lmrescore_pruned.sh \
        --cmd "$decode_cmd --mem 4G" \
        --weight 0.8 --max-ngram-order $ngram_order \
        data/lang_test_$LM $dir \
        data/test_${decode_set}_hires ${decode_dir} \
        ${decode_dir}_${decode_dir_suffix} &
    done
    wait
  fi
  
  exit 0