Blame view
scripts/rnnlm/prepare_rnnlm_dir.sh
6.28 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#!/usr/bin/env bash # This script prepares some things needed by rnnlm/train_rnnlm.sh, e.g. it initializes # the model and ensures we have the split-up integerized data on disk. cmd=run.pl stage=0 sampling=true # add the option --sampling false to disable creation # of sampling.lm words_per_split=5000000 # aim for training on 5 million words per job. the # aim is to have the jobs last at least a couple of # minutes. If this leads to having just one job, # we repeat the archive as many times as needed to # get the target length. unigram_factor=100.0 # Option used when pruning the LM used for sampling. # You can increase this, e.g. to 200 or 400, if the LM used # for sampling is too big, causing rnnlm-get-egs to # take up too many CPUs worth of compute. . utils/parse_options.sh if [ $# != 3 ]; then echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>" echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by" echo "rnnlm/train_rnnlm.sh, and initializes the model." echo " <text-dir> is as validated by rnnlm/validate_text_dir.py" echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh." exit 1 fi text_dir=$1 config_dir=$2 dir=$3 set -e . ./path.sh if [ $stage -le 0 ]; then echo "$0: validating input" rnnlm/validate_text_dir.py --spot-check=true $text_dir rnnlm/validate_config_dir.sh $text_dir $config_dir if ! mkdir -p $dir; then echo "$0: could not create RNNLM dir $dir" fi fi if [ $stage -le 1 ]; then if [ $config_dir != $dir/config ]; then echo "$0: copying config directory" mkdir -p $dir/config # copy expected things from $config_dir to $dir/config. for f in words.txt data_weights.txt oov.txt xconfig; do cp $config_dir/$f $dir/config done # features.txt is optional, check separately if [ -f $config_dir/features.txt ]; then cp $config_dir/features.txt $dir/config fi fi rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt fi if [ $stage -le 2 ]; then if [ ! -f $text_dir/dev.counts ] || [ $text_dir/dev.counts -ot $text_dir/dev.txt ]; then echo "$0: preparing unigram counts in $text_dir" rnnlm/get_unigram_counts.sh $text_dir fi fi if [ $stage -le 3 ]; then if [ -f $dir/config/features.txt ]; then # prepare word-level features in $dir/word_feats.txt # first we need the appropriately weighted unigram counts. if awk '{if($2 == "unigram"){saw_unigram=1;}} END{exit(saw_unigram ? 0 : 1)}' $dir/config/features.txt; then # we need the unigram probabilities rnnlm/ensure_counts_present.sh $text_dir rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \ --unk-word=$(cat $dir/config/oov.txt) \ --data-weights-file=$dir/config/data_weights.txt $text_dir \ >$dir/unigram_probs.txt unigram_opt="--unigram-probs=$dir/unigram_probs.txt" else unigram_opt= fi rnnlm/get_word_features.py $unigram_opt --treat-as-bos='#0' \ $dir/config/words.txt $dir/config/features.txt >$dir/word_feats.txt else [ -f $dir/word_feats.txt ] && rm $dir/word_feats.txt fi fi if [ $stage -le 4 ]; then echo "$0: preparing split-up integerized text data with weights" num_repeats=1 num_splits=$(rnnlm/get_num_splits.sh $words_per_split $text_dir $dir/config/data_weights.txt) if [ $num_splits -lt 1 ]; then # the script outputs the negative of the num-repeats if the num-splits is 1 # and the num-repeats is >1. num_repeats=$[-num_splits] num_splits=1 fi # note: the python script treats the empty unknown word as a special case, # so if oov.txt is empty we don't have to take any special action. rnnlm/prepare_split_data.py --unk-word="$(cat $dir/config/oov.txt)" \ --vocab-file=$dir/config/words.txt --data-weights-file=$dir/config/data_weights.txt \ --num-splits=$num_splits $text_dir $dir/text echo $num_repeats >$dir/text/info/num_repeats fi if [ $stage -le 5 ]; then echo "$0: initializing neural net" mkdir -p $dir/config/nnet steps/nnet3/xconfig_to_configs.py --xconfig-file=$dir/config/xconfig \ --config-dir=$dir/config/nnet # initialize the neural net. nnet3-init $dir/config/nnet/ref.config $dir/0.raw fi embedding_dim=$(rnnlm/get_embedding_dim.py $dir/0.raw) if [ $stage -le 6 ]; then echo "$0: initializing embedding matrix" if [ -f $dir/config/features.txt ]; then # We are using sparse features. feat_dim=$(tail -n 1 $dir/config/features.txt | awk '{print $1 + 1;}') first_element_opt= if grep -q '0\tconstant' $dir/config/features.txt; then first_element_opt="--first-element 1.0" fi # we'll probably make the stddev configurable soon, or maybe just remove it. # At some point stability was an issue. rnnlm/initialize_matrix.pl $first_element_opt --stddev 0.001 \ $feat_dim $embedding_dim > $dir/feat_embedding.0.mat [ -f $dir/word_embedding.0.mat ] && rm $dir/word_embedding.0.mat else vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}') rnnlm/initialize_matrix.pl --first-column 1.0 $vocab_size $embedding_dim > $dir/word_embedding.0.mat [ -f $dir/feat_embedding.0.mat ] && rm $dir/feat_embedding.0.mat fi fi mkdir -p $dir/log if [ $stage -le 7 ]; then if $sampling; then echo "$0: preparing language model for sampling" num_splits=$(cat $dir/text/info/num_splits) text_files=$(for n in $(seq $num_splits); do echo -n $dir/text/$n.txt ''; done) vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}') special_symbol_opts=$(cat $dir/special_symbol_opts.txt) # this prints some nontrivial log information, so run using '$cmd' to ensure # the output gets saved. # ***NOTE*** we will likely later have to pass in options to this program to control # the size of the sampling LM. $cmd $dir/log/prepare_sampling_lm.log \ rnnlm-get-sampling-lm --unigram-factor=$unigram_factor $special_symbol_opts \ --vocab-size=$vocab_size "cat $text_files|" $dir/sampling.lm echo "$0: done estimating LM for sampling." else [ -f $dir/sampling.lm ] && rm $dir/sampling.lm fi fi |