prepare_rnnlm_dir.sh
6.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env bash
# This script prepares some things needed by rnnlm/train_rnnlm.sh, e.g. it initializes
# the model and ensures we have the split-up integerized data on disk.
cmd=run.pl
stage=0
sampling=true # add the option --sampling false to disable creation
# of sampling.lm
words_per_split=5000000 # aim for training on 5 million words per job. the
# aim is to have the jobs last at least a couple of
# minutes. If this leads to having just one job,
# we repeat the archive as many times as needed to
# get the target length.
unigram_factor=100.0 # Option used when pruning the LM used for sampling.
# You can increase this, e.g. to 200 or 400, if the LM used
# for sampling is too big, causing rnnlm-get-egs to
# take up too many CPUs worth of compute.
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
echo "rnnlm/train_rnnlm.sh, and initializes the model."
echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
exit 1
fi
text_dir=$1
config_dir=$2
dir=$3
set -e
. ./path.sh
if [ $stage -le 0 ]; then
echo "$0: validating input"
rnnlm/validate_text_dir.py --spot-check=true $text_dir
rnnlm/validate_config_dir.sh $text_dir $config_dir
if ! mkdir -p $dir; then
echo "$0: could not create RNNLM dir $dir"
fi
fi
if [ $stage -le 1 ]; then
if [ $config_dir != $dir/config ]; then
echo "$0: copying config directory"
mkdir -p $dir/config
# copy expected things from $config_dir to $dir/config.
for f in words.txt data_weights.txt oov.txt xconfig; do
cp $config_dir/$f $dir/config
done
# features.txt is optional, check separately
if [ -f $config_dir/features.txt ]; then
cp $config_dir/features.txt $dir/config
fi
fi
rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
fi
if [ $stage -le 2 ]; then
if [ ! -f $text_dir/dev.counts ] || [ $text_dir/dev.counts -ot $text_dir/dev.txt ]; then
echo "$0: preparing unigram counts in $text_dir"
rnnlm/get_unigram_counts.sh $text_dir
fi
fi
if [ $stage -le 3 ]; then
if [ -f $dir/config/features.txt ]; then
# prepare word-level features in $dir/word_feats.txt
# first we need the appropriately weighted unigram counts.
if awk '{if($2 == "unigram"){saw_unigram=1;}} END{exit(saw_unigram ? 0 : 1)}' $dir/config/features.txt; then
# we need the unigram probabilities
rnnlm/ensure_counts_present.sh $text_dir
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
--unk-word=$(cat $dir/config/oov.txt) \
--data-weights-file=$dir/config/data_weights.txt $text_dir \
>$dir/unigram_probs.txt
unigram_opt="--unigram-probs=$dir/unigram_probs.txt"
else
unigram_opt=
fi
rnnlm/get_word_features.py $unigram_opt --treat-as-bos='#0' \
$dir/config/words.txt $dir/config/features.txt >$dir/word_feats.txt
else
[ -f $dir/word_feats.txt ] && rm $dir/word_feats.txt
fi
fi
if [ $stage -le 4 ]; then
echo "$0: preparing split-up integerized text data with weights"
num_repeats=1
num_splits=$(rnnlm/get_num_splits.sh $words_per_split $text_dir $dir/config/data_weights.txt)
if [ $num_splits -lt 1 ]; then
# the script outputs the negative of the num-repeats if the num-splits is 1
# and the num-repeats is >1.
num_repeats=$[-num_splits]
num_splits=1
fi
# note: the python script treats the empty unknown word as a special case,
# so if oov.txt is empty we don't have to take any special action.
rnnlm/prepare_split_data.py --unk-word="$(cat $dir/config/oov.txt)" \
--vocab-file=$dir/config/words.txt --data-weights-file=$dir/config/data_weights.txt \
--num-splits=$num_splits $text_dir $dir/text
echo $num_repeats >$dir/text/info/num_repeats
fi
if [ $stage -le 5 ]; then
echo "$0: initializing neural net"
mkdir -p $dir/config/nnet
steps/nnet3/xconfig_to_configs.py --xconfig-file=$dir/config/xconfig \
--config-dir=$dir/config/nnet
# initialize the neural net.
nnet3-init $dir/config/nnet/ref.config $dir/0.raw
fi
embedding_dim=$(rnnlm/get_embedding_dim.py $dir/0.raw)
if [ $stage -le 6 ]; then
echo "$0: initializing embedding matrix"
if [ -f $dir/config/features.txt ]; then
# We are using sparse features.
feat_dim=$(tail -n 1 $dir/config/features.txt | awk '{print $1 + 1;}')
first_element_opt=
if grep -q '0\tconstant' $dir/config/features.txt; then
first_element_opt="--first-element 1.0"
fi
# we'll probably make the stddev configurable soon, or maybe just remove it.
# At some point stability was an issue.
rnnlm/initialize_matrix.pl $first_element_opt --stddev 0.001 \
$feat_dim $embedding_dim > $dir/feat_embedding.0.mat
[ -f $dir/word_embedding.0.mat ] && rm $dir/word_embedding.0.mat
else
vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
rnnlm/initialize_matrix.pl --first-column 1.0 $vocab_size $embedding_dim > $dir/word_embedding.0.mat
[ -f $dir/feat_embedding.0.mat ] && rm $dir/feat_embedding.0.mat
fi
fi
mkdir -p $dir/log
if [ $stage -le 7 ]; then
if $sampling; then
echo "$0: preparing language model for sampling"
num_splits=$(cat $dir/text/info/num_splits)
text_files=$(for n in $(seq $num_splits); do echo -n $dir/text/$n.txt ''; done)
vocab_size=$(tail -n 1 $dir/config/words.txt | awk '{print $NF + 1}')
special_symbol_opts=$(cat $dir/special_symbol_opts.txt)
# this prints some nontrivial log information, so run using '$cmd' to ensure
# the output gets saved.
# ***NOTE*** we will likely later have to pass in options to this program to control
# the size of the sampling LM.
$cmd $dir/log/prepare_sampling_lm.log \
rnnlm-get-sampling-lm --unigram-factor=$unigram_factor $special_symbol_opts \
--vocab-size=$vocab_size "cat $text_files|" $dir/sampling.lm
echo "$0: done estimating LM for sampling."
else
[ -f $dir/sampling.lm ] && rm $dir/sampling.lm
fi
fi