change_vocab.sh
2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env bash
# Copyright 2018 Xiaohui Zhang
# This script prepares a new rnnlm-dir commpatible with a new vocab from a provided word-list,
# given an exisiting rnnlm-dir containing a trained rnnlm. Basically, we copy the feature
# embedding, a trained rnnlm and some config files from the old rnnlm-dir. And then we re-
# generate the unigram_probs.txt (a fixed unigram prob is assigned to words out of the orignal vocab),
# word_feats.txt and word embeddings.
cmd=run.pl
oov_unigram_prob=0.0000001
. utils/parse_options.sh
if [ $# != 3 ]; then
echo "Usage: $0 [options] <word-list> <rnnlm-in-dir> <rnnlm-out-dir>"
echo "Prepare a new directory <rnnlm-out-dir> with a given <word-list> and a valid <rnnlm-in-dir>."
echo " <word-list> is a vocabulary file with mapping to integers."
exit 1
fi
set -e
[ -f path.sh ] && . ./path.sh
word_list=$1
rnnlm_in_dir=$2
rnnlm_out_dir=$3
for f in features.txt data_weights.txt oov.txt xconfig; do
if [ ! -f $rnnlm_in_dir/config/$f ]; then
echo "$0: file $rnnlm_in_dir/config/$f is not present."
exit 1
fi
done
for f in unigram_probs.txt feat_embedding.final.mat final.raw; do
if [ ! -f $rnnlm_in_dir/$f ]; then
echo "$0: file $rnnlm_in_dir/$f is not present."
exit 1
fi
done
echo "$0: Copying config directory."
mkdir -p $rnnlm_out_dir/config
for f in features.txt data_weights.txt oov.txt xconfig; do
cp $rnnlm_in_dir/config/$f $rnnlm_out_dir/config
done
for f in feat_embedding.final.mat final.raw; do
cp -L $rnnlm_in_dir/$f $rnnlm_out_dir/
done
echo "$0: Re-generating words.txt, unigram_probs.txt, word_feats.txt and word_embedding.final.mat."
cp $word_list $rnnlm_out_dir/config/words.txt
brk_id=`cat $rnnlm_out_dir/config/words.txt | wc -l`
echo "<brk> $brk_id" >> $rnnlm_out_dir/config/words.txt
# Generate new unigram_probs.txt. For words within the original vocab, we just take the prob
# from the original unigram_probs.txt. For new words added, we assign the prob as $oov_unigram_prob.
awk -v s=$rnnlm_in_dir/unigram_probs.txt -v t=$rnnlm_in_dir/config/words.txt -v oov_prob=$oov_unigram_prob \
'BEGIN { while ((getline<s) > 0) { id2prob[$1] = $2; }
while ((getline<t) > 0) { word2prob[$1] = id2prob[$2]; }
}
{ if ($1 in word2prob) print $2" "word2prob[$1]; else print $2" "oov_prob; }' \
$rnnlm_out_dir/config/words.txt | sort -k1,1 -n > $rnnlm_out_dir/unigram_probs.txt
rnnlm/get_special_symbol_opts.py < $rnnlm_out_dir/config/words.txt > $rnnlm_out_dir/special_symbol_opts.txt
# Re-compute words_feats.txt and word embeddings.
rnnlm/get_word_features.py --unigram-probs=$rnnlm_out_dir/unigram_probs.txt --treat-as-bos='#0' \
$rnnlm_out_dir/config/words.txt $rnnlm_out_dir/config/features.txt > $rnnlm_out_dir/word_feats.txt
rnnlm-get-word-embedding $rnnlm_out_dir/word_feats.txt $rnnlm_out_dir/feat_embedding.final.mat \
$rnnlm_out_dir/word_embedding.final.mat