validate_config_dir.sh
3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env bash
# This script validates the inputs for RNNLM training.
if [ $# != 2 ]; then
echo "Usage: $0 <text-dir> <rnnlm-config-dir>"
echo "Validates <text-dir> and <rnnlm-config-dir>."
echo "<text-dir> should be as validated by validate_data_dir.py."
echo "<rnnlm-config-dir> contains various smallish user-provided"
echo "files needed for RNNLM training:"
echo " words.txt (vocabulary file with mapping to integers)"
echo " features.txt [optional] File as generated by choose_features.py,"
echo " that determines the feature represenration."
echo " If not present, no feature representation will be "
echo " used, and each word's embedding is trained separately."
echo " data_weights.txt File containing data multiplicities and"
echo " weighting factors for all data sources in <text-dir>,"
echo " except 'dev'. e.g. with lines like"
echo " switchboard 1 0.5"
echo " Weights do not have to sum to one and can be greater"
echo " than one."
echo " oov.txt Must either be a empty file, or contain the"
echo " written representation of the unknown word, usually"
echo " <unk> (only relevant if <text-dir> contains words not"
echo " present in words.txt)."
echo " xconfig File containing xconfig representation of the"
echo " RNNLM to be created, as could be provided to"
echo " xconfig_to_configs.py"
exit 1;
fi
text_dir=$1
config_dir=$2
set -e
for f in words.txt data_weights.txt oov.txt xconfig; do
if [ ! -f $config_dir/$f ]; then
echo "$0: file $config_dir/$f is not present."
exit 1
fi
done
rnnlm/validate_text_dir.py --spot-check=true $text_dir
if [ -f $config_dir/features.txt ]; then
# features.txt is optional.
rnnlm/validate_features.py $config_dir/features.txt
fi
# basic check of words.txt
if ! echo 0 | utils/int2sym.pl $config_dir/words.txt >/dev/null; then
echo "$0: detected a problem in $config_dir/words.txt"
exit 1
fi
rnnlm/ensure_counts_present.sh $text_dir
# rnnlm/get_unigram_probs.py validates the data-weights file, so we're
# relying on that check rather than writing a special one.
if ! rnnlm/get_unigram_probs.py --vocab-file=$config_dir/words.txt \
--unk-word=$(cat $config_dir/oov.txt) \
--data-weights-file=$config_dir/data_weights.txt \
$text_dir >/dev/null; then
echo "$0: detected problem, most likely with data-weights file $config_dir/data_weights.txt"
echo " ... see errors above."
fi
# for words.txt: check number of fields per line is 2 and that the
# second is an integer; check that bos, eos and brk are in the
# expected positions.
if [ -s $config_dir/oov.txt ]; then
# if oov.txt is nonempty...
if ! awk '{if (NF!=1){ exit (1) }} END{if(NR != 1) exit(1)}' <$config_dir/oov.txt; then
echo "$0: $config_dir/oov.txt does not look right."
exit 1
fi
if ! utils/sym2int.pl $config_dir/words.txt <$config_dir/oov.txt >/dev/null; then
echo "$0: the word in $config_dir/oov.txt does not exist in $config_dir/words.txt: '$(cat $config_dir/oov.txt)'"
exit 1
fi
fi
if grep '^\s*fixed-affine-layer' $config_dir/xconfig; then
echo "$0: $config_dir/xconfig cannot contain a layer of type fixed-affine-layer."
exit 1
fi
echo "$0: validated config dir $config_dir"
exit 0;