Blame view
scripts/rnnlm/validate_config_dir.sh
3.33 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#!/usr/bin/env bash # This script validates the inputs for RNNLM training. if [ $# != 2 ]; then echo "Usage: $0 <text-dir> <rnnlm-config-dir>" echo "Validates <text-dir> and <rnnlm-config-dir>." echo "<text-dir> should be as validated by validate_data_dir.py." echo "<rnnlm-config-dir> contains various smallish user-provided" echo "files needed for RNNLM training:" echo " words.txt (vocabulary file with mapping to integers)" echo " features.txt [optional] File as generated by choose_features.py," echo " that determines the feature represenration." echo " If not present, no feature representation will be " echo " used, and each word's embedding is trained separately." echo " data_weights.txt File containing data multiplicities and" echo " weighting factors for all data sources in <text-dir>," echo " except 'dev'. e.g. with lines like" echo " switchboard 1 0.5" echo " Weights do not have to sum to one and can be greater" echo " than one." echo " oov.txt Must either be a empty file, or contain the" echo " written representation of the unknown word, usually" echo " <unk> (only relevant if <text-dir> contains words not" echo " present in words.txt)." echo " xconfig File containing xconfig representation of the" echo " RNNLM to be created, as could be provided to" echo " xconfig_to_configs.py" exit 1; fi text_dir=$1 config_dir=$2 set -e for f in words.txt data_weights.txt oov.txt xconfig; do if [ ! -f $config_dir/$f ]; then echo "$0: file $config_dir/$f is not present." exit 1 fi done rnnlm/validate_text_dir.py --spot-check=true $text_dir if [ -f $config_dir/features.txt ]; then # features.txt is optional. rnnlm/validate_features.py $config_dir/features.txt fi # basic check of words.txt if ! echo 0 | utils/int2sym.pl $config_dir/words.txt >/dev/null; then echo "$0: detected a problem in $config_dir/words.txt" exit 1 fi rnnlm/ensure_counts_present.sh $text_dir # rnnlm/get_unigram_probs.py validates the data-weights file, so we're # relying on that check rather than writing a special one. if ! rnnlm/get_unigram_probs.py --vocab-file=$config_dir/words.txt \ --unk-word=$(cat $config_dir/oov.txt) \ --data-weights-file=$config_dir/data_weights.txt \ $text_dir >/dev/null; then echo "$0: detected problem, most likely with data-weights file $config_dir/data_weights.txt" echo " ... see errors above." fi # for words.txt: check number of fields per line is 2 and that the # second is an integer; check that bos, eos and brk are in the # expected positions. if [ -s $config_dir/oov.txt ]; then # if oov.txt is nonempty... if ! awk '{if (NF!=1){ exit (1) }} END{if(NR != 1) exit(1)}' <$config_dir/oov.txt; then echo "$0: $config_dir/oov.txt does not look right." exit 1 fi if ! utils/sym2int.pl $config_dir/words.txt <$config_dir/oov.txt >/dev/null; then echo "$0: the word in $config_dir/oov.txt does not exist in $config_dir/words.txt: '$(cat $config_dir/oov.txt)'" exit 1 fi fi if grep '^\s*fixed-affine-layer' $config_dir/xconfig; then echo "$0: $config_dir/xconfig cannot contain a layer of type fixed-affine-layer." exit 1 fi echo "$0: validated config dir $config_dir" exit 0; |