Blame view

scripts/rnnlm/validate_config_dir.sh 3.33 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  #!/usr/bin/env bash
  
  # This script validates the inputs for RNNLM training.
  
  
  if [ $# != 2 ]; then
    echo "Usage: $0 <text-dir> <rnnlm-config-dir>"
    echo "Validates <text-dir> and <rnnlm-config-dir>."
    echo "<text-dir> should be as validated by validate_data_dir.py."
    echo "<rnnlm-config-dir> contains various smallish user-provided"
    echo "files needed for RNNLM training:"
    echo "  words.txt  (vocabulary file with mapping to integers)"
    echo "  features.txt  [optional] File as generated by choose_features.py,"
    echo "              that determines the feature represenration."
    echo "              If not present, no feature representation will be "
    echo "              used, and each word's embedding is trained separately."
    echo "  data_weights.txt  File containing data multiplicities and"
    echo "     weighting factors for all data sources in <text-dir>,"
    echo "     except 'dev'.  e.g. with lines like"
    echo "       switchboard  1  0.5"
    echo "     Weights do not have to sum to one and can be greater"
    echo "     than one."
    echo "  oov.txt   Must either be a empty file, or contain the"
    echo "     written representation of the unknown word, usually"
    echo "     <unk> (only relevant if <text-dir> contains words not"
    echo "     present in words.txt)."
    echo "  xconfig   File containing xconfig representation of the"
    echo "     RNNLM to be created, as could be provided to"
    echo "     xconfig_to_configs.py"
    exit 1;
  fi
  
  
  text_dir=$1
  config_dir=$2
  
  set -e
  
  
  for f in words.txt data_weights.txt oov.txt xconfig; do
    if [ ! -f $config_dir/$f ]; then
      echo "$0: file $config_dir/$f is not present."
      exit 1
    fi
  done
  
  rnnlm/validate_text_dir.py --spot-check=true $text_dir
  
  if [ -f $config_dir/features.txt ]; then
    # features.txt is optional.
    rnnlm/validate_features.py $config_dir/features.txt
  fi
  
  # basic check of words.txt
  if ! echo 0 | utils/int2sym.pl $config_dir/words.txt >/dev/null; then
    echo "$0: detected a problem in $config_dir/words.txt"
    exit 1
  fi
  
  
  rnnlm/ensure_counts_present.sh $text_dir
  
  
  # rnnlm/get_unigram_probs.py validates the data-weights file, so we're
  # relying on that check rather than writing a special one.
  if ! rnnlm/get_unigram_probs.py --vocab-file=$config_dir/words.txt \
                             --unk-word=$(cat $config_dir/oov.txt) \
                             --data-weights-file=$config_dir/data_weights.txt \
                             $text_dir >/dev/null; then
    echo "$0: detected problem, most likely with data-weights file $config_dir/data_weights.txt"
    echo " ... see errors above."
  fi
  
  # for words.txt: check number of fields per line is 2 and that the
  # second is an integer; check that bos, eos and brk are in the
  # expected positions.
  
  if [ -s $config_dir/oov.txt ]; then
    # if oov.txt is nonempty...
    if ! awk '{if (NF!=1){ exit (1) }} END{if(NR != 1) exit(1)}' <$config_dir/oov.txt; then
      echo "$0: $config_dir/oov.txt does not look right."
      exit 1
    fi
    if ! utils/sym2int.pl $config_dir/words.txt <$config_dir/oov.txt >/dev/null; then
      echo "$0: the word in $config_dir/oov.txt does not exist in $config_dir/words.txt: '$(cat $config_dir/oov.txt)'"
      exit 1
    fi
  fi
  
  
  if grep '^\s*fixed-affine-layer' $config_dir/xconfig; then
    echo "$0: $config_dir/xconfig cannot contain a layer of type fixed-affine-layer."
    exit 1
  fi
  
  
  echo "$0: validated config dir $config_dir"
  exit 0;