validate_image_dir.sh
2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env bash
# This script validates a directory containing training or test images
# for image-classification tasks with fixed-size images.
if [ $# != 1 ]; then
echo "Usage: $0 <image-dir-to-validate>"
echo "e.g.: $0 data/cifar10_train"
fi
dir=$1
[ -e ./path.sh ] && . ./path.sh
if [ ! -d $dir ]; then
echo "$0: directory $dir does not exist."
exit 1
fi
for f in images.scp labels.txt classes.txt num_channels; do
if [ ! -s "$dir/$f" ]; then
echo "$0: expected file $dir/$f to exist and be nonempty"
exit 1
fi
done
num_channels=$(cat $dir/num_channels)
if ! [[ $num_channels -gt 0 ]]; then
echo "$0: expected the file $dir/num_channels to contain a number >0"
exit 1
fi
paf="--print-args=false"
num_cols=$(head -n 1 $dir/images.scp | feat-to-dim $paf scp:- -)
if ! [[ $[$num_cols%$num_channels] == 0 ]]; then
echo "$0: expected the number of columns in the image matrices ($num_cols) to "
echo " be a multiple of the number of channels ($num_channels)"
exit 1
fi
num_rows=$(head -n 1 $dir/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}')
height=$[$num_cols/$num_channels]
echo "$0: images are width=$num_rows by height=$height, with $num_channels channels (colors)."
if ! cmp <(awk '{print $1}' $dir/images.scp) <(awk '{print $1}' $dir/labels.txt); then
echo "$0: expected the first fields of $dir/images.scp and $dir/labels.txt to match up."
exit 1;
fi
if ! [[ $num_cols -eq $(tail -n 1 $dir/images.scp | feat-to-dim $paf scp:- -) ]]; then
echo "$0: the number of columns in the image matrices is not consistent."
exit 1
fi
if ! [[ $num_rows -eq $(tail -n 1 $dir/images.scp | feat-to-len $paf scp:- ark,t:- | awk '{print $2}') ]]; then
echo "$0: the number of rows in the image matrices is not consistent."
exit 1
fi
# Note: we don't require images.scp and labels.txt to be sorted, but they
# may not contain repeated keys.
if ! awk '{if($1 in a) { print "validate_image_dir.sh: key " $1 " is repeated in labels.txt, line is: " $0; exit 1; } a[$1]=1; }' <$dir/labels.txt; then
exit 1
fi
if ! utils/int2sym.pl -f 2 $dir/classes.txt <$dir/labels.txt >/dev/null; then
echo "$0: classes.txt may have the wrong format or may not cover all labels in $dir/labels.txt"
exit 1;
fi
if ! awk '{ if(NF != 2 || $2 != NR-1) { print "Bad line of classes.txt: " $0; exit(1); }}' $dir/classes.txt; then
echo "$0: $dir/classes.txt doesn't look right."
exit 1
fi
echo "$0: validated image-data directory $dir"
exit 0