ensure_counts_present.sh 666 Bytes
#!/usr/bin/env bash

# This script makes sure that a <text-dir>, as validated by validate_text_dir.py,
# has unigram counts present (*.counts).


if [ $# != 1 ]; then
  echo "Usage: $0 <text-dir>"
  echo "Makes sure unigram counts (*.counts) are present in <text-dir>,"
  echo "and if not, sets them up."
  exit 1;
fi


dir=$1

all_ok=true
for f in `ls $dir/*.txt`; do
  counts_file=$(echo $f | sed s/.txt$/.counts/)
  if [ ! -f $counts_file -o $counts_file -ot $f ]; then
    echo "$0: generating counts file for $f" 1>&2
    cat $f | awk '{for(i = 1; i <= NF; i++) {print $i;} print "</s>"}' | \
      sort | uniq -c | awk '{print $2,$1}' > $counts_file
  fi
done