Blame view
scripts/rnnlm/get_num_splits.sh
4 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/bin/bash # Copyright 2017 Johns Hopkins University (author: Daniel Povey) # Apache 2.0. # This script works out how many pieces we want to split the data into for a # particular training run, based on how many words are in the data directory # (excluding dev), and the target words-per-split. if [ $# != 3 ]; then ( echo "Usage: rnnlm/get_num_splits.sh <target-words-per-split> <data-dir> <weights-file>" echo "e.g.: rnnlm/get_num_splits.sh 200000 data/text exp/rnnlm/data_weights.txt" echo "This works out how many pieces to split a data directory into, and" echo "(if just one piece) how many times that piece should be repeated to" echo "get the target words-per-split. A number is printed to the standard" echo "output. If no repeats are necessary it will be the number of splits," echo "a positive number. If repeats are necessary, then a negative number," echo "interpretable as the negative of the number of times we should repeat" echo "the data, is echoed, and the number of splits should be taken to be 1." echo "To compute the number of words of training data" echo "this script uses <data-dir>/*.counts; they are scaled by the data-multiplicities" echo "given as the second field of <weights-file> for each data source." ) 1>&2 exit 1 fi words_per_split=$1 text=$2 weights_file=$3 ! [ $words_per_split -eq $words_per_split ] && \ echo "$0: first arg must be an integer" 1>&2 && exit 1; [ ! -d $text ] && \ echo "$0: no such directory $text" 1>&2 && exit 1; [ ! -f $weight ] && \ echo "$0: expected weights file in $weight" 1>&2 && exit 1; rnnlm/ensure_counts_present.sh $text 1>&2 set -e -o pipefail -u export LC_ALL=C multiplicities=$(mktemp tmp.XXXX) trap "rm $multiplicities" EXIT if ! awk '{if(NF!=3){ exit(1); } print $1, $2; } END{if(NR==0) exit(1);}' <$weights_file > $multiplicities; then echo "$0: weights file $weights_file has the wrong format." fi tot_orig=0 tot_with_multiplicities=0 for f in $text/*.counts; do if [ "$f" != "$text/dev.counts" ]; then this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}') if ! [ $this_tot -gt 0 ]; then echo "$0: there were no counts in counts file $f" 1>&2 exit 1 fi # weight by the data multiplicity which is the second field of the weights file. multiplicity=$(basename $f | sed 's:.counts$::' | utils/apply_map.pl $multiplicities) if ! [ $multiplicity -eq $multiplicity ]; then echo "$0: error getting multiplicity for data-source $f, check weights file $weights_file" exit 1 fi tot_orig=$[tot_orig+this_tot] tot_with_multiplicities=$[tot_with_multiplicities+(this_tot*multiplicity)] fi done if ! [ $tot_orig -gt 0 ]; then echo "$0: there was a problem getting counts from directory $text (no counts present?)" 1>&2 exit 1 fi if ! [ $tot_with_multiplicities -gt 0 ]; then echo "$0: there was a problem getting counts from directory $text (check data-weights file $weights_file)" 1>&2 exit 1 fi # adding words_per_split-1 below causes us to round up the number of splits. num_splits=$[(tot_with_multiplicities+words_per_split-1)/words_per_split] actual_words_per_split=$[tot_with_multiplicities/num_splits] if ! [ $num_splits -gt 0 ]; then echo "$0: there was a problem getting the number of splits" 1>&2 exit 1 fi num_repeats=$[words_per_split/actual_words_per_split] if ! [ $num_repeats -ge 1 ]; then echo "$0: error computing the number of repeats, got $num_repeats." 1>&2 exit 1 fi if [ $num_repeats -gt 1 -a $num_splits -gt 1 ]; then echo "$0: script error: both num-repeats and num-splits are over 1." 1>&2 exit 1 fi echo -n "get_num_splits.sh: based on tot-words=$tot_orig (with multiplicities: $tot_with_multiplicities)" 1>&2 echo " and target-words-per-split=$words_per_split, got $num_splits splits, actual words-per-split is $actual_words_per_split" 1>&2 if [ $num_repeats -gt 1 ]; then echo " ... and num-repeats is $num_repeats" 1>&2 fi if [ $num_repeats -eq 1 ]; then echo $num_splits else echo -$num_repeats fi |