Blame view
egs/wsj/s5/utils/parallel/retry.pl
3.21 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
#!/usr/bin/env perl use strict; use warnings; # Copyright 2018 Johns Hopkins University (Author: Daniel Povey). # Apache 2.0. use File::Basename; use Cwd; use Getopt::Long; # retry.pl is a wrapper for queue.pl. It can be used to retry jobs that failed, # e.g. if your command line was "queue.pl [args]", you can replace that # with "retry.pl queue.pl [args]" and it will retry jobs that failed. my $num_tries = 2; sub print_usage() { print STDERR "Usage: retry.pl <some-other-wrapper-script> <rest-of-command> " . " e.g.: retry.pl [options] queue.pl foo.log do_something " . "This will retry jobs that failed (only once) " . "Options: " . " --num-tries <n> # default: 2 "; exit 1; } if ($ARGV[0] eq "--num-tries") { shift; $num_tries = $ARGV[0] + 0; if ($num_tries < 1) { die "$0: invalid option --num-tries $ARGV[0]"; } shift; } if (@ARGV < 3) { print_usage(); } sub get_log_file { my $n; # First just look for the first command-line arg that ends in ".log". If that # exists, it's almost certainly the log file. for ($n = 1; $n < @ARGV; $n++) { if ($ARGV[$n] =~ m/\.log$/) { return $ARGV[$n]; } } for ($n = 1; $n < @ARGV; $n++) { # If this arg isn't of the form "-some-option', and isn't of the form # "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this # isn't just a number (note: the 'not-a-number' things is mostly to exclude # things like the 5 in "-pe smp 5" which is an older but still-supported # option to queue.pl)... then assume it's a log file. if ($ARGV[$n] !~ m/^-=/ && $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ && $ARGV[$n-1] !~ m/^-/) { return $ARGV[$n]; } } print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV); exit(1); } my $log_file = get_log_file(); my $return_status; for (my $n = 1; $n <= $num_tries; $n++) { system(@ARGV); $return_status = $?; if ($return_status == 0) { exit(0); # The command succeeded. We return success. } elsif ($return_status != 256) { # The command did not "die normally". When queue.pl and similar scripts # detect a normal error, they exit(1), which becomes a status of 256 # in perl's $? variable. # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info. # An example of an abnormal death that would cause us to want to exit # immediately, is when the user does ctrl-c or KILLs the script, # which gets caught by 'caught_signal' in queue.pl and causes that program # to return with exit status 2. exit(1); } if ($n < $num_tries) { if (! -f $log_file) { # $log_file doesn't exist as a file. Maybe it was an array job. # This script doesn't yet support array jobs. We just give up. # Later on we might want to figure out which array jobs failed # and have to be rerun, but for now we just die. print STDERR "$0: job failed and log file $log_file does not exist (array job?). "; } else { rename($log_file, $log_file . ".bak"); print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning "; } } } print STDERR "$0: job failed $num_tries times; log is in $log_file "; exit(1); |