retry.pl
3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env perl
use strict;
use warnings;
# Copyright 2018 Johns Hopkins University (Author: Daniel Povey).
# Apache 2.0.
use File::Basename;
use Cwd;
use Getopt::Long;
# retry.pl is a wrapper for queue.pl. It can be used to retry jobs that failed,
# e.g. if your command line was "queue.pl [args]", you can replace that
# with "retry.pl queue.pl [args]" and it will retry jobs that failed.
my $num_tries = 2;
sub print_usage() {
print STDERR
"Usage: retry.pl <some-other-wrapper-script> <rest-of-command>\n" .
" e.g.: retry.pl [options] queue.pl foo.log do_something\n" .
"This will retry jobs that failed (only once)\n" .
"Options:\n" .
" --num-tries <n> # default: 2\n";
exit 1;
}
if ($ARGV[0] eq "--num-tries") {
shift;
$num_tries = $ARGV[0] + 0;
if ($num_tries < 1) {
die "$0: invalid option --num-tries $ARGV[0]";
}
shift;
}
if (@ARGV < 3) {
print_usage();
}
sub get_log_file {
my $n;
# First just look for the first command-line arg that ends in ".log". If that
# exists, it's almost certainly the log file.
for ($n = 1; $n < @ARGV; $n++) {
if ($ARGV[$n] =~ m/\.log$/) {
return $ARGV[$n];
}
}
for ($n = 1; $n < @ARGV; $n++) {
# If this arg isn't of the form "-some-option', and isn't of the form
# "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this
# isn't just a number (note: the 'not-a-number' things is mostly to exclude
# things like the 5 in "-pe smp 5" which is an older but still-supported
# option to queue.pl)... then assume it's a log file.
if ($ARGV[$n] !~ m/^-=/ && $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ &&
$ARGV[$n-1] !~ m/^-/) {
return $ARGV[$n];
}
}
print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV);
exit(1);
}
my $log_file = get_log_file();
my $return_status;
for (my $n = 1; $n <= $num_tries; $n++) {
system(@ARGV);
$return_status = $?;
if ($return_status == 0) {
exit(0); # The command succeeded. We return success.
} elsif ($return_status != 256) {
# The command did not "die normally". When queue.pl and similar scripts
# detect a normal error, they exit(1), which becomes a status of 256
# in perl's $? variable.
# See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info.
# An example of an abnormal death that would cause us to want to exit
# immediately, is when the user does ctrl-c or KILLs the script,
# which gets caught by 'caught_signal' in queue.pl and causes that program
# to return with exit status 2.
exit(1);
}
if ($n < $num_tries) {
if (! -f $log_file) {
# $log_file doesn't exist as a file. Maybe it was an array job.
# This script doesn't yet support array jobs. We just give up.
# Later on we might want to figure out which array jobs failed
# and have to be rerun, but for now we just die.
print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n";
} else {
rename($log_file, $log_file . ".bak");
print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n";
}
}
}
print STDERR "$0: job failed $num_tries times; log is in $log_file\n";
exit(1);