Blame view

egs/wsj/s5/utils/parallel/retry.pl 3.21 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  #!/usr/bin/env perl
  use strict;
  use warnings;
  
  # Copyright 2018  Johns Hopkins University (Author: Daniel Povey).
  # Apache 2.0.
  
  use File::Basename;
  use Cwd;
  use Getopt::Long;
  
  
  # retry.pl is a wrapper for queue.pl.  It can be used to retry jobs that failed,
  # e.g. if your command line was "queue.pl [args]", you can replace that
  # with "retry.pl queue.pl [args]" and it will retry jobs that failed.
  
  
  my $num_tries = 2;
  
  sub print_usage() {
    print STDERR
      "Usage: retry.pl  <some-other-wrapper-script> <rest-of-command>
  " .
      "  e.g.:  retry.pl [options] queue.pl foo.log do_something
  " .
      "This will retry jobs that failed (only once)
  " .
      "Options:
  " .
      "      --num-tries <n>        # default: 2
  ";
    exit 1;
  }
  
  if ($ARGV[0] eq "--num-tries") {
    shift;
    $num_tries =  $ARGV[0] + 0;
    if ($num_tries < 1) {
      die "$0: invalid option --num-tries $ARGV[0]";
    }
    shift;
  }
  
  if (@ARGV < 3) {
    print_usage();
  }
  
  
  sub get_log_file {
    my $n;
    # First just look for the first command-line arg that ends in ".log".  If that
    # exists, it's almost certainly the log file.
    for ($n = 1; $n < @ARGV; $n++) {
      if ($ARGV[$n] =~ m/\.log$/) {
        return $ARGV[$n];
      }
    }
    for ($n = 1; $n < @ARGV; $n++) {
      # If this arg isn't of the form "-some-option', and isn't of the form
      # "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this
      # isn't just a number (note: the 'not-a-number' things is mostly to exclude
      # things like the 5 in "-pe smp 5" which is an older but still-supported
      # option to queue.pl)... then assume it's a log file.
      if ($ARGV[$n] !~ m/^-=/ &&  $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ &&
          $ARGV[$n-1] !~ m/^-/) {
        return $ARGV[$n];
      }
    }
    print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV);
    exit(1);
  }
  
  
  my $log_file = get_log_file();
  my $return_status;
  
  for (my $n = 1; $n <= $num_tries; $n++) {
    system(@ARGV);
    $return_status = $?;
    if ($return_status == 0) {
      exit(0);  # The command succeeded.  We return success.
    } elsif ($return_status != 256) {
      # The command did not "die normally".  When queue.pl and similar scripts
      # detect a normal error, they exit(1), which becomes a status of 256
      # in perl's $? variable.
      # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info.
      # An example of an abnormal death that would cause us to want to exit
      # immediately, is when the user does ctrl-c or KILLs the script,
      # which gets caught by 'caught_signal' in queue.pl and causes that program
      # to return with exit status 2.
      exit(1);
    }
  
  
    if ($n < $num_tries) {
      if (! -f $log_file) {
        # $log_file doesn't exist as a file.  Maybe it was an array job.
        # This script doesn't yet support array jobs.  We just give up.
        # Later on we might want to figure out which array jobs failed
        # and have to be rerun, but for now we just die.
        print STDERR "$0: job failed and log file $log_file does not exist (array job?).
  ";
      } else {
        rename($log_file, $log_file . ".bak");
        print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning
  ";
      }
    }
  }
  
  print STDERR "$0: job failed $num_tries times; log is in $log_file
  ";
  exit(1);