Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/parallel/run.pl 9.96 KB
  #!/usr/bin/env perl
  use warnings; #sed replacement for -w perl parameter
  
  # In general, doing
  #  run.pl some.log a b c is like running the command a b c in
  # the bash shell, and putting the standard error and output into some.log.
  # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
  #  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
  # and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
  # If any of the jobs fails, this script will fail.
  
  # A typical example is:
  #  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
  # and run.pl will run something like:
  # ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
  #
  # Basically it takes the command-line arguments, quotes them
  # as necessary to preserve spaces, and evaluates them with bash.
  # In addition it puts the command line at the top of the log, and
  # the start and end times of the command at the beginning and end.
  # The reason why this is useful is so that we can create a different
  # version of this program that uses a queueing system instead.
  
  # use Data::Dumper;
  
  @ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
  
  
  $max_jobs_run = -1;
  $jobstart = 1;
  $jobend = 1;
  $ignored_opts = ""; # These will be ignored.
  
  # First parse an option like JOB=1:4, and any
  # options that would normally be given to
  # queue.pl, which we will just discard.
  
  for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
    # allow the JOB=1:n option to be interleaved with the
    # options to qsub.
    while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
      # parse any options that would normally go to qsub, but which will be ignored here.
      my $switch = shift @ARGV;
      if ($switch eq "-V") {
        $ignored_opts .= "-V ";
      } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
        # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
        $max_jobs_run = shift @ARGV;
        if (! ($max_jobs_run > 0)) {
          die "run.pl: invalid option --max-jobs-run $max_jobs_run";
        }
      } else {
        my $argument = shift @ARGV;
        if ($argument =~ m/^--/) {
          print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'
  ";
        }
        if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
          $ignored_opts .= "-sync "; # Note: in the
          # corresponding code in queue.pl it says instead, just "$sync = 1;".
        } elsif ($switch eq "-pe") { # e.g. -pe smp 5
          my $argument2 = shift @ARGV;
          $ignored_opts .= "$switch $argument $argument2 ";
        } elsif ($switch eq "--gpu") {
          $using_gpu = $argument;
        } else {
          # Ignore option.
          $ignored_opts .= "$switch $argument ";
        }
      }
    }
    if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
      $jobname = $1;
      $jobstart = $2;
      $jobend = $3;
      if ($jobstart > $jobend) {
        die "run.pl: invalid job range $ARGV[0]";
      }
      if ($jobstart <= 0) {
        die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
      }
      shift;
    } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
      $jobname = $1;
      $jobstart = $2;
      $jobend = $2;
      shift;
    } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
      print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]
  ";
    }
  }
  
  # Users found this message confusing so we are removing it.
  # if ($ignored_opts ne "") {
  #   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"
  ";
  # }
  
  if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
                             # then work out the number of processors if possible,
                             # and set it based on that.
    $max_jobs_run = 0;
    if ($using_gpu) {
      if (open(P, "nvidia-smi -L |")) {
        $max_jobs_run++ while (<P>);
        close(P);
      }
      if ($max_jobs_run == 0) {
        $max_jobs_run = 1;
        print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}
  ";
      }
    } elsif (open(P, "</proc/cpuinfo")) {  # Linux
      while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
      if ($max_jobs_run == 0) {
        print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo
  ";
        $max_jobs_run = 10;  # reasonable default.
      }
      close(P);
    } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
      while (<P>) {
        if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
          $max_jobs_run = $1;
          last;
        }
      }
      close(P);
      if ($max_jobs_run == 0) {
        print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a
  ";
        $max_jobs_run = 10;  # reasonable default.
      }
    } else {
      # allow at most 32 jobs at once, on non-UNIX systems; change this code
      # if you need to change this default.
      $max_jobs_run = 32;
    }
    # The just-computed value of $max_jobs_run is just the number of processors
    # (or our best guess); and if it happens that the number of jobs we need to
    # run is just slightly above $max_jobs_run, it will make sense to increase
    # $max_jobs_run to equal the number of jobs, so we don't have a small number
    # of leftover jobs.
    $num_jobs = $jobend - $jobstart + 1;
    if (!$using_gpu &&
        $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
      $max_jobs_run = $num_jobs;
    }
  }
  
  $logfile = shift @ARGV;
  
  if (defined $jobname && $logfile !~ m/$jobname/ &&
      $jobend > $jobstart) {
    print STDERR "run.pl: you are trying to run a parallel job but "
      . "you are putting the output into just one log file ($logfile)
  ";
    exit(1);
  }
  
  $cmd = "";
  
  foreach $x (@ARGV) {
      if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
      elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
      else { $cmd .= "\"$x\" "; }
  }
  
  #$Data::Dumper::Indent=0;
  $ret = 0;
  $numfail = 0;
  %active_pids=();
  
  use POSIX ":sys_wait_h";
  for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    if (scalar(keys %active_pids) >= $max_jobs_run) {
  
      # Lets wait for a change in any child's status
      # Then we have to work out which child finished
      $r = waitpid(-1, 0);
      $code = $?;
      if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
      if ( defined $active_pids{$r} ) {
          $jid=$active_pids{$r};
          $fail[$jid]=$code;
          if ($code !=0) { $numfail++;}
          delete $active_pids{$r};
          # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "
  ";
      } else {
          die "run.pl: Cannot find the PID of the child process that just finished.";
      }
  
      # In theory we could do a non-blocking waitpid over all jobs running just
      # to find out if only one or more jobs finished during the previous waitpid()
      # However, we just omit this and will reap the next one in the next pass
      # through the for(;;) cycle
    }
    $childpid = fork();
    if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
    if ($childpid == 0) { # We're in the child... this branch
      # executes the job and returns (possibly with an error status).
      if (defined $jobname) {
        $cmd =~ s/$jobname/$jobid/g;
        $logfile =~ s/$jobname/$jobid/g;
      }
      system("mkdir -p `dirname $logfile` 2>/dev/null");
      open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
      print F "# " . $cmd . "
  ";
      print F "# Started at " . `date`;
      $starttime = `date +'%s'`;
      print F "#
  ";
      close(F);
  
      # Pipe into bash.. make sure we're not using any other shell.
      open(B, "|bash") || die "run.pl: Error opening shell command";
      print B "( " . $cmd . ") 2>>$logfile >> $logfile";
      close(B);                   # If there was an error, exit status is in $?
      $ret = $?;
  
      $lowbits = $ret & 127;
      $highbits = $ret >> 8;
      if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
      else { $return_str = "code $highbits"; }
  
      $endtime = `date +'%s'`;
      open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
      $enddate = `date`;
      chop $enddate;
      print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1
  ";
      print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds
  ";
      close(F);
      exit($ret == 0 ? 0 : 1);
    } else {
      $pid[$jobid] = $childpid;
      $active_pids{$childpid} = $jobid;
      # print STDERR "Queued: " .  Dumper(\%active_pids) . "
  ";
    }
  }
  
  # Now we have submitted all the jobs, lets wait until all the jobs finish
  foreach $child (keys %active_pids) {
      $jobid=$active_pids{$child};
      $r = waitpid($pid[$jobid], 0);
      $code = $?;
      if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
      if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
  }
  
  # Some sanity checks:
  # The $fail array should not contain undefined codes
  # The number of non-zeros in that array  should be equal to $numfail
  # We cannot do foreach() here, as the JOB ids do not start at zero
  $failed_jids=0;
  for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
    $job_return = $fail[$jobid];
    if (not defined $job_return ) {
      # print Dumper(\@fail);
  
      die "run.pl: Sanity check failed: we have indication that some jobs are running " .
        "even after we waited for all jobs to finish" ;
    }
    if ($job_return != 0 ){ $failed_jids++;}
  }
  if ($failed_jids != $numfail) {
    die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
  }
  if ($numfail > 0) { $ret = 1; }
  
  if ($ret != 0) {
    $njobs = $jobend - $jobstart + 1;
    if ($njobs == 1) {
      if (defined $jobname) {
        $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                           # that job.
      }
      print STDERR "run.pl: job failed, log is in $logfile
  ";
      if ($logfile =~ m/JOB/) {
        print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
      }
    }
    else {
      $logfile =~ s/$jobname/*/g;
      print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile
  ";
    }
  }
  
  
  exit ($ret);