Blame view

egs/wsj/s5/utils/parallel/limit_num_gpus.sh 1.91 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
  #!/bin/bash
  
  # This script functions as a wrapper of a bash command that uses GPUs.
  #
  # It sets the CUDA_VISIBLE_DEVICES variable so that it limits the number of GPUs
  # used for programs. It is neccesary for running a job on the grid if the job
  # would automatically grabs all resources available on the system, e.g. a
  # TensorFlow program.
  
  num_gpus=1 # this variable indicates how many GPUs we will allow the command
             # passed to this script will run on. We achieve this by setting the
             # CUDA_VISIBLE_DEVICES variable
  set -e
  
  if [ "$1" == "--num-gpus" ]; then
    num_gpus=$2
    shift
    shift
  fi
  
  if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then
    echo $0: Must pass a positive interger or 0 after --num-gpus
    echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
    exit 1
  fi
  
  if [ $# -eq 0 ]; then
    echo "Usage:  $0 [--num-gpus <num-gpus>] <command> [<arg1>...]"
    echo "Runs <command> with args after setting CUDA_VISIBLE_DEVICES to "
    echo "make sure exactly <num-gpus> GPUs are visible (default: 1)."
    exit 1
  fi
  
  CUDA_VISIBLE_DEVICES=
  num_total_gpus=`nvidia-smi -L | wc -l`
  num_gpus_assigned=0
  
  if [ $num_gpus -eq 0 ] ; then
      echo "$0: Running the job on CPU. Disabling submitting to gpu"
      export CUDA_VISIBLE_DEVICES=""
  else
      for i in `seq 0 $[$num_total_gpus-1]`; do
      # going over all GPUs and check if it is idle, and add to the list if yes
        if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
          CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
        fi
      # once we have enough GPUs, break out of the loop
        [ $num_gpus_assigned -eq $num_gpus ] && break
      done
  
      [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
  
      export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
  
      echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
  fi
  
  "$@"