Yannick Estève / ONTRAC-Kaldi

Blame view

tools/cub-1.8.0/test/test_allocator.cu 16.2 KB
  /******************************************************************************
   * Copyright (c) 2011, Duane Merrill.  All rights reserved.
   * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
   *     * Redistributions of source code must retain the above copyright
   *       notice, this list of conditions and the following disclaimer.
   *     * Redistributions in binary form must reproduce the above copyright
   *       notice, this list of conditions and the following disclaimer in the
   *       documentation and/or other materials provided with the distribution.
   *     * Neither the name of the NVIDIA CORPORATION nor the
   *       names of its contributors may be used to endorse or promote products
   *       derived from this software without specific prior written permission.
   *
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   ******************************************************************************/
  
  /******************************************************************************
   * Test evaluation for caching allocator of device memory
   ******************************************************************************/
  
  // Ensure printing of CUDA runtime errors to console
  #define CUB_STDERR
  
  #include <stdio.h>
  
  #include <cub/util_allocator.cuh>
  #include "test_util.h"
  
  using namespace cub;
  
  
  //---------------------------------------------------------------------
  // Main
  //---------------------------------------------------------------------
  
  /**
   * Main
   */
  int main(int argc, char** argv)
  {
      // Initialize command line
      CommandLineArgs args(argc, argv);
  
      // Print usage
      if (args.CheckCmdLineFlag("help"))
      {
          printf("%s "
              "[--device=<device-id>]"
              "[--bytes=<timing bytes>]"
              "[--i=<timing iterations>]"
              "
  ", argv[0]);
          exit(0);
      }
  
  #if (CUB_PTX_ARCH == 0)
  
      // Initialize device
      CubDebugExit(args.DeviceInit());
  
      // Get number of GPUs and current GPU
      int num_gpus;
      int initial_gpu;
      int timing_iterations           = 10000;
      int timing_bytes                = 1024 * 1024;
  
      if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
      if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
      args.GetCmdLineArgument("i", timing_iterations);
      args.GetCmdLineArgument("bytes", timing_bytes);
  
      // Create default allocator (caches up to 6MB in device allocations per GPU)
      CachingDeviceAllocator allocator;
      allocator.debug = true;
  
      printf("Running single-gpu tests...
  "); fflush(stdout);
  
      //
      // Test0
      //
  
      // Create a new stream
      cudaStream_t other_stream;
      CubDebugExit(cudaStreamCreate(&other_stream));
  
      // Allocate 999 bytes on the current gpu in stream0
      char *d_999B_stream0_a;
      char *d_999B_stream0_b;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
  
      // Run some big kernel in stream 0
      EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
  
      // Free d_999B_stream0_a
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
  
      // Allocate another 999 bytes in stream 0
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
  
      // Check that that we have 1 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 1);
  
      // Check that that we have no cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 0);
  
      // Run some big kernel in stream 0
      EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
  
      // Free d_999B_stream0_b
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
  
      // Allocate 999 bytes on the current gpu in other_stream
      char *d_999B_stream_other_a;
      char *d_999B_stream_other_b;
      allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
  
      // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
      AssertEquals(allocator.live_blocks.size(), 1);
  
      // Check that that we have one cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 1);
  
      // Run some big kernel in other_stream
      EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
  
      // Free d_999B_stream_other
      CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
  
      // Check that we can now use both allocations in stream 0 after synchronizing the device
      CubDebugExit(cudaDeviceSynchronize());
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
  
      // Check that that we have 2 live blocks on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 2);
  
      // Check that that we have no cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 0);
  
      // Free d_999B_stream0_a and d_999B_stream0_b
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
  
      // Check that we can now use both allocations in other_stream
      CubDebugExit(cudaDeviceSynchronize());
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
  
      // Check that that we have 2 live blocks on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 2);
  
      // Check that that we have no cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 0);
  
      // Run some big kernel in other_stream
      EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
  
      // Free d_999B_stream_other_a and d_999B_stream_other_b
      CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
      CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
  
      // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
      CubDebugExit(cudaDeviceSynchronize());
      CubDebugExit(cudaStreamDestroy(other_stream));
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
      CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
  
      // Check that that we have 2 live blocks on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 2);
  
      // Check that that we have no cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 0);
  
      // Free d_999B_stream0_a and d_999B_stream0_b
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
      CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
  
      // Free all cached
      CubDebugExit(allocator.FreeAllCached());
  
      //
      // Test1
      //
  
      // Allocate 5 bytes on the current gpu
      char *d_5B;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
  
      // Check that that we have zero free bytes cached on the initial GPU
      AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
  
      // Check that that we have 1 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 1);
  
      //
      // Test2
      //
  
      // Allocate 4096 bytes on the current gpu
      char *d_4096B;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
  
      // Check that that we have 2 live blocks on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 2);
  
      //
      // Test3
      //
  
      // DeviceFree d_5B
      CubDebugExit(allocator.DeviceFree(d_5B));
  
      // Check that that we have min_bin_bytes free bytes cached on the initial gpu
      AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
  
      // Check that that we have 1 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 1);
  
      // Check that that we have 1 cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 1);
  
      //
      // Test4
      //
  
      // DeviceFree d_4096B
      CubDebugExit(allocator.DeviceFree(d_4096B));
  
      // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
      AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
  
      // Check that that we have 0 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 0);
  
      // Check that that we have 2 cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 2);
  
      //
      // Test5
      //
  
      // Allocate 768 bytes on the current gpu
      char *d_768B;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
  
      // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
      AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
  
      // Check that that we have 1 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 1);
  
      // Check that that we have 1 cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 1);
  
      //
      // Test6
      //
  
      // Allocate max_cached_bytes on the current gpu
      char *d_max_cached;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
  
      // DeviceFree d_max_cached
      CubDebugExit(allocator.DeviceFree(d_max_cached));
  
      // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
      AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
  
      // Check that that we have 1 live block on the initial GPU
      AssertEquals(allocator.live_blocks.size(), 1);
  
      // Check that that we still have 1 cached block on the initial GPU
      AssertEquals(allocator.cached_blocks.size(), 1);
  
      //
      // Test7
      //
  
      // Free all cached blocks on all GPUs
      CubDebugExit(allocator.FreeAllCached());
  
      // Check that that we have 0 bytes cached on the initial GPU
      AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
  
      // Check that that we have 0 cached blocks across all GPUs
      AssertEquals(allocator.cached_blocks.size(), 0);
  
      // Check that that still we have 1 live block across all GPUs
      AssertEquals(allocator.live_blocks.size(), 1);
  
      //
      // Test8
      //
  
      // Allocate max cached bytes + 1 on the current gpu
      char *d_max_cached_plus;
      CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
  
      // DeviceFree max cached bytes
      CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
  
      // DeviceFree d_768B
      CubDebugExit(allocator.DeviceFree(d_768B));
  
      unsigned int power;
      size_t rounded_bytes;
      allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
  
      // Check that that we have 4096 free bytes cached on the initial gpu
      AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
  
      // Check that that we have 1 cached blocks across all GPUs
      AssertEquals(allocator.cached_blocks.size(), 1);
  
      // Check that that still we have 0 live block across all GPUs
      AssertEquals(allocator.live_blocks.size(), 0);
  
  #ifndef CUB_CDP
      // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
  
      if (num_gpus > 1)
      {
          printf("
  Running multi-gpu tests...
  "); fflush(stdout);
  
          //
          // Test9
          //
  
          // Allocate 768 bytes on the next gpu
          int next_gpu = (initial_gpu + 1) % num_gpus;
          char *d_768B_2;
          CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
  
          // DeviceFree d_768B on the next gpu
          CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
  
          // Re-allocate 768 bytes on the next gpu
          CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
  
          // Re-free d_768B on the next gpu
          CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
  
          // Check that that we have 4096 free bytes cached on the initial gpu
          AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
  
          // Check that that we have 4096 free bytes cached on the second gpu
          AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
  
          // Check that that we have 2 cached blocks across all GPUs
          AssertEquals(allocator.cached_blocks.size(), 2);
  
          // Check that that still we have 0 live block across all GPUs
          AssertEquals(allocator.live_blocks.size(), 0);
      }
  #endif  // CUB_CDP
  
      //
      // Performance
      //
  
      printf("
  CPU Performance (%d timing iterations, %d bytes):
  ", timing_iterations, timing_bytes);
      fflush(stdout); fflush(stderr);
  
      // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
      CpuTimer    cpu_timer;
      char        *d_1024MB                       = NULL;
      allocator.debug                             = false;
  
      // Prime the caching allocator and the kernel
      CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
      CubDebugExit(allocator.DeviceFree(d_1024MB));
      cub::EmptyKernel<void><<<1, 32>>>();
  
      // CUDA
      cpu_timer.Start();
      for (int i = 0; i < timing_iterations; ++i)
      {
          CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
          CubDebugExit(cudaFree(d_1024MB));
      }
      cpu_timer.Stop();
      float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
  
      // CUB
      cpu_timer.Start();
      for (int i = 0; i < timing_iterations; ++i)
      {
          CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
          CubDebugExit(allocator.DeviceFree(d_1024MB));
      }
      cpu_timer.Stop();
      float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
  
      printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)
  ",
          cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
          cuda_malloc_elapsed_millis / timing_iterations,
          cub_calloc_elapsed_millis / timing_iterations);
  
      // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
      GpuTimer gpu_timer;
  
      printf("
  GPU Performance (%d timing iterations, %d bytes):
  ", timing_iterations, timing_bytes);
      fflush(stdout); fflush(stderr);
  
      // Kernel-only
      gpu_timer.Start();
      for (int i = 0; i < timing_iterations; ++i)
      {
          cub::EmptyKernel<void><<<1, 32>>>();
      }
      gpu_timer.Stop();
      float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
  
      // CUDA
      gpu_timer.Start();
      for (int i = 0; i < timing_iterations; ++i)
      {
          CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
          cub::EmptyKernel<void><<<1, 32>>>();
          CubDebugExit(cudaFree(d_1024MB));
      }
      gpu_timer.Stop();
      cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
  
      // CUB
      gpu_timer.Start();
      for (int i = 0; i < timing_iterations; ++i)
      {
          CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
          cub::EmptyKernel<void><<<1, 32>>>();
          CubDebugExit(allocator.DeviceFree(d_1024MB));
      }
      gpu_timer.Stop();
      cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
  
      printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)
  ",
          cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
          cuda_malloc_elapsed_millis / timing_iterations,
          cub_calloc_elapsed_millis / timing_iterations);
  
  
  #endif
  
      printf("Success
  ");
  
      return 0;
  }