Yannick Estève / ONTRAC-Kaldi

Blame view

tools/cub-1.8.0/examples/block/example_block_scan.cu 11.1 KB
  /******************************************************************************
   * Copyright (c) 2011, Duane Merrill.  All rights reserved.
   * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
   *     * Redistributions of source code must retain the above copyright
   *       notice, this list of conditions and the following disclaimer.
   *     * Redistributions in binary form must reproduce the above copyright
   *       notice, this list of conditions and the following disclaimer in the
   *       documentation and/or other materials provided with the distribution.
   *     * Neither the name of the NVIDIA CORPORATION nor the
   *       names of its contributors may be used to endorse or promote products
   *       derived from this software without specific prior written permission.
   *
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   ******************************************************************************/
  
  /******************************************************************************
   * Simple demonstration of cub::BlockScan
   *
   * To compile using the command line:
   *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
   *
   ******************************************************************************/
  
  // Ensure printing of CUDA runtime errors to console (define before including cub.h)
  #define CUB_STDERR
  
  #include <stdio.h>
  #include <iostream>
  
  #include <cub/block/block_load.cuh>
  #include <cub/block/block_store.cuh>
  #include <cub/block/block_scan.cuh>
  
  #include "../../test/test_util.h"
  
  using namespace cub;
  
  //---------------------------------------------------------------------
  // Globals, constants and typedefs
  //---------------------------------------------------------------------
  
  /// Verbose output
  bool g_verbose = false;
  
  /// Timing iterations
  int g_timing_iterations = 100;
  
  /// Default grid size
  int g_grid_size = 1;
  
  
  
  //---------------------------------------------------------------------
  // Kernels
  //---------------------------------------------------------------------
  
  /**
   * Simple kernel for performing a block-wide exclusive prefix sum over integers
   */
  template <
      int                     BLOCK_THREADS,
      int                     ITEMS_PER_THREAD,
      BlockScanAlgorithm      ALGORITHM>
  __global__ void BlockPrefixSumKernel(
      int         *d_in,          // Tile of input
      int         *d_out,         // Tile of output
      clock_t     *d_elapsed)     // Elapsed cycle count of block scan
  {
      // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
      typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
  
      // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
      typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
  
      // Specialize BlockScan type for our thread block
      typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
  
      // Shared memory
      __shared__ union TempStorage
      {
          typename BlockLoadT::TempStorage    load;
          typename BlockStoreT::TempStorage   store;
          typename BlockScanT::TempStorage    scan;
      } temp_storage;
  
      // Per-thread tile data
      int data[ITEMS_PER_THREAD];
  
      // Load items into a blocked arrangement
      BlockLoadT(temp_storage.load).Load(d_in, data);
  
      // Barrier for smem reuse
      __syncthreads();
  
      // Start cycle timer
      clock_t start = clock();
  
      // Compute exclusive prefix sum
      int aggregate;
      BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
  
      // Stop cycle timer
      clock_t stop = clock();
  
      // Barrier for smem reuse
      __syncthreads();
  
      // Store items from a blocked arrangement
      BlockStoreT(temp_storage.store).Store(d_out, data);
  
      // Store aggregate and elapsed clocks
      if (threadIdx.x == 0)
      {
          *d_elapsed = (start > stop) ? start - stop : stop - start;
          d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
      }
  }
  
  
  
  //---------------------------------------------------------------------
  // Host utilities
  //---------------------------------------------------------------------
  
  /**
   * Initialize exclusive prefix sum problem (and solution).
   * Returns the aggregate
   */
  int Initialize(
      int *h_in,
      int *h_reference,
      int num_items)
  {
      int inclusive = 0;
  
      for (int i = 0; i < num_items; ++i)
      {
          h_in[i] = i % 17;
  
          h_reference[i] = inclusive;
          inclusive += h_in[i];
      }
  
      return inclusive;
  }
  
  
  /**
   * Test thread block scan
   */
  template <
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockScanAlgorithm  ALGORITHM>
  void Test()
  {
      const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
  
      // Allocate host arrays
      int *h_in           = new int[TILE_SIZE];
      int *h_reference    = new int[TILE_SIZE];
      int *h_gpu          = new int[TILE_SIZE + 1];
  
      // Initialize problem and reference output on host
      int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
  
      // Initialize device arrays
      int *d_in           = NULL;
      int *d_out          = NULL;
      clock_t *d_elapsed  = NULL;
      cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
      cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
      cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
  
      // Display input problem data
      if (g_verbose)
      {
          printf("Input data: ");
          for (int i = 0; i < TILE_SIZE; i++)
              printf("%d, ", h_in[i]);
          printf("
  
  ");
      }
  
      // Kernel props
      int max_sm_occupancy;
      CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
  
      // Copy problem to device
      cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
  
      printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):
  ",
          (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
          TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
  
      // Run aggregate/prefix kernel
      BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
          d_in,
          d_out,
          d_elapsed);
  
      // Check results
      printf("\tOutput items: ");
      int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
      printf("%s
  ", compare ? "FAIL" : "PASS");
      AssertEquals(0, compare);
  
      // Check total aggregate
      printf("\tAggregate: ");
      compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
      printf("%s
  ", compare ? "FAIL" : "PASS");
      AssertEquals(0, compare);
  
      // Run this several times and average the performance results
      GpuTimer    timer;
      float       elapsed_millis          = 0.0;
      clock_t     elapsed_clocks          = 0;
  
      for (int i = 0; i < g_timing_iterations; ++i)
      {
          // Copy problem to device
          cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
  
          timer.Start();
  
          // Run aggregate/prefix kernel
          BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
              d_in,
              d_out,
              d_elapsed);
  
          timer.Stop();
          elapsed_millis += timer.ElapsedMillis();
  
          // Copy clocks from device
          clock_t clocks;
          CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
          elapsed_clocks += clocks;
  
      }
  
      // Check for kernel errors and STDIO from the kernel, if any
      CubDebugExit(cudaPeekAtLastError());
      CubDebugExit(cudaDeviceSynchronize());
  
      // Display timing results
      float avg_millis            = elapsed_millis / g_timing_iterations;
      float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
      float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
      float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
  
      printf("\tAverage BlockScan::Sum clocks: %.3f
  ", avg_clocks);
      printf("\tAverage BlockScan::Sum clocks per item: %.3f
  ", avg_clocks_per_item);
      printf("\tAverage kernel millis: %.4f
  ", avg_millis);
      printf("\tAverage million items / sec: %.4f
  ", avg_items_per_sec);
  
      // Cleanup
      if (h_in) delete[] h_in;
      if (h_reference) delete[] h_reference;
      if (h_gpu) delete[] h_gpu;
      if (d_in) cudaFree(d_in);
      if (d_out) cudaFree(d_out);
      if (d_elapsed) cudaFree(d_elapsed);
  }
  
  
  /**
   * Main
   */
  int main(int argc, char** argv)
  {
      // Initialize command line
      CommandLineArgs args(argc, argv);
      g_verbose = args.CheckCmdLineFlag("v");
      args.GetCmdLineArgument("i", g_timing_iterations);
      args.GetCmdLineArgument("grid-size", g_grid_size);
  
      // Print usage
      if (args.CheckCmdLineFlag("help"))
      {
          printf("%s "
              "[--device=<device-id>] "
              "[--i=<timing iterations (default:%d)>]"
              "[--grid-size=<grid size (default:%d)>]"
              "[--v] "
              "
  ", argv[0], g_timing_iterations, g_grid_size);
          exit(0);
      }
  
      // Initialize device
      CubDebugExit(args.DeviceInit());
  
      // Run tests
      Test<1024, 1, BLOCK_SCAN_RAKING>();
      Test<512, 2, BLOCK_SCAN_RAKING>();
      Test<256, 4, BLOCK_SCAN_RAKING>();
      Test<128, 8, BLOCK_SCAN_RAKING>();
      Test<64, 16, BLOCK_SCAN_RAKING>();
      Test<32, 32, BLOCK_SCAN_RAKING>();
  
      printf("-------------
  ");
  
      Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
      Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
      Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
      Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
      Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
      Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
  
      printf("-------------
  ");
  
      Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
      Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
      Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
      Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
      Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
      Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
  
  
      return 0;
  }