Yannick Estève / ONTRAC-Kaldi

Blame view

tools/cub-1.8.0/test/test_block_load_store.cu 19 KB
  /******************************************************************************
   * Copyright (c) 2011, Duane Merrill.  All rights reserved.
   * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
   *     * Redistributions of source code must retain the above copyright
   *       notice, this list of conditions and the following disclaimer.
   *     * Redistributions in binary form must reproduce the above copyright
   *       notice, this list of conditions and the following disclaimer in the
   *       documentation and/or other materials provided with the distribution.
   *     * Neither the name of the NVIDIA CORPORATION nor the
   *       names of its contributors may be used to endorse or promote products
   *       derived from this software without specific prior written permission.
   *
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   ******************************************************************************/
  
  /******************************************************************************
   * Test of BlockLoad and BlockStore utilities
   ******************************************************************************/
  
  // Ensure printing of CUDA runtime errors to console
  #define CUB_STDERR
  
  #include <iterator>
  #include <stdio.h>
  
  #include <cub/block/block_load.cuh>
  #include <cub/block/block_store.cuh>
  #include <cub/iterator/cache_modified_input_iterator.cuh>
  #include <cub/iterator/cache_modified_output_iterator.cuh>
  #include <cub/iterator/discard_output_iterator.cuh>
  #include <cub/util_allocator.cuh>
  
  #include "test_util.h"
  
  using namespace cub;
  
  //---------------------------------------------------------------------
  // Globals, constants and typedefs
  //---------------------------------------------------------------------
  
  bool                    g_verbose = false;
  CachingDeviceAllocator  g_allocator(true);
  
  
  //---------------------------------------------------------------------
  // Test kernels
  //---------------------------------------------------------------------
  
  
  /**
   * Test load/store kernel.
   */
  template <
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM,
      typename            InputIteratorT,
      typename            OutputIteratorT>
  __launch_bounds__ (BLOCK_THREADS, 1)
  __global__ void Kernel(
      InputIteratorT    d_in,
      OutputIteratorT    d_out_unguarded,
      OutputIteratorT    d_out_guarded,
      int               num_items)
  {
      enum
      {
          TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
      };
  
      // The input value type
      typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
  
      // The output value type
      typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
          typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
          typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
  
      // Threadblock load/store abstraction types
      typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
      typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
  
      // Shared memory type for this thread block
      union TempStorage
      {
          typename BlockLoad::TempStorage     load;
          typename BlockStore::TempStorage    store;
      };
  
      // Allocate temp storage in shared memory
      __shared__ TempStorage temp_storage;
  
      // Threadblock work bounds
      int block_offset = blockIdx.x * TILE_SIZE;
      int guarded_elements = num_items - block_offset;
  
      // Tile of items
      OutputT data[ITEMS_PER_THREAD];
  
      // Load data
      BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
  
      __syncthreads();
  
      // Store data
      BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
  
      __syncthreads();
  
      // reset data
      #pragma unroll
      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
          data[ITEM] = OutputT();
  
      __syncthreads();
  
      // Load data
      BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
  
      __syncthreads();
  
      // Store data
      BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
  }
  
  
  //---------------------------------------------------------------------
  // Host testing subroutines
  //---------------------------------------------------------------------
  
  
  /**
   * Test load/store variants
   */
  template <
      typename            T,
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM,
      typename            InputIteratorT,
      typename            OutputIteratorT>
  void TestKernel(
      T                   *h_in,
      InputIteratorT      d_in,
      OutputIteratorT      d_out_unguarded_itr,
      OutputIteratorT      d_out_guarded_itr,
      T                   *d_out_unguarded_ptr,
      T                   *d_out_guarded_ptr,
      int                 grid_size,
      int                 guarded_elements)
  {
      int compare;
  
      int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
  
      // Test with discard output iterator
      typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
      DiscardOutputIterator<OffsetT> discard_itr;
  
      Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
          <<<grid_size, BLOCK_THREADS>>>(
              d_in,
              discard_itr,
              discard_itr,
              guarded_elements);
  
      // Test with regular output iterator
      Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
          <<<grid_size, BLOCK_THREADS>>>(
              d_in,
              d_out_unguarded_itr,
              d_out_guarded_itr,
              guarded_elements);
  
      CubDebugExit(cudaPeekAtLastError());
      CubDebugExit(cudaDeviceSynchronize());
  
      // Check results
      compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
      printf("\tGuarded: %s
  ", (compare) ? "FAIL" : "PASS");
      AssertEquals(0, compare);
  
      // Check results
      compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
      printf("\tUnguarded: %s
  ", (compare) ? "FAIL" : "PASS");
      AssertEquals(0, compare);
  }
  
  
  /**
   * Test native pointer.  Specialized for sufficient resources
   */
  template <
      typename            T,
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM>
  void TestNative(
      int                 grid_size,
      float               fraction_valid,
      Int2Type<true>      sufficient_resources)
  {
      int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
      int guarded_elements = int(fraction_valid * float(unguarded_elements));
  
      // Allocate host arrays
      T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
  
      // Allocate device arrays
      T *d_in = NULL;
      T *d_out_unguarded = NULL;
      T *d_out_guarded = NULL;
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
      CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
      CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
  
      // Initialize problem on host and device
      for (int i = 0; i < unguarded_elements; ++i)
      {
          InitValue(INTEGER_SEED, h_in[i], i);
      }
      CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
  
      printf("TestNative "
          "grid_size(%d) "
          "guarded_elements(%d) "
          "unguarded_elements(%d) "
          "BLOCK_THREADS(%d) "
          "ITEMS_PER_THREAD(%d) "
          "LOAD_ALGORITHM(%d) "
          "STORE_ALGORITHM(%d) "
          "sizeof(T)(%d)
  ",
              grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
  
      TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
          h_in,
          (T const *) d_in,   // Test const
          d_out_unguarded,
          d_out_guarded,
          d_out_unguarded,
          d_out_guarded,
          grid_size,
          guarded_elements);
  
      // Cleanup
      if (h_in) free(h_in);
      if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
      if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
      if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
  }
  
  
  /**
   * Test native pointer.  Specialized for insufficient resources
   */
  template <
      typename            T,
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM>
  void TestNative(
      int                 grid_size,
      float               fraction_valid,
      Int2Type<false>      sufficient_resources)
  {}
  
  
  /**
   * Test iterator.  Specialized for sufficient resources.
   */
  template <
      typename            T,
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM,
      CacheLoadModifier   LOAD_MODIFIER,
      CacheStoreModifier  STORE_MODIFIER>
  void TestIterator(
      int                 grid_size,
      float               fraction_valid,
      Int2Type<true>      sufficient_resources)
  {
      int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
      int guarded_elements = int(fraction_valid * float(unguarded_elements));
  
      // Allocate host arrays
      T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
  
      // Allocate device arrays
      T *d_in = NULL;
      T *d_out_unguarded = NULL;
      T *d_out_guarded = NULL;
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
      CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
      CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
  
      // Initialize problem on host and device
      for (int i = 0; i < unguarded_elements; ++i)
      {
          InitValue(INTEGER_SEED, h_in[i], i);
      }
      CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
  
      printf("TestIterator "
          "grid_size(%d) "
          "guarded_elements(%d) "
          "unguarded_elements(%d) "
          "BLOCK_THREADS(%d) "
          "ITEMS_PER_THREAD(%d) "
          "LOAD_ALGORITHM(%d) "
          "STORE_ALGORITHM(%d) "
          "LOAD_MODIFIER(%d) "
          "STORE_MODIFIER(%d) "
          "sizeof(T)(%d)
  ",
              grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
  
      TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
          h_in,
          CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
          CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
          CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
          d_out_unguarded,
          d_out_guarded,
          grid_size,
          guarded_elements);
  
      // Cleanup
      if (h_in) free(h_in);
      if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
      if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
      if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
  }
  
  /**
   * Test iterator.  Specialized for insufficient resources.
   */
  template <
      typename            T,
      int                 BLOCK_THREADS,
      int                 ITEMS_PER_THREAD,
      BlockLoadAlgorithm  LOAD_ALGORITHM,
      BlockStoreAlgorithm STORE_ALGORITHM,
      CacheLoadModifier   LOAD_MODIFIER,
      CacheStoreModifier  STORE_MODIFIER>
  void TestIterator(
      int                 grid_size,
      float               fraction_valid,
      Int2Type<false>     sufficient_resources)
  {}
  
  
  /**
   * Evaluate different pointer access types
   */
  template <
      typename                T,
      int                     BLOCK_THREADS,
      int                     ITEMS_PER_THREAD,
      BlockLoadAlgorithm      LOAD_ALGORITHM,
      BlockStoreAlgorithm     STORE_ALGORITHM>
  void TestPointerType(
      int             grid_size,
      float           fraction_valid)
  {
      // Threadblock load/store abstraction types
      typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
      typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
  
  #if defined(SM100) || defined(SM110) || defined(SM130)
      static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
      static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
      static const bool sufficient_threads    = BLOCK_THREADS <= 512;
  #else
      static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
      static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
      static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
  #endif
  
      static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
  
      TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
      TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
  }
  
  
  /**
   * Evaluate different time-slicing strategies
   */
  template <
      typename                T,
      int                     BLOCK_THREADS,
      int                     ITEMS_PER_THREAD,
      BlockLoadAlgorithm      LOAD_ALGORITHM,
      BlockStoreAlgorithm     STORE_ALGORITHM>
  void TestSlicedStrategy(
      int             grid_size,
      float           fraction_valid)
  {
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
  }
  
  
  
  /**
   * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
   */
  template <
      typename        T,
      int             BLOCK_THREADS,
      int             ITEMS_PER_THREAD>
  void TestStrategy(
      int             grid_size,
      float           fraction_valid,
      Int2Type<false> is_warp_multiple)
  {
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
  }
  
  
  /**
   * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
   */
  template <
      typename        T,
      int             BLOCK_THREADS,
      int             ITEMS_PER_THREAD>
  void TestStrategy(
      int             grid_size,
      float           fraction_valid,
      Int2Type<true>  is_warp_multiple)
  {
      TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
      TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
  }
  
  
  /**
   * Evaluate different register blocking
   */
  template <
      typename T,
      int BLOCK_THREADS>
  void TestItemsPerThread(
      int grid_size,
      float fraction_valid)
  {
      Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
  
      TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
      TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
      TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
      TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
  }
  
  
  /**
   * Evaluate different thread block sizes
   */
  template <typename T>
  void TestThreads(
      int grid_size,
      float fraction_valid)
  {
      TestItemsPerThread<T, 15>(grid_size, fraction_valid);
      TestItemsPerThread<T, 32>(grid_size, fraction_valid);
      TestItemsPerThread<T, 72>(grid_size, fraction_valid);
      TestItemsPerThread<T, 96>(grid_size, fraction_valid);
      TestItemsPerThread<T, 128>(grid_size, fraction_valid);
  }
  
  
  /**
   * Main
   */
  int main(int argc, char** argv)
  {
      // Initialize command line
      CommandLineArgs args(argc, argv);
      g_verbose = args.CheckCmdLineFlag("v");
  
      // Print usage
      if (args.CheckCmdLineFlag("help"))
      {
          printf("%s "
              "[--device=<device-id>] "
              "[--v] "
              "
  ", argv[0]);
          exit(0);
      }
  
      // Initialize device
      CubDebugExit(args.DeviceInit());
  
      // Get ptx version
      int ptx_version;
      CubDebugExit(PtxVersion(ptx_version));
  
  #ifdef QUICK_TEST
  
      // Compile/run quick tests
      TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
      TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
  
  #else
  
      // Compile/run thorough tests
      TestThreads<char>(2, 0.8f);
      TestThreads<int>(2, 0.8f);
      TestThreads<long>(2, 0.8f);
      TestThreads<long2>(2, 0.8f);
  
      if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
          TestThreads<double2>(2, 0.8f);
      TestThreads<TestFoo>(2, 0.8f);
      TestThreads<TestBar>(2, 0.8f);
  
  #endif
  
      return 0;
  }