Yannick Estève / ONTRAC-Kaldi

Blame view

tools/cub-1.8.0/cub/block/block_reduce.cuh 24.7 KB
  /******************************************************************************
   * Copyright (c) 2011, Duane Merrill.  All rights reserved.
   * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
   * 
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
   *     * Redistributions of source code must retain the above copyright
   *       notice, this list of conditions and the following disclaimer.
   *     * Redistributions in binary form must reproduce the above copyright
   *       notice, this list of conditions and the following disclaimer in the
   *       documentation and/or other materials provided with the distribution.
   *     * Neither the name of the NVIDIA CORPORATION nor the
   *       names of its contributors may be used to endorse or promote products
   *       derived from this software without specific prior written permission.
   * 
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   ******************************************************************************/
  
  /**
   * \file
   * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
   */
  
  #pragma once
  
  #include "specializations/block_reduce_raking.cuh"
  #include "specializations/block_reduce_raking_commutative_only.cuh"
  #include "specializations/block_reduce_warp_reductions.cuh"
  #include "../util_ptx.cuh"
  #include "../util_type.cuh"
  #include "../thread/thread_operators.cuh"
  #include "../util_namespace.cuh"
  
  /// Optional outer namespace(s)
  CUB_NS_PREFIX
  
  /// CUB namespace
  namespace cub {
  
  
  
  /******************************************************************************
   * Algorithmic variants
   ******************************************************************************/
  
  /**
   * BlockReduceAlgorithm enumerates alternative algorithms for parallel
   * reduction across a CUDA thread block.
   */
  enum BlockReduceAlgorithm
  {
  
      /**
       * \par Overview
       * An efficient "raking" reduction algorithm that only supports commutative
       * reduction operators (true for most operations, e.g., addition).
       *
       * \par
       * Execution is comprised of three phases:
       * -# Upsweep sequential reduction in registers (if threads contribute more
       *    than one input each).  Threads in warps other than the first warp place
       *    their partial reductions into shared memory.
       * -# Upsweep sequential reduction in shared memory.  Threads within the first
       *    warp continue to accumulate by raking across segments of shared partial reductions
       * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
       *
       * \par
       * \image html block_reduce.png
       * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
       *
       * \par Performance Considerations
       * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
       *   and is preferable when the reduction operator is commutative.  This variant
       *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
       *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
       *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
       *   when the GPU is under-occupied.
       */
      BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
  
  
      /**
       * \par Overview
       * An efficient "raking" reduction algorithm that supports commutative
       * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
       * operators. \blocked.
       *
       * \par
       * Execution is comprised of three phases:
       * -# Upsweep sequential reduction in registers (if threads contribute more
       *    than one input each).  Each thread then places the partial reduction
       *    of its item(s) into shared memory.
       * -# Upsweep sequential reduction in shared memory.  Threads within a
       *    single warp rake across segments of shared partial reductions.
       * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
       *
       * \par
       * \image html block_reduce.png
       * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
       *
       * \par Performance Considerations
       * - This variant performs more communication than BLOCK_REDUCE_RAKING
       *   and is only preferable when the reduction operator is non-commutative.  This variant
       *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
       *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
       *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
       *   when the GPU is under-occupied.
       */
      BLOCK_REDUCE_RAKING,
  
  
      /**
       * \par Overview
       * A quick "tiled warp-reductions" reduction algorithm that supports commutative
       * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
       * operators.
       *
       * \par
       * Execution is comprised of four phases:
       * -# Upsweep sequential reduction in registers (if threads contribute more
       *    than one input each).  Each thread then places the partial reduction
       *    of its item(s) into shared memory.
       * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
       *    reduction within each warp.
       * -# A propagation phase where the warp reduction outputs in each warp are
       *    updated with the aggregate from each preceding warp.
       *
       * \par
       * \image html block_scan_warpscans.png
       * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
       *
       * \par Performance Considerations
       * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
       *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
       *   throughput across the GPU.  However turn-around latency may be lower and
       *   thus useful when the GPU is under-occupied.
       */
      BLOCK_REDUCE_WARP_REDUCTIONS,
  };
  
  
  /******************************************************************************
   * Block reduce
   ******************************************************************************/
  
  /**
   * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
   * \ingroup BlockModule
   *
   * \tparam T                Data type being reduced
   * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
   * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
   * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
   * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
   * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
   *
   * \par Overview
   * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
   *   uses a binary combining operator to compute a single aggregate from a list of input elements.
   * - \rowmajor
   * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
   *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
   *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
   *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
   *
   * \par Performance Considerations
   * - \granularity
   * - Very efficient (only one synchronization barrier).
   * - Incurs zero bank conflicts for most types
   * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
   *   - Summation (<b><em>vs.</em></b> generic reduction)
   *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
   *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
   * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
   *
   * \par A Simple Example
   * \blockcollective{BlockReduce}
   * \par
   * The code snippet below illustrates a sum reduction of 512 integer items that
   * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
   * where each thread owns 4 consecutive items.
   * \par
   * \code
   * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
   *
   * __global__ void ExampleKernel(...)
   * {
   *     // Specialize BlockReduce for a 1D block of 128 threads on type int
   *     typedef cub::BlockReduce<int, 128> BlockReduce;
   *
   *     // Allocate shared memory for BlockReduce
   *     __shared__ typename BlockReduce::TempStorage temp_storage;
   *
   *     // Obtain a segment of consecutive items that are blocked across threads
   *     int thread_data[4];
   *     ...
   *
   *     // Compute the block-wide sum for thread0
   *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
   *
   * \endcode
   *
   */
  template <
      typename                T,
      int                     BLOCK_DIM_X,
      BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
      int                     BLOCK_DIM_Y     = 1,
      int                     BLOCK_DIM_Z     = 1,
      int                     PTX_ARCH        = CUB_PTX_ARCH>
  class BlockReduce
  {
  private:
  
      /******************************************************************************
       * Constants and type definitions
       ******************************************************************************/
  
      /// Constants
      enum
      {
          /// The thread block size in threads
          BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
      };
  
      typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
      typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
      typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
  
      /// Internal specialization type
      typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
          WarpReductions,
          typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
              RakingCommutativeOnly,
              Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
  
      /// Shared memory storage layout type for BlockReduce
      typedef typename InternalBlockReduce::TempStorage _TempStorage;
  
  
      /******************************************************************************
       * Utility methods
       ******************************************************************************/
  
      /// Internal storage allocator
      __device__ __forceinline__ _TempStorage& PrivateStorage()
      {
          __shared__ _TempStorage private_storage;
          return private_storage;
      }
  
  
      /******************************************************************************
       * Thread fields
       ******************************************************************************/
  
      /// Shared storage reference
      _TempStorage &temp_storage;
  
      /// Linear thread-id
      unsigned int linear_tid;
  
  
  public:
  
      /// \smemstorage{BlockReduce}
      struct TempStorage : Uninitialized<_TempStorage> {};
  
  
      /******************************************************************//**
       * 
  ame Collective constructors
       *********************************************************************/
      //@{
  
      /**
       * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
       */
      __device__ __forceinline__ BlockReduce()
      :
          temp_storage(PrivateStorage()),
          linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
      {}
  
  
      /**
       * \brief Collective constructor using the specified memory allocation as temporary storage.
       */
      __device__ __forceinline__ BlockReduce(
          TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
      :
          temp_storage(temp_storage.Alias()),
          linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
      {}
  
  
      //@}  end member group
      /******************************************************************//**
       * 
  ame Generic reductions
       *********************************************************************/
      //@{
  
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \rowmajor
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a max reduction of 128 integer items that
       * are partitioned across 128 threads.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Each thread obtains an input item
       *     int thread_data;
       *     ...
       *
       *     // Compute the block-wide max for thread0
       *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
       *
       * \endcode
       *
       * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
       */
      template <typename ReductionOp>
      __device__ __forceinline__ T Reduce(
          T               input,                      ///< [in] Calling thread's input
          ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
      {
          return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
      }
  
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \granularity
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a max reduction of 512 integer items that
       * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
       * where each thread owns 4 consecutive items.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Obtain a segment of consecutive items that are blocked across threads
       *     int thread_data[4];
       *     ...
       *
       *     // Compute the block-wide max for thread0
       *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
       *
       * \endcode
       *
       * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
       * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
       */
      template <
          int ITEMS_PER_THREAD,
          typename ReductionOp>
      __device__ __forceinline__ T Reduce(
          T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
          ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
      {
          // Reduce partials
          T partial = internal::ThreadReduce(inputs, reduction_op);
          return Reduce(partial, reduction_op);
      }
  
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \rowmajor
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
       * are partitioned across 128 threads.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(int num_valid, ...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Each thread obtains an input item
       *     int thread_data;
       *     if (threadIdx.x < num_valid) thread_data = ...
       *
       *     // Compute the block-wide max for thread0
       *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
       *
       * \endcode
       *
       * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
       */
      template <typename ReductionOp>
      __device__ __forceinline__ T Reduce(
          T                   input,                  ///< [in] Calling thread's input
          ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
          int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
      {
          // Determine if we scan skip bounds checking
          if (num_valid >= BLOCK_THREADS)
          {
              return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
          }
          else
          {
              return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
          }
      }
  
  
      //@}  end member group
      /******************************************************************//**
       * 
  ame Summation reductions
       *********************************************************************/
      //@{
  
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \rowmajor
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a sum reduction of 128 integer items that
       * are partitioned across 128 threads.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Each thread obtains an input item
       *     int thread_data;
       *     ...
       *
       *     // Compute the block-wide sum for thread0
       *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
       *
       * \endcode
       *
       */
      __device__ __forceinline__ T Sum(
          T   input)                      ///< [in] Calling thread's input
      {
          return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
      }
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \granularity
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a sum reduction of 512 integer items that
       * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
       * where each thread owns 4 consecutive items.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Obtain a segment of consecutive items that are blocked across threads
       *     int thread_data[4];
       *     ...
       *
       *     // Compute the block-wide sum for thread0
       *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
       *
       * \endcode
       *
       * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
       */
      template <int ITEMS_PER_THREAD>
      __device__ __forceinline__ T Sum(
          T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
      {
          // Reduce partials
          T partial = internal::ThreadReduce(inputs, cub::Sum());
          return Sum(partial);
      }
  
  
      /**
       * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
       *
       * \par
       * - The return value is undefined in threads other than thread<sub>0</sub>.
       * - \rowmajor
       * - \smemreuse
       *
       * \par Snippet
       * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
       * are partitioned across 128 threads.
       * \par
       * \code
       * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
       *
       * __global__ void ExampleKernel(int num_valid, ...)
       * {
       *     // Specialize BlockReduce for a 1D block of 128 threads on type int
       *     typedef cub::BlockReduce<int, 128> BlockReduce;
       *
       *     // Allocate shared memory for BlockReduce
       *     __shared__ typename BlockReduce::TempStorage temp_storage;
       *
       *     // Each thread obtains an input item (up to num_items)
       *     int thread_data;
       *     if (threadIdx.x < num_valid)
       *         thread_data = ...
       *
       *     // Compute the block-wide sum for thread0
       *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
       *
       * \endcode
       *
       */
      __device__ __forceinline__ T Sum(
          T   input,                  ///< [in] Calling thread's input
          int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
      {
          // Determine if we scan skip bounds checking
          if (num_valid >= BLOCK_THREADS)
          {
              return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
          }
          else
          {
              return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
          }
      }
  
  
      //@}  end member group
  };
  
  /**
   * \example example_block_reduce.cu
   */
  
  }               // CUB namespace
  CUB_NS_POSTFIX  // Optional outer namespace(s)