Yannick Estève / ONTRAC-Kaldi

Blame view

tools/cub-1.8.0/experimental/defunct/example_coo_spmv.cu 36.3 KB
  /******************************************************************************
   * Copyright (c) 2011, Duane Merrill.  All rights reserved.
   * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
   *     * Redistributions of source code must retain the above copyright
   *       notice, this list of conditions and the following disclaimer.
   *     * Redistributions in binary form must reproduce the above copyright
   *       notice, this list of conditions and the following disclaimer in the
   *       documentation and/or other materials provided with the distribution.
   *     * Neither the name of the NVIDIA CORPORATION nor the
   *       names of its contributors may be used to endorse or promote products
   *       derived from this software without specific prior written permission.
   *
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   *
   ******************************************************************************/
  
  /******************************************************************************
   * An implementation of COO SpMV using prefix scan to implement a
   * reduce-value-by-row strategy
   ******************************************************************************/
  
  // Ensure printing of CUDA runtime errors to console
  #define CUB_STDERR
  
  #include <iterator>
  #include <vector>
  #include <string>
  #include <algorithm>
  #include <stdio.h>
  
  #include <cub/cub.cuh>
  
  #include "coo_graph.cuh"
  #include "../test/test_util.h"
  
  using namespace cub;
  using namespace std;
  
  
  /******************************************************************************
   * Globals, constants, and typedefs
   ******************************************************************************/
  
  typedef int         VertexId;   // uint32s as vertex ids
  typedef double      Value;      // double-precision floating point values
  
  bool                    g_verbose       = false;
  int                     g_timing_iterations    = 1;
  CachingDeviceAllocator  g_allocator;
  
  
  /******************************************************************************
   * Texture referencing
   ******************************************************************************/
  
  /**
   * Templated texture reference type for multiplicand vector
   */
  template <typename Value>
  struct TexVector
  {
      // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
      typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
  
      // Texture reference type
      typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
  
      static TexRef ref;
  
      /**
       * Bind textures
       */
      static void BindTexture(void *d_in, int elements)
      {
          cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
          if (d_in)
          {
              size_t offset;
              size_t bytes = sizeof(CastType) * elements;
              CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
          }
      }
  
      /**
       * Unbind textures
       */
      static void UnbindTexture()
      {
          CubDebugExit(cudaUnbindTexture(ref));
      }
  
      /**
       * Load
       */
      static __device__ __forceinline__ Value Load(int offset)
      {
          Value output;
          reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
          return output;
      }
  };
  
  // Texture reference definitions
  template <typename Value>
  typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
  
  
  /******************************************************************************
   * Utility types
   ******************************************************************************/
  
  
  /**
   * A partial dot-product sum paired with a corresponding row-id
   */
  template <typename VertexId, typename Value>
  struct PartialProduct
  {
      VertexId    row;            /// Row-id
      Value       partial;        /// PartialProduct sum
  };
  
  
  /**
   * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
   */
  template <>
  struct PartialProduct<int, double>
  {
      long long   row;            /// Row-id
      double      partial;        /// PartialProduct sum
  };
  
  
  /**
   * Reduce-value-by-row scan operator
   */
  struct ReduceByKeyOp
  {
      template <typename PartialProduct>
      __device__ __forceinline__ PartialProduct operator()(
          const PartialProduct &first,
          const PartialProduct &second)
      {
          PartialProduct retval;
  
          retval.partial = (second.row != first.row) ?
                  second.partial :
                  first.partial + second.partial;
  
          retval.row = second.row;
          return retval;
      }
  };
  
  
  /**
   * Stateful block-wide prefix operator for BlockScan
   */
  template <typename PartialProduct>
  struct BlockPrefixCallbackOp
  {
      // Running block-wide prefix
      PartialProduct running_prefix;
  
      /**
       * Returns the block-wide running_prefix in thread-0
       */
      __device__ __forceinline__ PartialProduct operator()(
          const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
      {
          ReduceByKeyOp scan_op;
  
          PartialProduct retval = running_prefix;
          running_prefix = scan_op(running_prefix, block_aggregate);
          return retval;
      }
  };
  
  
  /**
   * Operator for detecting discontinuities in a list of row identifiers.
   */
  struct NewRowOp
  {
      /// Returns true if row_b is the start of a new row
      template <typename VertexId>
      __device__ __forceinline__ bool operator()(
          const VertexId& row_a,
          const VertexId& row_b)
      {
          return (row_a != row_b);
      }
  };
  
  
  
  /******************************************************************************
   * Persistent thread block types
   ******************************************************************************/
  
  /**
   * SpMV thread block abstraction for processing a contiguous segment of
   * sparse COO tiles.
   */
  template <
      int             BLOCK_THREADS,
      int             ITEMS_PER_THREAD,
      typename        VertexId,
      typename        Value>
  struct PersistentBlockSpmv
  {
      //---------------------------------------------------------------------
      // Types and constants
      //---------------------------------------------------------------------
  
      // Constants
      enum
      {
          TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
      };
  
      // Head flag type
      typedef int HeadFlag;
  
      // Partial dot product type
      typedef PartialProduct<VertexId, Value> PartialProduct;
  
      // Parameterized BlockScan type for reduce-value-by-row scan
      typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
  
      // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
      typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
  
      // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
      typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
  
      // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
      typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
  
      // Shared memory type for this thread block
      struct TempStorage
      {
          union
          {
              typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
              typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
              struct
              {
                  typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
                  typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
              };
          };
  
          VertexId        first_block_row;    ///< The first row-ID seen by this thread block
          VertexId        last_block_row;     ///< The last row-ID seen by this thread block
          Value           first_product;      ///< The first dot-product written by this thread block
      };
  
      //---------------------------------------------------------------------
      // Thread fields
      //---------------------------------------------------------------------
  
      TempStorage                     &temp_storage;
      BlockPrefixCallbackOp<PartialProduct>   prefix_op;
      VertexId                        *d_rows;
      VertexId                        *d_columns;
      Value                           *d_values;
      Value                           *d_vector;
      Value                           *d_result;
      PartialProduct                  *d_block_partials;
      int                             block_offset;
      int                             block_end;
  
  
      //---------------------------------------------------------------------
      // Operations
      //---------------------------------------------------------------------
  
      /**
       * Constructor
       */
      __device__ __forceinline__
      PersistentBlockSpmv(
          TempStorage                 &temp_storage,
          VertexId                    *d_rows,
          VertexId                    *d_columns,
          Value                       *d_values,
          Value                       *d_vector,
          Value                       *d_result,
          PartialProduct              *d_block_partials,
          int                         block_offset,
          int                         block_end)
      :
          temp_storage(temp_storage),
          d_rows(d_rows),
          d_columns(d_columns),
          d_values(d_values),
          d_vector(d_vector),
          d_result(d_result),
          d_block_partials(d_block_partials),
          block_offset(block_offset),
          block_end(block_end)
      {
          // Initialize scalar shared memory values
          if (threadIdx.x == 0)
          {
              VertexId first_block_row            = d_rows[block_offset];
              VertexId last_block_row             = d_rows[block_end - 1];
  
              temp_storage.first_block_row        = first_block_row;
              temp_storage.last_block_row         = last_block_row;
              temp_storage.first_product          = Value(0);
  
              // Initialize prefix_op to identity
              prefix_op.running_prefix.row        = first_block_row;
              prefix_op.running_prefix.partial    = Value(0);
          }
  
          __syncthreads();
      }
  
  
      /**
       * Processes a COO input tile of edges, outputting dot products for each row
       */
      template <bool FULL_TILE>
      __device__ __forceinline__ void ProcessTile(
          int block_offset,
          int guarded_items = 0)
      {
          VertexId        columns[ITEMS_PER_THREAD];
          VertexId        rows[ITEMS_PER_THREAD];
          Value           values[ITEMS_PER_THREAD];
          PartialProduct  partial_sums[ITEMS_PER_THREAD];
          HeadFlag        head_flags[ITEMS_PER_THREAD];
  
          // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
          if (FULL_TILE)
          {
              // Unguarded loads
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
          }
          else
          {
              // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
              // vertex for out-of-bound items, but zero-valued
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
              LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
          }
  
          // Load the referenced values from x and compute the dot product partials sums
          #pragma unroll
          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
          {
  #if CUB_PTX_ARCH >= 350
              values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
  #else
              values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
  #endif
          }
  
          // Transpose from warp-striped to blocked arrangement
          BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
  
          __syncthreads();
  
          // Transpose from warp-striped to blocked arrangement
          BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
  
          // Barrier for smem reuse and coherence
          __syncthreads();
  
          // FlagT row heads by looking for discontinuities
          BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
              head_flags,                     // (Out) Head flags
              rows,                           // Original row ids
              NewRowOp(),                     // Functor for detecting start of new rows
              prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
  
          // Assemble partial product structures
          #pragma unroll
          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
          {
              partial_sums[ITEM].partial = values[ITEM];
              partial_sums[ITEM].row = rows[ITEM];
          }
  
          // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
          PartialProduct block_aggregate;
          BlockScan(temp_storage.scan).ExclusiveScan(
              partial_sums,                   // Scan input
              partial_sums,                   // Scan output
              ReduceByKeyOp(),                // Scan operator
              block_aggregate,                // Block-wide total (unused)
              prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
  
          // Barrier for smem reuse and coherence
          __syncthreads();
  
          // Scatter an accumulated dot product if it is the head of a valid row
          #pragma unroll
          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
          {
              if (head_flags[ITEM])
              {
                  d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
  
                  // Save off the first partial product that this thread block will scatter
                  if (partial_sums[ITEM].row == temp_storage.first_block_row)
                  {
                      temp_storage.first_product = partial_sums[ITEM].partial;
                  }
              }
          }
      }
  
  
      /**
       * Iterate over input tiles belonging to this thread block
       */
      __device__ __forceinline__
      void ProcessTiles()
      {
          // Process full tiles
          while (block_offset <= block_end - TILE_ITEMS)
          {
              ProcessTile<true>(block_offset);
              block_offset += TILE_ITEMS;
          }
  
          // Process the last, partially-full tile (if present)
          int guarded_items = block_end - block_offset;
          if (guarded_items)
          {
              ProcessTile<false>(block_offset, guarded_items);
          }
  
          if (threadIdx.x == 0)
          {
              if (gridDim.x == 1)
              {
                  // Scatter the final aggregate (this kernel contains only 1 thread block)
                  d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
              }
              else
              {
                  // Write the first and last partial products from this thread block so
                  // that they can be subsequently "fixed up" in the next kernel.
  
                  PartialProduct first_product;
                  first_product.row       = temp_storage.first_block_row;
                  first_product.partial   = temp_storage.first_product;
  
                  d_block_partials[blockIdx.x * 2]          = first_product;
                  d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
              }
          }
      }
  };
  
  
  /**
   * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
   */
  template <
      int             BLOCK_THREADS,
      int             ITEMS_PER_THREAD,
      typename        VertexId,
      typename        Value>
  struct FinalizeSpmvBlock
  {
      //---------------------------------------------------------------------
      // Types and constants
      //---------------------------------------------------------------------
  
      // Constants
      enum
      {
          TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
      };
  
      // Head flag type
      typedef int HeadFlag;
  
      // Partial dot product type
      typedef PartialProduct<VertexId, Value> PartialProduct;
  
      // Parameterized BlockScan type for reduce-value-by-row scan
      typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
  
      // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
      typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
  
      // Shared memory type for this thread block
      struct TempStorage
      {
          typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
          typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
  
          VertexId last_block_row;
      };
  
  
      //---------------------------------------------------------------------
      // Thread fields
      //---------------------------------------------------------------------
  
      TempStorage                     &temp_storage;
      BlockPrefixCallbackOp<PartialProduct>   prefix_op;
      Value                           *d_result;
      PartialProduct                  *d_block_partials;
      int                             num_partials;
  
  
      //---------------------------------------------------------------------
      // Operations
      //---------------------------------------------------------------------
  
      /**
       * Constructor
       */
      __device__ __forceinline__
      FinalizeSpmvBlock(
          TempStorage                 &temp_storage,
          Value                       *d_result,
          PartialProduct              *d_block_partials,
          int                         num_partials)
      :
          temp_storage(temp_storage),
          d_result(d_result),
          d_block_partials(d_block_partials),
          num_partials(num_partials)
      {
          // Initialize scalar shared memory values
          if (threadIdx.x == 0)
          {
              VertexId first_block_row            = d_block_partials[0].row;
              VertexId last_block_row             = d_block_partials[num_partials - 1].row;
              temp_storage.last_block_row         = last_block_row;
  
              // Initialize prefix_op to identity
              prefix_op.running_prefix.row        = first_block_row;
              prefix_op.running_prefix.partial    = Value(0);
          }
  
          __syncthreads();
      }
  
  
      /**
       * Processes a COO input tile of edges, outputting dot products for each row
       */
      template <bool FULL_TILE>
      __device__ __forceinline__
      void ProcessTile(
          int block_offset,
          int guarded_items = 0)
      {
          VertexId        rows[ITEMS_PER_THREAD];
          PartialProduct  partial_sums[ITEMS_PER_THREAD];
          HeadFlag        head_flags[ITEMS_PER_THREAD];
  
          // Load a tile of block partials from previous kernel
          if (FULL_TILE)
          {
              // Full tile
  #if CUB_PTX_ARCH >= 350
              LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
  #else
              LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
  #endif
          }
          else
          {
              // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
              PartialProduct default_sum;
              default_sum.row = temp_storage.last_block_row;
              default_sum.partial = Value(0);
  
  #if CUB_PTX_ARCH >= 350
              LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
  #else
              LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
  #endif
          }
  
          // Copy out row IDs for row-head flagging
          #pragma unroll
          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
          {
              rows[ITEM] = partial_sums[ITEM].row;
          }
  
          // FlagT row heads by looking for discontinuities
          BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
              rows,                           // Original row ids
              head_flags,                     // (Out) Head flags
              NewRowOp(),                     // Functor for detecting start of new rows
              prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
  
          // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
          PartialProduct block_aggregate;
          BlockScan(temp_storage.scan).ExclusiveScan(
              partial_sums,                   // Scan input
              partial_sums,                   // Scan output
              ReduceByKeyOp(),                // Scan operator
              block_aggregate,                // Block-wide total (unused)
              prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
  
          // Scatter an accumulated dot product if it is the head of a valid row
          #pragma unroll
          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
          {
              if (head_flags[ITEM])
              {
                  d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
              }
          }
      }
  
  
      /**
       * Iterate over input tiles belonging to this thread block
       */
      __device__ __forceinline__
      void ProcessTiles()
      {
          // Process full tiles
          int block_offset = 0;
          while (block_offset <= num_partials - TILE_ITEMS)
          {
              ProcessTile<true>(block_offset);
              block_offset += TILE_ITEMS;
          }
  
          // Process final partial tile (if present)
          int guarded_items = num_partials - block_offset;
          if (guarded_items)
          {
              ProcessTile<false>(block_offset, guarded_items);
          }
  
          // Scatter the final aggregate (this kernel contains only 1 thread block)
          if (threadIdx.x == 0)
          {
              d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
          }
      }
  };
  
  
  /******************************************************************************
   * Kernel entrypoints
   ******************************************************************************/
  
  
  
  /**
   * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
   */
  template <
      int                             BLOCK_THREADS,
      int                             ITEMS_PER_THREAD,
      typename                        VertexId,
      typename                        Value>
  __launch_bounds__ (BLOCK_THREADS)
  __global__ void CooKernel(
      GridEvenShare<int>              even_share,
      PartialProduct<VertexId, Value> *d_block_partials,
      VertexId                        *d_rows,
      VertexId                        *d_columns,
      Value                           *d_values,
      Value                           *d_vector,
      Value                           *d_result)
  {
      // Specialize SpMV thread block abstraction type
      typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
  
      // Shared memory allocation
      __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
  
      // Initialize thread block even-share to tell us where to start and stop our tile-processing
      even_share.BlockInit();
  
      // Construct persistent thread block
      PersistentBlockSpmv persistent_block(
          temp_storage,
          d_rows,
          d_columns,
          d_values,
          d_vector,
          d_result,
          d_block_partials,
          even_share.block_offset,
          even_share.block_end);
  
      // Process input tiles
      persistent_block.ProcessTiles();
  }
  
  
  /**
   * Kernel for "fixing up" an array of interblock SpMV partial products.
   */
  template <
      int                             BLOCK_THREADS,
      int                             ITEMS_PER_THREAD,
      typename                        VertexId,
      typename                        Value>
  __launch_bounds__ (BLOCK_THREADS,  1)
  __global__ void CooFinalizeKernel(
      PartialProduct<VertexId, Value> *d_block_partials,
      int                             num_partials,
      Value                           *d_result)
  {
      // Specialize "fix-up" thread block abstraction type
      typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
  
      // Shared memory allocation
      __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
  
      // Construct persistent thread block
      FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
  
      // Process input tiles
      persistent_block.ProcessTiles();
  }
  
  
  
  //---------------------------------------------------------------------
  // Host subroutines
  //---------------------------------------------------------------------
  
  
  /**
   * Simple test of device
   */
  template <
      int                         COO_BLOCK_THREADS,
      int                         COO_ITEMS_PER_THREAD,
      int                         COO_SUBSCRIPTION_FACTOR,
      int                         FINALIZE_BLOCK_THREADS,
      int                         FINALIZE_ITEMS_PER_THREAD,
      typename                    VertexId,
      typename                    Value>
  void TestDevice(
      CooGraph<VertexId, Value>&  coo_graph,
      Value*                      h_vector,
      Value*                      h_reference)
  {
      typedef PartialProduct<VertexId, Value> PartialProduct;
  
      const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
  
      // SOA device storage
      VertexId        *d_rows;             // SOA graph row coordinates
      VertexId        *d_columns;          // SOA graph col coordinates
      Value           *d_values;           // SOA graph values
      Value           *d_vector;           // Vector multiplicand
      Value           *d_result;           // Output row
      PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
  
      // Create SOA version of coo_graph on host
      int             num_edges   = coo_graph.coo_tuples.size();
      VertexId        *h_rows     = new VertexId[num_edges];
      VertexId        *h_columns  = new VertexId[num_edges];
      Value           *h_values   = new Value[num_edges];
      for (int i = 0; i < num_edges; i++)
      {
          h_rows[i]       = coo_graph.coo_tuples[i].row;
          h_columns[i]    = coo_graph.coo_tuples[i].col;
          h_values[i]     = coo_graph.coo_tuples[i].val;
      }
  
      // Get CUDA properties
      Device device_props;
      CubDebugExit(device_props.Init());
  
      // Determine launch configuration from kernel properties
      int coo_sm_occupancy;
      CubDebugExit(device_props.MaxSmOccupancy(
          coo_sm_occupancy,
          CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
          COO_BLOCK_THREADS));
      int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
  
      // Construct an even-share work distribution
      GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
      int coo_grid_size  = even_share.grid_size;
      int num_partials   = coo_grid_size * 2;
  
      // Allocate COO device arrays
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
      CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
  
      // Copy host arrays to device
      CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
      CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
      CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
      CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
  
      // Bind textures
      TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
  
      // Print debug info
      printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d
  ",
          COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
      if (coo_grid_size > 1)
      {
          printf("CooFinalizeKernel<<<1, %d>>>(...)
  ", FINALIZE_BLOCK_THREADS);
      }
      fflush(stdout);
  
      CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
  
      // Run kernel (always run one iteration without timing)
      GpuTimer gpu_timer;
      float elapsed_millis = 0.0;
      for (int i = 0; i <= g_timing_iterations; i++)
      {
          gpu_timer.Start();
  
          // Initialize output
          CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
  
          // Run the COO kernel
          CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
              even_share,
              d_block_partials,
              d_rows,
              d_columns,
              d_values,
              d_vector,
              d_result);
  
          if (coo_grid_size > 1)
          {
              // Run the COO finalize kernel
              CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
                  d_block_partials,
                  num_partials,
                  d_result);
          }
  
          gpu_timer.Stop();
  
          if (i > 0)
              elapsed_millis += gpu_timer.ElapsedMillis();
      }
  
      // Force any kernel stdio to screen
      CubDebugExit(cudaThreadSynchronize());
      fflush(stdout);
  
      // Display timing
      if (g_timing_iterations > 0)
      {
          float avg_elapsed = elapsed_millis / g_timing_iterations;
          int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
          printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)
  ",
              g_timing_iterations,
              avg_elapsed,
              total_bytes / avg_elapsed / 1000.0 / 1000.0,
              num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
      }
  
      // Check results
      int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
      printf("%s
  ", compare ? "FAIL" : "PASS");
      AssertEquals(0, compare);
  
      // Cleanup
      TexVector<Value>::UnbindTexture();
      CubDebugExit(g_allocator.DeviceFree(d_block_partials));
      CubDebugExit(g_allocator.DeviceFree(d_rows));
      CubDebugExit(g_allocator.DeviceFree(d_columns));
      CubDebugExit(g_allocator.DeviceFree(d_values));
      CubDebugExit(g_allocator.DeviceFree(d_vector));
      CubDebugExit(g_allocator.DeviceFree(d_result));
      delete[] h_rows;
      delete[] h_columns;
      delete[] h_values;
  }
  
  
  /**
   * Compute reference answer on CPU
   */
  template <typename VertexId, typename Value>
  void ComputeReference(
      CooGraph<VertexId, Value>&  coo_graph,
      Value*                      h_vector,
      Value*                      h_reference)
  {
      for (VertexId i = 0; i < coo_graph.row_dim; i++)
      {
          h_reference[i] = 0.0;
      }
  
      for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
      {
          h_reference[coo_graph.coo_tuples[i].row] +=
              coo_graph.coo_tuples[i].val *
              h_vector[coo_graph.coo_tuples[i].col];
      }
  }
  
  
  /**
   * Assign arbitrary values to vector items
   */
  template <typename Value>
  void AssignVectorValues(Value *vector, int col_dim)
  {
      for (int i = 0; i < col_dim; i++)
      {
          vector[i] = 1.0;
      }
  }
  
  
  /**
   * Main
   */
  int main(int argc, char** argv)
  {
      // Initialize command line
      CommandLineArgs args(argc, argv);
      g_verbose = args.CheckCmdLineFlag("v");
      args.GetCmdLineArgument("i", g_timing_iterations);
  
      // Print usage
      if (args.CheckCmdLineFlag("help"))
      {
          printf("%s
   [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]
  "
              "\t--type=wheel --spokes=<spokes>
  "
              "\t--type=grid2d --width=<width> [--no-self-loops]
  "
              "\t--type=grid3d --width=<width> [--no-self-loops]
  "
              "\t--type=market --file=<file>
  "
              "
  ", argv[0]);
          exit(0);
      }
  
      // Initialize device
      CubDebugExit(args.DeviceInit());
  
      // Get graph type
      string type;
      args.GetCmdLineArgument("type", type);
  
      // Generate graph structure
  
      CpuTimer timer;
      timer.Start();
      CooGraph<VertexId, Value> coo_graph;
      if (type == string("grid2d"))
      {
          VertexId width;
          args.GetCmdLineArgument("width", width);
          bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
          printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
          if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
      } else if (type == string("grid3d"))
      {
          VertexId width;
          args.GetCmdLineArgument("width", width);
          bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
          printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
          if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
      }
      else if (type == string("wheel"))
      {
          VertexId spokes;
          args.GetCmdLineArgument("spokes", spokes);
          printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
          if (coo_graph.InitWheel(spokes)) exit(1);
      }
      else if (type == string("market"))
      {
          string filename;
          args.GetCmdLineArgument("file", filename);
          printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
          if (coo_graph.InitMarket(filename)) exit(1);
      }
      else
      {
          printf("Unsupported graph type
  ");
          exit(1);
      }
      timer.Stop();
      printf("Done (%.3fs). %d non-zeros, %d rows, %d columns
  ",
          timer.ElapsedMillis() / 1000.0,
          coo_graph.coo_tuples.size(),
          coo_graph.row_dim,
          coo_graph.col_dim);
      fflush(stdout);
  
      if (g_verbose)
      {
          cout << coo_graph << "
  ";
      }
  
      // Create vector
      Value *h_vector = new Value[coo_graph.col_dim];
      AssignVectorValues(h_vector, coo_graph.col_dim);
      if (g_verbose)
      {
          printf("Vector[%d]: ", coo_graph.col_dim);
          DisplayResults(h_vector, coo_graph.col_dim);
          printf("
  
  ");
      }
  
      // Compute reference answer
      Value *h_reference = new Value[coo_graph.row_dim];
      ComputeReference(coo_graph, h_vector, h_reference);
      if (g_verbose)
      {
          printf("Results[%d]: ", coo_graph.row_dim);
          DisplayResults(h_reference, coo_graph.row_dim);
          printf("
  
  ");
      }
  
      // Parameterization for SM35
      enum
      {
          COO_BLOCK_THREADS           = 64,
          COO_ITEMS_PER_THREAD        = 10,
          COO_SUBSCRIPTION_FACTOR     = 4,
          FINALIZE_BLOCK_THREADS      = 256,
          FINALIZE_ITEMS_PER_THREAD   = 4,
      };
  
      // Run GPU version
      TestDevice<
          COO_BLOCK_THREADS,
          COO_ITEMS_PER_THREAD,
          COO_SUBSCRIPTION_FACTOR,
          FINALIZE_BLOCK_THREADS,
          FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
  
      // Cleanup
      delete[] h_vector;
      delete[] h_reference;
  
      return 0;
  }