cuda-decoder-common.h 20.2 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
// cudadecoder/cuda-decoder-common.h
//
// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
// Hugo Braun, Justin Luitjens, Ryan Leary
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
#define KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
#include "cudamatrix/cu-device.h"
#include "util/stl-utils.h"

// A decoder channel is linked to one utterance. Frames
// from the same must be sent to the same channel.
//
// A decoder lane is where the computation actually happens
// a decoder lane is given a frame and its associated channel
// and does the actual computation
//
// An analogy would be lane -> a core, channel -> a software thread

// Some config parameters can be computed using other parameters
// (e.g. we can set main_q_capacity using max-active)
// Those values are the different factors between parameters that we know
// and parameters we want to set
#define KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR 4
#define KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR 3

// If we're at risk of filling the tokens queue,
// the beam is reduced to keep only the best candidates in the
// remaining space
// We then slowly put the beam back to its default value
// beam_next_frame = min(default_beam, RECOVER_RATE * beam_previous_frame)
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE 1.2f

// Defines for the cuda decoder kernels
// It shouldn't be necessary to change the DIMX of the kernels

// Below that value, we launch the persistent kernel for NonEmitting
#define KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS 4096

// We know we will have at least X elements in the hashmap
// We allocate space for X*KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR elements
// to avoid having too much collisions
#define KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR 1

// Max size of the total kernel arguments
// 4kb for compute capability >= 2.0
#define KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE (4096)

// When applying the max-active, we need to compute a topk
// to perform that (soft) topk, we compute a histogram
// here we define the number of bins in that histogram
// it has to be less than the number of 1D threads
#define KALDI_CUDA_DECODER_HISTO_NBINS 255

// Number of "heavy duty" process non emitting kernels
// If more non emitting iterations are required, those will be done
// in the one-CTA persistent kernel
#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2

// Adaptive beam parameters
// We will decrease the beam when we detect that we are generating too many
// tokens
// for the first segment of the aux_q, we don't do anything (keep the original
// beam)
// the first segment is made of (aux_q
// capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
// then we will decrease the beam step by step, until 0.
// we will decrease the beam every m elements, with:
// x = (aux_q capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT (static
// segment
// y = (aux_q capacity) - x
// m = y / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS
// For more information, please refer to the definition of GetAdaptiveBeam in
// cuda-decoder-kernels.cu
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT 4
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS 8
// When applying max_active we don't keep exactly max_active_ tokens,
// but a bit more. And we can call ApplyMaxActiveAndReduceBeam multiple times
// in the first frame (the first times as a pre-filter, the last time at the
// very end of the frame)
// Because keeping a bit more than max_active_ is expected, we add the tolerance
// so that we can avoid triggering ApplyMaxActiveAndReduceBeam for just a few
// tokens above the limit
// at the end of the frame

#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)

#define KALDI_CUDA_DECODER_ASSERT(val, recoverable)                     \
  {                                                                     \
    if ((val) != true) {                                                \
      throw CudaDecoderException("KALDI_CUDA_DECODER_ASSERT", __FILE__, \
                                 __LINE__, recoverable)                 \
    }                                                                   \
  }
// Macro for checking cuda errors following a cuda launch or api call
#ifdef NDEBUG
#define KALDI_DECODER_CUDA_CHECK_ERROR()
#else
#define KALDI_DECODER_CUDA_CHECK_ERROR()                                  \
  {                                                                       \
    cudaError_t e = cudaGetLastError();                                   \
    if (e != cudaSuccess) {                                               \
      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
                                 false);                                  \
    }                                                                     \
  }
#endif

#define KALDI_DECODER_CUDA_API_CHECK_ERROR(e)                             \
  {                                                                       \
    if (e != cudaSuccess) {                                               \
      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
                                 false);                                  \
    }                                                                     \
  }

#define KALDI_CUDA_DECODER_1D_KERNEL_LOOP(i, n)                \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)

#define KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, th_idx, n) \
  for (int offset = blockIdx.x * blockDim.x, th_idx = threadIdx.x;        \
       offset < (n); offset += blockDim.x * gridDim.x)

#define KALDI_CUDA_DECODER_IS_LAST_1D_THREAD() (threadIdx.x == (blockDim.x - 1))

#define KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.y; i < (n); i += gridDim.y)

#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)

#define KALDI_CUDA_DECODER_1D_BLOCK 256
#define KALDI_CUDA_DECODER_LARGEST_1D_BLOCK 1024
#define KALDI_CUDA_DECODER_ONE_THREAD_BLOCK 1
#define KALDI_CUDA_DECODER_MAX_CTA_COUNT 4096u
#define KALDI_CUDA_DECODER_MAX_CTA_PER_LANE 512u
namespace kaldi {
namespace cuda_decoder {

// Returning the number of CTAs to launch for (N,M) elements to compute
// M is usually the batch size
inline dim3 KaldiCudaDecoderNumBlocks(int N, int M) {
  dim3 grid;
  grid.x = KALDI_CUDA_DECODER_DIV_ROUND_UP(N, KALDI_CUDA_DECODER_1D_BLOCK);
  unsigned int max_CTA_per_lane =
      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / M, 1u);
  grid.x = std::min(grid.x, max_CTA_per_lane);
  grid.y = M;
  return grid;
}

// Use a fixed number of blocks for nlanes
// Using the max number of CTAs possible for each lane,
// according to KALDI_CUDA_DECODER_MAX_CTA_COUNT
// and KALDI_CUDA_DECODER_MAX_CTA_PER_LANE
inline dim3 KaldiCudaDecoderNumBlocks(int nlanes) {
  dim3 grid;
  unsigned int n_CTA_per_lane =
      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / nlanes, 1u);
  if (n_CTA_per_lane == 0) n_CTA_per_lane = 1;
  grid.x = std::min(KALDI_CUDA_DECODER_MAX_CTA_PER_LANE, n_CTA_per_lane);
  grid.y = nlanes;
  return grid;
}

typedef int32 StateId;
typedef float CostType;
// IntegerCostType is the type used in the lookup table d_state_best_cost
// and the d_cutoff
// We use a 1:1 conversion between CostType <--> IntegerCostType
// IntegerCostType is used because it triggers native atomic operations
// (CostType does not)
typedef int32 IntegerCostType;
typedef int32 LaneId;
typedef int32 ChannelId;

// On the device we compute everything by batch
// Data is stored as 2D matrices (BatchSize, 1D_Size)
// For example, for the token queue, (BatchSize, max_tokens_per_frame_)
// DeviceMatrix owns the data but is not used to access it.
// DeviceMatrix is inherited in DeviceLaneMatrix and DeviceChannelMatrix
// those two classes do the same thing, except that they belong either to a
// channel or lane
// that inheritance is done to clarify the code and help debugging
//
// To actually access the data, we should request an view through
// GetView
// That view contains both host cuda code to access the data. It does not own
// the data.
template <typename T>
// if necessary, make a version that always use ncols_ as the next power of 2
class DeviceMatrix {
  T *data_;
  void Allocate() {
    KALDI_ASSERT(nrows_ > 0);
    KALDI_ASSERT(ncols_ > 0);
    KALDI_ASSERT(!data_);
    data_ = static_cast<T *>(CuDevice::Instantiate().Malloc(
        (size_t)nrows_ * ncols_ * sizeof(*data_)));
    KALDI_ASSERT(data_);
  }
  void Free() {
    KALDI_ASSERT(data_);
    CuDevice::Instantiate().Free(data_);
  }

 protected:
  int32 ncols_;
  int32 nrows_;

 public:
  DeviceMatrix() : data_(NULL), ncols_(0), nrows_(0) {}

  virtual ~DeviceMatrix() {
    if (data_) Free();
  }

  void Resize(int32 nrows, int32 ncols) {
    if (data_) Free();
    KALDI_ASSERT(nrows > 0);
    KALDI_ASSERT(ncols > 0);
    nrows_ = nrows;
    ncols_ = ncols;
    Allocate();
  }

  T *MutableData() {
    KALDI_ASSERT(data_);
    return data_;
  }
  // abstract getInterface...
};

template <typename T>
// if necessary, make a version that always use ncols_ as the next power of 2
class HostMatrix {
  T *data_;
  void Allocate() {
    KALDI_ASSERT(nrows_ > 0);
    KALDI_ASSERT(ncols_ > 0);
    KALDI_ASSERT(!data_);
    cudaMallocHost((void **)&data_, (size_t)nrows_ * ncols_ * sizeof(*data_));
    KALDI_ASSERT(data_);
  }
  void Free() {
    KALDI_ASSERT(data_);
    cudaFreeHost(data_);
  }

 protected:
  int32 ncols_;
  int32 nrows_;

 public:
  HostMatrix() : data_(NULL), ncols_(0), nrows_(0) {}

  virtual ~HostMatrix() {
    if (data_) Free();
  }

  void Resize(int32 nrows, int32 ncols) {
    if (data_) Free();
    KALDI_ASSERT(nrows > 0);
    KALDI_ASSERT(ncols > 0);
    nrows_ = nrows;
    ncols_ = ncols;
    Allocate();
  }

  T *MutableData() {
    KALDI_ASSERT(data_);
    return data_;
  }
  // abstract getInterface...
};

// Views of DeviceMatrix
// Those views are created by either DeviceChannelMatrix or
// DeviceLaneMatrix
// We can access the data (the matrix) associated with that
// Device[Channel|Lane]Matrix without owning that data.
// Which means that we can pass those views by copy
// without triggering a cudaFree, for instance.
// Device[Channel|Lane]Matrix owns the data, [Channel|Lane]MatrixInterface just
// gives access to it
// Generating both host and device interfaces
template <typename T>
struct LaneMatrixView {
  T *data_;
  int32 ncols_;
  __host__ __device__ __inline__ T *lane(const int32 ilane) {
    return &data_[ilane * ncols_];
  }
};

template <typename T>
struct ChannelMatrixView {
  T *data_;
  int32 ncols_;
  __host__ __device__ __inline__ T *channel(const int32 ichannel) {
    return &data_[ichannel * ncols_];
  }
};

// Specializing DeviceMatrix into lane and channel variants.
// Helps with code clarity/debugging
template <typename T>
class DeviceLaneMatrix : public DeviceMatrix<T> {
 public:
  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }

  T *lane(const int32 ilane) {
    return &this->MutableData()[ilane * this->ncols_];
  }
};

template <typename T>
class HostLaneMatrix : public HostMatrix<T> {
 public:
  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }

  T *lane(const int32 ilane) {
    return &this->MutableData()[ilane * this->ncols_];
  }
};

template <typename T>
class DeviceChannelMatrix : public DeviceMatrix<T> {
 public:
  ChannelMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
  T *channel(const int32 ichannel) {
    return &this->MutableData()[ichannel * this->ncols_];
  }
};

// LaneCounters/ChannelCounters
// The counters are all the singular values associated to a lane/channel
// For instance  the main queue size. Or the min_cost of all tokens in that
// queue
// LaneCounters are used during computation
struct LaneCounters {
  // hannel that this lane will compute for the current frame
  ChannelId channel_to_compute;
  // Pointer to the loglikelihoods array for this channel and current frame
  BaseFloat *loglikelihoods;
  // Contains both main_q_end and narcs
  // End index of the main queue
  // only tokens at index i with i < main_q_end
  // are valid tokens
  // Each valid token the subqueue main_q[main_q_local_offset, main_q_end[ has
  // a number of outgoing arcs (out-degree)
  // main_q_narcs is the sum of those numbers
  // We sometime need to update both end and narcs at the same time using a
  // single atomic,
  // which is why they're packed together
  int2 main_q_narcs_and_end;
  // contains the requested queue length which can
  // be larger then the actual queue length in the case of overflow
  int32 main_q_requested;
  int32 aux_q_requested;
  int32 aux_q_end;
  int32 post_expand_aux_q_end;  // used for double buffering
  // Some tokens in the same frame share the same token.next_state
  // main_q_n_extra_prev_tokens is the count of those tokens
  int32 main_q_n_extra_prev_tokens;
  // Number of tokens created during the emitting stage
  int32 main_q_n_emitting_tokens;
  // Depending on the value of the parameter "max_tokens_per_frame"
  // we can end up with an overflow when generating the tokens for a frame
  // We try to prevent this from happening using an adaptive beam
  // If an overflow happens, then the kernels no longer insert any data into
  // the queues and set overflow flag to true.
  // queue length.
  // Even if that flag is set, we can continue the execution (quality
  // of the output can be lowered)
  // We use that flag to display a warning to the user
  int32 q_overflow;
  // ExpandArcs reads the tokens in the index range [main_q_local_offset, end[
  int32 main_q_local_offset;
  // We transfer the tokens back to the host at the end of each frame.
  // Which means that tokens at a frame  n > 0 have an offset compared to to
  // those
  // in frame n-1. main_q_global_offset is the overall offset of the current
  // main_q,
  // since frame 0
  // It is used to set the prev_token index.
  int32 main_q_global_offset;
  // Same thing, but for main_q_n_extra_prev_tokens (those are also transfered
  // back to host)
  int32 main_q_extra_prev_tokens_global_offset;
  // Minimum token for that frame
  IntegerCostType min_int_cost;
  // Current beam. Can be different from default_beam,
  // because of the AdaptiveBeam process, or because of
  // ApplyMaxActiveAndReduceBeam
  IntegerCostType int_beam;
  // Adaptive beam. The validity says until which index this adaptive beam is
  // valid.
  // After that index, we need to lower the adaptive beam
  int2 adaptive_int_beam_with_validity_index;
  // min_cost + beam
  IntegerCostType int_cutoff;
  // The histogram for max_active will be computed between min_histo_cost
  // and max_histo_cost. Set for each frame after emitting stage
  CostType min_histo_cost;
  CostType max_histo_cost;
  CostType histo_bin_width;
  bool compute_max_active;
  // offsets used by concatenate_lanes_data_kernel
  int32 main_q_end_lane_offset;
  int32 main_q_n_emitting_tokens_lane_offset;
  int32 main_q_n_extra_prev_tokens_lane_offset;

  // --- Only valid after calling GetBestCost
  // min_cost and its arg. Can be different than min_cost, because we may
  // include final costs
  int2 min_int_cost_and_arg;
  // Number of final tokens with cost < best + lattice_beam
  int32 n_within_lattice_beam;
  int32 has_reached_final;  // if there's at least one final token in the queue
  int32 prev_arg_min_int_cost;
};

// Channel counters
// Their job is to save the state of a channel, when this channel is idle
// The channel counters are loaded into the lane counters during the context
// switches
struct ChannelCounters {
  // All the following values are just saved values from LaneCounters
  // from the latest context-switch
  int2 prev_main_q_narcs_and_end;
  int32 prev_main_q_n_extra_prev_tokens;
  int32 prev_main_q_global_offset;
  int32 prev_main_q_extra_prev_tokens_global_offset;
  CostType prev_beam;

  // Only valid after calling GetBestCost
  // different than min_int_cost : we include the "final" cost
  int2 min_int_cost_and_arg_with_final;
  int2 min_int_cost_and_arg_without_final;
};

class CudaDecoderException : public std::exception {
 public:
  CudaDecoderException(const char *str_, const char *file_, int line_,
                       const bool recoverable_)
      : str(str_),
        file(file_),
        line(line_),
        buffer(std::string(file) + ":" + std::to_string(line) + " :" +
               std::string(str)),
        recoverable(recoverable_) {}
  const char *what() const throw() { return buffer.c_str(); }

  const char *str;
  const char *file;
  const int line;
  const std::string buffer;
  const bool recoverable;
};

// InfoToken contains data that needs to be saved for the backtrack
// in GetBestPath/GetRawLattice
// We don't need the token.cost or token.next_state.
struct __align__(8) InfoToken {
  int32 prev_token;
  int32 arc_idx;
  bool IsUniqueTokenForStateAndFrame() {
    // This is a trick used to save space and PCI-E bandwidth (cf
    // preprocess_in_place kernel)
    // This token is associated with a next_state s, created during the
    // processing of frame f.
    // If we have multiple tokens associated with the state s in the frame f,
    // arc_idx < 0 and -arc_idx is the
    // count of such tokens. We will then have to look at another list to read
    // the actually arc_idx and prev_token values
    // If the current token is the only one, prev_token and arc_idx are valid
    // and can be used directly
    return (arc_idx >= 0);
  }

  // Called if this token is linked to others tokens in the same frame (cf
  // comments for IsUniqueTokenForStateAndFrame)
  // return the {offset,size} pair necessary to list those tokens in the
  // extra_prev_tokens list
  // They are stored at offset "offset", and we have "size" of those
  std::pair<int32, int32> GetSameFSTStateTokensList() {
    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());

    return {prev_token, -arc_idx};
  }
};

// Device function, used to set a in an InfoToken the [offset,size] related to
// InfoToken.GetSameFSTStateTokensList
__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
                                                     InfoToken *info_token) {
  // We always have size > 0
  *info_token = {offset, -size};
}

// Used to store the index in the GPU hashmap of that FST state
// The hashmap is only generated with the final main queue (post max_active_) of
// each frame
// Also stores the information or whether or not the owner of that object is the
// representative of this FSTState
typedef int32 FSTStateHashIndex;

// 1:1 Conversion float <---> sortable int
// We convert floats to sortable ints in order
// to use native atomics operation
// Those are the host version, used when we transfer an int from the device
// and we want to convert it to a float
// (it was created on device by floatToOrderedInt, we'll use
// orderedIntToFloatHost on host to convert it back to a float)
__inline__ int32 floatToOrderedIntHost(float floatVal) {
  int32 intVal;
  // Should be optimized away by compiler
  memcpy(&intVal, &floatVal, sizeof(float));
  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
}

__inline__ float orderedIntToFloatHost(int32 intVal) {
  intVal = (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
  float floatVal;
  // Should be optimized away by compiler
  memcpy(&floatVal, &intVal, sizeof(float));
  return floatVal;
}

// Hashmap value. Used when computing the hashmap in PostProcessingMainQueue
struct __align__(16) HashmapValueT {
  // Map key : fst state
  int32 key;
  // Number of tokens associated to that state
  int32 count;
  // minimum cost for that state + argmin
  unsigned long long min_and_argmin_int_cost_u64;
};

enum OVERFLOW_TYPE {
  OVERFLOW_NONE = 0,
  OVERFLOW_MAIN_Q = 1,
  OVERFLOW_AUX_Q = 2
};

enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };

}  // end namespace cuda_decoder
}  // end namespace kaldi

#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_