cuda-decoder-common.h
20.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
// cudadecoder/cuda-decoder-common.h
//
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Hugo Braun, Justin Luitjens, Ryan Leary
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
#define KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
#include "cudamatrix/cu-device.h"
#include "util/stl-utils.h"
// A decoder channel is linked to one utterance. Frames
// from the same must be sent to the same channel.
//
// A decoder lane is where the computation actually happens
// a decoder lane is given a frame and its associated channel
// and does the actual computation
//
// An analogy would be lane -> a core, channel -> a software thread
// Some config parameters can be computed using other parameters
// (e.g. we can set main_q_capacity using max-active)
// Those values are the different factors between parameters that we know
// and parameters we want to set
#define KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR 4
#define KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR 3
// If we're at risk of filling the tokens queue,
// the beam is reduced to keep only the best candidates in the
// remaining space
// We then slowly put the beam back to its default value
// beam_next_frame = min(default_beam, RECOVER_RATE * beam_previous_frame)
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE 1.2f
// Defines for the cuda decoder kernels
// It shouldn't be necessary to change the DIMX of the kernels
// Below that value, we launch the persistent kernel for NonEmitting
#define KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS 4096
// We know we will have at least X elements in the hashmap
// We allocate space for X*KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR elements
// to avoid having too much collisions
#define KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR 1
// Max size of the total kernel arguments
// 4kb for compute capability >= 2.0
#define KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE (4096)
// When applying the max-active, we need to compute a topk
// to perform that (soft) topk, we compute a histogram
// here we define the number of bins in that histogram
// it has to be less than the number of 1D threads
#define KALDI_CUDA_DECODER_HISTO_NBINS 255
// Number of "heavy duty" process non emitting kernels
// If more non emitting iterations are required, those will be done
// in the one-CTA persistent kernel
#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2
// Adaptive beam parameters
// We will decrease the beam when we detect that we are generating too many
// tokens
// for the first segment of the aux_q, we don't do anything (keep the original
// beam)
// the first segment is made of (aux_q
// capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
// then we will decrease the beam step by step, until 0.
// we will decrease the beam every m elements, with:
// x = (aux_q capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT (static
// segment
// y = (aux_q capacity) - x
// m = y / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS
// For more information, please refer to the definition of GetAdaptiveBeam in
// cuda-decoder-kernels.cu
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT 4
#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS 8
// When applying max_active we don't keep exactly max_active_ tokens,
// but a bit more. And we can call ApplyMaxActiveAndReduceBeam multiple times
// in the first frame (the first times as a pre-filter, the last time at the
// very end of the frame)
// Because keeping a bit more than max_active_ is expected, we add the tolerance
// so that we can avoid triggering ApplyMaxActiveAndReduceBeam for just a few
// tokens above the limit
// at the end of the frame
#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
#define KALDI_CUDA_DECODER_ASSERT(val, recoverable) \
{ \
if ((val) != true) { \
throw CudaDecoderException("KALDI_CUDA_DECODER_ASSERT", __FILE__, \
__LINE__, recoverable) \
} \
}
// Macro for checking cuda errors following a cuda launch or api call
#ifdef NDEBUG
#define KALDI_DECODER_CUDA_CHECK_ERROR()
#else
#define KALDI_DECODER_CUDA_CHECK_ERROR() \
{ \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
false); \
} \
}
#endif
#define KALDI_DECODER_CUDA_API_CHECK_ERROR(e) \
{ \
if (e != cudaSuccess) { \
throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
false); \
} \
}
#define KALDI_CUDA_DECODER_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
#define KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, th_idx, n) \
for (int offset = blockIdx.x * blockDim.x, th_idx = threadIdx.x; \
offset < (n); offset += blockDim.x * gridDim.x)
#define KALDI_CUDA_DECODER_IS_LAST_1D_THREAD() (threadIdx.x == (blockDim.x - 1))
#define KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(i, n) \
for (int i = blockIdx.y; i < (n); i += gridDim.y)
#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
#define KALDI_CUDA_DECODER_1D_BLOCK 256
#define KALDI_CUDA_DECODER_LARGEST_1D_BLOCK 1024
#define KALDI_CUDA_DECODER_ONE_THREAD_BLOCK 1
#define KALDI_CUDA_DECODER_MAX_CTA_COUNT 4096u
#define KALDI_CUDA_DECODER_MAX_CTA_PER_LANE 512u
namespace kaldi {
namespace cuda_decoder {
// Returning the number of CTAs to launch for (N,M) elements to compute
// M is usually the batch size
inline dim3 KaldiCudaDecoderNumBlocks(int N, int M) {
dim3 grid;
grid.x = KALDI_CUDA_DECODER_DIV_ROUND_UP(N, KALDI_CUDA_DECODER_1D_BLOCK);
unsigned int max_CTA_per_lane =
std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / M, 1u);
grid.x = std::min(grid.x, max_CTA_per_lane);
grid.y = M;
return grid;
}
// Use a fixed number of blocks for nlanes
// Using the max number of CTAs possible for each lane,
// according to KALDI_CUDA_DECODER_MAX_CTA_COUNT
// and KALDI_CUDA_DECODER_MAX_CTA_PER_LANE
inline dim3 KaldiCudaDecoderNumBlocks(int nlanes) {
dim3 grid;
unsigned int n_CTA_per_lane =
std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / nlanes, 1u);
if (n_CTA_per_lane == 0) n_CTA_per_lane = 1;
grid.x = std::min(KALDI_CUDA_DECODER_MAX_CTA_PER_LANE, n_CTA_per_lane);
grid.y = nlanes;
return grid;
}
typedef int32 StateId;
typedef float CostType;
// IntegerCostType is the type used in the lookup table d_state_best_cost
// and the d_cutoff
// We use a 1:1 conversion between CostType <--> IntegerCostType
// IntegerCostType is used because it triggers native atomic operations
// (CostType does not)
typedef int32 IntegerCostType;
typedef int32 LaneId;
typedef int32 ChannelId;
// On the device we compute everything by batch
// Data is stored as 2D matrices (BatchSize, 1D_Size)
// For example, for the token queue, (BatchSize, max_tokens_per_frame_)
// DeviceMatrix owns the data but is not used to access it.
// DeviceMatrix is inherited in DeviceLaneMatrix and DeviceChannelMatrix
// those two classes do the same thing, except that they belong either to a
// channel or lane
// that inheritance is done to clarify the code and help debugging
//
// To actually access the data, we should request an view through
// GetView
// That view contains both host cuda code to access the data. It does not own
// the data.
template <typename T>
// if necessary, make a version that always use ncols_ as the next power of 2
class DeviceMatrix {
T *data_;
void Allocate() {
KALDI_ASSERT(nrows_ > 0);
KALDI_ASSERT(ncols_ > 0);
KALDI_ASSERT(!data_);
data_ = static_cast<T *>(CuDevice::Instantiate().Malloc(
(size_t)nrows_ * ncols_ * sizeof(*data_)));
KALDI_ASSERT(data_);
}
void Free() {
KALDI_ASSERT(data_);
CuDevice::Instantiate().Free(data_);
}
protected:
int32 ncols_;
int32 nrows_;
public:
DeviceMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
virtual ~DeviceMatrix() {
if (data_) Free();
}
void Resize(int32 nrows, int32 ncols) {
if (data_) Free();
KALDI_ASSERT(nrows > 0);
KALDI_ASSERT(ncols > 0);
nrows_ = nrows;
ncols_ = ncols;
Allocate();
}
T *MutableData() {
KALDI_ASSERT(data_);
return data_;
}
// abstract getInterface...
};
template <typename T>
// if necessary, make a version that always use ncols_ as the next power of 2
class HostMatrix {
T *data_;
void Allocate() {
KALDI_ASSERT(nrows_ > 0);
KALDI_ASSERT(ncols_ > 0);
KALDI_ASSERT(!data_);
cudaMallocHost((void **)&data_, (size_t)nrows_ * ncols_ * sizeof(*data_));
KALDI_ASSERT(data_);
}
void Free() {
KALDI_ASSERT(data_);
cudaFreeHost(data_);
}
protected:
int32 ncols_;
int32 nrows_;
public:
HostMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
virtual ~HostMatrix() {
if (data_) Free();
}
void Resize(int32 nrows, int32 ncols) {
if (data_) Free();
KALDI_ASSERT(nrows > 0);
KALDI_ASSERT(ncols > 0);
nrows_ = nrows;
ncols_ = ncols;
Allocate();
}
T *MutableData() {
KALDI_ASSERT(data_);
return data_;
}
// abstract getInterface...
};
// Views of DeviceMatrix
// Those views are created by either DeviceChannelMatrix or
// DeviceLaneMatrix
// We can access the data (the matrix) associated with that
// Device[Channel|Lane]Matrix without owning that data.
// Which means that we can pass those views by copy
// without triggering a cudaFree, for instance.
// Device[Channel|Lane]Matrix owns the data, [Channel|Lane]MatrixInterface just
// gives access to it
// Generating both host and device interfaces
template <typename T>
struct LaneMatrixView {
T *data_;
int32 ncols_;
__host__ __device__ __inline__ T *lane(const int32 ilane) {
return &data_[ilane * ncols_];
}
};
template <typename T>
struct ChannelMatrixView {
T *data_;
int32 ncols_;
__host__ __device__ __inline__ T *channel(const int32 ichannel) {
return &data_[ichannel * ncols_];
}
};
// Specializing DeviceMatrix into lane and channel variants.
// Helps with code clarity/debugging
template <typename T>
class DeviceLaneMatrix : public DeviceMatrix<T> {
public:
LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
T *lane(const int32 ilane) {
return &this->MutableData()[ilane * this->ncols_];
}
};
template <typename T>
class HostLaneMatrix : public HostMatrix<T> {
public:
LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
T *lane(const int32 ilane) {
return &this->MutableData()[ilane * this->ncols_];
}
};
template <typename T>
class DeviceChannelMatrix : public DeviceMatrix<T> {
public:
ChannelMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
T *channel(const int32 ichannel) {
return &this->MutableData()[ichannel * this->ncols_];
}
};
// LaneCounters/ChannelCounters
// The counters are all the singular values associated to a lane/channel
// For instance the main queue size. Or the min_cost of all tokens in that
// queue
// LaneCounters are used during computation
struct LaneCounters {
// hannel that this lane will compute for the current frame
ChannelId channel_to_compute;
// Pointer to the loglikelihoods array for this channel and current frame
BaseFloat *loglikelihoods;
// Contains both main_q_end and narcs
// End index of the main queue
// only tokens at index i with i < main_q_end
// are valid tokens
// Each valid token the subqueue main_q[main_q_local_offset, main_q_end[ has
// a number of outgoing arcs (out-degree)
// main_q_narcs is the sum of those numbers
// We sometime need to update both end and narcs at the same time using a
// single atomic,
// which is why they're packed together
int2 main_q_narcs_and_end;
// contains the requested queue length which can
// be larger then the actual queue length in the case of overflow
int32 main_q_requested;
int32 aux_q_requested;
int32 aux_q_end;
int32 post_expand_aux_q_end; // used for double buffering
// Some tokens in the same frame share the same token.next_state
// main_q_n_extra_prev_tokens is the count of those tokens
int32 main_q_n_extra_prev_tokens;
// Number of tokens created during the emitting stage
int32 main_q_n_emitting_tokens;
// Depending on the value of the parameter "max_tokens_per_frame"
// we can end up with an overflow when generating the tokens for a frame
// We try to prevent this from happening using an adaptive beam
// If an overflow happens, then the kernels no longer insert any data into
// the queues and set overflow flag to true.
// queue length.
// Even if that flag is set, we can continue the execution (quality
// of the output can be lowered)
// We use that flag to display a warning to the user
int32 q_overflow;
// ExpandArcs reads the tokens in the index range [main_q_local_offset, end[
int32 main_q_local_offset;
// We transfer the tokens back to the host at the end of each frame.
// Which means that tokens at a frame n > 0 have an offset compared to to
// those
// in frame n-1. main_q_global_offset is the overall offset of the current
// main_q,
// since frame 0
// It is used to set the prev_token index.
int32 main_q_global_offset;
// Same thing, but for main_q_n_extra_prev_tokens (those are also transfered
// back to host)
int32 main_q_extra_prev_tokens_global_offset;
// Minimum token for that frame
IntegerCostType min_int_cost;
// Current beam. Can be different from default_beam,
// because of the AdaptiveBeam process, or because of
// ApplyMaxActiveAndReduceBeam
IntegerCostType int_beam;
// Adaptive beam. The validity says until which index this adaptive beam is
// valid.
// After that index, we need to lower the adaptive beam
int2 adaptive_int_beam_with_validity_index;
// min_cost + beam
IntegerCostType int_cutoff;
// The histogram for max_active will be computed between min_histo_cost
// and max_histo_cost. Set for each frame after emitting stage
CostType min_histo_cost;
CostType max_histo_cost;
CostType histo_bin_width;
bool compute_max_active;
// offsets used by concatenate_lanes_data_kernel
int32 main_q_end_lane_offset;
int32 main_q_n_emitting_tokens_lane_offset;
int32 main_q_n_extra_prev_tokens_lane_offset;
// --- Only valid after calling GetBestCost
// min_cost and its arg. Can be different than min_cost, because we may
// include final costs
int2 min_int_cost_and_arg;
// Number of final tokens with cost < best + lattice_beam
int32 n_within_lattice_beam;
int32 has_reached_final; // if there's at least one final token in the queue
int32 prev_arg_min_int_cost;
};
// Channel counters
// Their job is to save the state of a channel, when this channel is idle
// The channel counters are loaded into the lane counters during the context
// switches
struct ChannelCounters {
// All the following values are just saved values from LaneCounters
// from the latest context-switch
int2 prev_main_q_narcs_and_end;
int32 prev_main_q_n_extra_prev_tokens;
int32 prev_main_q_global_offset;
int32 prev_main_q_extra_prev_tokens_global_offset;
CostType prev_beam;
// Only valid after calling GetBestCost
// different than min_int_cost : we include the "final" cost
int2 min_int_cost_and_arg_with_final;
int2 min_int_cost_and_arg_without_final;
};
class CudaDecoderException : public std::exception {
public:
CudaDecoderException(const char *str_, const char *file_, int line_,
const bool recoverable_)
: str(str_),
file(file_),
line(line_),
buffer(std::string(file) + ":" + std::to_string(line) + " :" +
std::string(str)),
recoverable(recoverable_) {}
const char *what() const throw() { return buffer.c_str(); }
const char *str;
const char *file;
const int line;
const std::string buffer;
const bool recoverable;
};
// InfoToken contains data that needs to be saved for the backtrack
// in GetBestPath/GetRawLattice
// We don't need the token.cost or token.next_state.
struct __align__(8) InfoToken {
int32 prev_token;
int32 arc_idx;
bool IsUniqueTokenForStateAndFrame() {
// This is a trick used to save space and PCI-E bandwidth (cf
// preprocess_in_place kernel)
// This token is associated with a next_state s, created during the
// processing of frame f.
// If we have multiple tokens associated with the state s in the frame f,
// arc_idx < 0 and -arc_idx is the
// count of such tokens. We will then have to look at another list to read
// the actually arc_idx and prev_token values
// If the current token is the only one, prev_token and arc_idx are valid
// and can be used directly
return (arc_idx >= 0);
}
// Called if this token is linked to others tokens in the same frame (cf
// comments for IsUniqueTokenForStateAndFrame)
// return the {offset,size} pair necessary to list those tokens in the
// extra_prev_tokens list
// They are stored at offset "offset", and we have "size" of those
std::pair<int32, int32> GetSameFSTStateTokensList() {
KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
return {prev_token, -arc_idx};
}
};
// Device function, used to set a in an InfoToken the [offset,size] related to
// InfoToken.GetSameFSTStateTokensList
__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
InfoToken *info_token) {
// We always have size > 0
*info_token = {offset, -size};
}
// Used to store the index in the GPU hashmap of that FST state
// The hashmap is only generated with the final main queue (post max_active_) of
// each frame
// Also stores the information or whether or not the owner of that object is the
// representative of this FSTState
typedef int32 FSTStateHashIndex;
// 1:1 Conversion float <---> sortable int
// We convert floats to sortable ints in order
// to use native atomics operation
// Those are the host version, used when we transfer an int from the device
// and we want to convert it to a float
// (it was created on device by floatToOrderedInt, we'll use
// orderedIntToFloatHost on host to convert it back to a float)
__inline__ int32 floatToOrderedIntHost(float floatVal) {
int32 intVal;
// Should be optimized away by compiler
memcpy(&intVal, &floatVal, sizeof(float));
return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
}
__inline__ float orderedIntToFloatHost(int32 intVal) {
intVal = (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
float floatVal;
// Should be optimized away by compiler
memcpy(&floatVal, &intVal, sizeof(float));
return floatVal;
}
// Hashmap value. Used when computing the hashmap in PostProcessingMainQueue
struct __align__(16) HashmapValueT {
// Map key : fst state
int32 key;
// Number of tokens associated to that state
int32 count;
// minimum cost for that state + argmin
unsigned long long min_and_argmin_int_cost_u64;
};
enum OVERFLOW_TYPE {
OVERFLOW_NONE = 0,
OVERFLOW_MAIN_Q = 1,
OVERFLOW_AUX_Q = 2
};
enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
} // end namespace cuda_decoder
} // end namespace kaldi
#endif // KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_