Yannick Estève / ONTRAC-Kaldi

Blame view

src/cudadecoder/cuda-decoder.h 40.1 KB
  // cudadecoder/cuda-decoder.h
  //
  // Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  // Hugo Braun, Justin Luitjens, Ryan Leary
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //     http://www.apache.org/licenses/LICENSE-2.0
  //
  // Unless required by applicable law or agreed to in writing, software
  // distributed under the License is distributed on an "AS IS" BASIS,
  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  // See the License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_CUDA_DECODER_CUDA_DECODER_H_
  #define KALDI_CUDA_DECODER_CUDA_DECODER_H_
  
  #include "cudadecoder/cuda-decodable-itf.h"
  #include "cudadecoder/cuda-decoder-common.h"
  #include "cudadecoder/cuda-fst.h"
  #include "nnet3/decodable-online-looped.h"
  #include "thread-pool.h"
  
  #include <cuda_runtime_api.h>
  #include <mutex>
  #include <tuple>
  #include <vector>
  namespace kaldi {
  namespace cuda_decoder {
  
  struct CudaDecoderConfig {
    BaseFloat default_beam;
    BaseFloat lattice_beam;
    int32 ntokens_pre_allocated;
    int32 main_q_capacity, aux_q_capacity;
    int32 max_active;
  
    CudaDecoderConfig()
        : default_beam(15.0),
          lattice_beam(10.0),
          ntokens_pre_allocated(2000000),
          main_q_capacity(-1),
          aux_q_capacity(-1),
          max_active(10000) {}
  
    void Register(OptionsItf *opts) {
      opts->Register("beam", &default_beam,
                     "Decoding beam. Larger->slower, more accurate. If "
                     "aux-q-capacity is too small, we may decrease the beam "
                     "dynamically to avoid overflow (adaptive beam, see "
                     "aux-q-capacity parameter)");
      opts->Register("lattice-beam", &lattice_beam,
                     "The width of the lattice beam");
      opts->Register("max-active", &max_active,
                     "At the end of each frame computation, we keep only its "
                     "best max-active tokens. One token is the instantiation of "
                     "a single arc. Typical values are within the 5k-10k range.");
      opts->Register("ntokens-pre-allocated", &ntokens_pre_allocated,
                     "Advanced - Number of tokens pre-allocated in host buffers. "
                     "If this size is exceeded the buffer will reallocate, "
                     "reducing performance.");
      std::ostringstream main_q_capacity_desc;
      main_q_capacity_desc
          << "Advanced - Capacity of the main queue : Maximum number of "
             "tokens that can be stored *after* pruning for each frame. "
             "Lower -> less memory usage, Higher -> More accurate. "
             "Tokens stored in the main queue were already selected "
             "through a max-active pre-selection. It means that for each "
             "emitting/non-emitting iteration, we can add at most "
             "~max-active tokens to the main queue. Typically only the "
             "emitting iteration creates a large number of tokens. Using "
             "main-q-capacity=k*max-active with k=4..10 should be safe. "
             "If main-q-capacity is too small, we will print a warning "
             "but prevent the overflow. The computation can safely "
             "continue, but the quality of the output may decrease "
             "(-1 = set to "
          << KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR
          << "*max-active).";
      opts->Register("main-q-capacity", &main_q_capacity,
                     main_q_capacity_desc.str());
      std::ostringstream aux_q_capacity_desc;
      aux_q_capacity_desc
          << "Advanced - Capacity of the auxiliary queue : Maximum "
             "number of raw tokens that can be stored *before* pruning "
             "for each frame. Lower -> less memory usage, Higher -> More "
             "accurate. During the tokens generation, if we detect that "
             "we are getting close to saturating that capacity, we will "
             "reduce the beam dynamically (adaptive beam) to keep only "
             "the best tokens in the remaining space. If the aux queue "
             "is still too small, we will print an overflow warning, but "
             "prevent the overflow. The computation can safely continue, "
             "but the quality of the output may decrease. We strongly "
             "recommend keeping aux-q-capacity large (>400k), to avoid "
             "triggering the adaptive beam and/or the overflow "
             "(-1 = set to "
          << KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR
          << "*main-q-capacity).";
      opts->Register("aux-q-capacity", &aux_q_capacity,
                     aux_q_capacity_desc.str());
    }
  
    void Check() const {
      KALDI_ASSERT(default_beam > 0.0 && ntokens_pre_allocated >= 0 &&
                   lattice_beam >= 0.0f && max_active > 0);
    }
  
    void ComputeConfig() {
      if (main_q_capacity == -1)
        main_q_capacity =
            max_active * KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR;
      if (aux_q_capacity == -1)
        aux_q_capacity =
            main_q_capacity * KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR;
    }
  };
  
  // Forward declaration.
  // Those contains CUDA code. We don't want to include their definition
  // in this header
  class DeviceParams;
  class KernelParams;
  
  class CudaDecoder {
   public:
    // Creating a new CudaDecoder, associated to the FST fst
    // nlanes and nchannels are defined as follow
  
    // A decoder channel is linked to one utterance.
    // When we need to perform decoding on an utterance,
    // we pick an available channel, call InitDecoding on that channel
    // (with that ChannelId in the channels vector in the arguments)
    // then call AdvanceDecoding whenever frames are ready for the decoder
    // for that utterance (also passing the same ChannelId to AdvanceDecoding)
    //
    // A decoder lane is where the computation actually happens
    // a decoder lane is channel, and perform the actual decoding
    // of that channel.
    // If we have 200 lanes, we can compute 200 utterances (channels)
    // at the same time. We need many lanes in parallel to saturate the big GPUs
    //
    // An analogy would be lane -> a CPU core, channel -> a software thread
    // A channel saves the current state of the decoding for a given utterance.
    // It can be kept idle until more frames are ready to be processed
    //
    // We will use as many lanes as necessary to saturate the GPU, but not more.
    // A lane has an higher memory usage than a channel. If you just want to be
    // able to
    // keep more audio channels open at the same time (when I/O is the bottleneck
    // for instance,
    // typically in the context of online decoding), you should instead use more
    // channels.
    //
    // A channel is typically way smaller in term of memory usage, and can be used
    // to oversubsribe lanes in the context of online decoding
    // For instance, we could choose nlanes=200 because it gives us good
    // performance
    // on a given GPU. It gives us an end-to-end performance of 3000 XRTF. We are
    // doing online,
    // so we only get audio at realtime speed for a given utterance/channel.
    // We then decide to receive audio from 2500 audio channels at the same time
    // (each at realtime speed),
    // and as soon as we have frames ready for nlanes=200 channels, we call
    // AdvanceDecoding on those channels
    // In that configuration, we have nlanes=200 (for performance), and
    // nchannels=2500 (to have enough audio
    // available at a given time).
    // Using nlanes=2500 in that configuration would first not be possible (out of
    // memory), but also not necessary.
    // Increasing the number of lanes is only useful if it increases performance.
    // If the GPU is saturated at nlanes=200,
    // you should not increase that number
    CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
                int32 nchannels);
  
    // Reads the config from config
    void ReadConfig(const CudaDecoderConfig &config);
    // Special constructor for nlanes = nchannels. Here for the non-advanced user
    // Here we can consider nchannels = batch size. If we want to decode 10
    // utterances at a time,
    // we can use nchannels = 10
    CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
                int32 nchannels)
        : CudaDecoder(fst, config, nchannels, nchannels) {}
    ~CudaDecoder();
  
    // InitDecoding initializes the decoding, and should only be used if you
    // intend to call AdvanceDecoding() on the channels listed in channels
    void InitDecoding(const std::vector<ChannelId> &channels);
    // Computes the heavy H2H copies of InitDecoding. Usually launched on the
    // threadpool
    void InitDecodingH2HCopies(ChannelId ichannel);
    // AdvanceDecoding on a given batch
    // a batch is defined by the channels vector
    // We can compute N channels at the same time (in the same batch)
    // where N = number of lanes, as defined in the constructor
    // AdvanceDecoding will compute as many frames as possible while running the
    // full batch
    // when at least one channel has no more frames ready to be computed,
    // AdvanceDecoding returns
    // The user then decides what to do, i.e.:
    //
    // 1) either remove the empty channel from the channels list
    // and call again AdvanceDecoding
    // 2) or swap the empty channel with another one that has frames ready
    // and call again AdvanceDecoding
    //
    // Solution 2) should be preferred because we need to run full, big batches to
    // saturate the GPU
    //
    // If max_num_frames is >= 0 it will decode no more than
    // that many frames.
    void AdvanceDecoding(const std::vector<ChannelId> &channels,
                         std::vector<CudaDecodableInterface *> &decodables,
                         int32 max_num_frames = -1);
  
    // Returns the number of frames already decoded in a given channel
    int32 NumFramesDecoded(ChannelId ichannel) const;
    // GetBestPath gets the one-best decoding traceback. If "use_final_probs" is
    // true
    // AND we reached a final state, it limits itself to final states;
    // otherwise it gets the most likely token not taking into account
    // final-probs.
    void GetBestPath(const std::vector<ChannelId> &channels,
                     std::vector<Lattice *> &fst_out_vec,
                     bool use_final_probs = true);
    // It is possible to use a threadsafe version of GetRawLattice, which is
    // ConcurrentGetRawLatticeSingleChannel()
    // Which will do the heavy CPU work associated with GetRawLattice
    // It is necessary to first call PrepareForGetRawLattice *on the main thread*
    // on the channels.
    // The main thread is the one we use to call all other functions, like
    // InitDecoding or AdvanceDecoding
    // We usually call it "cuda control thread", but it is a CPU thread
    // For example:
    // on main cpu thread : Call PrepareForGetRawLattice on channel 8,6,3
    // then:
    // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 3
    // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 8
    // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 6
    void PrepareForGetRawLattice(const std::vector<ChannelId> &channels,
                                 bool use_final_probs);
    void ConcurrentGetRawLatticeSingleChannel(ChannelId ichannel,
                                              Lattice *fst_out);
  
    // GetRawLattice gets the lattice decoding traceback (using the lattice-beam
    // in the CudaConfig parameters).
    // If "use_final_probs" is true
    // AND we reached a final state, it limits itself to final states;
    // otherwise it gets the most likely token not taking into account
    // final-probs.
    void GetRawLattice(const std::vector<ChannelId> &channels,
                       std::vector<Lattice *> &fst_out_vec, bool use_final_probs);
    // GetBestCost finds the best cost in the last tokens queue
    // for each channel in channels. If isfinal is true,
    // we also add the final cost to the token costs before
    // finding the minimum cost
    // We list all tokens that have a cost within [best; best+lattice_beam]
    // in list_lattice_tokens.
    // We alsos set has_reached_final[ichannel] to true if token associated to a
    // final state
    // exists in the last token queue of that channel
    void GetBestCost(
        const std::vector<ChannelId> &channels, bool isfinal,
        std::vector<std::pair<int32, CostType>> *argmins,
        std::vector<std::vector<std::pair<int, float>>> *list_lattice_tokens,
        std::vector<bool> *has_reached_final);
    // (optional) Giving the decoder access to the cpu thread pool
    // We will use it to compute specific CPU work, such as InitDecodingH2HCopies
    // For recurrent CPU work, such as ComputeH2HCopies, we will use dedicated CPU
    // threads
    // We will launch nworkers of those threads
    void SetThreadPoolAndStartCPUWorkers(ThreadPool *thread_pool, int32 nworkers);
  
   private:
    // Data allocation. Called in constructor
    void AllocateDeviceData();
    void AllocateHostData();
    void AllocateDeviceKernelParams();
    // Data initialization. Called in constructor
    void InitDeviceData();
    void InitHostData();
    void InitDeviceParams();
    // Computes the initial channel
    // The initial channel is used to initialize a channel
    // when a new utterance starts (we clone it into the given channel)
    void ComputeInitialChannel();
    // Updates *h_kernel_params using channels
    void SetChannelsInKernelParams(const std::vector<ChannelId> &channels);
    void ResetChannelsInKernelParams();
    // Context-switch functions
    // Used to perform the context-switch of load/saving the state of a channels
    // into a lane. When a channel will be executed on a lane, we load that
    // channel into that lane (same idea than when we load a software threads into
    // the registers of a CPU)
    void LoadChannelsStateToLanes(const std::vector<ChannelId> &channels);
    void SaveChannelsStateFromLanes();
    // We compute the decodes by batch. Each decodable in the batch has a
    // different number of frames ready
    // We compute the min number of frames ready (so that the full batch is
    // executing). If max_num_frames
    // is > 0, we apply that ceiling to the NumFramesToDecode.
    int32 NumFramesToDecode(const std::vector<ChannelId> &channels,
                            std::vector<CudaDecodableInterface *> &decodables,
                            int32 max_num_frames);
    // Expand the arcs, emitting stage. Must be called after
    // a preprocess_in_place, which happens in PostProcessingMainQueue.
    // ExpandArcsEmitting is called first when decoding a frame,
    // using the preprocessing that happened at the end of the previous frame,
    // in PostProcessingMainQueue
    void ExpandArcsEmitting();
    // ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
    void ExpandArcsNonEmitting();
    // If we have more than max_active_ tokens in the queue (either after an
    // expand, or at the end of the frame)
    // we will compute a new beam that will only keep a number of tokens as close
    // as possible to max_active_ tokens
    // (that number is >= max_active_) (soft topk)
    // All ApplyMaxActiveAndReduceBeam is find the right beam for that topk and
    // set it.
    // We need to then call PruneAndPreprocess (explicitly pruning tokens with
    // cost > beam)
    // Or PostProcessingMainQueue (ignoring tokens with cost > beam in the next
    // frame)
    void ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id);
    // Called after an ExpandArcs. Prune the aux_q (output of the ExpandArcs),
    // move the survival tokens to the main_q, do the preprocessing at the same
    // time
    // We don't need it after the last ExpandArcsNonEmitting.
    void PruneAndPreprocess();
    // Once the non-emitting is done, the main_q is final for that frame.
    // We now generate all the data associated with that main_q, such as listing
    // the different tokens sharing the same token.next_state
    // we also preprocess for the ExpandArcsEmitting of the next frame
    // Once PostProcessingMainQueue, all working data is back to its original
    // state, to make sure we're ready for the next context switch
    void PostProcessingMainQueue();
    // Moving the relevant data to host, ie the data that will be needed in
    // GetBestPath/GetRawLattice.
    // Happens when PostProcessingMainQueue is done generating that data
    void CopyMainQueueDataToHost();
    // CheckOverflow
    // If a kernel sets the flag h_q_overflow, we send a warning to stderr
    // Overflows are detected and prevented on the device. It only means
    // that we've discarded the tokens that were created after the queue was full
    // That's why we only send a warning. It is not a fatal error
    void CheckOverflow();
    // Evaluates the function func for each lane, returning the max of all return
    // values
    // (func returns int32)
    // Used for instance to ge the max number of arcs for all lanes
    // func is called with h_lanes_counters_[ilane] for each lane.
    // h_lanes_counters_
    // must be ready to be used when calling GetMaxForAllLanes (you might want to
    // call
    // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
    int32 GetMaxForAllLanes(std::function<int32(const LaneCounters &)> func);
    // Copy the lane counters back to host, async or sync
    // The lanes counters contain all the information such as main_q_end (number
    // of tokens in the main_q)
    // main_q_narcs (number of arcs) during the computation. That's why we
    // frequently copy it back to host
    // to know what to do next
    void CopyLaneCountersToHostAsync();
    void CopyLaneCountersToHostSync();
    // The selected tokens for each frame will be copied back to host. We will
    // store them on host memory, and we wil use them to create the final lattice
    // once we've reached the last frame
    // We will also copy information on those tokens that we've generated on the
    // device, such as which tokens are associated to the same FST state in the
    // same frame, or their extra cost.
    // We cannot call individuals Device2Host copies for each channel, because it
    // would lead to a lot of small copies, reducing performance. Instead we
    // concatenate all channels data into a single
    // continuous array, copy that array to host, then unpack it to the individual
    // channel vectors
    // The first step (pack then copy to host, async) is done in
    // ConcatenateData
    // The second step is done in LaunchD2H and sLaunchH2HCopies
    // A sync on cudaStream st has to happen between the two functions to make
    // sure that the copy is done
    //
    // Each lane contains X elements to be copied, where X = func(ilane)
    // That data is contained in the array (pointer, X), with pointer = src[ilane]
    // It will be concatenated in d_concat on device, then copied async into
    // h_concat
    // That copy is launched on stream st
    // The offset of the data of each lane in the concatenate array is saved in
    // *lanes_offsets_ptr
    // it will be used for unpacking in MoveConcatenatedCopyToVector
    //
    // func is called with h_lanes_counters_[ilane] for each lane.
    // h_lanes_counters_
    // must be ready to be used when calling GetMaxForAllLanes (you might want to
    // call
    // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
    // Concatenate data on device before calling the D2H copies
    void ConcatenateData();
    // Start the D2H copies used to send data back to host at the end of each
    // frames
    void LaunchD2HCopies();
    // ComputeH2HCopies
    // At the end of each frame, we copy data back to host
    // That data was concatenated into a single continous array
    // We then have to unpack it and move it inside host memory
    // This is done by ComputeH2HCopies
    void ComputeH2HCopies();
    // Takes care of preparing the data for ComputeH2HCopies
    // and check whether we can use the threadpool or we have to do the work on
    // the current thread
    void LaunchH2HCopies();
    // Function called by the CPU worker threads
    // Calls ComputeH2HCopies when triggered
    void ComputeH2HCopiesCPUWorker();
  
    template <typename T>
    void MoveConcatenatedCopyToVector(const LaneId ilane,
                                      const ChannelId ichannel,
                                      const std::vector<int32> &lanes_offsets,
                                      T *h_concat,
                                      std::vector<std::vector<T>> *vecvec);
    void WaitForH2HCopies();
    void WaitForInitDecodingH2HCopies();
    // Computes a set of static asserts on the static values
    // In theory we should do them at compile time
    void CheckStaticAsserts();
    // Can be called in GetRawLattice to do a bunch of deep asserts on the data
    // Slow, so disabled by default
    void DebugValidateLattice();
  
    //
    // Data members
    //
  
    // The CudaFst data structure contains the FST graph
    // in the CSR format, on both the GPU and CPU memory
    const CudaFst fst_;
    // Counters used by a decoder lane
    // Contains all the single values generated during computation,
    // such as the current size of the main_q, the number of arcs currently in
    // that queue
    // We load data from the channel state during context-switch (for instance the
    // size of the last token queue for that channel)
    HostLaneMatrix<LaneCounters> h_lanes_counters_;
    // Counters of channels
    // Contains all the single values saved to remember the state of a channel
    // not used during computation. Those values are loaded/saved into/from a lane
    // during context switching
    ChannelCounters *h_channels_counters_;
    // Contain the various counters used by lanes/channels, such as main_q_end,
    // main_q_narcs. On device memory (equivalent of h_channels_counters on
    // device)
    DeviceChannelMatrix<ChannelCounters> d_channels_counters_;
    DeviceLaneMatrix<LaneCounters> d_lanes_counters_;
    // Number of lanes and channels, as defined in the constructor arguments
    int32 nlanes_, nchannels_;
  
    // We will now define the data used on the GPU
    // The data is mainly linked to two token queues
    // - the main queue
    // - the auxiliary queue
    //
    // The auxiliary queue is used to store the raw output of ExpandArcs.
    // We then prune that aux queue (and apply max-active) and move the survival
    // tokens in the main queue.
    // Tokens stored in the main q can then be used to generate new tokens (using
    // ExpandArcs)
    // We also generate more information about what's in the main_q at the end of
    // a frame (in PostProcessingMainQueue)
    //
    // As a reminder, here's the data structure of a token :
    //
    // struct Token { state, cost, prev_token, arc_idx }
    //
    // Please keep in mind that this structure is also used in the context
    // of lattice decoding. We are not storing a list of forward links like in the
    // CPU decoder. A token stays an instanciation of an single arc.
    //
    // For performance reasons, we split the tokens in three parts :
    // { state } , { cost }, { prev_token, arc_idx }
    // Each part has its associated queue
    // For instance, d_main_q_state[i], d_main_q_cost[i], d_main_q_info[i]
    // all refer to the same token (at index i)
    // The data structure InfoToken contains { prev_token, arc_idx }
    // We also store the acoustic costs independently in d_main_q_acoustic_cost_
    //
    // The data is eiher linked to a channel, or to a lane.
    //
    // Channel data (DeviceChannelMatrix):
    //
    // The data linked with a channel contains the data of frame i we need to
    // remember
    // to compute frame i+1. It is the list of tokens from frame i, with some
    // additional info
    // (ie the prefix sum of the emitting arcs degrees from those tokens).
    // We are only storing d_main_q_state_and_cost_ as channel data because that's
    // all we need in a token to compute
    // frame i+1. We don't need token.arc_idx or token.prev_token.
    // The reason why we also store that prefix sum is because we do the emitting
    // preprocessing
    // at the end of frame i. The reason for that is that we need infos from the
    // hashmap to do that preprocessing.
    // The hashmap is always cleared at the end of a frame. So we need to do the
    // preprocessing at the end of frame i,
    // and then save d_main_q_degrees_prefix_sum_. d_main_q_arc_offsets is
    // generated also during preprocessing.
    //
    // Lane data (DeviceLaneMatrix):
    //
    // The lane data is everything we use during computation, but which we reset
    // at the end of each frame.
    // For instance we use a hashmap at some point during the computation, but at
    // the end of each frame we reset it. That
    // way that hashmap is able to compute whichever channel the next time
    // AdvanceDecoding is called. The reasons why we do that is :
    //
    // - We use context switching. Before and after every frames, we can do a
    // context switching. Which means that a lane cannot save a channel's state
    // in any way once AdvanceDecoding returns. e.g., during a call of
    // AdvanceDecoding, ilane=2 may compute 5 frames from channel=57 (as defined
    // in the std::vector<ChannelId> channels).
    // In the next call, the same ilane=2 may compute 10 frames from channel=231.
    // A lane data has to be reset to its original state at the end of each
    // AdvanceDecoding call.
    // If somehow some data has to be saved, it needs to be declared as channel
    // data.
    //
    // - The reason why we make the distinction between lane and channel data (in
    // theory everything could be consider channel data), is because
    // a lane uses more memory than a channel. In the context of online decoding,
    // we need to create a lot channels, and we need them to be as small as
    // possible in memory.
    // Everything that can be reused between channels is stored as lane data.
  
    //
    // Channel data members:
    //
  
    DeviceChannelMatrix<int2> d_main_q_state_and_cost_;
    // Prefix sum of the arc's degrees in the main_q. Used by ExpandArcs,
    // set in the preprocess stages (either PruneAndPreprocess or
    // preprocess_in_place in PostProcessingMainQueue)
    DeviceChannelMatrix<int32> d_main_q_degrees_prefix_sum_;
    // d_main_q_arc_offsets[i] = fst_.arc_offsets[d_main_q_state[i]]
    // we pay the price for the random memory accesses of fst_.arc_offsets in the
    // preprocess kernel
    // we cache the results in d_main_q_arc_offsets which will be read in a
    // coalesced fashion in expand
    DeviceChannelMatrix<int32> d_main_q_arc_offsets_;
  
    //
    // Lane data members:
    //
  
    // InfoToken
    // Usually contains {prev_token, arc_idx}
    // If more than one token is associated to a fst_state,
    // it will contain where to find the list of those tokens in
    // d_main_q_extra_prev_tokens
    // ie {offset,size} in that list. We differentiate the two situations by
    // calling InfoToken.IsUniqueTokenForStateAndFrame()
    DeviceLaneMatrix<InfoToken> d_main_q_info_;
    // Acoustic cost of a given token
    DeviceLaneMatrix<CostType> d_main_q_acoustic_cost_;
    // At the end of a frame, we use a hashmap to detect the tokens that are
    // associated with the same FST state S
    // We do it that the very end, to only use the hashmap on post-prune, post-max
    // active tokens
    DeviceLaneMatrix<HashmapValueT> d_hashmap_values_;
    // Reminder: in the GPU lattice decoder, a token is always associated
    // to a single arc. Which means that multiple tokens in the same frame
    // can be associated with the same FST state.
    //
    // We are NOT listing those duplicates as ForwardLinks in an unique meta-token
    // like in the CPU lattice decoder
    //
    // When more than one token is associated to a single FST state,
    // we will list those tokens into another list : d_main_q_extra_prev_tokens
    // we will also save data useful in such a case, such as the extra_cost of a
    // token compared to the best for that state
    DeviceLaneMatrix<InfoToken> d_main_q_extra_prev_tokens_;
    DeviceLaneMatrix<float2> d_main_q_extra_and_acoustic_cost_;
    // Histogram. Used to perform the histogram of the token costs
    // in the main_q. Used to perform a soft topk of the main_q (max-active)
    DeviceLaneMatrix<int32> d_histograms_;
    // When filling the hashmap in PostProcessingMainQueue, we create a hashmap
    // value for each FST state
    // presents in the main_q (if at least one token is associated with that
    // state)
    // d_main_q_state_hash_idx_[token_idx] is the index of the state token.state
    // in the hashmap
    // Stored into a FSTStateHashIndex, which is actually a int32.
    // FSTStateHashIndex should only
    // be accessed through [Get|Set]FSTStateHashIndex, because it uses the bit
    // sign to also remember if that token is the representative of that state.
    // If only one token is associated with S, its representative will be itself
    DeviceLaneMatrix<FSTStateHashIndex> d_main_q_state_hash_idx_;
    // local_idx of the extra cost list for a state
    // For a given state S, first token associated with S will have local_idx=0
    // the second one local_idx=1, etc. The order of the local_idxs is random
    DeviceLaneMatrix<int32> d_main_q_n_extra_prev_tokens_local_idx_;
    // Where to write the extra_prev_tokens in the d_main_q_extra_prev_tokens_
    // queue
    DeviceLaneMatrix<int32> d_main_q_extra_prev_tokens_prefix_sum_;
    // Used when computing the prefix_sums in preprocess_in_place. Stores
    // the local_sums per CTA
    DeviceLaneMatrix<int2> d_main_q_block_sums_prefix_sum_;
    // Defining the aux_q. Filled by ExpandArcs.
    // The tokens are moved to the main_q by PruneAndPreprocess
    DeviceLaneMatrix<int2> d_aux_q_state_and_cost_;
    DeviceLaneMatrix<InfoToken> d_aux_q_info_;
    // Dedicated space for the concat of extra_cost. We should reuse memory
    DeviceLaneMatrix<float2> d_extra_and_acoustic_cost_concat_matrix_;
    DeviceLaneMatrix<InfoToken> d_extra_prev_tokens_concat_matrix_;
    DeviceLaneMatrix<CostType> d_acoustic_cost_concat_matrix_;
    DeviceLaneMatrix<InfoToken> d_infotoken_concat_matrix_;
    // We will list in d_list_final_tokens_in_main_q all tokens within [min_cost;
    // min_cost+lattice_beam]
    // It is used when calling GetBestCost
    // We only use an interface here because we will actually reuse data from
    // d_aux_q_state_and_cost
    // We are done using the aux_q when GetBestCost is called, so we can reuse
    // that memory
    HostLaneMatrix<int2> h_list_final_tokens_in_main_q_;
    // Parameters used by the kernels
    // DeviceParams contains all the parameters that won't change
    // i.e. memory address of the main_q for instance
    // KernelParams contains information that can change.
    // For instance which channel is executing on which lane
    DeviceParams *h_device_params_;
    KernelParams *h_kernel_params_;
    std::vector<ChannelId> channel_to_compute_;
    int32 nlanes_used_;  // number of lanes used in h_kernel_params_
    // Initial lane
    // When starting a new utterance,
    // init_channel_id is used to initialize a channel
    int32 init_channel_id_;
    // CUDA streams used by the decoder
    cudaStream_t compute_st_, copy_st_;
    // Parameters extracted from CudaDecoderConfig
    // Those are defined in CudaDecoderConfig
    CostType default_beam_;
    CostType lattice_beam_;
    int32 ntokens_pre_allocated_;
    int32 max_active_;  // Target value from the parameters
    int32 aux_q_capacity_;
    int32 main_q_capacity_;
    // Hashmap capacity. Multiple of max_tokens_per_frame
    int32 hashmap_capacity_;
    // Static segment of the adaptive beam. Cf InitDeviceParams
    int32 adaptive_beam_static_segment_;
    // The first index of all the following vectors (or vector<vector>)
    // is the ChannelId. e.g., to get the number of frames decoded in channel 2,
    // look into num_frames_decoded_[2].
  
    // Keep track of the number of frames decoded in the current file.
    std::vector<int32> num_frames_decoded_;
    // Offsets of each frame in h_all_tokens_info_
    std::vector<std::vector<int32>> frame_offsets_;
    // Data storage. We store on host what we will need in
    // GetRawLattice/GetBestPath
    std::vector<std::vector<InfoToken>> h_all_tokens_info_;
    std::vector<std::vector<CostType>> h_all_tokens_acoustic_cost_;
    std::vector<std::vector<InfoToken>> h_all_tokens_extra_prev_tokens_;
    std::vector<std::vector<float2>>
        h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_;
    std::vector<std::mutex> channel_lock_;  // at some point we should switch to a
                                            // shared_lock (to be able to compute
                                            // partial lattices while still
                                            // streaming new data for this
                                            // channel)
    bool worker_threads_running_;
    // For each channel, set by PrepareForGetRawLattice
    // argmin cost, list of the tokens within [best_cost;best_cost+lattice_beam]
    // and if we've reached a final token. Set by PrepareForGetRawLattice.
    std::vector<std::pair<int32, CostType>> h_all_argmin_cost_;
    std::vector<std::vector<std::pair<int, float>>> h_all_final_tokens_list_;
    std::vector<bool> h_all_has_reached_final_;
  
    // Pinned memory arrays. Used for the DeviceToHost copies
    float2 *h_extra_and_acoustic_cost_concat_, *d_extra_and_acoustic_cost_concat_;
    InfoToken *h_infotoken_concat_, *d_infotoken_concat_;
    CostType *h_acoustic_cost_concat_, *d_acoustic_cost_concat_;
    InfoToken *h_extra_prev_tokens_concat_, *d_extra_prev_tokens_concat_;
    // second memory space used for double buffering
    float2 *h_extra_and_acoustic_cost_concat_tmp_;
    InfoToken *h_infotoken_concat_tmp_;
    CostType *h_acoustic_cost_concat_tmp_;
    InfoToken *h_extra_prev_tokens_concat_tmp_;
    // Offsets used in MoveConcatenatedCopyToVector
    std::vector<int32> h_main_q_end_lane_offsets_;
    std::vector<int32> h_emitting_main_q_end_lane_offsets_;
    std::vector<int32> h_n_extra_prev_tokens_lane_offsets_;
    // Used when calling GetBestCost
    std::vector<std::pair<int32, CostType>> argmins_;
    std::vector<bool> has_reached_final_;
    std::vector<std::vector<std::pair<int32, CostType>>>
        list_finals_token_idx_and_cost_;
    bool compute_max_active_;
    cudaEvent_t nnet3_done_evt_;
    cudaEvent_t d2h_copy_acoustic_evt_;
    cudaEvent_t d2h_copy_infotoken_evt_;
    cudaEvent_t d2h_copy_extra_prev_tokens_evt_;
    cudaEvent_t concatenated_data_ready_evt_;
    cudaEvent_t lane_offsets_ready_evt_;
    // GetRawLattice helper
    // Data used when building the lattice in GetRawLattice
  
    // few typedef to make GetRawLattice easier to understand
    // Returns a unique id for each (iframe, fst_state) pair
    // We need to be able to quickly identity a (iframe, fst_state) ID
    //
    // A lattice state is defined by the pair (iframe, fst_state)
    // A token is associated to a lattice state (iframe, token.next_state)
    // Multiple token in the same frame can be associated to the same lattice
    // state
    // (they all go to the same token.next_state)
    // We need to quickly identify what is the lattice state of a token.
    // We are able to do that through GetLatticeStateInternalId(token),
    // which returns the internal unique ID for each lattice state for a token
    //
    // When we build the output lattice, we a get new lattice state
    // output_lattice_state = fst_out->AddState()
    // We call this one OutputLatticeState
    // The conversion between the two is done through maps
    // [curr|prev]_f_raw_lattice_state_
    typedef int32 LatticeStateInternalId;
    typedef StateId OutputLatticeState;
    typedef int32 TokenId;
    LatticeStateInternalId GetLatticeStateInternalId(int32 total_ntokens,
                                                     TokenId token_idx,
                                                     InfoToken token);
    // Keeping track of a variety of info about states in the lattice
    // - token_extra_cost. A path going from the current lattice_state to the
    // end has an extra cost
    // compared to the best path (which has an extra cost of 0).
    // token_extra_cost is the minimum of the extra_cost of all paths going from
    // the current lattice_state
    // to the final frame.
    // - fst_lattice_state is the StateId of the lattice_state in fst_out (in
    // the output lattice). lattice_state is an internal state used in
    // GetRawLattice.
    // - is_state_closed is true if the token_extra_cost has been read by
    // another token. It means that the
    // token_extra_cost value has been used, and if we modify token_extra_cost
    // again, we may need to recompute the current frame (so that everyone uses
    // the latest
    // token_extra_cost value)
    struct RawLatticeState {
      CostType token_extra_cost;
      OutputLatticeState fst_lattice_state;
      bool is_state_closed;
    };
    // extra_cost_min_delta_ used in the must_replay_frame situation. Please read
    // comments
    // associated with must_replay_frame in GetRawLattice to understand what it
    // does
    CostType extra_cost_min_delta_;
    ThreadPool *thread_pool_;
    std::vector<std::thread> cpu_dedicated_threads_;
    int32 n_threads_used_;
    std::vector<ChannelId> lanes2channels_todo_;
    std::atomic<int> n_acoustic_h2h_copies_todo_;
    std::atomic<int> n_extra_prev_tokens_h2h_copies_todo_;
    std::atomic<int> n_d2h_copies_ready_;
    std::atomic<int> n_infotoken_h2h_copies_todo_;
    int32 n_h2h_task_not_done_;
    int32 n_init_decoding_h2h_task_not_done_;
    std::atomic<int> n_h2h_main_task_todo_;
    std::mutex n_h2h_task_not_done_mutex_;
    std::mutex n_init_decoding_h2h_task_not_done_mutex_;
    std::mutex n_h2h_main_task_todo_mutex_;
    std::condition_variable n_h2h_main_task_todo_cv_;
    std::condition_variable h2h_done_;
    std::condition_variable init_decoding_h2h_done_;
    std::atomic<bool> active_wait_;
    bool h2h_threads_running_;
    // Using the output from GetBestPath, we add the best tokens (as selected in
    // GetBestCost)
    // from the final frame to the output lattice. We also fill the data
    // structures
    // (such as q_curr_frame_todo_, or curr_f_raw_lattice_state_) accordingly
    void AddFinalTokensToLattice(
        ChannelId ichannel,
        std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *curr_f_raw_lattice_state,
        Lattice *fst_out);
    // Check if a token should be added to the lattice. If it should, then
    // keep_arc will be true
    void ConsiderTokenForLattice(
        ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
        OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
        float2 *arc_extra_cost_beg, CostType token_extra_cost,
        TokenId list_prev_token_idx, int32 list_arc_idx,
        InfoToken *list_prev_token, CostType *this_arc_prev_token_extra_cost,
        CostType *acoustic_cost, OutputLatticeState *lattice_src_state,
        bool *keep_arc, bool *dbg_found_zero);
    // Add the arc to the lattice. Also updates what needs to be updated in the
    // GetRawLattice datastructures.
    void AddArcToLattice(
        int32 list_arc_idx, TokenId list_prev_token_idx,
        InfoToken list_prev_token, int32 curr_frame_offset,
        CostType acoustic_cost, CostType this_arc_prev_token_extra_cost,
        LatticeStateInternalId src_state_internal_id,
        OutputLatticeState fst_lattice_start,
        OutputLatticeState to_fst_lattice_state,
        std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
        std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *curr_f_raw_lattice_state,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *prev_f_raw_lattice_state,
        std::unordered_set<int32> *f_arc_idx_added, Lattice *fst_out,
        bool *must_replay_frame);
    // Read a token information
    void GetTokenRawLatticeData(
        TokenId token_idx, InfoToken token, int32 total_ntokens,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *curr_f_raw_lattice_state,
        CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state);
  
    // A token is an instance of an arc. It goes to a FST state (token.next_state)
    // Multiple token in the same frame can go to the same FST state.
    // GetSameFSTStateTokenList
    // returns that list
    void GetSameFSTStateTokenList(ChannelId ichannel, InfoToken token,
                                  InfoToken **tok_beg,
                                  float2 **arc_extra_cost_beg, int32 *nprevs);
  
    // Swap datastructures at the end of a frame. prev becomes curr (we go
    // backward)
    //
    void SwapPrevAndCurrLatticeMap(
        int32 iframe, bool dbg_found_best_path,
        std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
        std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *curr_f_raw_lattice_state,
        std::unordered_map<LatticeStateInternalId, RawLatticeState>
            *prev_f_raw_lattice_state,
        std::unordered_set<int32> *f_arc_idx_added);
    KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
  };
  
  }  // end namespace cuda_decoder
  }  // end namespace kaldi
  
  #endif  // KALDI_CUDA_DECODER_CUDA_DECODER_H_