// cudadecoder/cuda-fst.h // // Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. // Hugo Braun, Justin Luitjens, Ryan Leary // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_CUDA_DECODER_CUDA_FST_H_ #define KALDI_CUDA_DECODER_CUDA_FST_H_ #include "cudadecoder/cuda-decoder-common.h" #include "cudamatrix/cu-device.h" #include "lat/kaldi-lattice.h" #include "nnet3/decodable-online-looped.h" // TransitionModel namespace kaldi { namespace cuda_decoder { typedef fst::StdArc StdArc; typedef StdArc::Weight StdWeight; typedef StdArc::Label Label; // FST in both device and host memory // Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix // format. // https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format) // Where states = rows and arcs = columns. // This format allows us to store the FST in a compact form, and leads to clean // memory accesses // For instance, when loading the arcs from a given source, we can load all arc // informations (destination, weight, etc.) with coalesced reads // Emitting arcs and non-emitting arcs are stored as separate matrices for // efficiency // We then copy the FST to the device (while keeping its original copy on host) class CudaFst { public: CudaFst() : d_e_offsets_(nullptr), d_ne_offsets_(nullptr), d_arc_weights_(nullptr), d_arc_nextstates_(nullptr), d_arc_pdf_ilabels_(nullptr), d_final_(nullptr){}; // Creates a CSR representation of the FST, // then copies it to the GPU // If a TransitionModel is passed, we'll use it to convert the ilabels id // indexes into pdf indexes // If no TransitionModel is passed, we'll assume TransitionModel == identity // Important: The CudaDecodable won't apply the TransitionModel. If you use a // TransitionModel, you need to apply it now void Initialize(const fst::Fst &fst, const TransitionModel *trans_model = NULL); void Finalize(); inline uint32_t NumStates() const { return num_states_; } inline StateId Start() const { return start_; } private: friend class CudaDecoder; // Counts arcs and computes offsets of the fst passed in void ComputeOffsets(const fst::Fst &fst); // Allocates memory to store FST void AllocateData(const fst::Fst &fst); // Populate the arcs data (arc.destination, arc.weights, etc.) void PopulateArcs(const fst::Fst &fst); // Converting the id ilabels into pdf ilabels using the transition model // It allows the CudaDecoder to read the acoustic model loglikelihoods at the // right indexes void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model); // Copies fst to device into the pre-allocated datastructures void CopyDataToDevice(); // Total number of states unsigned int num_states_; // Starting state of the FST // Computation should start from state start_ StateId start_; // Number of emitting, non-emitting, and total number of arcs unsigned int e_count_, ne_count_, arc_count_; // This data structure is similar to a CSR matrix format // with 2 offsets matrices (one emitting one non-emitting). // Offset arrays are num_states_+1 in size (last state needs // its +1 arc_offset) // Arc values for state i are stored in the range of [offset[i],offset[i+1][ unsigned int *d_e_offsets_; // Emitting offset arrays std::vector h_e_offsets_; unsigned int *d_ne_offsets_; // Non-emitting offset arrays std::vector h_ne_offsets_; // These are the values for each arc. // Arcs belonging to state i are found in the range of [offsets[i], // offsets[i+1][ // Use e_offsets or ne_offsets depending on what you need // (emitting/nonemitting) // The ilabels arrays are of size e_count_, not arc_count_ std::vector h_arc_weights_; CostType *d_arc_weights_; std::vector h_arc_nextstate_; StateId *d_arc_nextstates_; std::vector h_arc_id_ilabels_; int32 *d_arc_pdf_ilabels_; std::vector h_arc_olabels_; // Final costs // final cost of state i is h_final_[i] std::vector h_final_; CostType *d_final_; // ilabels (pdf indexing) // only populate during CSR generation, cleared after (not needed on host) std::vector h_arc_pdf_ilabels_; }; } // end namespace cuda_decoder } // end namespace kaldi #endif // KALDI_CUDA_DECODER_CUDA_FST_H_