Blame view
src/cudadecoder/cuda-fst.h
4.84 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
// cudadecoder/cuda-fst.h // // Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. // Hugo Braun, Justin Luitjens, Ryan Leary // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_CUDA_DECODER_CUDA_FST_H_ #define KALDI_CUDA_DECODER_CUDA_FST_H_ #include "cudadecoder/cuda-decoder-common.h" #include "cudamatrix/cu-device.h" #include "lat/kaldi-lattice.h" #include "nnet3/decodable-online-looped.h" // TransitionModel namespace kaldi { namespace cuda_decoder { typedef fst::StdArc StdArc; typedef StdArc::Weight StdWeight; typedef StdArc::Label Label; // FST in both device and host memory // Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix // format. // https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format) // Where states = rows and arcs = columns. // This format allows us to store the FST in a compact form, and leads to clean // memory accesses // For instance, when loading the arcs from a given source, we can load all arc // informations (destination, weight, etc.) with coalesced reads // Emitting arcs and non-emitting arcs are stored as separate matrices for // efficiency // We then copy the FST to the device (while keeping its original copy on host) class CudaFst { public: CudaFst() : d_e_offsets_(nullptr), d_ne_offsets_(nullptr), d_arc_weights_(nullptr), d_arc_nextstates_(nullptr), d_arc_pdf_ilabels_(nullptr), d_final_(nullptr){}; // Creates a CSR representation of the FST, // then copies it to the GPU // If a TransitionModel is passed, we'll use it to convert the ilabels id // indexes into pdf indexes // If no TransitionModel is passed, we'll assume TransitionModel == identity // Important: The CudaDecodable won't apply the TransitionModel. If you use a // TransitionModel, you need to apply it now void Initialize(const fst::Fst<StdArc> &fst, const TransitionModel *trans_model = NULL); void Finalize(); inline uint32_t NumStates() const { return num_states_; } inline StateId Start() const { return start_; } private: friend class CudaDecoder; // Counts arcs and computes offsets of the fst passed in void ComputeOffsets(const fst::Fst<StdArc> &fst); // Allocates memory to store FST void AllocateData(const fst::Fst<StdArc> &fst); // Populate the arcs data (arc.destination, arc.weights, etc.) void PopulateArcs(const fst::Fst<StdArc> &fst); // Converting the id ilabels into pdf ilabels using the transition model // It allows the CudaDecoder to read the acoustic model loglikelihoods at the // right indexes void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model); // Copies fst to device into the pre-allocated datastructures void CopyDataToDevice(); // Total number of states unsigned int num_states_; // Starting state of the FST // Computation should start from state start_ StateId start_; // Number of emitting, non-emitting, and total number of arcs unsigned int e_count_, ne_count_, arc_count_; // This data structure is similar to a CSR matrix format // with 2 offsets matrices (one emitting one non-emitting). // Offset arrays are num_states_+1 in size (last state needs // its +1 arc_offset) // Arc values for state i are stored in the range of [offset[i],offset[i+1][ unsigned int *d_e_offsets_; // Emitting offset arrays std::vector<unsigned int> h_e_offsets_; unsigned int *d_ne_offsets_; // Non-emitting offset arrays std::vector<unsigned int> h_ne_offsets_; // These are the values for each arc. // Arcs belonging to state i are found in the range of [offsets[i], // offsets[i+1][ // Use e_offsets or ne_offsets depending on what you need // (emitting/nonemitting) // The ilabels arrays are of size e_count_, not arc_count_ std::vector<CostType> h_arc_weights_; CostType *d_arc_weights_; std::vector<StateId> h_arc_nextstate_; StateId *d_arc_nextstates_; std::vector<int32> h_arc_id_ilabels_; int32 *d_arc_pdf_ilabels_; std::vector<int32> h_arc_olabels_; // Final costs // final cost of state i is h_final_[i] std::vector<CostType> h_final_; CostType *d_final_; // ilabels (pdf indexing) // only populate during CSR generation, cleared after (not needed on host) std::vector<int32> h_arc_pdf_ilabels_; }; } // end namespace cuda_decoder } // end namespace kaldi #endif // KALDI_CUDA_DECODER_CUDA_FST_H_ |