Blame view

src/cudadecoder/cuda-fst.h 4.84 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
  // cudadecoder/cuda-fst.h
  //
  // Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  // Hugo Braun, Justin Luitjens, Ryan Leary
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //     http://www.apache.org/licenses/LICENSE-2.0
  //
  // Unless required by applicable law or agreed to in writing, software
  // distributed under the License is distributed on an "AS IS" BASIS,
  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  // See the License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_CUDA_DECODER_CUDA_FST_H_
  #define KALDI_CUDA_DECODER_CUDA_FST_H_
  #include "cudadecoder/cuda-decoder-common.h"
  #include "cudamatrix/cu-device.h"
  #include "lat/kaldi-lattice.h"
  #include "nnet3/decodable-online-looped.h"  // TransitionModel
  
  namespace kaldi {
  namespace cuda_decoder {
  
  typedef fst::StdArc StdArc;
  typedef StdArc::Weight StdWeight;
  typedef StdArc::Label Label;
  
  // FST in both device and host memory
  // Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix
  // format.
  // https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
  // Where states = rows and arcs = columns.
  // This format allows us to store the FST in a compact form, and leads to clean
  // memory accesses
  // For instance, when loading the arcs from a given source, we can load all arc
  // informations (destination, weight, etc.) with coalesced reads
  // Emitting arcs and non-emitting arcs are stored as separate matrices for
  // efficiency
  // We then copy the FST to the device (while keeping its original copy on host)
  class CudaFst {
   public:
    CudaFst()
        : d_e_offsets_(nullptr),
          d_ne_offsets_(nullptr),
          d_arc_weights_(nullptr),
          d_arc_nextstates_(nullptr),
          d_arc_pdf_ilabels_(nullptr),
          d_final_(nullptr){};
    // Creates a CSR representation of the FST,
    // then copies it to the GPU
    // If a TransitionModel is passed, we'll use it to convert the ilabels id
    // indexes into pdf indexes
    // If no TransitionModel is passed, we'll assume TransitionModel == identity
    // Important: The CudaDecodable won't apply the TransitionModel. If you use a
    // TransitionModel, you need to apply it now
    void Initialize(const fst::Fst<StdArc> &fst,
                    const TransitionModel *trans_model = NULL);
    void Finalize();
  
    inline uint32_t NumStates() const { return num_states_; }
    inline StateId Start() const { return start_; }
  
   private:
    friend class CudaDecoder;
    // Counts arcs and computes offsets of the fst passed in
    void ComputeOffsets(const fst::Fst<StdArc> &fst);
    // Allocates memory to store FST
    void AllocateData(const fst::Fst<StdArc> &fst);
    // Populate the arcs data (arc.destination, arc.weights, etc.)
    void PopulateArcs(const fst::Fst<StdArc> &fst);
    // Converting the id ilabels into pdf ilabels using the transition model
    // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
    // right indexes
    void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
    // Copies fst to device into the pre-allocated datastructures
    void CopyDataToDevice();
    // Total number of states
    unsigned int num_states_;
    // Starting state of the FST
    // Computation should start from state start_
    StateId start_;
    // Number of emitting, non-emitting, and total number of arcs
    unsigned int e_count_, ne_count_, arc_count_;
    // This data structure is similar to a CSR matrix format
    // with 2 offsets matrices (one emitting one non-emitting).
    // Offset arrays are num_states_+1 in size (last state needs
    // its +1 arc_offset)
    // Arc values for state i are stored in the range of [offset[i],offset[i+1][
    unsigned int *d_e_offsets_;  // Emitting offset arrays
    std::vector<unsigned int> h_e_offsets_;
    unsigned int *d_ne_offsets_;  // Non-emitting offset arrays
    std::vector<unsigned int> h_ne_offsets_;
    // These are the values for each arc.
    // Arcs belonging to state i are found in the range of [offsets[i],
    // offsets[i+1][
    // Use e_offsets or ne_offsets depending on what you need
    // (emitting/nonemitting)
    // The ilabels arrays are of size e_count_, not arc_count_
    std::vector<CostType> h_arc_weights_;
    CostType *d_arc_weights_;
    std::vector<StateId> h_arc_nextstate_;
    StateId *d_arc_nextstates_;
    std::vector<int32> h_arc_id_ilabels_;
    int32 *d_arc_pdf_ilabels_;
    std::vector<int32> h_arc_olabels_;
    // Final costs
    // final cost of state i is h_final_[i]
    std::vector<CostType> h_final_;
    CostType *d_final_;
  
    // ilabels (pdf indexing)
    // only populate during CSR generation, cleared after (not needed on host)
    std::vector<int32> h_arc_pdf_ilabels_;
  };
  
  }  // end namespace cuda_decoder
  }  // end namespace kaldi
  #endif  // KALDI_CUDA_DECODER_CUDA_FST_H_