cu-block-matrix.cc 11.2 KB
// cudamatrix/cu-block-matrix.cc

// Copyright 2013      Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#if HAVE_CUDA == 1
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#endif

#include <algorithm>
#include "base/timer.h"
#include "cudamatrix/cu-block-matrix.h"
#include "cudamatrix/cu-matrix.h"
#include "cudamatrix/cu-device.h"

namespace kaldi {

template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix() {
#if HAVE_CUDA == 1
  cu_data_ = NULL;
#endif
}

template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix(const std::vector<CuMatrix<Real> >&data) {
#if HAVE_CUDA == 1
  cu_data_ = NULL;
#endif
  block_data_.resize(data.size());
  MatrixIndexT row_offset = 0, col_offset = 0, max_num_rows = 0;
  for (size_t b = 0; b < data.size(); b++) {
    MatrixIndexT num_rows = data[b].NumRows(), num_cols = data[b].NumCols();
    KALDI_ASSERT(num_rows > 0 && num_cols > 0);
    BlockMatrixData block_data;
    block_data.num_rows = num_rows;
    block_data.num_cols = num_cols;
    block_data.row_offset = row_offset;
    block_data.col_offset = col_offset;
    row_offset += num_rows;
    col_offset += num_cols;
    max_num_rows = std::max(max_num_rows, num_rows);
    block_data_[b] = block_data;
  }
  num_rows_ = row_offset;
  data_.Resize(max_num_rows, col_offset);
  for (int32 b = 0; b < NumBlocks(); b++)
    Block(b).CopyFromMat(data[b]);
  SetCudaData();
}


template<class Real>
const CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) const {
  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
  const BlockMatrixData &block_data = block_data_[b];
  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
                           block_data.col_offset, block_data.num_cols);
}

template<class Real>
CuSubMatrix<Real> CuBlockMatrix<Real>::Block(int32 b) {
  KALDI_ASSERT(static_cast<size_t>(b) < block_data_.size());
  BlockMatrixData &block_data = block_data_[b];
  return CuSubMatrix<Real>(data_, 0, block_data.num_rows,
                           block_data.col_offset, block_data.num_cols);
}


template<class Real>
CuBlockMatrix<Real>::CuBlockMatrix(const CuBlockMatrix<Real> &other):
    data_(other.data_), block_data_(other.block_data_), num_rows_(other.num_rows_) {
#if HAVE_CUDA == 1
  cu_data_ = NULL;
#endif
  SetCudaData();
}

template<class Real>
CuBlockMatrix<Real> &CuBlockMatrix<Real>::operator =(const CuBlockMatrix<Real> &other) {
  FreeCudaData();
  data_ = other.data_;
  block_data_ = other.block_data_;
  num_rows_ = other.num_rows_;
  SetCudaData();
  return *this;
}

template<class Real>
void CuBlockMatrix<Real>::FreeCudaData() {
#if HAVE_CUDA == 1
  if (cu_data_ != NULL) {
    if (CuDevice::Instantiate().Enabled()) {
      CuDevice::Instantiate().Free(cu_data_);
      cu_data_ = NULL;
    } else {
      KALDI_ERR << "CuBlockMatrix: you have CUDA data pointer but "
                << "no GPU is enabled: likely code error.";
    }
  }
#endif
}


template<class Real>
void CuBlockMatrix<Real>::SetCudaData() {
#if HAVE_CUDA == 1
  KALDI_ASSERT(cu_data_ == NULL);
  if (block_data_.size() == 0) return; // Nothing to do.
  if (CuDevice::Instantiate().Enabled()) {
    CuTimer tim;
    std::vector<CuBlockMatrixData> tmp_cu_data(NumBlocks());
    int32 row_offset = 0, col_offset = 0;
    for (size_t b = 0; b < NumBlocks(); b++) {
      CuSubMatrix<Real> this_mat = Block(b);
      CuBlockMatrixData &this_cu_data = tmp_cu_data[b];
      this_cu_data.row_offset = row_offset;
      this_cu_data.col_offset = col_offset;
      this_cu_data.matrix_dim = this_mat.Dim();
      this_cu_data.matrix_data = static_cast<void*>(this_mat.Data());
      row_offset += this_mat.NumRows();
      col_offset += this_mat.NumCols();
    }
    size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
    cu_data_ = static_cast<CuBlockMatrixData*>(
        CuDevice::Instantiate().Malloc(size));
    CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, 
                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
    CuDevice::Instantiate().AccuProfile(__func__, tim);    
  }
#endif
}

template<class Real>
void CuBlockMatrix<Real>::Swap(CuBlockMatrix<Real> *other) {
  data_.Swap(&other->data_);
  block_data_.swap(other->block_data_);
  std::swap(num_rows_, other->num_rows_);
#if HAVE_CUDA == 1
  std::swap(cu_data_, other->cu_data_);
#endif
}

template<class Real>
void CuBlockMatrix<Real>::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<CuBlockMatrix>");
  int32 num_blocks = NumBlocks();
  WriteBasicType(os, binary, num_blocks);
  for (int32 b = 0; b < num_blocks; b++)
    this->Block(b).Write(os, binary);
  WriteToken(os, binary, "</CuBlockMatrix>");  
}


template<class Real>
void CuBlockMatrix<Real>::Read(std::istream &is, bool binary) {
  Destroy();
  int i = Peek(is, binary);
  std::vector<CuMatrix<Real> > data;
  if (i != static_cast<int>('<')) {
    // back-compatibility code so we can read the older format of
    // MixtureProbComponent.  This code should be deleted eventually.
    int32 size;
    ReadBasicType(is, binary, &size);
    KALDI_ASSERT(size >= 0);
    data.resize(size);
    for (int32 i = 0; i < size; i++)
      data[i].Read(is, binary);
  } else {
    ExpectToken(is, binary, "<CuBlockMatrix>");
    int32 size;
    ReadBasicType(is, binary, &size);
    KALDI_ASSERT(size >= 0);
    data.resize(size);
    for (int32 i = 0; i < size; i++)
      data[i].Read(is, binary);
    ExpectToken(is, binary, "</CuBlockMatrix>");    
  }

  CuBlockMatrix<Real> block_mat(data); // initializer from std::vector<CuMatrix<Real> > does
  // the main job of initialization.
  this->Swap(&block_mat);
}

template<class Real>
void CuBlockMatrix<Real>::Destroy() {
  data_.Resize(0, 0);
  block_data_.clear();
  num_rows_ = 0;  
  FreeCudaData();
}

// Does *this = alpha A B + beta * *this, discarding elements outside
// the block structure of the *this matrix. 
template<class Real>
void CuBlockMatrix<Real>::AddMatMat(
    BaseFloat alpha,
    const CuMatrix<Real> &A, MatrixTransposeType transA,
    const CuMatrix<Real> &B, MatrixTransposeType transB,
    BaseFloat beta) {
  MatrixIndexT A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
      A_row_stride = A.Stride(), A_col_stride = 1,
      B_num_rows = B.NumRows(), B_num_cols = B.NumCols(),
      B_row_stride = B.Stride(), B_col_stride = 1;
  if (transA == kTrans) {
    std::swap(A_num_rows, A_num_cols);
    std::swap(A_row_stride, A_col_stride);
  }
  if (transB == kTrans) {
    std::swap(B_num_rows, B_num_cols);
    std::swap(B_row_stride, B_col_stride);
  }
  KALDI_ASSERT(A_num_rows == NumRows() && B_num_cols == NumCols()
               && A_num_cols == B_num_rows);
  if (NumBlocks() == 0) return; // empty matrix.
#if HAVE_CUDA == 1
  if (CuDevice::Instantiate().Enabled()) {
    CuTimer tim;

    // (x,y,z) dimensions are (block-id, row-of-block, col-of-block)
    // First some logic to choose block dims...
    // we assume (which we can, safely) that CU1DBLOCK is <= the max threads per block.
    int32 x_blocksize = std::min(CU1DBLOCK, NumBlocks()); // x dim corresponds to block-idx.
    int32 max_block_rows = MaxBlockRows(), max_block_cols = MaxBlockCols();
    int32 y_blocksize = max_block_rows;
    while (y_blocksize * x_blocksize > CU1DBLOCK || y_blocksize > CU2DBLOCK)
      y_blocksize--;
    int32 z_blocksize = max_block_cols;
    while (z_blocksize * x_blocksize * y_blocksize > CU1DBLOCK || z_blocksize > CU2DBLOCK)
      z_blocksize--;
    
    dim3 dimBlock(x_blocksize, y_blocksize, z_blocksize);
    dim3 dimGrid(n_blocks(NumBlocks(), x_blocksize),
                 n_blocks(max_block_rows, y_blocksize),
                 n_blocks(max_block_cols, z_blocksize));
    cuda_block_add_mat_mat(dimGrid, dimBlock, cu_data_, NumBlocks(),
                           A.Data(), A_num_cols, A_row_stride, A_col_stride,
                           B.Data(), B_row_stride, B_col_stride, alpha, beta);
    CU_SAFE_CALL(cudaGetLastError());    
    CuDevice::Instantiate().AccuProfile(__func__, tim);    
  } else
#endif
  {
    int32 row_offset = 0, col_offset = 0;    
    for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
      CuSubMatrix<Real> this_block = Block(b);
      MatrixIndexT this_num_rows = this_block.NumRows(),
          this_num_cols = this_block.NumCols();
      CuSubMatrix<Real> A_part = (transA == kNoTrans ?
                                  A.Range(row_offset, this_num_rows,
                                          0, A.NumCols()) :
                                  A.Range(0, A.NumRows(),
                                          row_offset, this_num_rows)),
          B_part = (transB == kNoTrans ?
                    B.Range(0, B.NumRows(),
                            col_offset, this_num_cols) :
                    B.Range(col_offset, this_num_cols,
                            0, B.NumCols()));
      this_block.AddMatMat(alpha, A_part, transA, B_part, transB, beta);
      row_offset += this_num_rows;
      col_offset += this_num_cols;
    }
    KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
  }
}

template<class Real>
MatrixIndexT CuBlockMatrix<Real>::MaxBlockCols() const {
  MatrixIndexT max_cols = 0;
  for (size_t i = 0; i < block_data_.size(); i++)
    max_cols = std::max(max_cols, block_data_[i].num_cols);
  return max_cols;
}

template<class Real>
MatrixIndexT CuBlockMatrix<Real>::MaxBlockRows() const {
  return data_.NumRows();
}

template<class Real>
void CuBlockMatrix<Real>::CopyFromMat(const CuMatrix<Real> &M) {
  KALDI_ASSERT(NumRows() == M.NumRows() && NumCols() == M.NumCols());
  MatrixIndexT row_offset = 0, col_offset = 0;
  for (MatrixIndexT b = 0; b < NumBlocks(); b++) {
    CuSubMatrix<Real> this_block = Block(b);
    MatrixIndexT this_num_rows = this_block.NumRows(),
        this_num_cols = this_block.NumCols();
    const CuSubMatrix<Real> src(M, row_offset, this_num_rows,
                                col_offset, this_num_cols);
    this_block.CopyFromMat(src);
    row_offset += this_num_rows;
    col_offset += this_num_cols;
  }
  KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
}

/**
 * Print the matrix to stream
 */
template<typename Real>
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<Real> &mat) {
  bool binary = false;
  mat.Write(out, binary);
  return out;
}
// instantiate the template
template
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<float> &mat);
template 
std::ostream &operator << (std::ostream &out, const CuBlockMatrix<double> &mat);

// Instantiate the class for float and double.
template class CuBlockMatrix<float>;
template class CuBlockMatrix<double>;

} // namespace kaldi