Yannick Estève / ONTRAC-Kaldi

Blame view

src/cudamatrix/cu-sp-matrix.cc 7.64 KB
  // cudamatrix/cu-sp-matrix.cc
  
  // Copyright      2013  Karel Vesely
  //           2014-2015  Johns Hopkins University (author: Daniel Povey)
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #if HAVE_CUDA == 1
  #include <cuda_runtime_api.h>
  #include <cublas_v2.h>
  #endif
  
  #include "base/timer.h"
  #include "cudamatrix/cu-common.h"
  #include "cudamatrix/cu-vector.h"
  #include "cudamatrix/cu-device.h"
  #include "cudamatrix/cu-kernels.h"
  #include "cudamatrix/cu-math.h"
  #include "cudamatrix/cu-sp-matrix.h"
  #include "cudamatrix/cu-matrix.h"
  #include "cudamatrix/cublas-wrappers.h"
  
  namespace kaldi {
  
  template<typename Real>
  void CuSpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
                                     SpCopyType copy_type) {
    KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
                 this->num_rows_ == M.NumCols());
    if (this->num_rows_ == 0)
      return;
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      MatrixIndexT D = this->NumRows();
      if (D == 0)
        return;
      switch (copy_type) {
        case kTakeMeanAndCheck:
          KALDI_ERR << "kTakeMeanAndCheck not supported!";
        // The grid/block dimensions have been very roughly tuned for the
        // individual cases.
        case kTakeMean:
          {
            dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
            dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
            cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
            CU_SAFE_CALL(cudaGetLastError());
          }
          break;
        case kTakeLower:
          {
            int32 block_size = std::min(CU1DBLOCK, this->num_rows_);
            dim3 dimBlock(1, block_size);
            dim3 dimGrid(D, n_blocks(D, block_size));
            cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
            CU_SAFE_CALL(cudaGetLastError());
          }
          break;
        case kTakeUpper:
          {
            dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
            dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
            cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
            CU_SAFE_CALL(cudaGetLastError());
          }
          break;
        default:
          KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
      }
      CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim);
    } else
  #endif
    {
      Mat().CopyFromMat(M.Mat(), copy_type);
    }
  }
  
  template<typename Real>
  void CuSpMatrix<Real>::Invert() {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
      mat.CopyFromSp(*this);
      mat.SymInvertPosDef();
      this->CopyFromMat(mat);
    } else
  #endif
    { // Use inversion of CPU-based SpMatrix.
      Mat().Invert();
    }
  }
  
  template<typename Real>
  void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
    KALDI_ASSERT(v.Dim() == this->NumRows());
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      if (this->num_rows_ == 0) return;
      CuTimer tim;
      size_t nr = this->num_rows_;
      dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
      dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
  
      CUBLAS_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(),
                                  1, this->Data()));
      
      CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim);
    } else
  #endif
    {
      Mat().AddVec2(alpha, v.Vec());
    }
  }
  
  template<typename Real>
  void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
                                 MatrixTransposeType transM, const Real beta) {
    KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
                 || (transM == kTrans && this->NumRows() == M.NumCols()));
  
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      if (this->num_rows_ == 0) return;
      CuTimer tim;
      MatrixIndexT this_dim = this->NumRows(),
          m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
  
      if (this_dim == 0) return;
      if (alpha == 0.0) {
        if (beta != 1.0) this->Scale(beta);
        return;
      }
  
      cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
  
      CuMatrix<Real> tmp_mat(*this);
      cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(),
                  M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
      this->CopyFromMat(tmp_mat, kTakeLower);
      
      CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim);
    } else
  #endif
    {
      Mat().AddMat2(alpha, M.Mat(), transM, beta);
    }
  }
  
  /**
   * C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply)
   */
  
  template<typename Real, typename OtherReal>
  Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B) {
    KALDI_ASSERT(A.NumRows() == B.NumRows());
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      if (A.NumRows() == 0) return 0.0;
      MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
      CuVector<Real> Adiag(nr, kUndefined);
      CuVector<OtherReal> Bdiag(nr, kUndefined);
      Adiag.CopyDiagFromPacked(A);
      Bdiag.CopyDiagFromPacked(B);
      CuSubVector<Real> Aall(A.Data(), size);
      CuSubVector<OtherReal> Ball(B.Data(), size);
      // Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
      // on the diagonal.
      return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
    } else
  #endif
    {
      return TraceSpSp(A.Mat(), B.Mat());
    }
  }
  template
  float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
  template
  float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
  template
  double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
  template
  double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
  
  
  template<typename Real>
  bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
    KALDI_ASSERT(this->NumRows() == B.NumRows());
    CuSpMatrix<Real> diff(*this);
    diff.AddSp(-1.0, B);
    Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
        d = diff.FrobeniusNorm();
    return (d <= tol * std::max(a, b));
  }
  
  template<typename Real>
  bool CuSpMatrix<Real>::IsUnit(Real tol) const {
    // want to return:
    //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
    //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
    //    trace((*this - I)(*this - I)) <= tol * NumRows()
    // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
    // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
  
    // Note: we could do this more efficiently still, by slightly changing the
    // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
    // that corrects for the diagonal being counted twice.
    return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
            tol * this->NumRows());
  }
  
  template <class Real>
  CuSpMatrix<Real>& CuSpMatrix<Real>::operator = (const CuSpMatrix<Real> &in) {
    this->Resize(in.NumRows(), kUndefined);
    this->CopyFromPacked(in);
    return *this;
  }
  
  template class CuSpMatrix<float>;
  template class CuSpMatrix<double>;
  
  
  
  } // namespace