Yannick Estève / ONTRAC-Kaldi

Blame view

src/cudamatrix/cu-rand.cc 7.92 KB
  // cudamatrix/cu-rand.cc
  
  // Copyright 2016-2017  Brno University of Technology (author Karel Vesely)
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #include "cudamatrix/cu-rand.h"
  
  namespace kaldi {
  
  #if HAVE_CUDA == 1
  /// Wrappers of curand functions to interface both float and double as 1 function,
  
  /// Wrapper of curandGenerateUniform(), curandGenerateUniformDouble(),
  template<typename Real>
  curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, Real *ptr, size_t num);
  //
  template<>
  curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, float *ptr, size_t num) {
    return curandGenerateUniform(gen, ptr, num);
  }
  template<>
  curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, double *ptr, size_t num) {
    return curandGenerateUniformDouble(gen, ptr, num);
  }
  
  /// Wrapper of curandGenerateNormal(), curandGenerateNormalDouble(),
  template<typename Real>
  curandStatus_t curandGenerateNormalWrap(
      curandGenerator_t gen, Real *ptr, size_t num);
  //
  template<>
  curandStatus_t curandGenerateNormalWrap<float>(
      curandGenerator_t gen, float *ptr, size_t num) {
    return curandGenerateNormal(gen, ptr, num, 0.0 /*mean*/, 1.0 /*stddev*/);
  }
  template<>
  curandStatus_t curandGenerateNormalWrap<double>(
      curandGenerator_t gen, double *ptr, size_t num) {
    return curandGenerateNormalDouble(gen, ptr, num, 0.0 /*mean*/, 1.0 /*stddev*/);
  }
  /// End of wrappers.
  #endif
  
  
  template<typename Real>
  void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      // Better use 'tmp' matrix, 'tgt' can be a window into a larger matrix,
      // so we should not use it to generate random numbers over whole stride.
      // Use the option kStrideEqualNumCols to ensure consistency
      // (because when memory is nearly exhausted, the stride of CudaMallocPitch
      // may vary).
      CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                         kStrideEqualNumCols);
      size_t s = static_cast<size_t>(tmp.NumRows()) * static_cast<size_t>(tmp.Stride());
      CURAND_SAFE_CALL(curandGenerateUniformWrap(
            GetCurandHandle(), tmp.Data(), s));
      tgt->CopyFromMat(tmp);
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Mat().SetRandUniform();
    }
  }
  
  template<typename Real>
  void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      // Here we don't need to use 'tmp' matrix,
      size_t s = static_cast<size_t>(tgt->NumRows()) * static_cast<size_t>(tgt->Stride());
      CURAND_SAFE_CALL(curandGenerateUniformWrap(
            GetCurandHandle(), tgt->Data(), s));
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Mat().SetRandUniform();
    }
  }
  
  template<typename Real>
  void CuRand<Real>::RandUniform(CuVectorBase<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      CURAND_SAFE_CALL(curandGenerateUniformWrap(
            GetCurandHandle(), tgt->Data(), tgt->Dim()));
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Vec().SetRandUniform();
    }
  }
  
  template<typename Real>
  void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      // Better use 'tmp' matrix, 'tgt' can be a window into a larger matrix,
      // so we should not use it to generate random numbers over whole stride.
      // Also, we ensure to have 'even' number of elements for calling 'curand'
      // by possibly adding one column. Even number of elements is required by
      // curandGenerateUniform(), curandGenerateUniformDouble().
      // Use the option kStrideEqualNumCols to ensure consistency
      // (because when memory is nearly exhausted, the stride of CudaMallocPitch
      // may vary).
      MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
      CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                         kStrideEqualNumCols);
      CURAND_SAFE_CALL(curandGenerateNormalWrap(
            GetCurandHandle(), tmp.Data(), tmp.NumRows()*tmp.Stride()));
      tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Mat().SetRandn();
    }
  }
  
  template<typename Real>
  void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      // Here we don't need to use 'tmp' matrix, if the number of elements is even,
      MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride();
      if (0 == (num_elements % 2)) {
        CURAND_SAFE_CALL(curandGenerateNormalWrap(
              GetCurandHandle(), tgt->Data(), num_elements));
      } else {
        // We use 'tmp' matrix with one column added, this guarantees an even
        // number of elements.  Use the option kStrideEqualNumCols to ensure
        // consistency (because when memory is nearly exhausted, the stride of
        // CudaMallocPitch may vary).
        MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
        CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                           kStrideEqualNumCols);
        CURAND_SAFE_CALL(curandGenerateNormalWrap(
              GetCurandHandle(), tmp.Data(), tmp.NumRows() * tmp.Stride()));
        tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
      }
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Mat().SetRandn();
    }
  }
  
  template<typename Real>
  void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
  #if HAVE_CUDA == 1
    if (CuDevice::Instantiate().Enabled()) {
      CuTimer tim;
      // To ensure 'even' number of elements, we use 'tmp' vector of even length.
      // Even number of elements is required by 'curand' functions:
      // curandGenerateUniform(), curandGenerateUniformDouble().
      MatrixIndexT num_elements = tgt->Dim();
      if (0 == (num_elements % 2)) {
        CURAND_SAFE_CALL(curandGenerateNormalWrap(
              GetCurandHandle(), tgt->Data(), tgt->Dim()));
      } else {
        MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1,
        CuVector<Real> tmp(dim_even, kUndefined);
        CURAND_SAFE_CALL(curandGenerateNormalWrap(
              GetCurandHandle(), tmp.Data(), tmp.Dim()));
        tgt->CopyFromVec(tmp.Range(0,tgt->Dim()));
      }
      CuDevice::Instantiate().AccuProfile(__func__, tim);
    } else
  #endif
    {
      tgt->Vec().SetRandn();
    }
  }
  
  /// convert probabilities binary values,
  template<typename Real>
  void CuRand<Real>::BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states) {
    CuMatrix<Real> tmp(probs.NumRows(), probs.NumCols());
    this->RandUniform(&tmp);  // [0..1]
    tmp.Scale(-1.0);  // [-1..0]
    tmp.AddMat(1.0, probs);  // [-1..+1]
    states->Heaviside(tmp);  // negative
  }
  
  /// add gaussian noise to each element
  template<typename Real>
  void CuRand<Real>::AddGaussNoise(CuMatrix<Real> *tgt, Real gscale) {
    // Use the option kStrideEqualNumCols to ensure consistency (because when
    // memory is nearly exhausted, the stride of CudaMallocPitch may vary).
    CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(),
                       kUndefined, kStrideEqualNumCols);
    this->RandGaussian(&tmp);
    tgt->AddMat(gscale, tmp);
  }
  
  // explicit instantiation,
  template class CuRand<float>;
  template class CuRand<double>;
  
  }  // namespace,