// cudamatrix/cu-rand.cc // Copyright 2016-2017 Brno University of Technology (author Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "cudamatrix/cu-rand.h" namespace kaldi { #if HAVE_CUDA == 1 /// Wrappers of curand functions to interface both float and double as 1 function, /// Wrapper of curandGenerateUniform(), curandGenerateUniformDouble(), template curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, Real *ptr, size_t num); // template<> curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, float *ptr, size_t num) { return curandGenerateUniform(gen, ptr, num); } template<> curandStatus_t curandGenerateUniformWrap(curandGenerator_t gen, double *ptr, size_t num) { return curandGenerateUniformDouble(gen, ptr, num); } /// Wrapper of curandGenerateNormal(), curandGenerateNormalDouble(), template curandStatus_t curandGenerateNormalWrap( curandGenerator_t gen, Real *ptr, size_t num); // template<> curandStatus_t curandGenerateNormalWrap( curandGenerator_t gen, float *ptr, size_t num) { return curandGenerateNormal(gen, ptr, num, 0.0 /*mean*/, 1.0 /*stddev*/); } template<> curandStatus_t curandGenerateNormalWrap( curandGenerator_t gen, double *ptr, size_t num) { return curandGenerateNormalDouble(gen, ptr, num, 0.0 /*mean*/, 1.0 /*stddev*/); } /// End of wrappers. #endif template void CuRand::RandUniform(CuMatrixBase *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Better use 'tmp' matrix, 'tgt' can be a window into a larger matrix, // so we should not use it to generate random numbers over whole stride. // Use the option kStrideEqualNumCols to ensure consistency // (because when memory is nearly exhausted, the stride of CudaMallocPitch // may vary). CuMatrix tmp(tgt->NumRows(), tgt->NumCols(), kUndefined, kStrideEqualNumCols); size_t s = static_cast(tmp.NumRows()) * static_cast(tmp.Stride()); CURAND_SAFE_CALL(curandGenerateUniformWrap( GetCurandHandle(), tmp.Data(), s)); tgt->CopyFromMat(tmp); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Mat().SetRandUniform(); } } template void CuRand::RandUniform(CuMatrix *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Here we don't need to use 'tmp' matrix, size_t s = static_cast(tgt->NumRows()) * static_cast(tgt->Stride()); CURAND_SAFE_CALL(curandGenerateUniformWrap( GetCurandHandle(), tgt->Data(), s)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Mat().SetRandUniform(); } } template void CuRand::RandUniform(CuVectorBase *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CURAND_SAFE_CALL(curandGenerateUniformWrap( GetCurandHandle(), tgt->Data(), tgt->Dim())); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Vec().SetRandUniform(); } } template void CuRand::RandGaussian(CuMatrixBase *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Better use 'tmp' matrix, 'tgt' can be a window into a larger matrix, // so we should not use it to generate random numbers over whole stride. // Also, we ensure to have 'even' number of elements for calling 'curand' // by possibly adding one column. Even number of elements is required by // curandGenerateUniform(), curandGenerateUniformDouble(). // Use the option kStrideEqualNumCols to ensure consistency // (because when memory is nearly exhausted, the stride of CudaMallocPitch // may vary). MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1, CuMatrix tmp(tgt->NumRows(), num_cols_even, kUndefined, kStrideEqualNumCols); CURAND_SAFE_CALL(curandGenerateNormalWrap( GetCurandHandle(), tmp.Data(), tmp.NumRows()*tmp.Stride())); tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols())); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Mat().SetRandn(); } } template void CuRand::RandGaussian(CuMatrix *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // Here we don't need to use 'tmp' matrix, if the number of elements is even, MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride(); if (0 == (num_elements % 2)) { CURAND_SAFE_CALL(curandGenerateNormalWrap( GetCurandHandle(), tgt->Data(), num_elements)); } else { // We use 'tmp' matrix with one column added, this guarantees an even // number of elements. Use the option kStrideEqualNumCols to ensure // consistency (because when memory is nearly exhausted, the stride of // CudaMallocPitch may vary). MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1, CuMatrix tmp(tgt->NumRows(), num_cols_even, kUndefined, kStrideEqualNumCols); CURAND_SAFE_CALL(curandGenerateNormalWrap( GetCurandHandle(), tmp.Data(), tmp.NumRows() * tmp.Stride())); tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols())); } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Mat().SetRandn(); } } template void CuRand::RandGaussian(CuVectorBase *tgt) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; // To ensure 'even' number of elements, we use 'tmp' vector of even length. // Even number of elements is required by 'curand' functions: // curandGenerateUniform(), curandGenerateUniformDouble(). MatrixIndexT num_elements = tgt->Dim(); if (0 == (num_elements % 2)) { CURAND_SAFE_CALL(curandGenerateNormalWrap( GetCurandHandle(), tgt->Data(), tgt->Dim())); } else { MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1, CuVector tmp(dim_even, kUndefined); CURAND_SAFE_CALL(curandGenerateNormalWrap( GetCurandHandle(), tmp.Data(), tmp.Dim())); tgt->CopyFromVec(tmp.Range(0,tgt->Dim())); } CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { tgt->Vec().SetRandn(); } } /// convert probabilities binary values, template void CuRand::BinarizeProbs(const CuMatrix &probs, CuMatrix *states) { CuMatrix tmp(probs.NumRows(), probs.NumCols()); this->RandUniform(&tmp); // [0..1] tmp.Scale(-1.0); // [-1..0] tmp.AddMat(1.0, probs); // [-1..+1] states->Heaviside(tmp); // negative } /// add gaussian noise to each element template void CuRand::AddGaussNoise(CuMatrix *tgt, Real gscale) { // Use the option kStrideEqualNumCols to ensure consistency (because when // memory is nearly exhausted, the stride of CudaMallocPitch may vary). CuMatrix tmp(tgt->NumRows(), tgt->NumCols(), kUndefined, kStrideEqualNumCols); this->RandGaussian(&tmp); tgt->AddMat(gscale, tmp); } // explicit instantiation, template class CuRand; template class CuRand; } // namespace,