// cudamatrix/cu-array-inl.h // Copyright 2009-2016 Karel Vesely // 2013 Johns Hopkins University (author: Daniel Povey) // 2017 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ #include #if HAVE_CUDA == 1 #include #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-kernels.h" #endif #include "base/timer.h" namespace kaldi { template void CuArray::Resize(MatrixIndexT dim, MatrixResizeType resize_type) { KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0); if (this->dim_ == dim) { if (resize_type == kSetZero) this->SetZero(); return; } Destroy(); if (dim == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; this->data_ = static_cast(CuDevice::Instantiate().Malloc(dim * sizeof(T))); this->dim_ = dim; if (resize_type == kSetZero) this->SetZero(); CuDevice::Instantiate().AccuProfile("CuArray::Resize", tim); } else #endif { this->data_ = static_cast(malloc(dim * sizeof(T))); // We allocate with malloc because we don't want constructors being called. // We basically ignore memory alignment issues here-- we assume the malloc // implementation is forgiving enough that it will automatically align on // sensible boundaries. if (this->data_ == 0) KALDI_ERR << "Memory allocation failed when initializing CuVector " << "with dimension " << dim << " object size in bytes: " << sizeof(T); } this->dim_ = dim; if (resize_type == kSetZero) this->SetZero(); } template void CuArray::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->data_ != NULL) { CuDevice::Instantiate().Free(this->data_); } } else #endif { if (this->data_ != NULL) free(this->data_); } this->dim_ = 0; this->data_ = NULL; } template void CuArrayBase::CopyFromVec(const std::vector &src) { KALDI_ASSERT(dim_ == src.size()); if (src.empty()) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { memcpy(data_, &src.front(), src.size() * sizeof(T)); } } template void CuArray::CopyFromVec(const std::vector &src) { Resize(src.size(), kUndefined); if (src.empty()) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { memcpy(this->data_, &src.front(), src.size()*sizeof(T)); } } template void CuArray::CopyFromArray(const CuArrayBase &src) { this->Resize(src.Dim(), kUndefined); if (this->dim_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { memcpy(this->data_, src.data_, this->dim_ * sizeof(T)); } } template void CuArrayBase::CopyFromArray(const CuArrayBase &src) { KALDI_ASSERT(src.Dim() == Dim()); if (dim_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif { memcpy(this->data_, src.data_, dim_ * sizeof(T)); } } template void CuArrayBase::CopyToVec(std::vector *dst) const { if (static_cast(dst->size()) != this->dim_) { dst->resize(this->dim_); } if (this->dim_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim); } else #endif { memcpy(&dst->front(), this->data_, this->dim_ * sizeof(T)); } } template void CuArrayBase::CopyToHost(T *dst) const { if (this->dim_ == 0) return; KALDI_ASSERT(dst != NULL); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost, cudaStreamPerThread)); CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim); } else #endif { memcpy(dst, this->data_, this->dim_ * sizeof(T)); } } template void CuArrayBase::SetZero() { if (this->dim_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T), cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim); } else #endif { memset(static_cast(this->data_), 0, this->dim_ * sizeof(T)); } } template void CuArrayBase::Set(const T &value) { // This is not implemented yet, we'll do so if it's needed. KALDI_ERR << "CuArray::Set not implemented yet for this type."; } // int32 specialization implemented in 'cudamatrix/cu-array.cc', template<> void CuArrayBase::Set(const int32 &value); template void CuArrayBase::Sequence(const T base) { // This is not implemented yet, we'll do so if it's needed. KALDI_ERR << "CuArray::Sequence not implemented yet for this type."; } // int32 specialization implemented in 'cudamatrix/cu-array.cc', template<> void CuArrayBase::Sequence(const int32 base); template void CuArrayBase::Add(const T &value) { // This is not implemented yet, we'll do so if it's needed. KALDI_ERR << "CuArray::Add not implemented yet for this type."; } // int32 specialization implemented in 'cudamatrix/cu-array.cc', template<> void CuArrayBase::Add(const int32 &value); template inline T CuArrayBase::Min() const { KALDI_ASSERT(this->Dim() > 0); #if HAVE_CUDA == 1 CuTimer tim; #endif std::vector tmp(Dim()); CopyToVec(&tmp); T ans = *std::min_element(tmp.begin(), tmp.end()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuDevice::Instantiate().AccuProfile(__func__, tim); } #endif return ans; } template inline T CuArrayBase::Max() const { KALDI_ASSERT(this->Dim() > 0); #if HAVE_CUDA == 1 CuTimer tim; #endif std::vector tmp(Dim()); CopyToVec(&tmp); T ans = *std::max_element(tmp.begin(), tmp.end()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuDevice::Instantiate().AccuProfile(__func__, tim); } #endif return ans; } template void CuArray::Read(std::istream& in, bool binary) { std::vector tmp; ReadIntegerVector(in, binary, &tmp); (*this) = tmp; } template void CuArray::Write(std::ostream& out, bool binary) const { std::vector tmp(this->Dim()); this->CopyToVec(&tmp); WriteIntegerVector(out, binary, tmp); } template CuSubArray::CuSubArray(const CuArrayBase &src, MatrixIndexT offset, MatrixIndexT dim) { KALDI_ASSERT(offset >= 0 && dim >= 0 && offset + dim <= src.Dim()); this->data_ = src.data_ + offset; this->dim_ = dim; } /** * Print the vector to stream */ template std::ostream &operator << (std::ostream &out, const CuArray &vec) { std::vector tmp; vec.CopyToVec(&tmp); out << "["; for(int32 i=0; i void CuArray::Swap(CuArray *other) { std::swap(this->dim_, other->dim_); std::swap(this->data_, other->data_); } } // namespace kaldi #endif