You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
436 lines
14 KiB
436 lines
14 KiB
//////////////////////////////////////////////////////////////////// |
|
// UtilCUDA.cpp |
|
// |
|
// Copyright 2007 cDc@seacave |
|
// Distributed under the Boost Software License, Version 1.0 |
|
// (See http://www.boost.org/LICENSE_1_0.txt) |
|
|
|
#include "Common.h" |
|
#include "UtilCUDA.h" |
|
|
|
#ifdef _USE_CUDA |
|
|
|
|
|
// D E F I N E S /////////////////////////////////////////////////// |
|
|
|
|
|
namespace SEACAVE { |
|
|
|
namespace CUDA { |
|
|
|
// S T R U C T S /////////////////////////////////////////////////// |
|
|
|
int desiredDeviceID = -1; |
|
Devices devices; |
|
|
|
// GPU Architecture definitions |
|
int _convertSMVer2Cores(int major, int minor) |
|
{ |
|
if (major == 9999 && minor == 9999) |
|
return 1; |
|
|
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM |
|
struct sSMtoCores { |
|
int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version |
|
int Cores; |
|
}; |
|
const sSMtoCores nGpuArchCoresPerSM[] = { |
|
{0x20, 32}, // Fermi Generation (SM 2.0) GF100 class |
|
{0x21, 48}, // Fermi Generation (SM 2.1) GF10x class |
|
{0x30, 192}, // Kepler Generation (SM 3.0) GK10x class |
|
{0x32, 192}, // Kepler Generation (SM 3.2) GK10x class |
|
{0x35, 192}, // Kepler Generation (SM 3.5) GK11x class |
|
{0x37, 192}, // Kepler Generation (SM 3.7) GK21x class |
|
{0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class |
|
{0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class |
|
{0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class |
|
{0x60, 64 }, // Pascal Generation (SM 6.0) GP100 class |
|
{0x61, 128}, // Pascal Generation (SM 6.1) GP10x class |
|
{0x62, 128}, // Pascal Generation (SM 6.2) GP10x class |
|
{0x70, 64 }, // Volta Generation (SM 7.0) GV100 class |
|
{0x72, 64 }, // Volta Generation (SM 7.2) GV10B class |
|
{0x75, 64 }, // Turing Generation (SM 7.5) TU1xx class |
|
{0x80, 64 }, // Ampere Generation (SM 8.0) GA100 class |
|
{-1, -1} |
|
}; |
|
|
|
int index(0); |
|
while (nGpuArchCoresPerSM[index].SM != -1) { |
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) |
|
return nGpuArchCoresPerSM[index].Cores; |
|
index++; |
|
} |
|
|
|
// If we don't find the values, we default use the previous one to run properly |
|
VERBOSE("MapSMtoCores for SM %d.%d is undefined; default to use %d cores/SM", major, minor, nGpuArchCoresPerSM[index-1].Cores); |
|
return nGpuArchCoresPerSM[index-1].Cores; |
|
} |
|
|
|
// checks that the given device ID is valid; |
|
// if successful returns the device info in the given structure |
|
CUresult _gpuCheckDeviceId(int devID, Device& device) |
|
{ |
|
int device_count; |
|
checkCudaError(cuDeviceGetCount(&device_count)); |
|
if (device_count == 0) { |
|
VERBOSE("CUDA error: no devices supporting CUDA"); |
|
return CUDA_ERROR_NO_DEVICE; |
|
} |
|
if (devID < 0) |
|
devID = 0; |
|
if (devID >= device_count) { |
|
VERBOSE("CUDA error: device [%d] is not a valid GPU device (%d CUDA capable GPU device(s) detected)", devID, device_count); |
|
return CUDA_ERROR_INVALID_DEVICE; |
|
} |
|
checkCudaError(cuDeviceGetAttribute(&device.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, devID)); |
|
if (device.computeMode == CU_COMPUTEMODE_PROHIBITED) { |
|
VERBOSE("CUDA error: device is running in <Compute Mode Prohibited>"); |
|
return CUDA_ERROR_PROFILER_DISABLED; |
|
} |
|
checkCudaError(cuDeviceComputeCapability(&device.major, &device.minor, devID)); |
|
if (device.major < 1) { |
|
VERBOSE("CUDA error: GPU device does not support CUDA"); |
|
return CUDA_ERROR_INVALID_DEVICE; |
|
} |
|
checkCudaError(cuDeviceGetProperties(&device.prop, devID)); |
|
device.ID = (CUdevice)devID; |
|
return CUDA_SUCCESS; |
|
} |
|
|
|
// finds the best GPU (with maximum GFLOPS); |
|
// if successful returns the device info in the given structure |
|
CUresult _gpuGetMaxGflopsDeviceId(Device& bestDevice) |
|
{ |
|
int device_count = 0; |
|
checkCudaError(cuDeviceGetCount(&device_count)); |
|
if (device_count == 0) { |
|
VERBOSE("CUDA error: no devices supporting CUDA"); |
|
return CUDA_ERROR_NO_DEVICE; |
|
} |
|
|
|
// Find the best major SM Architecture GPU device |
|
Devices devices; |
|
int best_SM_arch = 0; |
|
for (int current_device = 0; current_device < device_count; ++current_device) { |
|
Device device; |
|
if (reportCudaError(cuDeviceGetAttribute(&device.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device)) != CUDA_SUCCESS) |
|
continue; |
|
// If this GPU is not running on Compute Mode prohibited, then we can add it to the list |
|
if (device.computeMode == CU_COMPUTEMODE_PROHIBITED) |
|
continue; |
|
if (reportCudaError(cuDeviceComputeCapability(&device.major, &device.minor, current_device)) != CUDA_SUCCESS) |
|
continue; |
|
if (device.major > 0 && device.major < 9999) { |
|
best_SM_arch = MAXF(best_SM_arch, device.major); |
|
device.ID = (CUdevice)current_device; |
|
devices.Insert(device); |
|
} |
|
} |
|
if (devices.IsEmpty()) { |
|
VERBOSE("CUDA error: all devices have compute mode prohibited"); |
|
return CUDA_ERROR_PROFILER_DISABLED; |
|
} |
|
|
|
// Find the best CUDA capable GPU device |
|
Device* max_perf_device = NULL; |
|
size_t max_compute_perf = 0; |
|
FOREACHPTR(pDevice, devices) { |
|
ASSERT(pDevice->computeMode != CU_COMPUTEMODE_PROHIBITED); |
|
int sm_per_multiproc = _convertSMVer2Cores(pDevice->major, pDevice->minor); |
|
int multiProcessorCount; |
|
if (reportCudaError(cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, (CUdevice)pDevice->ID)) != CUDA_SUCCESS) |
|
continue; |
|
int clockRate; |
|
if (reportCudaError(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, pDevice->ID)) != CUDA_SUCCESS) |
|
continue; |
|
size_t compute_perf = (size_t)multiProcessorCount * sm_per_multiproc * clockRate; |
|
if (compute_perf > max_compute_perf && |
|
(best_SM_arch < 3 || // if we find GPU with SM major > 2, search only these |
|
pDevice->major == best_SM_arch) ) // if our device==dest_SM_arch, choose this, or else pass |
|
{ |
|
max_compute_perf = compute_perf; |
|
max_perf_device = pDevice; |
|
} |
|
} |
|
if (max_perf_device == NULL) |
|
return CUDA_ERROR_INVALID_DEVICE; |
|
|
|
bestDevice = *max_perf_device; |
|
checkCudaError(cuDeviceGetProperties(&bestDevice.prop, bestDevice.ID)); |
|
return CUDA_SUCCESS; |
|
} |
|
|
|
// initialize the given CUDA device and add it to the array of initialized devices; |
|
// if the given device is -1, the best available device is selected |
|
CUresult initDevice(int deviceID) |
|
{ |
|
if (deviceID < -1) |
|
return CUDA_ERROR_INVALID_DEVICE; |
|
|
|
checkCudaError(cuInit(0)); |
|
|
|
Device device; |
|
if (deviceID >= 0) { |
|
checkCudaError(_gpuCheckDeviceId(deviceID, device)); |
|
} else { |
|
// Otherwise pick the device with the highest Gflops/s |
|
checkCudaError(_gpuGetMaxGflopsDeviceId(device)); |
|
} |
|
if (device.major < 3) { |
|
VERBOSE("CUDA error: compute capability 3.2 or greater required (available %d.%d for device[%d])", device.ID, device.major, device.minor); |
|
return CUDA_ERROR_INVALID_DEVICE; |
|
} |
|
devices.Insert(device); |
|
checkCudaError(cuCtxCreate(&devices.Last().ctx, CU_CTX_SCHED_AUTO, device.ID)); |
|
|
|
#if TD_VERBOSE != TD_VERBOSE_OFF |
|
char name[2048]; |
|
checkCudaError(cuDeviceGetName(name, 2048, device.ID)); |
|
size_t memSize; |
|
checkCudaError(cuDeviceTotalMem(&memSize, device.ID)); |
|
DEBUG("CUDA device %d initialized: %s (compute capability %d.%d; memory %s)", device.ID, name, device.major, device.minor, Util::formatBytes(memSize).c_str()); |
|
#endif |
|
return CUDA_SUCCESS; |
|
} |
|
|
|
// load/read module (program) from file/string and compile it |
|
CUresult ptxJIT(LPCSTR program, CUmodule& hModule, int mode) |
|
{ |
|
CUlinkState lState; |
|
CUjit_option options[6]; |
|
void *optionVals[6]; |
|
float walltime(0); |
|
const unsigned logSize(8192); |
|
char error_log[logSize], info_log[logSize]; |
|
void *cuOut; |
|
size_t outSize; |
|
|
|
// Setup linker options |
|
// Return walltime from JIT compilation |
|
options[0] = CU_JIT_WALL_TIME; |
|
optionVals[0] = (void*)&walltime; |
|
// Pass a buffer for info messages |
|
options[1] = CU_JIT_INFO_LOG_BUFFER; |
|
optionVals[1] = (void*)info_log; |
|
// Pass the size of the info buffer |
|
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; |
|
optionVals[2] = (void*)(long)logSize; |
|
// Pass a buffer for error message |
|
options[3] = CU_JIT_ERROR_LOG_BUFFER; |
|
optionVals[3] = (void*)error_log; |
|
// Pass the size of the error buffer |
|
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; |
|
optionVals[4] = (void*)(long)logSize; |
|
// Make the linker verbose |
|
options[5] = CU_JIT_LOG_VERBOSE; |
|
optionVals[5] = (void*)1; |
|
|
|
// Create a pending linker invocation |
|
checkCudaError(cuLinkCreate(6, options, optionVals, &lState)); |
|
|
|
const size_t programLen(strlen(program)); |
|
CUresult myErr; |
|
if (mode == JIT::FILE || (mode == JIT::AUTO && programLen < 256)) { |
|
// Load the PTX from the file |
|
myErr = cuLinkAddFile(lState, CU_JIT_INPUT_PTX, program, 0, 0, 0); |
|
} else { |
|
// Load the PTX from the string |
|
myErr = cuLinkAddData(lState, CU_JIT_INPUT_PTX, (void*)program, programLen+1, 0, 0, 0, 0); |
|
} |
|
if (myErr != CUDA_SUCCESS) { |
|
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above |
|
VERBOSE("PTX Linker Error: %s", error_log); |
|
return myErr; |
|
} |
|
|
|
// Complete the linker step |
|
checkCudaError(cuLinkComplete(lState, &cuOut, &outSize)); |
|
|
|
// Linker walltime and info_log were requested in options above |
|
DEBUG_LEVEL(3, "CUDA link completed (%gms):\n%s", walltime, info_log); |
|
|
|
// Load resulting cuBin into module |
|
checkCudaError(cuModuleLoadData(&hModule, cuOut)); |
|
|
|
// Destroy the linker invocation |
|
return reportCudaError(cuLinkDestroy(lState)); |
|
} |
|
|
|
// requested function (kernel) from module (program) |
|
CUresult ptxGetFunc(const CUmodule& hModule, LPCSTR functionName, CUfunction& hKernel) |
|
{ |
|
// Locate the kernel entry point |
|
checkCudaError(cuModuleGetFunction(&hKernel, hModule, functionName)); |
|
DEBUG_LEVEL(3, "Kernel '%s' loaded", functionName); |
|
return CUDA_SUCCESS; |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
void MemDevice::Release() { |
|
if (pData) { |
|
reportCudaError(cuMemFree(pData)); |
|
pData = 0; |
|
} |
|
} |
|
CUresult MemDevice::Reset(size_t size) { |
|
if (pData) { |
|
if (nSize == size) |
|
return CUDA_SUCCESS; |
|
Release(); |
|
} |
|
if (cuMemAlloc(&pData, size) != CUDA_SUCCESS) { |
|
pData = 0; |
|
return CUDA_ERROR_OUT_OF_MEMORY; |
|
} |
|
nSize = size; |
|
return CUDA_SUCCESS; |
|
} |
|
CUresult MemDevice::Reset(const void* pDataHost, size_t size) { |
|
if (pData && nSize != size) |
|
Release(); |
|
if (!pData && cuMemAlloc(&pData, size) != CUDA_SUCCESS) { |
|
pData = 0; |
|
return CUDA_ERROR_OUT_OF_MEMORY; |
|
} |
|
nSize = size; |
|
return cuMemcpyHtoD(pData, pDataHost, size); |
|
} |
|
|
|
CUresult MemDevice::SetData(const void* pDataHost, size_t size) { |
|
ASSERT(IsValid()); |
|
return cuMemcpyHtoD(pData, pDataHost, size); |
|
} |
|
|
|
CUresult MemDevice::GetData(void* pDataHost, size_t size) const { |
|
ASSERT(IsValid()); |
|
return cuMemcpyDtoH(pDataHost, pData, size); |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
void EventRT::Release() { |
|
if (hEvent) { |
|
reportCudaError(cuEventDestroy(hEvent)); |
|
hEvent = NULL; |
|
} |
|
} |
|
CUresult EventRT::Reset(unsigned flags) { |
|
CUresult ret(cuEventCreate(&hEvent, flags)); |
|
if (ret != CUDA_SUCCESS) |
|
hEvent = NULL; |
|
return ret; |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
void StreamRT::Release() { |
|
if (hStream) { |
|
reportCudaError(cuStreamDestroy(hStream)); |
|
hStream = NULL; |
|
} |
|
} |
|
CUresult StreamRT::Reset(unsigned flags) { |
|
CUresult ret(cuStreamCreate(&hStream, flags)); |
|
if (ret != CUDA_SUCCESS) |
|
hStream = NULL; |
|
return ret; |
|
} |
|
|
|
CUresult StreamRT::Wait(CUevent hEvent) { |
|
ASSERT(IsValid()); |
|
return cuStreamWaitEvent(hStream, hEvent, 0); |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
void ModuleRT::Release() { |
|
if (hModule) { |
|
reportCudaError(cuModuleUnload(hModule)); |
|
hModule = NULL; |
|
} |
|
} |
|
CUresult ModuleRT::Reset(LPCSTR program, int mode) { |
|
// compile the module (program) from PTX and get its handle (Driver API) |
|
CUresult result(ptxJIT(program, hModule, mode)); |
|
if (result != CUDA_SUCCESS) |
|
hModule = NULL; |
|
return result; |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
void KernelRT::Release() { |
|
inDatas.Release(); |
|
outDatas.Release(); |
|
ptrModule.Release(); |
|
hKernel = NULL; |
|
} |
|
void KernelRT::Reset() { |
|
paramOffset = 0; |
|
inDatas.Empty(); |
|
outDatas.Empty(); |
|
} |
|
CUresult KernelRT::Reset(LPCSTR functionName) { |
|
// get the function handle (Driver API) |
|
ASSERT(ptrModule != NULL && ptrModule->IsValid()); |
|
CUresult result(ptxGetFunc(*ptrModule, functionName, hKernel)); |
|
if (result != CUDA_SUCCESS) { |
|
ptrModule.Release(); |
|
hKernel = NULL; |
|
} |
|
return result; |
|
} |
|
CUresult KernelRT::Reset(const ModuleRTPtr& _ptrModule, LPCSTR functionName) { |
|
// set module |
|
ptrModule = _ptrModule; |
|
// set function |
|
return Reset(functionName); |
|
} |
|
CUresult KernelRT::Reset(LPCSTR program, LPCSTR functionName, int mode) { |
|
// compile the module (program) from PTX and get the function handle (Driver API) |
|
ptrModule = new ModuleRT(program, mode); |
|
if (!ptrModule->IsValid()) { |
|
ptrModule.Release(); |
|
hKernel = NULL; |
|
return CUDA_ERROR_INVALID_HANDLE; |
|
} |
|
return Reset(functionName); |
|
} |
|
|
|
|
|
// append a generic input parameter (allocate© input buffer) |
|
CUresult KernelRT::_AddParam(const InputParam& param) { |
|
MemDevice& data = inDatas.AddEmpty(); |
|
if (data.Reset(param.data, param.size) != CUDA_SUCCESS) |
|
return CUDA_ERROR_OUT_OF_MEMORY; |
|
return addKernelParam(hKernel, paramOffset, (CUdeviceptr)data); |
|
} |
|
// append a generic output parameter (allocate output buffer) |
|
CUresult KernelRT::_AddParam(const OutputParam& param) { |
|
MemDevice& data = outDatas.AddEmpty(); |
|
if (data.Reset(param.size) != CUDA_SUCCESS) |
|
return CUDA_ERROR_OUT_OF_MEMORY; |
|
return addKernelParam(hKernel, paramOffset, (CUdeviceptr)data); |
|
} |
|
|
|
|
|
// copy result from the given output parameter index back to the host |
|
CUresult KernelRT::GetResult(const CUdeviceptr data, const ReturnParam& param) const { |
|
return fetchMemDevice(param.data, param.size, data); |
|
} |
|
// read from the device the variadic output parameters |
|
CUresult KernelRT::GetResult(const std::initializer_list<ReturnParam>& params) const { |
|
MemDeviceArr::IDX idx(0); |
|
for (auto param : params) |
|
if (outDatas[idx++].GetData(param.data, param.size) != CUDA_SUCCESS) |
|
return CUDA_ERROR_INVALID_VALUE; |
|
return CUDA_SUCCESS; |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
} // namespace CUDA |
|
|
|
} // namespace SEACAVE |
|
|
|
#endif // _USE_CUDA
|
|
|