You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

285 lines
8.8 KiB

#include "MeshTextureCUDA.h"
#include <iostream>
bool MeshTextureCUDA::cudaInitialized = false;
__global__ void poissonBlendKernel(float* dst, const float* src, const uchar* mask,
int width, int height, int channels, float bias) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int idx = (y * width + x) * channels;
if (mask[y * width + x] == 0) return; // empty
for (int c = 0; c < channels; c++) {
float src_val = src[idx + c];
float dst_val = dst[idx + c];
dst[idx + c] = src_val * bias + dst_val * (1 - bias);
}
}
__global__ void processMaskKernel(uchar* mask, int width, int height, int stripWidth) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int idx = y * width + x;
// 简化的掩码处理逻辑 - 根据实际需求完善
if (x < stripWidth || x >= width - stripWidth ||
y < stripWidth || y >= height - stripWidth) {
mask[idx] = 0; // 设置为空
}
}
// CUDA 核函数:将 uint8 图像转换为 float 图像
__global__ void convertToFloatKernel(const uchar* src, float* dst,
int width, int height,
int srcStep, int dstStep,
float scale) {
// 计算像素坐标
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
// 检查是否在图像范围内
if (x >= width || y >= height) return;
// 计算内存索引
int srcIdx = y * srcStep + x * 3; // 3 channels for BGR
int dstIdx = y * dstStep + x * 3;
// 转换并缩放每个通道
dst[dstIdx] = src[srcIdx] * scale; // Blue
dst[dstIdx + 1] = src[srcIdx + 1] * scale; // Green
dst[dstIdx + 2] = src[srcIdx + 2] * scale; // Red
}
bool MeshTextureCUDA::Initialize() {
if (cudaInitialized) return true;
int deviceCount;
cudaError_t error = cudaGetDeviceCount(&deviceCount);
if (error != cudaSuccess || deviceCount == 0) {
std::cerr << "CUDA initialization failed: No CUDA devices found" << std::endl;
return false;
}
cudaSetDevice(0); // 使用第一个设备
cudaInitialized = true;
return true;
}
bool MeshTextureCUDA::PoissonBlendCUDA(cv::Mat& dst, const cv::Mat& src, const cv::Mat& mask, float bias) {
if (!cudaInitialized && !Initialize()) {
return false;
}
// 验证输入矩阵
if (dst.size() != src.size() || dst.size() != mask.size() ||
dst.type() != CV_32FC3 || src.type() != CV_32FC3 || mask.type() != CV_8U) {
return false;
}
const int width = dst.cols;
const int height = dst.rows;
const int channels = 3;
const size_t size = width * height * channels * sizeof(float);
const size_t maskSize = width * height * sizeof(uchar);
// 分配设备内存
float *d_dst, *d_src;
uchar *d_mask;
cudaMalloc(&d_dst, size);
cudaMalloc(&d_src, size);
cudaMalloc(&d_mask, maskSize);
// 拷贝数据到设备
cudaMemcpy(d_dst, dst.ptr<float>(), size, cudaMemcpyHostToDevice);
cudaMemcpy(d_src, src.ptr<float>(), size, cudaMemcpyHostToDevice);
cudaMemcpy(d_mask, mask.ptr<uchar>(), maskSize, cudaMemcpyHostToDevice);
// 配置核函数
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
// 启动核函数
poissonBlendKernel<<<gridSize, blockSize>>>(d_dst, d_src, d_mask, width, height, channels, bias);
// 检查核函数执行
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
cudaFree(d_dst);
cudaFree(d_src);
cudaFree(d_mask);
return false;
}
// 等待核函数完成
cudaDeviceSynchronize();
// 拷贝结果回主机
cudaMemcpy(dst.ptr<float>(), d_dst, size, cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_dst);
cudaFree(d_src);
cudaFree(d_mask);
return true;
}
bool MeshTextureCUDA::ProcessMaskCUDA(cv::Mat& mask, int stripWidth) {
if (!cudaInitialized && !Initialize()) {
return false;
}
const int width = mask.cols;
const int height = mask.rows;
const size_t size = width * height * sizeof(uchar);
// 分配设备内存
uchar *d_mask;
cudaMalloc(&d_mask, size);
// 拷贝数据到设备
cudaMemcpy(d_mask, mask.ptr<uchar>(), size, cudaMemcpyHostToDevice);
// 配置核函数
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
// 启动核函数
processMaskKernel<<<gridSize, blockSize>>>(d_mask, width, height, stripWidth);
// 检查核函数执行
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
cudaFree(d_mask);
return false;
}
// 等待核函数完成
cudaDeviceSynchronize();
// 拷贝结果回主机
cudaMemcpy(mask.ptr<uchar>(), d_mask, size, cudaMemcpyDeviceToHost);
// 释放设备内存
cudaFree(d_mask);
return true;
}
// 将 uint8 图像转换为 float 图像
bool MeshTextureCUDA::ConvertToCUDA(const cv::Mat& src, cv::Mat& dst, float scale) {
// 验证输入图像
if (src.empty() || src.type() != CV_8UC3) {
std::cerr << "Invalid source image: must be CV_8UC3" << std::endl;
return false;
}
const int width = src.cols;
const int height = src.rows;
const int channels = 3;
// 准备输出图像
if (dst.empty() || dst.cols != width || dst.rows != height || dst.type() != CV_32FC3) {
dst.create(height, width, CV_32FC3);
}
// 计算内存步长
const size_t srcStep = src.step;
const size_t dstStep = dst.step / sizeof(float);
// 分配设备内存
uchar* d_src = nullptr;
float* d_dst = nullptr;
cudaError_t error;
// 分配源图像设备内存
error = cudaMalloc(&d_src, src.step * height);
if (error != cudaSuccess) {
std::cerr << "CUDA malloc error (src): " << cudaGetErrorString(error) << std::endl;
return false;
}
// 分配目标图像设备内存
error = cudaMalloc(&d_dst, dst.step * height);
if (error != cudaSuccess) {
std::cerr << "CUDA malloc error (dst): " << cudaGetErrorString(error) << std::endl;
cudaFree(d_src);
return false;
}
// 拷贝源图像到设备
error = cudaMemcpy2D(d_src, src.step, src.data, src.step,
width * channels * sizeof(uchar), height,
cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
std::cerr << "CUDA memcpy error (src to device): " << cudaGetErrorString(error) << std::endl;
cudaFree(d_src);
cudaFree(d_dst);
return false;
}
// 配置核函数
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
// 启动核函数
convertToFloatKernel<<<gridSize, blockSize>>>(d_src, d_dst, width, height,
srcStep, dstStep, scale);
// 检查核函数执行
error = cudaGetLastError();
if (error != cudaSuccess) {
std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
cudaFree(d_src);
cudaFree(d_dst);
return false;
}
// 等待核函数完成
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
std::cerr << "CUDA synchronize error: " << cudaGetErrorString(error) << std::endl;
cudaFree(d_src);
cudaFree(d_dst);
return false;
}
// 拷贝结果回主机
error = cudaMemcpy2D(dst.data, dst.step, d_dst, dst.step,
width * channels * sizeof(float), height,
cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
std::cerr << "CUDA memcpy error (dst to host): " << cudaGetErrorString(error) << std::endl;
cudaFree(d_src);
cudaFree(d_dst);
return false;
}
// 释放设备内存
cudaFree(d_src);
cudaFree(d_dst);
return true;
}
void MeshTextureCUDA::Cleanup() {
if (cudaInitialized) {
cudaDeviceReset();
cudaInitialized = false;
}
}