diff --git a/libs/MVS/cuda/MeshTextureCUDA.cu b/libs/MVS/cuda/MeshTextureCUDA.cu
new file mode 100644
index 0000000..82298b4
--- /dev/null
+++ b/libs/MVS/cuda/MeshTextureCUDA.cu
@@ -0,0 +1,285 @@
+#include "MeshTextureCUDA.h"
+#include <iostream>
+
+bool MeshTextureCUDA::cudaInitialized = false;
+
+__global__ void poissonBlendKernel(float* dst, const float* src, const uchar* mask, 
+                                  int width, int height, int channels, float bias) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= width || y >= height) return;
+    
+    int idx = (y * width + x) * channels;
+    if (mask[y * width + x] == 0) return; // empty
+    
+    for (int c = 0; c < channels; c++) {
+        float src_val = src[idx + c];
+        float dst_val = dst[idx + c];
+        dst[idx + c] = src_val * bias + dst_val * (1 - bias);
+    }
+}
+
+__global__ void processMaskKernel(uchar* mask, int width, int height, int stripWidth) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= width || y >= height) return;
+    
+    int idx = y * width + x;
+    
+    // 简化的掩码处理逻辑 - 根据实际需求完善
+    if (x < stripWidth || x >= width - stripWidth || 
+        y < stripWidth || y >= height - stripWidth) {
+        mask[idx] = 0; // 设置为空
+    }
+}
+
+// CUDA 核函数：将 uint8 图像转换为 float 图像
+__global__ void convertToFloatKernel(const uchar* src, float* dst, 
+                                    int width, int height, 
+                                    int srcStep, int dstStep, 
+                                    float scale) {
+    // 计算像素坐标
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    // 检查是否在图像范围内
+    if (x >= width || y >= height) return;
+    
+    // 计算内存索引
+    int srcIdx = y * srcStep + x * 3;  // 3 channels for BGR
+    int dstIdx = y * dstStep + x * 3;
+    
+    // 转换并缩放每个通道
+    dst[dstIdx]     = src[srcIdx]     * scale;     // Blue
+    dst[dstIdx + 1] = src[srcIdx + 1] * scale;     // Green
+    dst[dstIdx + 2] = src[srcIdx + 2] * scale;     // Red
+}
+
+bool MeshTextureCUDA::Initialize() {
+    if (cudaInitialized) return true;
+    
+    int deviceCount;
+    cudaError_t error = cudaGetDeviceCount(&deviceCount);
+    if (error != cudaSuccess || deviceCount == 0) {
+        std::cerr << "CUDA initialization failed: No CUDA devices found" << std::endl;
+        return false;
+    }
+    
+    cudaSetDevice(0); // 使用第一个设备
+    cudaInitialized = true;
+    return true;
+}
+
+bool MeshTextureCUDA::PoissonBlendCUDA(cv::Mat& dst, const cv::Mat& src, const cv::Mat& mask, float bias) {
+    if (!cudaInitialized && !Initialize()) {
+        return false;
+    }
+    
+    // 验证输入矩阵
+    if (dst.size() != src.size() || dst.size() != mask.size() ||
+        dst.type() != CV_32FC3 || src.type() != CV_32FC3 || mask.type() != CV_8U) {
+        return false;
+    }
+    
+    const int width = dst.cols;
+    const int height = dst.rows;
+    const int channels = 3;
+    const size_t size = width * height * channels * sizeof(float);
+    const size_t maskSize = width * height * sizeof(uchar);
+    
+    // 分配设备内存
+    float *d_dst, *d_src;
+    uchar *d_mask;
+    
+    cudaMalloc(&d_dst, size);
+    cudaMalloc(&d_src, size);
+    cudaMalloc(&d_mask, maskSize);
+    
+    // 拷贝数据到设备
+    cudaMemcpy(d_dst, dst.ptr<float>(), size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_src, src.ptr<float>(), size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_mask, mask.ptr<uchar>(), maskSize, cudaMemcpyHostToDevice);
+    
+    // 配置核函数
+    dim3 blockSize(16, 16);
+    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, 
+                  (height + blockSize.y - 1) / blockSize.y);
+    
+    // 启动核函数
+    poissonBlendKernel<<<gridSize, blockSize>>>(d_dst, d_src, d_mask, width, height, channels, bias);
+    
+    // 检查核函数执行
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_dst);
+        cudaFree(d_src);
+        cudaFree(d_mask);
+        return false;
+    }
+    
+    // 等待核函数完成
+    cudaDeviceSynchronize();
+    
+    // 拷贝结果回主机
+    cudaMemcpy(dst.ptr<float>(), d_dst, size, cudaMemcpyDeviceToHost);
+    
+    // 释放设备内存
+    cudaFree(d_dst);
+    cudaFree(d_src);
+    cudaFree(d_mask);
+    
+    return true;
+}
+
+bool MeshTextureCUDA::ProcessMaskCUDA(cv::Mat& mask, int stripWidth) {
+    if (!cudaInitialized && !Initialize()) {
+        return false;
+    }
+    
+    const int width = mask.cols;
+    const int height = mask.rows;
+    const size_t size = width * height * sizeof(uchar);
+    
+    // 分配设备内存
+    uchar *d_mask;
+    cudaMalloc(&d_mask, size);
+    
+    // 拷贝数据到设备
+    cudaMemcpy(d_mask, mask.ptr<uchar>(), size, cudaMemcpyHostToDevice);
+    
+    // 配置核函数
+    dim3 blockSize(16, 16);
+    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, 
+                  (height + blockSize.y - 1) / blockSize.y);
+    
+    // 启动核函数
+    processMaskKernel<<<gridSize, blockSize>>>(d_mask, width, height, stripWidth);
+    
+    // 检查核函数执行
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_mask);
+        return false;
+    }
+    
+    // 等待核函数完成
+    cudaDeviceSynchronize();
+    
+    // 拷贝结果回主机
+    cudaMemcpy(mask.ptr<uchar>(), d_mask, size, cudaMemcpyDeviceToHost);
+    
+    // 释放设备内存
+    cudaFree(d_mask);
+    
+    return true;
+}
+
+// 将 uint8 图像转换为 float 图像
+bool MeshTextureCUDA::ConvertToCUDA(const cv::Mat& src, cv::Mat& dst, float scale) {
+    
+    // 验证输入图像
+    if (src.empty() || src.type() != CV_8UC3) {
+        std::cerr << "Invalid source image: must be CV_8UC3" << std::endl;
+        return false;
+    }
+    
+    const int width = src.cols;
+    const int height = src.rows;
+    const int channels = 3;
+    
+    // 准备输出图像
+    if (dst.empty() || dst.cols != width || dst.rows != height || dst.type() != CV_32FC3) {
+        dst.create(height, width, CV_32FC3);
+    }
+    
+    // 计算内存步长
+    const size_t srcStep = src.step;
+    const size_t dstStep = dst.step / sizeof(float);
+    
+    // 分配设备内存
+    uchar* d_src = nullptr;
+    float* d_dst = nullptr;
+    
+    cudaError_t error;
+    
+    // 分配源图像设备内存
+    error = cudaMalloc(&d_src, src.step * height);
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA malloc error (src): " << cudaGetErrorString(error) << std::endl;
+        return false;
+    }
+    
+    // 分配目标图像设备内存
+    error = cudaMalloc(&d_dst, dst.step * height);
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA malloc error (dst): " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_src);
+        return false;
+    }
+    
+    // 拷贝源图像到设备
+    error = cudaMemcpy2D(d_src, src.step, src.data, src.step, 
+                        width * channels * sizeof(uchar), height, 
+                        cudaMemcpyHostToDevice);
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA memcpy error (src to device): " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_src);
+        cudaFree(d_dst);
+        return false;
+    }
+    
+    // 配置核函数
+    dim3 blockSize(16, 16);
+    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, 
+                  (height + blockSize.y - 1) / blockSize.y);
+    
+    // 启动核函数
+    convertToFloatKernel<<<gridSize, blockSize>>>(d_src, d_dst, width, height, 
+                                                  srcStep, dstStep, scale);
+    
+    // 检查核函数执行
+    error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA kernel error: " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_src);
+        cudaFree(d_dst);
+        return false;
+    }
+    
+    // 等待核函数完成
+    error = cudaDeviceSynchronize();
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA synchronize error: " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_src);
+        cudaFree(d_dst);
+        return false;
+    }
+    
+    // 拷贝结果回主机
+    error = cudaMemcpy2D(dst.data, dst.step, d_dst, dst.step, 
+                        width * channels * sizeof(float), height, 
+                        cudaMemcpyDeviceToHost);
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA memcpy error (dst to host): " << cudaGetErrorString(error) << std::endl;
+        cudaFree(d_src);
+        cudaFree(d_dst);
+        return false;
+    }
+    
+    // 释放设备内存
+    cudaFree(d_src);
+    cudaFree(d_dst);
+    
+    return true;
+}
+
+void MeshTextureCUDA::Cleanup() {
+    if (cudaInitialized) {
+        cudaDeviceReset();
+        cudaInitialized = false;
+    }
+}
diff --git a/libs/MVS/cuda/MeshTextureCUDA.h b/libs/MVS/cuda/MeshTextureCUDA.h
new file mode 100644
index 0000000..f59b31e
--- /dev/null
+++ b/libs/MVS/cuda/MeshTextureCUDA.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <opencv2/core.hpp>
+
+class MeshTextureCUDA {
+public:
+    // 初始化CUDA环境
+    static bool Initialize();
+    
+    // 泊松混合的CUDA版本
+    static bool PoissonBlendCUDA(cv::Mat& dst, const cv::Mat& src, const cv::Mat& mask, float bias);
+    
+    // 掩码处理的CUDA版本
+    static bool ProcessMaskCUDA(cv::Mat& mask, int stripWidth);
+
+        // 将 uint8 图像转换为 float 图像
+    static bool ConvertToCUDA(const cv::Mat& src, cv::Mat& dst, float scale = 1.0f/255.0f);
+    
+    // 清理CUDA资源
+    static void Cleanup();
+    
+private:
+    static bool cudaInitialized;
+};
+