/* * PatchMatchCUDA.cu * * Copyright (c) 2014-2021 SEACAVE * * Author(s): * * cDc * * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * * Additional Terms: * * You are required to preserve legal notices and author attributions in * that material or in the Appropriate Legal Notices displayed by works * containing it. */ #include "PatchMatchCUDA.inl" // static max supported views #define MAX_VIEWS 32 // samples used to perform views selection #define NUM_SAMPLES 32 // patch window radius #define nSizeHalfWindow 4 // patch stepping #define nSizeStep 2 using namespace MVS; typedef Eigen::Matrix Point2i; typedef Eigen::Matrix Point2; typedef Eigen::Matrix Point3; typedef Eigen::Matrix Point4; typedef Eigen::Matrix Matrix3; #define ImagePixels cudaTextureObject_t #define RandState curandState // square the given value __device__ inline constexpr float Square(float v) { return v * v; } // set/check a bit __device__ inline constexpr void SetBit(unsigned& input, unsigned i) { input |= (1u << i); } __device__ inline constexpr int IsBitSet(unsigned input, unsigned i) { return (input >> i) & 1u; } // swap the given values __device__ inline constexpr void Swap(float& v0, float& v1) { const float tmp = v0; v0 = v1; v1 = tmp; } // convert 2d to 1d coordinates and back __device__ inline int Point2Idx(const Point2i& p, int width) { return p.y() * width + p.x(); } __device__ inline Point2i Idx2Point(int idx, int width) { return Point2i(idx % width, idx / width); } // project and back-project a 3D point __device__ inline Point2 ProjectPoint(const PatchMatchCUDA::Camera& camera, const Point3& X) { const Point3 x = camera.K * camera.R * (X - camera.C); return x.hnormalized(); } __device__ inline Point3 BackProjectPointCamera(const PatchMatchCUDA::Camera& camera, const Point2& p, const float depth = 1.f) { return Point3( depth * (p.x() - camera.K(0,2)) / camera.K(0,0), depth * (p.y() - camera.K(1,2)) / camera.K(1,1), depth); } __device__ inline Point3 BackProjectPoint(const PatchMatchCUDA::Camera& camera, const Point2& p, const float depth) { const Point3 camX = BackProjectPointCamera(camera, p, depth); return camera.R.transpose() * camX + camera.C; } // compute camera ray direction for the given pixel __device__ inline Point3 ViewDirection(const PatchMatchCUDA::Camera& camera, const Point2i& p) { return BackProjectPointCamera(camera, p.cast()).normalized(); } // sort the given values array using bubble sort algorithm __device__ inline void Sort(const float* values, float* sortedValues, int n) { for (int i = 0; i < n; ++i) sortedValues[i] = values[i]; do { int newn = 0; for (int i = 1; i < n; ++i) { if (sortedValues[i-1] > sortedValues[i]) { Swap(sortedValues[i-1], sortedValues[i]); newn = i; } } n = newn; } while(n); } // find the index of the minimum value in the given values array __device__ inline int FindMinIndex(const float* values, const int n) { float minValue = values[0]; int minValueIdx = 0; for (int i = 1; i < n; ++i) { if (minValue > values[i]) { minValue = values[i]; minValueIdx = i; } } return minValueIdx; } // convert Probability Density Function (PDF) to Cumulative Distribution Function (CDF) __device__ inline void PDF2CDF(float* probs, const int numProbs) { float probSum = 0.f; for (int i = 0; i < numProbs; ++i) probSum += probs[i]; const float invProbSum = 1.f / probSum; float sumProb = 0.f; for (int i = 0; i < numProbs-1; ++i) { sumProb += probs[i] * invProbSum; probs[i] = sumProb; } probs[numProbs-1] = 1.f; } /*----------------------------------------------------------------*/ // generate a random normal __device__ inline Point3 GenerateRandomNormal(const PatchMatchCUDA::Camera& camera, const Point2i& p, RandState* randState) { float q1, q2, s; do { q1 = 2.f * curand_uniform(randState) - 1.f; q2 = 2.f * curand_uniform(randState) - 1.f; s = q1 * q1 + q2 * q2; } while (s >= 1.f); const float sq = sqrt(1.f - s); Point3 normal( 2.f * q1 * sq, 2.f * q2 * sq, 1.f - 2.f * s); const Point3 viewDirection = ViewDirection(camera, p); if (normal.dot(viewDirection) > 0.f) normal = -normal; return normal.normalized(); } // randomly perturb a normal __device__ inline Point3 GeneratePerturbedNormal(const PatchMatchCUDA::Camera& camera, const Point2i& p, const Point3& normal, RandState* randState, const float perturbation) { const Point3 viewDirection = ViewDirection(camera, p); const float a1 = (curand_uniform(randState) - 0.5f) * perturbation; const float a2 = (curand_uniform(randState) - 0.5f) * perturbation; const float a3 = (curand_uniform(randState) - 0.5f) * perturbation; const float sinA1 = sin(a1); const float sinA2 = sin(a2); const float sinA3 = sin(a3); const float cosA1 = cos(a1); const float cosA2 = cos(a2); const float cosA3 = cos(a3); Matrix3 perturb; perturb << cosA2 * cosA3, cosA3 * sinA1 * sinA2 - cosA1 * sinA3, sinA1 * sinA3 + cosA1 * cosA3 * sinA2, cosA2 * sinA3, cosA1 * cosA3 + sinA1 * sinA2 * sinA3, cosA1 * sinA2 * sinA3 - cosA3 * sinA1, -sinA2, cosA2 * sinA1, cosA1 * cosA2; Point3 normalPerturbed = perturb * normal.topLeftCorner<3,1>(); if (normalPerturbed.dot(viewDirection) >= 0.f) return normal; return normalPerturbed.normalized(); } // randomly perturb a normal __device__ inline float GeneratePerturbedDepth(float depth, RandState* randState, const float perturbation, const PatchMatchCUDA::Params& params) { const float depthMinPerturbed = (1.f - perturbation) * depth; const float depthMaxPerturbed = (1.f + perturbation) * depth; float depthPerturbed; do { depthPerturbed = curand_uniform(randState) * (depthMaxPerturbed - depthMinPerturbed) + depthMinPerturbed; } while (depthPerturbed < params.fDepthMin && depthPerturbed > params.fDepthMax); return depthPerturbed; } // interpolate given pixel's estimate to the current position __device__ inline float InterpolatePixel(const PatchMatchCUDA::Camera& camera, const Point2i& p, const Point2i& np, float depth, const Point3& normal, const PatchMatchCUDA::Params& params) { float depthNew; if (p.x() == np.x()) { const float nx1 = (p.y() - camera.K(1,2)) / camera.K(1,1); const float denom = normal.z() + nx1 * normal.y(); if (abs(denom) < FLT_EPSILON) return depth; const float x1 = (np.y() - camera.K(1,2)) / camera.K(1,1); const float nom = depth * (normal.z() + x1 * normal.y()); depthNew = nom / denom; } else if (p.y() == np.y()) { const float nx1 = (p.x() - camera.K(0,2)) / camera.K(0,0); const float denom = normal.z() + nx1 * normal.x(); if (abs(denom) < FLT_EPSILON) return depth; const float x1 = (np.x() - camera.K(0,2)) / camera.K(0,0); const float nom = depth * (normal.z() + x1 * normal.x()); depthNew = nom / denom; } else { const float planeD = normal.dot(BackProjectPointCamera(camera, np.cast(), depth)); depthNew = planeD / normal.dot(BackProjectPointCamera(camera, p.cast())); } return (depthNew >= params.fDepthMin && depthNew <= params.fDepthMax) ? depthNew : depth; } // compute normal to the surface given the 4 neighbors __device__ inline Point3 ComputeDepthGradient(const Matrix3& K, float depth, const Point2i& pos, const Point4& ndepth) { constexpr float2 nposg[4] = {{0,-1}, {0,1}, {-1,0}, {1,0}}; Point2 dg(0,0); // add neighbor depths at the gradient locations for (int i=0; i<4; ++i) dg += Point2(nposg[i].x,nposg[i].y) * (ndepth[i] - depth); // compute depth gradient const Point2 d = dg*0.5f; // compute normal from depth gradient return Point3( K(0,0)*d.x(), K(1,1)*d.y(), (K(0,2)-pos.x())*d.x()+(K(1,2)-pos.y())*d.y()-depth).normalized(); } // compose tho homography matrix that transforms a point from reference to source camera through the given plane __device__ inline Matrix3 ComputeHomography(const PatchMatchCUDA::Camera& refCamera, const PatchMatchCUDA::Camera& trgCamera, const Point2& p, const Point4& plane) { const Point3 X = BackProjectPointCamera(refCamera, p, plane.w()); const Point3 normal = plane.topLeftCorner<3,1>(); const Point3 t = (refCamera.C - trgCamera.C) / (normal.dot(X)); const Matrix3 H = trgCamera.R * (refCamera.R.transpose() + t*normal.transpose()); return trgCamera.K * H * refCamera.K.inverse(); } // weight a neighbor texel based on color similarity and distance to the center texel __device__ inline float ComputeBilateralWeight(int xDist, int yDist, float pix, float centerPix) { constexpr float sigmaSpatial = -1.f / (2.f * (nSizeHalfWindow-1)*(nSizeHalfWindow-1)); constexpr float sigmaColor = -1.f / (2.f * 25.f/255.f*25.f/255.f); const float spatialDistSq = float(xDist * xDist + yDist * yDist); const float colorDistSq = Square(pix - centerPix); return exp(spatialDistSq * sigmaSpatial + colorDistSq * sigmaColor); } // compute the geometric consistency weight __device__ inline float GeometricConsistencyWeight(const ImagePixels depthImage, const PatchMatchCUDA::Camera& refCamera, const PatchMatchCUDA::Camera& trgCamera, const Point4& plane, const Point2i& p) { if (depthImage == NULL) return 0.f; constexpr float maxDist = 4.f; const Point3 forwardPoint = BackProjectPoint(refCamera, p.cast(), plane.w()); const Point2 trgPt = ProjectPoint(trgCamera, forwardPoint); const float trgDepth = tex2D(depthImage, trgPt.x() + 0.5f, trgPt.y() + 0.5f); if (trgDepth == 0.f) return maxDist; const Point3 trgX = BackProjectPoint(trgCamera, trgPt, trgDepth); const Point2 backwardPoint = ProjectPoint(refCamera, trgX); const Point2 diff = p.cast() - backwardPoint; const float dist = diff.norm(); return min(maxDist, sqrt(dist*(dist+2.f))); } // compute photometric score using weighted ZNCC __device__ float ScorePlane(const ImagePixels refImage, const PatchMatchCUDA::Camera& refCamera, const ImagePixels trgImage, const PatchMatchCUDA::Camera& trgCamera, const Point2i& p, const Point4& plane, const float lowDepth, const PatchMatchCUDA::Params& params) { constexpr float maxCost = 1.2f; Matrix3 H = ComputeHomography(refCamera, trgCamera, p.cast(), plane); const Point2 pt = (H * p.cast().homogeneous()).hnormalized(); if (pt.x() >= trgCamera.width || pt.x() < 0.f || pt.y() >= trgCamera.height || pt.y() < 0.f) return maxCost; Point3 X = H * Point2(p.x()-nSizeHalfWindow, p.y()-nSizeHalfWindow).homogeneous(); Point3 baseX(X); H *= float(nSizeStep); float sumRef = 0.f; float sumRefRef = 0.f; float sumTrg = 0.f; float sumTrgTrg = 0.f; float sumRefTrg = 0.f; float bilateralWeightSum = 0.f; const float refCenterPix = tex2D(refImage, p.x() + 0.5f, p.y() + 0.5f); for (int i = -nSizeHalfWindow; i <= nSizeHalfWindow; i += nSizeStep) { for (int j = -nSizeHalfWindow; j <= nSizeHalfWindow; j += nSizeStep) { const Point2i refPt = Point2i(p.x() + j, p.y() + i); const Point2 trgPt = X.hnormalized(); const float refPix = tex2D(refImage, refPt.x() + 0.5f, refPt.y() + 0.5f); const float trgPix = tex2D(trgImage, trgPt.x() + 0.5f, trgPt.y() + 0.5f); const float weight = ComputeBilateralWeight(j, i, refPix, refCenterPix); const float weightRefPix = weight * refPix; const float weightTrgPix = weight * trgPix; sumRef += weightRefPix; sumTrg += weightTrgPix; sumRefRef += weightRefPix * refPix; sumTrgTrg += weightTrgPix * trgPix; sumRefTrg += weightRefPix * trgPix; bilateralWeightSum += weight; X += H.col(0); } baseX += H.col(1); X = baseX; } const float varRef = sumRefRef * bilateralWeightSum - sumRef * sumRef; if (lowDepth <= 0 && varRef < 1e-8f) return maxCost; const float varTrg = sumTrgTrg * bilateralWeightSum - sumTrg * sumTrg; const float varRefTrg = varRef * varTrg; if (varRefTrg < 1e-16f) return maxCost; const float covarTrgRef = sumRefTrg * bilateralWeightSum - sumRef * sumTrg; float ncc = 1.f - covarTrgRef / sqrt(varRefTrg); // apply depth prior weight based on patch textureless if (lowDepth > 0) { const float depth(plane.w()); const float deltaDepth(MIN((abs(lowDepth-depth) / lowDepth), 0.5f)); constexpr float smoothSigmaDepth(-1.f / (1.f * 0.02f)); // 0.12: patch texture variance below 0.02 (0.12^2) is considered texture-less const float factorDeltaDepth(exp(varRef * smoothSigmaDepth)); ncc = (1.f-factorDeltaDepth)*ncc + factorDeltaDepth*deltaDepth; } return max(0.f, min(2.f, ncc)); } // compute photometric score for all neighbor images __device__ inline void MultiViewScorePlane(const ImagePixels *images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, const Point2i& p, const Point4& plane, const float lowDepth, float* costVector, const PatchMatchCUDA::Params& params) { for (int imgId = 1; imgId <= params.nNumViews; ++imgId) costVector[imgId-1] = ScorePlane(images[0], cameras[0], images[imgId], cameras[imgId], p, plane, lowDepth, params); if (params.bGeomConsistency) for (int imgId = 0; imgId < params.nNumViews; ++imgId) costVector[imgId] += 0.1f * GeometricConsistencyWeight(depthImages[imgId], cameras[0], cameras[imgId+1], plane, p); } // same as above, but interpolate the plane to current pixel position __device__ inline float MultiViewScoreNeighborPlane(const ImagePixels* images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, const Point2i& p, const Point2i& np, Point4 plane, const float lowDepth, float* costVector, const PatchMatchCUDA::Params& params) { plane.w() = InterpolatePixel(cameras[0], p, np, plane.w(), plane.topLeftCorner<3,1>(), params); MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params); return plane.w(); } // aggregate photometric score from all images __device__ inline float AggregateMultiViewScores(const unsigned* viewWeights, const float* costVector, int numViews) { float cost = 0; for (int imgId = 0; imgId < numViews; ++imgId) if (viewWeights[imgId]) cost += viewWeights[imgId] * costVector[imgId]; return cost / NUM_SAMPLES; } // propagate and refine the plane estimate for the current pixel employing the asymmetric approach described in: // "Multi-View Stereo with Asymmetric Checkerboard Propagation and Multi-Hypothesis Joint View Selection", 2018 __device__ void ProcessPixel(const ImagePixels* images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, RandState* randStates, unsigned* selectedViews, const Point2i& p, const PatchMatchCUDA::Params& params, const int iter) { int width = cameras[0].width; int height = cameras[0].height; if (p.x() >= width || p.y() >= height) return; const int idx = Point2Idx(p, width); RandState* randState = &randStates[idx]; float lowDepth = 0; if (params.bLowResProcessed) lowDepth = lowDepths[idx]; // adaptive sampling: 0 up-near, 1 down-near, 2 left-near, 3 right-near, 4 up-far, 5 down-far, 6 left-far, 7 right-far static constexpr int2 dirs[8][11] = { {{ 0,-1},{-1,-2},{ 1,-2},{-2,-3},{ 2,-3},{-3,-4},{ 3,-4}}, {{ 0, 1},{-1, 2},{ 1, 2},{-2, 3},{ 2, 3},{-3, 4},{ 3, 4}}, {{-1, 0},{-2,-1},{-2, 1},{-3,-2},{-3, 2},{-4,-3},{-4, 3}}, {{ 1, 0},{ 2,-1},{ 2, 1},{ 3,-2},{ 3, 2},{ 4,-3},{ 4, 3}}, {{0,-3},{0,-5},{0,-7},{0,-9},{0,-11},{0,-13},{0,-15},{0,-17},{0,-19},{0,-21},{0,-23}}, {{0, 3},{0, 5},{0, 7},{0, 9},{0, 11},{0, 13},{0, 15},{0, 17},{0, 19},{0, 21},{0, 23}}, {{-3,0},{-5,0},{-7,0},{-9,0},{-11,0},{-13,0},{-15,0},{-17,0},{-19,0},{-21,0},{-23,0}}, {{ 3,0},{ 5,0},{ 7,0},{ 9,0},{ 11,0},{ 13,0},{ 15,0},{ 17,0},{ 19,0},{ 21,0},{ 23,0}} }; static constexpr int numDirs[8] = {7, 7, 7, 7, 11, 11, 11, 11}; const int neighborPositions[4] = { idx - width, idx + width, idx - 1, idx + 1, }; bool valid[8] = {false, false, false, false, false, false, false, false}; int positions[8]; float neighborDepths[8]; float costArray[8][MAX_VIEWS]; for (int posId=0; posId<8; ++posId) { const int2* samples = dirs[posId]; Point2i bestNx; float bestConf(FLT_MAX); for (int dirId=0; dirId=0 && np.y()>=0 && np.x() nconf) { bestNx = np; bestConf = nconf; } } if (bestConf < FLT_MAX) { valid[posId] = true; positions[posId] = Point2Idx(bestNx, width); neighborDepths[posId] = MultiViewScoreNeighborPlane(images, depthImages, cameras, p, bestNx, planes[positions[posId]], lowDepth, costArray[posId], params); } } // multi-hypothesis view selection float viewSelectionPriors[MAX_VIEWS] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; for (int posId = 0; posId < 4; ++posId) { if (valid[posId]) { const unsigned selectedView = selectedViews[neighborPositions[posId]]; for (int j = 0; j < params.nNumViews; ++j) viewSelectionPriors[j] += (IsBitSet(selectedView, j) ? 0.9f : 0.1f); } } float samplingProbs[MAX_VIEWS]; constexpr float thCostBad = 1.2f; const float thCost = 0.8f * exp(Square((float)iter) / (-2.f * 4.f*4.f)); for (int imgId = 0; imgId < params.nNumViews; ++imgId) { float sumW = 0; unsigned count = 0; unsigned countBad = 0; for (int posId = 0; posId < 8; posId++) { if (valid[posId]) { if (costArray[posId][imgId] < thCost) { sumW += exp(Square(costArray[posId][imgId]) / (-2.f * 0.3f*0.3f)); ++count; } else if (costArray[posId][imgId] > thCostBad) { ++countBad; } } } if (count > 2 && countBad < 3) { samplingProbs[imgId] = viewSelectionPriors[imgId] * sumW / count; } else if (countBad < 3) { samplingProbs[imgId] = viewSelectionPriors[imgId] * exp(Square(thCost) / (-2.f * 0.4f*0.4f)); } else { samplingProbs[imgId] = 0.f; } } PDF2CDF(samplingProbs, params.nNumViews); unsigned viewWeights[MAX_VIEWS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; for (int sample = 0; sample < NUM_SAMPLES; ++sample) { const float randProb = curand_uniform(randState); for (int imgId = 0; imgId < params.nNumViews; ++imgId) { if (samplingProbs[imgId] > randProb) { ++viewWeights[imgId]; break; } } } // propagate best neighbor plane Point4& plane = planes[idx]; float& cost = costs[idx]; unsigned newSelectedViews = 0; for (int imgId = 0; imgId < params.nNumViews; ++imgId) if (viewWeights[imgId]) SetBit(newSelectedViews, imgId); float finalCosts[8]; for (int posId = 0; posId < 8; ++posId) finalCosts[posId] = AggregateMultiViewScores(viewWeights, costArray[posId], params.nNumViews); const int minCostIdx = FindMinIndex(finalCosts, 8); float costVector[MAX_VIEWS]; MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params); cost = AggregateMultiViewScores(viewWeights, costVector, params.nNumViews); if (finalCosts[minCostIdx] < cost && valid[minCostIdx]) { plane = planes[positions[minCostIdx]]; plane.w() = neighborDepths[minCostIdx]; cost = finalCosts[minCostIdx]; selectedViews[idx] = newSelectedViews; } const float depth = plane.w(); // refine estimate constexpr float perturbationDepth = 0.005f; constexpr float perturbationNormal = 0.01f * (float)M_PI; const float depthPerturbed = GeneratePerturbedDepth(depth, randState, perturbationDepth, params); const Point3 perturbedNormal = GeneratePerturbedNormal(cameras[0], p, plane.topLeftCorner<3,1>(), randState, perturbationNormal); const Point3 normalRand = GenerateRandomNormal(cameras[0], p, randState); int numValidPlanes = 3; Point3 surfaceNormal; if (valid[0] && valid[1] && valid[2] && valid[3]) { // estimate normal from surrounding surface const Point4 ndepths( planes[neighborPositions[0]].w(), planes[neighborPositions[1]].w(), planes[neighborPositions[2]].w(), planes[neighborPositions[3]].w() ); surfaceNormal = ComputeDepthGradient(cameras[0].K, depth, p, ndepths); numValidPlanes = 4; } constexpr int numPlanes = 4; const float depths[numPlanes] = {depthPerturbed, depth, depth, depth}; const Point3 normals[numPlanes] = {plane.topLeftCorner<3,1>(), perturbedNormal, normalRand, surfaceNormal}; for (int i = 0; i < numValidPlanes; ++i) { Point4 newPlane; newPlane.topLeftCorner<3,1>() = normals[i]; newPlane.w() = depths[i]; MultiViewScorePlane(images, depthImages, cameras, p, newPlane, lowDepth, costVector, params); const float costPlane = AggregateMultiViewScores(viewWeights, costVector, params.nNumViews); if (cost > costPlane) { cost = costPlane; plane = newPlane; } } } // compute the score of the current plane estimate __device__ void InitializePixelScore(const ImagePixels *images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, RandState* randStates, unsigned* selectedViews, const Point2i& p, const PatchMatchCUDA::Params params) { const int width = cameras[0].width; const int height = cameras[0].height; if (p.x() >= width || p.y() >= height) return; const int idx = Point2Idx(p, width); float lowDepth = 0; if (params.bLowResProcessed) lowDepth = lowDepths[idx]; // initialize estimate randomly if not set RandState* randState = &randStates[idx]; curand_init(1234/*threadIdx.x*/, p.y(), p.x(), randState); Point4& plane = planes[idx]; float depth = plane.w(); if (depth <= 0.f) { // generate random plane plane.topLeftCorner<3,1>() = GenerateRandomNormal(cameras[0], p, randState); plane.w() = curand_uniform(randState) * (params.fDepthMax - params.fDepthMin) + params.fDepthMin; } else if (plane.topLeftCorner<3,1>().dot(ViewDirection(cameras[0], p)) >= 0.f) { // generate random normal plane.topLeftCorner<3,1>() = GenerateRandomNormal(cameras[0], p, randState); } // compute costs float costVector[MAX_VIEWS]; MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params); // select best views float costVectorSorted[MAX_VIEWS]; Sort(costVector, costVectorSorted, params.nNumViews); float cost = 0.f; for (int i = 0; i < params.nInitTopK; ++i) cost += costVectorSorted[i]; const float costThreshold = costVectorSorted[params.nInitTopK - 1]; unsigned& selectedView = selectedViews[idx]; selectedView = 0; for (int imgId = 0; imgId < params.nNumViews; ++imgId) if (costVector[imgId] <= costThreshold) SetBit(selectedView, imgId); costs[idx] = cost / params.nInitTopK; } __global__ void InitializeScore(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params) { const Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y); InitializePixelScore((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params); } // traverse image in a back/red checkerboard pattern __global__ void BlackPixelProcess(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params, const int iter) { Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y); p.y() = p.y() * 2 + (threadIdx.x % 2 == 0 ? 0 : 1); ProcessPixel((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params, iter); } __global__ void RedPixelProcess(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params, const int iter) { Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y); p.y() = p.y() * 2 + (threadIdx.x % 2 == 0 ? 1 : 0); ProcessPixel((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params, iter); } // filter depth/normals __global__ void FilterPlanes(Point4* planes, float* costs, unsigned* selectedViews, int width, int height, const PatchMatchCUDA::Params params) { const Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y); if (p.x() >= width || p.y() >= height) return; const int idx = Point2Idx(p, width); // filter estimates if the score is not good enough Point4& plane = planes[idx]; float conf = costs[idx]; if (plane.w() <= 0 || conf >= params.fThresholdKeepCost) { conf = 0; plane = Point4::Zero(); selectedViews[idx] = 0; } } /*----------------------------------------------------------------*/ __host__ void PatchMatchCUDA::RunCUDA(float* ptrCostMap, uint32_t* ptrViewsMap) { const unsigned width = cameras[0].width; const unsigned height = cameras[0].height; constexpr unsigned BLOCK_W = 32; constexpr unsigned BLOCK_H = (BLOCK_W / 2); const dim3 blockSize(BLOCK_W, BLOCK_H, 1); const dim3 gridSizeFull((width + BLOCK_H - 1) / BLOCK_H, (height + BLOCK_H - 1) / BLOCK_H, 1); const dim3 gridSizeCheckerboard((width + BLOCK_W - 1) / BLOCK_W, ((height / 2) + BLOCK_H - 1) / BLOCK_H, 1); InitializeScore<<>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params); cudaDeviceSynchronize(); for (int iter = 0; iter < params.nEstimationIters; ++iter) { BlackPixelProcess<<>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params, iter); cudaDeviceSynchronize(); RedPixelProcess<<>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params, iter); cudaDeviceSynchronize(); } if (params.fThresholdKeepCost > 0) FilterPlanes<<>>(cudaDepthNormalEstimates, cudaDepthNormalCosts, cudaSelectedViews, width, height, params); cudaMemcpy(depthNormalEstimates, cudaDepthNormalEstimates, sizeof(Point4) * width * height, cudaMemcpyDeviceToHost); if (ptrCostMap) cudaMemcpy(ptrCostMap, cudaDepthNormalCosts, sizeof(float) * width * height, cudaMemcpyDeviceToHost); if (ptrViewsMap) cudaMemcpy(ptrViewsMap, cudaSelectedViews, sizeof(uint32_t) * width * height, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); } /*----------------------------------------------------------------*/