You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

662 lines
27 KiB

/*
* PatchMatchCUDA.cu
*
* Copyright (c) 2014-2021 SEACAVE
*
* Author(s):
*
* cDc <cdc.seacave@gmail.com>
*
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* Additional Terms:
*
* You are required to preserve legal notices and author attributions in
* that material or in the Appropriate Legal Notices displayed by works
* containing it.
*/
#include "PatchMatchCUDA.inl"
// static max supported views
#define MAX_VIEWS 32
// samples used to perform views selection
#define NUM_SAMPLES 32
// patch window radius
#define nSizeHalfWindow 4
// patch stepping
#define nSizeStep 2
using namespace MVS;
typedef Eigen::Matrix<int,2,1> Point2i;
typedef Eigen::Matrix<float,2,1> Point2;
typedef Eigen::Matrix<float,3,1> Point3;
typedef Eigen::Matrix<float,4,1> Point4;
typedef Eigen::Matrix<float,3,3> Matrix3;
#define ImagePixels cudaTextureObject_t
#define RandState curandState
// square the given value
__device__ inline constexpr float Square(float v) {
return v * v;
}
// set/check a bit
__device__ inline constexpr void SetBit(unsigned& input, unsigned i) {
input |= (1u << i);
}
__device__ inline constexpr int IsBitSet(unsigned input, unsigned i) {
return (input >> i) & 1u;
}
// swap the given values
__device__ inline constexpr void Swap(float& v0, float& v1) {
const float tmp = v0;
v0 = v1;
v1 = tmp;
}
// convert 2d to 1d coordinates and back
__device__ inline int Point2Idx(const Point2i& p, int width) {
return p.y() * width + p.x();
}
__device__ inline Point2i Idx2Point(int idx, int width) {
return Point2i(idx % width, idx / width);
}
// project and back-project a 3D point
__device__ inline Point2 ProjectPoint(const PatchMatchCUDA::Camera& camera, const Point3& X) {
const Point3 x = camera.K * camera.R * (X - camera.C);
return x.hnormalized();
}
__device__ inline Point3 BackProjectPointCamera(const PatchMatchCUDA::Camera& camera, const Point2& p, const float depth = 1.f) {
return Point3(
depth * (p.x() - camera.K(0,2)) / camera.K(0,0),
depth * (p.y() - camera.K(1,2)) / camera.K(1,1),
depth);
}
__device__ inline Point3 BackProjectPoint(const PatchMatchCUDA::Camera& camera, const Point2& p, const float depth) {
const Point3 camX = BackProjectPointCamera(camera, p, depth);
return camera.R.transpose() * camX + camera.C;
}
// compute camera ray direction for the given pixel
__device__ inline Point3 ViewDirection(const PatchMatchCUDA::Camera& camera, const Point2i& p) {
return BackProjectPointCamera(camera, p.cast<float>()).normalized();
}
// sort the given values array using bubble sort algorithm
__device__ inline void Sort(const float* values, float* sortedValues, int n) {
for (int i = 0; i < n; ++i)
sortedValues[i] = values[i];
do {
int newn = 0;
for (int i = 1; i < n; ++i) {
if (sortedValues[i-1] > sortedValues[i]) {
Swap(sortedValues[i-1], sortedValues[i]);
newn = i;
}
}
n = newn;
} while(n);
}
// find the index of the minimum value in the given values array
__device__ inline int FindMinIndex(const float* values, const int n) {
float minValue = values[0];
int minValueIdx = 0;
for (int i = 1; i < n; ++i) {
if (minValue > values[i]) {
minValue = values[i];
minValueIdx = i;
}
}
return minValueIdx;
}
// convert Probability Density Function (PDF) to Cumulative Distribution Function (CDF)
__device__ inline void PDF2CDF(float* probs, const int numProbs) {
float probSum = 0.f;
for (int i = 0; i < numProbs; ++i)
probSum += probs[i];
const float invProbSum = 1.f / probSum;
float sumProb = 0.f;
for (int i = 0; i < numProbs-1; ++i) {
sumProb += probs[i] * invProbSum;
probs[i] = sumProb;
}
probs[numProbs-1] = 1.f;
}
/*----------------------------------------------------------------*/
// generate a random normal
__device__ inline Point3 GenerateRandomNormal(const PatchMatchCUDA::Camera& camera, const Point2i& p, RandState* randState)
{
float q1, q2, s;
do {
q1 = 2.f * curand_uniform(randState) - 1.f;
q2 = 2.f * curand_uniform(randState) - 1.f;
s = q1 * q1 + q2 * q2;
} while (s >= 1.f);
const float sq = sqrt(1.f - s);
Point3 normal(
2.f * q1 * sq,
2.f * q2 * sq,
1.f - 2.f * s);
const Point3 viewDirection = ViewDirection(camera, p);
if (normal.dot(viewDirection) > 0.f)
normal = -normal;
return normal.normalized();
}
// randomly perturb a normal
__device__ inline Point3 GeneratePerturbedNormal(const PatchMatchCUDA::Camera& camera, const Point2i& p, const Point3& normal, RandState* randState, const float perturbation)
{
const Point3 viewDirection = ViewDirection(camera, p);
const float a1 = (curand_uniform(randState) - 0.5f) * perturbation;
const float a2 = (curand_uniform(randState) - 0.5f) * perturbation;
const float a3 = (curand_uniform(randState) - 0.5f) * perturbation;
const float sinA1 = sin(a1);
const float sinA2 = sin(a2);
const float sinA3 = sin(a3);
const float cosA1 = cos(a1);
const float cosA2 = cos(a2);
const float cosA3 = cos(a3);
Matrix3 perturb; perturb <<
cosA2 * cosA3,
cosA3 * sinA1 * sinA2 - cosA1 * sinA3,
sinA1 * sinA3 + cosA1 * cosA3 * sinA2,
cosA2 * sinA3,
cosA1 * cosA3 + sinA1 * sinA2 * sinA3,
cosA1 * sinA2 * sinA3 - cosA3 * sinA1,
-sinA2,
cosA2 * sinA1,
cosA1 * cosA2;
Point3 normalPerturbed = perturb * normal.topLeftCorner<3,1>();
if (normalPerturbed.dot(viewDirection) >= 0.f)
return normal;
return normalPerturbed.normalized();
}
// randomly perturb a normal
__device__ inline float GeneratePerturbedDepth(float depth, RandState* randState, const float perturbation, const PatchMatchCUDA::Params& params)
{
const float depthMinPerturbed = (1.f - perturbation) * depth;
const float depthMaxPerturbed = (1.f + perturbation) * depth;
float depthPerturbed;
do {
depthPerturbed = curand_uniform(randState) * (depthMaxPerturbed - depthMinPerturbed) + depthMinPerturbed;
} while (depthPerturbed < params.fDepthMin && depthPerturbed > params.fDepthMax);
return depthPerturbed;
}
// interpolate given pixel's estimate to the current position
__device__ inline float InterpolatePixel(const PatchMatchCUDA::Camera& camera, const Point2i& p, const Point2i& np, float depth, const Point3& normal, const PatchMatchCUDA::Params& params)
{
float depthNew;
if (p.x() == np.x()) {
const float nx1 = (p.y() - camera.K(1,2)) / camera.K(1,1);
const float denom = normal.z() + nx1 * normal.y();
if (abs(denom) < FLT_EPSILON)
return depth;
const float x1 = (np.y() - camera.K(1,2)) / camera.K(1,1);
const float nom = depth * (normal.z() + x1 * normal.y());
depthNew = nom / denom;
} else if (p.y() == np.y()) {
const float nx1 = (p.x() - camera.K(0,2)) / camera.K(0,0);
const float denom = normal.z() + nx1 * normal.x();
if (abs(denom) < FLT_EPSILON)
return depth;
const float x1 = (np.x() - camera.K(0,2)) / camera.K(0,0);
const float nom = depth * (normal.z() + x1 * normal.x());
depthNew = nom / denom;
} else {
const float planeD = normal.dot(BackProjectPointCamera(camera, np.cast<float>(), depth));
depthNew = planeD / normal.dot(BackProjectPointCamera(camera, p.cast<float>()));
}
return (depthNew >= params.fDepthMin && depthNew <= params.fDepthMax) ? depthNew : depth;
}
// compute normal to the surface given the 4 neighbors
__device__ inline Point3 ComputeDepthGradient(const Matrix3& K, float depth, const Point2i& pos, const Point4& ndepth) {
constexpr float2 nposg[4] = {{0,-1}, {0,1}, {-1,0}, {1,0}};
Point2 dg(0,0);
// add neighbor depths at the gradient locations
for (int i=0; i<4; ++i)
dg += Point2(nposg[i].x,nposg[i].y) * (ndepth[i] - depth);
// compute depth gradient
const Point2 d = dg*0.5f;
// compute normal from depth gradient
return Point3(
K(0,0)*d.x(),
K(1,1)*d.y(),
(K(0,2)-pos.x())*d.x()+(K(1,2)-pos.y())*d.y()-depth).normalized();
}
// compose tho homography matrix that transforms a point from reference to source camera through the given plane
__device__ inline Matrix3 ComputeHomography(const PatchMatchCUDA::Camera& refCamera, const PatchMatchCUDA::Camera& trgCamera, const Point2& p, const Point4& plane)
{
const Point3 X = BackProjectPointCamera(refCamera, p, plane.w());
const Point3 normal = plane.topLeftCorner<3,1>();
const Point3 t = (refCamera.C - trgCamera.C) / (normal.dot(X));
const Matrix3 H = trgCamera.R * (refCamera.R.transpose() + t*normal.transpose());
return trgCamera.K * H * refCamera.K.inverse();
}
// weight a neighbor texel based on color similarity and distance to the center texel
__device__ inline float ComputeBilateralWeight(int xDist, int yDist, float pix, float centerPix)
{
constexpr float sigmaSpatial = -1.f / (2.f * (nSizeHalfWindow-1)*(nSizeHalfWindow-1));
constexpr float sigmaColor = -1.f / (2.f * 25.f/255.f*25.f/255.f);
const float spatialDistSq = float(xDist * xDist + yDist * yDist);
const float colorDistSq = Square(pix - centerPix);
return exp(spatialDistSq * sigmaSpatial + colorDistSq * sigmaColor);
}
// compute the geometric consistency weight
__device__ inline float GeometricConsistencyWeight(const ImagePixels depthImage, const PatchMatchCUDA::Camera& refCamera, const PatchMatchCUDA::Camera& trgCamera, const Point4& plane, const Point2i& p)
{
if (depthImage == NULL)
return 0.f;
constexpr float maxDist = 4.f;
const Point3 forwardPoint = BackProjectPoint(refCamera, p.cast<float>(), plane.w());
const Point2 trgPt = ProjectPoint(trgCamera, forwardPoint);
const float trgDepth = tex2D<float>(depthImage, trgPt.x() + 0.5f, trgPt.y() + 0.5f);
if (trgDepth == 0.f)
return maxDist;
const Point3 trgX = BackProjectPoint(trgCamera, trgPt, trgDepth);
const Point2 backwardPoint = ProjectPoint(refCamera, trgX);
const Point2 diff = p.cast<float>() - backwardPoint;
const float dist = diff.norm();
return min(maxDist, sqrt(dist*(dist+2.f)));
}
// compute photometric score using weighted ZNCC
__device__ float ScorePlane(const ImagePixels refImage, const PatchMatchCUDA::Camera& refCamera, const ImagePixels trgImage, const PatchMatchCUDA::Camera& trgCamera, const Point2i& p, const Point4& plane, const float lowDepth, const PatchMatchCUDA::Params& params)
{
constexpr float maxCost = 1.2f;
Matrix3 H = ComputeHomography(refCamera, trgCamera, p.cast<float>(), plane);
const Point2 pt = (H * p.cast<float>().homogeneous()).hnormalized();
if (pt.x() >= trgCamera.width || pt.x() < 0.f || pt.y() >= trgCamera.height || pt.y() < 0.f)
return maxCost;
Point3 X = H * Point2(p.x()-nSizeHalfWindow, p.y()-nSizeHalfWindow).homogeneous();
Point3 baseX(X);
H *= float(nSizeStep);
float sumRef = 0.f;
float sumRefRef = 0.f;
float sumTrg = 0.f;
float sumTrgTrg = 0.f;
float sumRefTrg = 0.f;
float bilateralWeightSum = 0.f;
const float refCenterPix = tex2D<float>(refImage, p.x() + 0.5f, p.y() + 0.5f);
for (int i = -nSizeHalfWindow; i <= nSizeHalfWindow; i += nSizeStep) {
for (int j = -nSizeHalfWindow; j <= nSizeHalfWindow; j += nSizeStep) {
const Point2i refPt = Point2i(p.x() + j, p.y() + i);
const Point2 trgPt = X.hnormalized();
const float refPix = tex2D<float>(refImage, refPt.x() + 0.5f, refPt.y() + 0.5f);
const float trgPix = tex2D<float>(trgImage, trgPt.x() + 0.5f, trgPt.y() + 0.5f);
const float weight = ComputeBilateralWeight(j, i, refPix, refCenterPix);
const float weightRefPix = weight * refPix;
const float weightTrgPix = weight * trgPix;
sumRef += weightRefPix;
sumTrg += weightTrgPix;
sumRefRef += weightRefPix * refPix;
sumTrgTrg += weightTrgPix * trgPix;
sumRefTrg += weightRefPix * trgPix;
bilateralWeightSum += weight;
X += H.col(0);
}
baseX += H.col(1);
X = baseX;
}
const float varRef = sumRefRef * bilateralWeightSum - sumRef * sumRef;
if (lowDepth <= 0 && varRef < 1e-8f)
return maxCost;
const float varTrg = sumTrgTrg * bilateralWeightSum - sumTrg * sumTrg;
const float varRefTrg = varRef * varTrg;
if (varRefTrg < 1e-16f)
return maxCost;
const float covarTrgRef = sumRefTrg * bilateralWeightSum - sumRef * sumTrg;
float ncc = 1.f - covarTrgRef / sqrt(varRefTrg);
// apply depth prior weight based on patch textureless
if (lowDepth > 0) {
const float depth(plane.w());
const float deltaDepth(MIN((abs(lowDepth-depth) / lowDepth), 0.5f));
constexpr float smoothSigmaDepth(-1.f / (1.f * 0.02f)); // 0.12: patch texture variance below 0.02 (0.12^2) is considered texture-less
const float factorDeltaDepth(exp(varRef * smoothSigmaDepth));
ncc = (1.f-factorDeltaDepth)*ncc + factorDeltaDepth*deltaDepth;
}
return max(0.f, min(2.f, ncc));
}
// compute photometric score for all neighbor images
__device__ inline void MultiViewScorePlane(const ImagePixels *images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, const Point2i& p, const Point4& plane, const float lowDepth, float* costVector, const PatchMatchCUDA::Params& params)
{
for (int imgId = 1; imgId <= params.nNumViews; ++imgId)
costVector[imgId-1] = ScorePlane(images[0], cameras[0], images[imgId], cameras[imgId], p, plane, lowDepth, params);
if (params.bGeomConsistency)
for (int imgId = 0; imgId < params.nNumViews; ++imgId)
costVector[imgId] += 0.1f * GeometricConsistencyWeight(depthImages[imgId], cameras[0], cameras[imgId+1], plane, p);
}
// same as above, but interpolate the plane to current pixel position
__device__ inline float MultiViewScoreNeighborPlane(const ImagePixels* images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, const Point2i& p, const Point2i& np, Point4 plane, const float lowDepth, float* costVector, const PatchMatchCUDA::Params& params)
{
plane.w() = InterpolatePixel(cameras[0], p, np, plane.w(), plane.topLeftCorner<3,1>(), params);
MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params);
return plane.w();
}
// aggregate photometric score from all images
__device__ inline float AggregateMultiViewScores(const unsigned* viewWeights, const float* costVector, int numViews)
{
float cost = 0;
for (int imgId = 0; imgId < numViews; ++imgId)
if (viewWeights[imgId])
cost += viewWeights[imgId] * costVector[imgId];
return cost / NUM_SAMPLES;
}
// propagate and refine the plane estimate for the current pixel employing the asymmetric approach described in:
// "Multi-View Stereo with Asymmetric Checkerboard Propagation and Multi-Hypothesis Joint View Selection", 2018
__device__ void ProcessPixel(const ImagePixels* images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, RandState* randStates, unsigned* selectedViews, const Point2i& p, const PatchMatchCUDA::Params& params, const int iter)
{
int width = cameras[0].width;
int height = cameras[0].height;
if (p.x() >= width || p.y() >= height)
return;
const int idx = Point2Idx(p, width);
RandState* randState = &randStates[idx];
float lowDepth = 0;
if (params.bLowResProcessed)
lowDepth = lowDepths[idx];
// adaptive sampling: 0 up-near, 1 down-near, 2 left-near, 3 right-near, 4 up-far, 5 down-far, 6 left-far, 7 right-far
static constexpr int2 dirs[8][11] = {
{{ 0,-1},{-1,-2},{ 1,-2},{-2,-3},{ 2,-3},{-3,-4},{ 3,-4}},
{{ 0, 1},{-1, 2},{ 1, 2},{-2, 3},{ 2, 3},{-3, 4},{ 3, 4}},
{{-1, 0},{-2,-1},{-2, 1},{-3,-2},{-3, 2},{-4,-3},{-4, 3}},
{{ 1, 0},{ 2,-1},{ 2, 1},{ 3,-2},{ 3, 2},{ 4,-3},{ 4, 3}},
{{0,-3},{0,-5},{0,-7},{0,-9},{0,-11},{0,-13},{0,-15},{0,-17},{0,-19},{0,-21},{0,-23}},
{{0, 3},{0, 5},{0, 7},{0, 9},{0, 11},{0, 13},{0, 15},{0, 17},{0, 19},{0, 21},{0, 23}},
{{-3,0},{-5,0},{-7,0},{-9,0},{-11,0},{-13,0},{-15,0},{-17,0},{-19,0},{-21,0},{-23,0}},
{{ 3,0},{ 5,0},{ 7,0},{ 9,0},{ 11,0},{ 13,0},{ 15,0},{ 17,0},{ 19,0},{ 21,0},{ 23,0}}
};
static constexpr int numDirs[8] = {7, 7, 7, 7, 11, 11, 11, 11};
const int neighborPositions[4] = {
idx - width,
idx + width,
idx - 1,
idx + 1,
};
bool valid[8] = {false, false, false, false, false, false, false, false};
int positions[8];
float neighborDepths[8];
float costArray[8][MAX_VIEWS];
for (int posId=0; posId<8; ++posId) {
const int2* samples = dirs[posId];
Point2i bestNx; float bestConf(FLT_MAX);
for (int dirId=0; dirId<numDirs[posId]; ++dirId) {
const int2& offset = samples[dirId];
const Point2i np(p.x()+offset.x, p.y()+offset.y);
if (!(np.x()>=0 && np.y()>=0 && np.x()<width && np.y()<height))
continue;
const int nidx = Point2Idx(np, width);
const float nconf = costs[nidx];
if (bestConf > nconf) {
bestNx = np;
bestConf = nconf;
}
}
if (bestConf < FLT_MAX) {
valid[posId] = true;
positions[posId] = Point2Idx(bestNx, width);
neighborDepths[posId] = MultiViewScoreNeighborPlane(images, depthImages, cameras, p, bestNx, planes[positions[posId]], lowDepth, costArray[posId], params);
}
}
// multi-hypothesis view selection
float viewSelectionPriors[MAX_VIEWS] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
for (int posId = 0; posId < 4; ++posId) {
if (valid[posId]) {
const unsigned selectedView = selectedViews[neighborPositions[posId]];
for (int j = 0; j < params.nNumViews; ++j)
viewSelectionPriors[j] += (IsBitSet(selectedView, j) ? 0.9f : 0.1f);
}
}
float samplingProbs[MAX_VIEWS];
constexpr float thCostBad = 1.2f;
const float thCost = 0.8f * exp(Square((float)iter) / (-2.f * 4.f*4.f));
for (int imgId = 0; imgId < params.nNumViews; ++imgId) {
float sumW = 0;
unsigned count = 0;
unsigned countBad = 0;
for (int posId = 0; posId < 8; posId++) {
if (valid[posId]) {
if (costArray[posId][imgId] < thCost) {
sumW += exp(Square(costArray[posId][imgId]) / (-2.f * 0.3f*0.3f));
++count;
} else if (costArray[posId][imgId] > thCostBad) {
++countBad;
}
}
}
if (count > 2 && countBad < 3) {
samplingProbs[imgId] = viewSelectionPriors[imgId] * sumW / count;
} else if (countBad < 3) {
samplingProbs[imgId] = viewSelectionPriors[imgId] * exp(Square(thCost) / (-2.f * 0.4f*0.4f));
} else {
samplingProbs[imgId] = 0.f;
}
}
PDF2CDF(samplingProbs, params.nNumViews);
unsigned viewWeights[MAX_VIEWS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (int sample = 0; sample < NUM_SAMPLES; ++sample) {
const float randProb = curand_uniform(randState);
for (int imgId = 0; imgId < params.nNumViews; ++imgId) {
if (samplingProbs[imgId] > randProb) {
++viewWeights[imgId];
break;
}
}
}
// propagate best neighbor plane
Point4& plane = planes[idx];
float& cost = costs[idx];
unsigned newSelectedViews = 0;
for (int imgId = 0; imgId < params.nNumViews; ++imgId)
if (viewWeights[imgId])
SetBit(newSelectedViews, imgId);
float finalCosts[8];
for (int posId = 0; posId < 8; ++posId)
finalCosts[posId] = AggregateMultiViewScores(viewWeights, costArray[posId], params.nNumViews);
const int minCostIdx = FindMinIndex(finalCosts, 8);
float costVector[MAX_VIEWS];
MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params);
cost = AggregateMultiViewScores(viewWeights, costVector, params.nNumViews);
if (finalCosts[minCostIdx] < cost && valid[minCostIdx]) {
plane = planes[positions[minCostIdx]];
plane.w() = neighborDepths[minCostIdx];
cost = finalCosts[minCostIdx];
selectedViews[idx] = newSelectedViews;
}
const float depth = plane.w();
// refine estimate
constexpr float perturbationDepth = 0.005f;
constexpr float perturbationNormal = 0.01f * (float)M_PI;
const float depthPerturbed = GeneratePerturbedDepth(depth, randState, perturbationDepth, params);
const Point3 perturbedNormal = GeneratePerturbedNormal(cameras[0], p, plane.topLeftCorner<3,1>(), randState, perturbationNormal);
const Point3 normalRand = GenerateRandomNormal(cameras[0], p, randState);
int numValidPlanes = 3;
Point3 surfaceNormal;
if (valid[0] && valid[1] && valid[2] && valid[3]) {
// estimate normal from surrounding surface
const Point4 ndepths(
planes[neighborPositions[0]].w(),
planes[neighborPositions[1]].w(),
planes[neighborPositions[2]].w(),
planes[neighborPositions[3]].w()
);
surfaceNormal = ComputeDepthGradient(cameras[0].K, depth, p, ndepths);
numValidPlanes = 4;
}
constexpr int numPlanes = 4;
const float depths[numPlanes] = {depthPerturbed, depth, depth, depth};
const Point3 normals[numPlanes] = {plane.topLeftCorner<3,1>(), perturbedNormal, normalRand, surfaceNormal};
for (int i = 0; i < numValidPlanes; ++i) {
Point4 newPlane;
newPlane.topLeftCorner<3,1>() = normals[i];
newPlane.w() = depths[i];
MultiViewScorePlane(images, depthImages, cameras, p, newPlane, lowDepth, costVector, params);
const float costPlane = AggregateMultiViewScores(viewWeights, costVector, params.nNumViews);
if (cost > costPlane) {
cost = costPlane;
plane = newPlane;
}
}
}
// compute the score of the current plane estimate
__device__ void InitializePixelScore(const ImagePixels *images, const ImagePixels* depthImages, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, RandState* randStates, unsigned* selectedViews, const Point2i& p, const PatchMatchCUDA::Params params)
{
const int width = cameras[0].width;
const int height = cameras[0].height;
if (p.x() >= width || p.y() >= height)
return;
const int idx = Point2Idx(p, width);
float lowDepth = 0;
if (params.bLowResProcessed)
lowDepth = lowDepths[idx];
// initialize estimate randomly if not set
RandState* randState = &randStates[idx];
curand_init(1234/*threadIdx.x*/, p.y(), p.x(), randState);
Point4& plane = planes[idx];
float depth = plane.w();
if (depth <= 0.f) {
// generate random plane
plane.topLeftCorner<3,1>() = GenerateRandomNormal(cameras[0], p, randState);
plane.w() = curand_uniform(randState) * (params.fDepthMax - params.fDepthMin) + params.fDepthMin;
} else if (plane.topLeftCorner<3,1>().dot(ViewDirection(cameras[0], p)) >= 0.f) {
// generate random normal
plane.topLeftCorner<3,1>() = GenerateRandomNormal(cameras[0], p, randState);
}
// compute costs
float costVector[MAX_VIEWS];
MultiViewScorePlane(images, depthImages, cameras, p, plane, lowDepth, costVector, params);
// select best views
float costVectorSorted[MAX_VIEWS];
Sort(costVector, costVectorSorted, params.nNumViews);
float cost = 0.f;
for (int i = 0; i < params.nInitTopK; ++i)
cost += costVectorSorted[i];
const float costThreshold = costVectorSorted[params.nInitTopK - 1];
unsigned& selectedView = selectedViews[idx];
selectedView = 0;
for (int imgId = 0; imgId < params.nNumViews; ++imgId)
if (costVector[imgId] <= costThreshold)
SetBit(selectedView, imgId);
costs[idx] = cost / params.nInitTopK;
}
__global__ void InitializeScore(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params)
{
const Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y);
InitializePixelScore((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params);
}
// traverse image in a back/red checkerboard pattern
__global__ void BlackPixelProcess(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params, const int iter)
{
Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y);
p.y() = p.y() * 2 + (threadIdx.x % 2 == 0 ? 0 : 1);
ProcessPixel((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params, iter);
}
__global__ void RedPixelProcess(const cudaTextureObject_t* textureImages, const cudaTextureObject_t* textureDepths, const PatchMatchCUDA::Camera* cameras, Point4* planes, const float* lowDepths, float* costs, curandState* randStates, unsigned* selectedViews, const PatchMatchCUDA::Params params, const int iter)
{
Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y);
p.y() = p.y() * 2 + (threadIdx.x % 2 == 0 ? 1 : 0);
ProcessPixel((const ImagePixels*)textureImages, (const ImagePixels*)textureDepths, cameras, planes, lowDepths, costs, (RandState*)randStates, selectedViews, p, params, iter);
}
// filter depth/normals
__global__ void FilterPlanes(Point4* planes, float* costs, unsigned* selectedViews, int width, int height, const PatchMatchCUDA::Params params)
{
const Point2i p = Point2i(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y);
if (p.x() >= width || p.y() >= height)
return;
const int idx = Point2Idx(p, width);
// filter estimates if the score is not good enough
Point4& plane = planes[idx];
float conf = costs[idx];
if (plane.w() <= 0 || conf >= params.fThresholdKeepCost) {
conf = 0;
plane = Point4::Zero();
selectedViews[idx] = 0;
}
}
/*----------------------------------------------------------------*/
__host__ void PatchMatchCUDA::RunCUDA(float* ptrCostMap, uint32_t* ptrViewsMap)
{
const unsigned width = cameras[0].width;
const unsigned height = cameras[0].height;
constexpr unsigned BLOCK_W = 32;
constexpr unsigned BLOCK_H = (BLOCK_W / 2);
const dim3 blockSize(BLOCK_W, BLOCK_H, 1);
const dim3 gridSizeFull((width + BLOCK_H - 1) / BLOCK_H, (height + BLOCK_H - 1) / BLOCK_H, 1);
const dim3 gridSizeCheckerboard((width + BLOCK_W - 1) / BLOCK_W, ((height / 2) + BLOCK_H - 1) / BLOCK_H, 1);
InitializeScore<<<gridSizeFull, blockSize>>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params);
cudaDeviceSynchronize();
for (int iter = 0; iter < params.nEstimationIters; ++iter) {
BlackPixelProcess<<<gridSizeCheckerboard, blockSize>>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params, iter);
cudaDeviceSynchronize();
RedPixelProcess<<<gridSizeCheckerboard, blockSize>>>(cudaTextureImages, cudaTextureDepths, cudaCameras, cudaDepthNormalEstimates, cudaLowDepths, cudaDepthNormalCosts, cudaRandStates, cudaSelectedViews, params, iter);
cudaDeviceSynchronize();
}
if (params.fThresholdKeepCost > 0)
FilterPlanes<<<gridSizeFull, blockSize>>>(cudaDepthNormalEstimates, cudaDepthNormalCosts, cudaSelectedViews, width, height, params);
cudaMemcpy(depthNormalEstimates, cudaDepthNormalEstimates, sizeof(Point4) * width * height, cudaMemcpyDeviceToHost);
if (ptrCostMap)
cudaMemcpy(ptrCostMap, cudaDepthNormalCosts, sizeof(float) * width * height, cudaMemcpyDeviceToHost);
if (ptrViewsMap)
cudaMemcpy(ptrViewsMap, cudaSelectedViews, sizeof(uint32_t) * width * height, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
/*----------------------------------------------------------------*/