You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2866 lines
89 KiB
2866 lines
89 KiB
/* |
|
* SceneRefineCUDA.cpp |
|
* |
|
* Copyright (c) 2014-2015 SEACAVE |
|
* |
|
* Author(s): |
|
* |
|
* cDc <cdc.seacave@gmail.com> |
|
* |
|
* |
|
* This program is free software: you can redistribute it and/or modify |
|
* it under the terms of the GNU Affero General Public License as published by |
|
* the Free Software Foundation, either version 3 of the License, or |
|
* (at your option) any later version. |
|
* |
|
* This program is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
* GNU Affero General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Affero General Public License |
|
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
* |
|
* |
|
* Additional Terms: |
|
* |
|
* You are required to preserve legal notices and author attributions in |
|
* that material or in the Appropriate Legal Notices displayed by works |
|
* containing it. |
|
*/ |
|
|
|
#include "Common.h" |
|
#include "Scene.h" |
|
|
|
using namespace MVS; |
|
|
|
#ifdef _USE_CUDA |
|
|
|
// D E F I N E S /////////////////////////////////////////////////// |
|
|
|
// uncomment to enable multi-threading based on OpenMP |
|
#ifdef _USE_OPENMP |
|
#define MESHCUDAOPT_USE_OPENMP |
|
#endif |
|
|
|
// uncomment to ensure edge size and improve vertex valence |
|
// (should enable more stable flow) |
|
#define MESHOPT_ENSUREEDGESIZE 1 // 0 - at all resolution |
|
|
|
|
|
// S T R U C T S /////////////////////////////////////////////////// |
|
|
|
static LPCSTR const g_szMeshRefineModule = |
|
".version 3.2\n" |
|
".target sm_20\n" |
|
".address_size 64\n" |
|
"\n" |
|
".global .texref texImageRef;\n" |
|
".global .surfref surfImageRef;\n" |
|
".global .surfref surfImageProjRef;\n" |
|
"\n" |
|
// kernel used to project the given mesh to a given camera plane: |
|
// the depth-map is computed by rasterizing all triangles (using a brute force scan-line approach) |
|
// and storing only the closest ones; |
|
// additionally the face index and barycentric coordinates are stored for each pixel |
|
".visible .entry ProjectMesh(\n" |
|
" .param .u64 .ptr param_1, // array vertices (float*3 * numVertices)\n" |
|
" .param .u64 .ptr param_2, // array faces (uint32_t*3 * numFaces)\n" |
|
" .param .u64 .ptr param_3, // array face IDs (uint32_t * numFacesView)\n" |
|
" .param .u64 .ptr param_4, // depth-map (float) [out]\n" |
|
" .param .u64 .ptr param_5, // face-map (uint32_t) [out]\n" |
|
" .param .u64 .ptr param_6, // bary-map (hfloat*3) [out]\n" |
|
" .param .align 4 .b8 param_7[176], // camera\n" |
|
" .param .u32 param_8 // numFacesView (uint32_t)\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<234>;\n" |
|
" .reg .pred %p<20>;\n" |
|
" .reg .s16 %rs<5>;\n" |
|
" .reg .s32 %r<105>;\n" |
|
" .reg .s64 %rl<42>;\n" |
|
"\n" |
|
" ld.param.u32 %r16, [param_8];\n" |
|
" mov.u32 %r17, %ntid.x;\n" |
|
" mov.u32 %r18, %ctaid.x;\n" |
|
" mov.u32 %r19, %tid.x;\n" |
|
" mad.lo.s32 %r5, %r17, %r18, %r19;\n" |
|
" setp.ge.s32 %p3, %r5, %r16;\n" |
|
" @%p3 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl4, [param_3];\n" |
|
" cvta.to.global.u64 %rl3, %rl4;\n" |
|
" mul.wide.u32 %rl5, %r5, 4;\n" |
|
" add.u64 %rl6, %rl3, %rl5;\n" |
|
" ld.global.u32 %r6, [%rl6];\n" |
|
" mul.lo.s32 %r20, %r6, 3;\n" |
|
" ld.param.u64 %rl36, [param_2];\n" |
|
" cvta.to.global.u64 %rl10, %rl36;\n" |
|
" mul.wide.s32 %rl11, %r20, 4;\n" |
|
" add.s64 %rl12, %rl10, %rl11;\n" |
|
" mad.lo.s32 %r21, %r6, 3, 1;\n" |
|
" mul.wide.s32 %rl13, %r21, 4;\n" |
|
" add.s64 %rl14, %rl10, %rl13;\n" |
|
" ld.global.u32 %r22, [%rl12];\n" |
|
" mul.lo.s32 %r24, %r22, 3;\n" |
|
" ld.param.u64 %rl35, [param_1];\n" |
|
" cvta.to.global.u64 %rl15, %rl35;\n" |
|
" mul.wide.s32 %rl16, %r24, 4;\n" |
|
" add.s64 %rl17, %rl15, %rl16;\n" |
|
" ld.global.u32 %r25, [%rl14];\n" |
|
" mul.lo.s32 %r27, %r25, 3;\n" |
|
" mul.wide.s32 %rl18, %r27, 4;\n" |
|
" add.s64 %rl19, %rl15, %rl18;\n" |
|
" ld.global.f32 %f33, [%rl19];\n" |
|
" ld.global.f32 %f34, [%rl19+4];\n" |
|
" ld.global.f32 %f35, [%rl19+8];\n" |
|
" ld.global.u32 %r31, [%rl14+4];\n" |
|
" mul.lo.s32 %r33, %r31, 3;\n" |
|
" mul.wide.s32 %rl20, %r33, 4;\n" |
|
" add.s64 %rl21, %rl15, %rl20;\n" |
|
" ld.global.f32 %f36, [%rl21];\n" |
|
" ld.global.f32 %f37, [%rl21+4];\n" |
|
" ld.global.f32 %f38, [%rl21+8];\n" |
|
" ld.global.f32 %f39, [%rl17];\n" |
|
" ld.global.f32 %f40, [%rl17+4];\n" |
|
" ld.param.f32 %f217, [param_7+4];\n" |
|
" mul.f32 %f95, %f217, %f40;\n" |
|
" ld.param.f32 %f220, [param_7];\n" |
|
" fma.rn.f32 %f96, %f220, %f39, %f95;\n" |
|
" ld.global.f32 %f41, [%rl17+8];\n" |
|
" ld.param.f32 %f214, [param_7+8];\n" |
|
" fma.rn.f32 %f97, %f214, %f41, %f96;\n" |
|
" ld.param.f32 %f211, [param_7+12];\n" |
|
" add.f32 %f42, %f97, %f211;\n" |
|
" ld.param.f32 %f205, [param_7+20];\n" |
|
" mul.f32 %f98, %f205, %f40;\n" |
|
" ld.param.f32 %f208, [param_7+16];\n" |
|
" fma.rn.f32 %f99, %f208, %f39, %f98;\n" |
|
" ld.param.f32 %f202, [param_7+24];\n" |
|
" fma.rn.f32 %f100, %f202, %f41, %f99;\n" |
|
" ld.param.f32 %f197, [param_7+28];\n" |
|
" add.f32 %f43, %f100, %f197;\n" |
|
" ld.param.f32 %f191, [param_7+36];\n" |
|
" mul.f32 %f101, %f191, %f40;\n" |
|
" ld.param.f32 %f194, [param_7+32];\n" |
|
" fma.rn.f32 %f102, %f194, %f39, %f101;\n" |
|
" ld.param.f32 %f188, [param_7+40];\n" |
|
" fma.rn.f32 %f103, %f188, %f41, %f102;\n" |
|
" ld.param.f32 %f185, [param_7+44];\n" |
|
" add.f32 %f44, %f103, %f185;\n" |
|
" setp.gt.f32 %p4, %f44, 0f00000000;\n" |
|
" @%p4 bra BB00_8;\n" |
|
"\n" |
|
" mov.f32 %f222, 0fBF800000;\n" |
|
" mov.f32 %f221, %f222;\n" |
|
" bra.uni BB00_9;\n" |
|
"\n" |
|
" BB00_8:\n" |
|
" div.rn.f32 %f221, %f42, %f44;\n" |
|
" div.rn.f32 %f222, %f43, %f44;\n" |
|
"\n" |
|
" BB00_9:\n" |
|
" ld.param.f32 %f216, [param_7+4];\n" |
|
" mul.f32 %f106, %f216, %f34;\n" |
|
" ld.param.f32 %f219, [param_7];\n" |
|
" fma.rn.f32 %f107, %f219, %f33, %f106;\n" |
|
" ld.param.f32 %f213, [param_7+8];\n" |
|
" fma.rn.f32 %f108, %f213, %f35, %f107;\n" |
|
" ld.param.f32 %f210, [param_7+12];\n" |
|
" add.f32 %f49, %f108, %f210;\n" |
|
" ld.param.f32 %f204, [param_7+20];\n" |
|
" mul.f32 %f109, %f204, %f34;\n" |
|
" ld.param.f32 %f207, [param_7+16];\n" |
|
" fma.rn.f32 %f110, %f207, %f33, %f109;\n" |
|
" ld.param.f32 %f201, [param_7+24];\n" |
|
" fma.rn.f32 %f111, %f201, %f35, %f110;\n" |
|
" ld.param.f32 %f199, [param_7+28];\n" |
|
" add.f32 %f50, %f111, %f199;\n" |
|
" ld.param.f32 %f193, [param_7+36];\n" |
|
" mul.f32 %f112, %f193, %f34;\n" |
|
" ld.param.f32 %f196, [param_7+32];\n" |
|
" fma.rn.f32 %f113, %f196, %f33, %f112;\n" |
|
" ld.param.f32 %f190, [param_7+40];\n" |
|
" fma.rn.f32 %f114, %f190, %f35, %f113;\n" |
|
" ld.param.f32 %f187, [param_7+44];\n" |
|
" add.f32 %f51, %f114, %f187;\n" |
|
" setp.gt.f32 %p5, %f51, 0f00000000;\n" |
|
" @%p5 bra BB00_10;\n" |
|
"\n" |
|
" mov.f32 %f224, 0fBF800000;\n" |
|
" mov.f32 %f223, %f224;\n" |
|
" bra.uni BB00_11;\n" |
|
"\n" |
|
" BB00_10:\n" |
|
" div.rn.f32 %f223, %f49, %f51;\n" |
|
" div.rn.f32 %f224, %f50, %f51;\n" |
|
"\n" |
|
" BB00_11:\n" |
|
" ld.param.f32 %f215, [param_7+4];\n" |
|
" mul.f32 %f117, %f215, %f37;\n" |
|
" ld.param.f32 %f218, [param_7];\n" |
|
" fma.rn.f32 %f118, %f218, %f36, %f117;\n" |
|
" ld.param.f32 %f212, [param_7+8];\n" |
|
" fma.rn.f32 %f119, %f212, %f38, %f118;\n" |
|
" ld.param.f32 %f209, [param_7+12];\n" |
|
" add.f32 %f56, %f119, %f209;\n" |
|
" ld.param.f32 %f203, [param_7+20];\n" |
|
" mul.f32 %f120, %f203, %f37;\n" |
|
" ld.param.f32 %f206, [param_7+16];\n" |
|
" fma.rn.f32 %f121, %f206, %f36, %f120;\n" |
|
" ld.param.f32 %f200, [param_7+24];\n" |
|
" fma.rn.f32 %f122, %f200, %f38, %f121;\n" |
|
" ld.param.f32 %f198, [param_7+28];\n" |
|
" add.f32 %f57, %f122, %f198;\n" |
|
" ld.param.f32 %f192, [param_7+36];\n" |
|
" mul.f32 %f123, %f192, %f37;\n" |
|
" ld.param.f32 %f195, [param_7+32];\n" |
|
" fma.rn.f32 %f124, %f195, %f36, %f123;\n" |
|
" ld.param.f32 %f189, [param_7+40];\n" |
|
" fma.rn.f32 %f125, %f189, %f38, %f124;\n" |
|
" ld.param.f32 %f186, [param_7+44];\n" |
|
" add.f32 %f58, %f125, %f186;\n" |
|
" setp.gt.f32 %p6, %f58, 0f00000000;\n" |
|
" @%p6 bra BB00_12;\n" |
|
"\n" |
|
" mov.f32 %f226, 0fBF800000;\n" |
|
" mov.f32 %f225, %f226;\n" |
|
" bra.uni BB00_13;\n" |
|
"\n" |
|
" BB00_12:\n" |
|
" div.rn.f32 %f225, %f56, %f58;\n" |
|
" div.rn.f32 %f226, %f57, %f58;\n" |
|
"\n" |
|
" BB00_13:\n" |
|
" add.f32 %f2, %f221, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r40, %f2;\n" |
|
" add.f32 %f4, %f223, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r41, %f4;\n" |
|
" min.s32 %r42, %r40, %r41;\n" |
|
" add.f32 %f6, %f225, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r43, %f6;\n" |
|
" min.s32 %r44, %r42, %r43;\n" |
|
" add.s32 %r103, %r44, -1;\n" |
|
" add.f32 %f7, %f221, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r45, %f7;\n" |
|
" add.f32 %f8, %f223, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r46, %f8;\n" |
|
" max.s32 %r47, %r45, %r46;\n" |
|
" add.f32 %f9, %f225, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r48, %f9;\n" |
|
" max.s32 %r49, %r47, %r48;\n" |
|
" add.s32 %r7, %r49, 1;\n" |
|
" add.f32 %f11, %f222, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r50, %f11;\n" |
|
" add.f32 %f13, %f224, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r51, %f13;\n" |
|
" min.s32 %r52, %r50, %r51;\n" |
|
" add.f32 %f15, %f226, 0fBF000000;\n" |
|
" cvt.rzi.s32.f32 %r53, %f15;\n" |
|
" min.s32 %r54, %r52, %r53;\n" |
|
" add.s32 %r8, %r54, -1;\n" |
|
" add.f32 %f16, %f222, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r55, %f16;\n" |
|
" add.f32 %f17, %f224, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r56, %f17;\n" |
|
" max.s32 %r57, %r55, %r56;\n" |
|
" add.f32 %f18, %f226, 0f3F000000;\n" |
|
" cvt.rzi.s32.f32 %r58, %f18;\n" |
|
" max.s32 %r59, %r57, %r58;\n" |
|
" add.s32 %r9, %r59, 1;\n" |
|
" mov.s32 %r60, 10;\n" |
|
" setp.lt.s32 %p7, %r103, %r60;\n" |
|
" @%p7 bra BB00_1;\n" |
|
"\n" |
|
" mov.s32 %r61, 10;\n" |
|
" setp.lt.s32 %p8, %r8, %r61;\n" |
|
" @%p8 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r101, [param_7+168];\n" |
|
" add.s32 %r63, %r101, -10;\n" |
|
" setp.gt.s32 %p9, %r7, %r63;\n" |
|
" @%p9 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r102, [param_7+172];\n" |
|
" add.s32 %r65, %r102, -10;\n" |
|
" setp.gt.s32 %p10, %r9, %r65;\n" |
|
" @%p10 bra BB00_1;\n" |
|
"\n" |
|
" sub.f32 %f128, %f224, %f222;\n" |
|
" sub.f32 %f129, %f225, %f221;\n" |
|
" mul.f32 %f130, %f128, %f129;\n" |
|
" sub.f32 %f131, %f226, %f222;\n" |
|
" sub.f32 %f132, %f223, %f221;\n" |
|
" neg.f32 %f133, %f132;\n" |
|
" fma.rn.f32 %f134, %f133, %f131, %f130;\n" |
|
" rcp.rn.f32 %f63, %f134;\n" |
|
" sub.f32 %f153, %f221, %f223;\n" |
|
" mul.f32 %f67, %f63, %f153;\n" |
|
" mul.f32 %f68, %f63, %f128;\n" |
|
" mul.f32 %f69, %f63, %f129;\n" |
|
" sub.f32 %f154, %f222, %f226;\n" |
|
" mul.f32 %f70, %f63, %f154;\n" |
|
" cvt.rn.f32.s32 %f155, %r103;\n" |
|
" sub.f32 %f156, %f155, %f225;\n" |
|
" mul.f32 %f157, %f154, %f156;\n" |
|
" cvt.rn.f32.s32 %f158, %r8;\n" |
|
" sub.f32 %f159, %f158, %f226;\n" |
|
" sub.f32 %f160, %f221, %f225;\n" |
|
" neg.f32 %f161, %f160;\n" |
|
" fma.rn.f32 %f162, %f161, %f159, %f157;\n" |
|
" mul.f32 %f232, %f63, %f162;\n" |
|
" sub.f32 %f163, %f155, %f221;\n" |
|
" mul.f32 %f164, %f128, %f163;\n" |
|
" sub.f32 %f165, %f158, %f222;\n" |
|
" fma.rn.f32 %f166, %f133, %f165, %f164;\n" |
|
" mul.f32 %f229, %f63, %f166;\n" |
|
" setp.gt.s32 %p11, %r103, %r7;\n" |
|
" @%p11 bra BB00_1;\n" |
|
"\n" |
|
" setp.gt.s32 %p1, %r8, %r9;\n" |
|
" setp.gt.f32 %p2, %f63, 0f00000000;\n" |
|
" ld.param.u64 %rl37, [param_4];\n" |
|
" cvta.to.global.u64 %rl22, %rl37;\n" |
|
" ld.param.u64 %rl39, [param_5];\n" |
|
" cvta.to.global.u64 %rl28, %rl39;\n" |
|
" ld.param.u64 %rl40, [param_6];\n" |
|
" cvta.to.global.u64 %rl31, %rl40;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" mov.f32 %f230, %f232;\n" |
|
" mov.f32 %f74, %f230;\n" |
|
" mov.f32 %f227, %f229;\n" |
|
" mov.f32 %f73, %f227;\n" |
|
" @%p1 bra BB00_6;\n" |
|
"\n" |
|
" mov.u32 %r11, %r8;\n" |
|
" mov.f32 %f75, %f73;\n" |
|
" mov.f32 %f76, %f74;\n" |
|
"\n" |
|
" BB00_7:\n" |
|
" setp.ge.f32 %p12, %f75, 0f00000000;\n" |
|
" setp.ge.f32 %p13, %f76, 0f00000000;\n" |
|
" and.pred %p14, %p13, %p12;\n" |
|
" @!%p14 bra BB00_5;\n" |
|
"\n" |
|
" add.f32 %f167, %f76, %f75;\n" |
|
" setp.gtu.f32 %p15, %f167, 0f3F800000;\n" |
|
" @%p15 bra BB00_5;\n" |
|
"\n" |
|
" sub.f32 %f77, 0f3F800000, %f76;\n" |
|
" sub.f32 %f77, %f77, %f75;\n" |
|
" mul.f32 %f171, %f76, %f51;\n" |
|
" fma.rn.f32 %f172, %f77, %f44, %f171;\n" |
|
" fma.rn.f32 %f78, %f75, %f58, %f172;\n" |
|
" ld.param.u32 %r100, [param_7+168];\n" |
|
" mad.lo.s32 %r67, %r11, %r100, %r103;\n" |
|
" mul.wide.s32 %rl23, %r67, 4;\n" |
|
" mov.b32 %r13, %f78;\n" |
|
" add.s64 %rl24, %rl22, %rl23;\n" |
|
" ld.global.f32 %f233, [%rl24];\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" setp.lt.f32 %p16, %f233, %f78;\n" |
|
" @%p16 bra BB00_5;\n" |
|
"\n" |
|
" mov.b32 %r72, %f233;\n" |
|
" add.s64 %rl27, %rl22, %rl23;\n" |
|
" atom.global.cas.b32 %r73, [%rl27], %r72, %r13;\n" |
|
" ld.global.f32 %f80, [%rl27];\n" |
|
" setp.neu.f32 %p17, %f233, %f80;\n" |
|
" mov.f32 %f233, %f80;\n" |
|
" @%p17 bra BB00_3;\n" |
|
"\n" |
|
" add.s64 %rl7, %rl28, %rl23;\n" |
|
" mul.wide.s32 %rl30, %r67, 6;\n" |
|
" add.s64 %rl8, %rl31, %rl30;\n" |
|
" @%p2 bra BB00_4;\n" |
|
"\n" |
|
" mov.s32 %r77, -1;\n" |
|
" st.global.u32 [%rl7], %r77;\n" |
|
" mov.u16 %rs4, 0;\n" |
|
" st.global.b16 [%rl8], %rs4;\n" |
|
" st.global.b16 [%rl8+2], %rs4;\n" |
|
" st.global.b16 [%rl8+4], %rs4;\n" |
|
" bra.uni BB00_5;\n" |
|
"\n" |
|
" BB00_4:\n" |
|
" st.global.u32 [%rl7], %r6;\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" cvt.rn.f16.f32 %temp, %f77;\n" |
|
" mov.b16 %rs1, %temp;\n" |
|
" }\n" |
|
" st.global.b16 [%rl8], %rs1;\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" cvt.rn.f16.f32 %temp, %f76;\n" |
|
" mov.b16 %rs2, %temp;\n" |
|
" }\n" |
|
" st.global.b16 [%rl8+2], %rs2;\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" cvt.rn.f16.f32 %temp, %f75;\n" |
|
" mov.b16 %rs3, %temp;\n" |
|
" }\n" |
|
" st.global.b16 [%rl8+4], %rs3;\n" |
|
"\n" |
|
" BB00_5:\n" |
|
" add.f32 %f76, %f76, %f69;\n" |
|
" add.f32 %f75, %f75, %f67;\n" |
|
" add.s32 %r11, %r11, 1;\n" |
|
" setp.le.s32 %p18, %r11, %r9;\n" |
|
" @%p18 bra BB00_7;\n" |
|
"\n" |
|
" BB00_6:\n" |
|
" add.f32 %f83, %f74, %f70;\n" |
|
" add.f32 %f84, %f73, %f68;\n" |
|
" add.s32 %r103, %r103, 1;\n" |
|
" setp.le.s32 %p19, %r103, %r7;\n" |
|
" mov.f32 %f229, %f84;\n" |
|
" mov.f32 %f232, %f83;\n" |
|
" @%p19 bra BB00_2;\n" |
|
"\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to invalidate pixels that don't have valid both depth and face index |
|
".visible .entry CrossCheckProjection(\n" |
|
" .param .u64 .ptr param_1, // depth-map (float) [in/out]\n" |
|
" .param .u64 .ptr param_2, // face-map (uint32_t) [in/out]\n" |
|
" .param .u32 param_3, // width\n" |
|
" .param .u32 param_4 // height\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<2>;\n" |
|
" .reg .pred %p<10>;\n" |
|
" .reg .s32 %r<14>;\n" |
|
" .reg .s64 %rl<8>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_3];\n" |
|
" ld.param.u32 %r2, [param_4];\n" |
|
" mov.u32 %r6, %ntid.x;\n" |
|
" mov.u32 %r7, %ctaid.x;\n" |
|
" mov.u32 %r8, %tid.x;\n" |
|
" mad.lo.s32 %r3, %r6, %r7, %r8;\n" |
|
" mov.u32 %r9, %ntid.y;\n" |
|
" mov.u32 %r10, %ctaid.y;\n" |
|
" mov.u32 %r11, %tid.y;\n" |
|
" mad.lo.s32 %r4, %r9, %r10, %r11;\n" |
|
" setp.gt.s32 %p1, %r3, -1;\n" |
|
" setp.lt.s32 %p2, %r3, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r4, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r4, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" mad.lo.s32 %r12, %r4, %r1, %r3;\n" |
|
" mul.wide.s32 %rl7, %r12, 4;\n" |
|
"\n" |
|
" ld.param.u64 %rl2, [param_1];\n" |
|
" cvta.to.global.u64 %rl1, %rl2;\n" |
|
" add.s64 %rl3, %rl1, %rl7;\n" |
|
"\n" |
|
" ld.param.u64 %rl5, [param_2];\n" |
|
" cvta.to.global.u64 %rl4, %rl5;\n" |
|
" add.s64 %rl6, %rl4, %rl7;\n" |
|
"\n" |
|
" ld.global.f32 %f1, [%rl3];\n" |
|
" setp.eq.f32 %p8, %f1, 0f7F7FFFFF;\n" |
|
" @%p8 bra BB00_2;\n" |
|
"\n" |
|
" ld.global.s32 %r13, [%rl6];\n" |
|
" setp.eq.s32 %p9, %r13, -1;\n" |
|
" @%p9 bra BB00_2;\n" |
|
"\n" |
|
" ret;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" st.global.f32 [%rl3], 0f00000000;\n" |
|
" st.global.s32 [%rl6], -1;\n" |
|
"\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to project image from view B to view A through the mesh; |
|
// additionally the mask is computed |
|
".visible .entry ImageMeshWarp(\n" |
|
" .param .u64 .ptr param_1, // depth-map A (float)\n" |
|
" .param .u64 .ptr param_2, // depth-map B (float)\n" |
|
" .param .u64 .ptr param_3, // mask [out]\n" |
|
" .param .align 4 .b8 param_4[176], // camera A \n" |
|
" .param .align 4 .b8 param_5[176] // camera B \n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<187>;\n" |
|
" .reg .pred %p<19>;\n" |
|
" .reg .s16 %rs<2>;\n" |
|
" .reg .s32 %r<36>;\n" |
|
" .reg .s64 %rl<23>;\n" |
|
" .reg .s16 %rc<2>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_4+168];\n" |
|
" ld.param.u32 %r2, [param_4+172];\n" |
|
" mov.u32 %r6, %ntid.x;\n" |
|
" mov.u32 %r7, %ctaid.x;\n" |
|
" mov.u32 %r8, %tid.x;\n" |
|
" mad.lo.s32 %r3, %r6, %r7, %r8;\n" |
|
" mov.u32 %r9, %ntid.y;\n" |
|
" mov.u32 %r10, %ctaid.y;\n" |
|
" mov.u32 %r11, %tid.y;\n" |
|
" mad.lo.s32 %r4, %r9, %r10, %r11;\n" |
|
" setp.gt.s32 %p1, %r3, -1;\n" |
|
" setp.lt.s32 %p2, %r3, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r4, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r4, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_0;\n" |
|
"\n" |
|
" mov.f32 %f141, 0f00000000;\n" |
|
" mov.u16 %rc1, 0;\n" |
|
" shl.b32 %r13, %r3, 1;\n" |
|
" ld.param.u64 %rl18, [param_1];\n" |
|
" cvta.to.global.u64 %rl10, %rl18;\n" |
|
" mad.lo.s32 %r12, %r4, %r1, %r3;\n" |
|
" mul.wide.s32 %rl9, %r12, 4;\n" |
|
" add.s64 %rl11, %rl10, %rl9;\n" |
|
" ld.global.f32 %f51, [%rl11];\n" |
|
" setp.gt.f32 %p8, %f51, 0f00000000;\n" |
|
" @!%p8 bra BB00_1;\n" |
|
"\n" |
|
" cvt.rn.f32.s32 %f88, %r3;\n" |
|
" ld.param.f32 %f147, [param_4+104];\n" |
|
" sub.f32 %f89, %f88, %f147;\n" |
|
" ld.param.f32 %f148, [param_4+96];\n" |
|
" div.rn.f32 %f90, %f89, %f148;\n" |
|
" cvt.rn.f32.s32 %f91, %r4;\n" |
|
" ld.param.f32 %f145, [param_4+116];\n" |
|
" sub.f32 %f92, %f91, %f145;\n" |
|
" ld.param.f32 %f146, [param_4+112];\n" |
|
" div.rn.f32 %f93, %f92, %f146;\n" |
|
" ld.param.f32 %f157, [param_4+60];\n" |
|
" mul.f32 %f94, %f157, %f93;\n" |
|
" ld.param.f32 %f160, [param_4+48];\n" |
|
" fma.rn.f32 %f95, %f160, %f90, %f94;\n" |
|
" ld.param.f32 %f154, [param_4+72];\n" |
|
" add.f32 %f96, %f95, %f154;\n" |
|
" ld.param.f32 %f156, [param_4+64];\n" |
|
" mul.f32 %f97, %f156, %f93;\n" |
|
" ld.param.f32 %f159, [param_4+52];\n" |
|
" fma.rn.f32 %f98, %f159, %f90, %f97;\n" |
|
" ld.param.f32 %f153, [param_4+76];\n" |
|
" add.f32 %f99, %f98, %f153;\n" |
|
" ld.param.f32 %f155, [param_4+68];\n" |
|
" mul.f32 %f100, %f155, %f93;\n" |
|
" ld.param.f32 %f158, [param_4+56];\n" |
|
" fma.rn.f32 %f101, %f158, %f90, %f100;\n" |
|
" ld.param.f32 %f152, [param_4+80];\n" |
|
" add.f32 %f102, %f101, %f152;\n" |
|
" ld.param.f32 %f72, [param_4+84];\n" |
|
" fma.rn.f32 %f55, %f51, %f96, %f72;\n" |
|
" ld.param.f32 %f75, [param_4+88];\n" |
|
" fma.rn.f32 %f56, %f51, %f99, %f75;\n" |
|
" ld.param.f32 %f78, [param_4+92];\n" |
|
" fma.rn.f32 %f57, %f51, %f102, %f78;\n" |
|
" ld.param.f32 %f183, [param_5+4];\n" |
|
" mul.f32 %f110, %f183, %f56;\n" |
|
" ld.param.f32 %f184, [param_5];\n" |
|
" fma.rn.f32 %f111, %f184, %f55, %f110;\n" |
|
" ld.param.f32 %f182, [param_5+8];\n" |
|
" fma.rn.f32 %f112, %f182, %f57, %f111;\n" |
|
" ld.param.f32 %f181, [param_5+12];\n" |
|
" add.f32 %f58, %f112, %f181;\n" |
|
" ld.param.f32 %f179, [param_5+20];\n" |
|
" mul.f32 %f113, %f179, %f56;\n" |
|
" ld.param.f32 %f180, [param_5+16];\n" |
|
" fma.rn.f32 %f114, %f180, %f55, %f113;\n" |
|
" ld.param.f32 %f178, [param_5+24];\n" |
|
" fma.rn.f32 %f115, %f178, %f57, %f114;\n" |
|
" ld.param.f32 %f177, [param_5+28];\n" |
|
" add.f32 %f59, %f115, %f177;\n" |
|
" ld.param.f32 %f175, [param_5+36];\n" |
|
" mul.f32 %f116, %f175, %f56;\n" |
|
" ld.param.f32 %f176, [param_5+32];\n" |
|
" fma.rn.f32 %f117, %f176, %f55, %f116;\n" |
|
" ld.param.f32 %f174, [param_5+40];\n" |
|
" fma.rn.f32 %f118, %f174, %f57, %f117;\n" |
|
" ld.param.f32 %f173, [param_5+44];\n" |
|
" add.f32 %f60, %f118, %f173;\n" |
|
" setp.gt.f32 %p9, %f60, 0f00000000;\n" |
|
" @!%p9 bra BB00_1;\n" |
|
"\n" |
|
" div.rn.f32 %f185, %f58, %f60;\n" |
|
" div.rn.f32 %f186, %f59, %f60;\n" |
|
" setp.leu.f32 %p10, %f185, 0f41200000;\n" |
|
" @%p10 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r35, [param_5+168];\n" |
|
" add.s32 %r18, %r35, -10;\n" |
|
" cvt.rn.f32.s32 %f127, %r18;\n" |
|
" setp.lt.f32 %p11, %f185, %f127;\n" |
|
" setp.gt.f32 %p12, %f186, 0f41200000;\n" |
|
" and.pred %p13, %p11, %p12;\n" |
|
" @!%p13 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r14, [param_5+172];\n" |
|
" add.s32 %r19, %r14, -10;\n" |
|
" cvt.rn.f32.s32 %f128, %r19;\n" |
|
" setp.geu.f32 %p14, %f186, %f128;\n" |
|
" @%p14 bra BB00_1;\n" |
|
"\n" |
|
" cvt.rzi.s32.f32 %r20, %f186;\n" |
|
" cvt.rzi.s32.f32 %r21, %f185;\n" |
|
" mad.lo.s32 %r22, %r20, %r35, %r21;\n" |
|
" ld.param.u64 %rl20, [param_2];\n" |
|
" cvta.to.global.u64 %rl14, %rl20;\n" |
|
" mul.wide.s32 %rl15, %r22, 4;\n" |
|
" add.s64 %rl6, %rl14, %rl15;\n" |
|
" ld.global.f32 %f129, [%rl6];\n" |
|
" sub.f32 %f130, %f129, %f60;\n" |
|
" abs.f32 %f131, %f130;\n" |
|
" mul.f32 %f1, %f60, 0f3C23D70A;\n" |
|
" setp.lt.f32 %p15, %f131, %f1;\n" |
|
" @%p15 bra BB00_2;\n" |
|
"\n" |
|
" ld.global.f32 %f135, [%rl6+4];\n" |
|
" sub.f32 %f136, %f135, %f60;\n" |
|
" abs.f32 %f137, %f136;\n" |
|
" setp.lt.f32 %p16, %f137, %f1;\n" |
|
" @%p16 bra BB00_2;\n" |
|
"\n" |
|
" add.s32 %r23, %r22, %r35;\n" |
|
" mul.wide.s32 %rl17, %r23, 4;\n" |
|
" add.s64 %rl7, %rl14, %rl17;\n" |
|
" ld.global.f32 %f132, [%rl7];\n" |
|
" sub.f32 %f133, %f132, %f60;\n" |
|
" abs.f32 %f134, %f133;\n" |
|
" setp.lt.f32 %p17, %f134, %f1;\n" |
|
" @%p17 bra BB00_2;\n" |
|
"\n" |
|
" ld.global.f32 %f138, [%rl7+4];\n" |
|
" sub.f32 %f139, %f138, %f60;\n" |
|
" abs.f32 %f140, %f139;\n" |
|
" setp.lt.f32 %p18, %f140, %f1;\n" |
|
" @%p18 bra BB00_2;\n" |
|
"\n" |
|
" BB00_1:\n" |
|
" suld.b.2d.b16.trap {%rs1}, [surfImageRef, {%r13, %r4}];\n" |
|
" bra.uni BB00_3;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" tex.2d.v4.f32.f32 {%f141, %f142, %f143, %f144}, [texImageRef, {%f185, %f186}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" cvt.rn.f16.f32 %temp, %f141;\n" |
|
" mov.b16 %rs1, %temp;\n" |
|
" }\n" |
|
" mov.u16 %rc1, 1;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" sust.b.2d.b16.trap [surfImageProjRef, {%r13, %r4}], {%rs1};\n" |
|
" ld.param.u64 %rl1, [param_3];\n" |
|
" cvta.to.global.u64 %rl4, %rl1;\n" |
|
" cvt.s64.s32 %rl3, %r12;\n" |
|
" add.s64 %rl2, %rl4, %rl3;\n" |
|
" st.global.u8 [%rl2], %rc1;\n" |
|
" BB00_0:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the mean for all image pixels for a given windows size |
|
".visible .entry ComputeImageMean(\n" |
|
" .param .u64 .ptr param_1, // image mask\n" |
|
" .param .u64 .ptr param_2, // image pixels mean [out]\n" |
|
" .param .u32 param_3, // image width\n" |
|
" .param .u32 param_4, // image height\n" |
|
" .param .u32 param_5 // half-window size\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<10>;\n" |
|
" .reg .pred %p<19>;\n" |
|
" .reg .s16 %rc<4>;\n" |
|
" .reg .s32 %r<40>;\n" |
|
" .reg .s64 %rl<13>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_3];\n" |
|
" ld.param.u32 %r2, [param_4];\n" |
|
" mov.u32 %r12, %ntid.x;\n" |
|
" mov.u32 %r13, %ctaid.x;\n" |
|
" mov.u32 %r14, %tid.x;\n" |
|
" mad.lo.s32 %r4, %r12, %r13, %r14;\n" |
|
" mov.u32 %r15, %ntid.y;\n" |
|
" mov.u32 %r16, %ctaid.y;\n" |
|
" mov.u32 %r17, %tid.y;\n" |
|
" mad.lo.s32 %r5, %r15, %r16, %r17;\n" |
|
" setp.gt.s32 %p1, %r4, -1;\n" |
|
" setp.lt.s32 %p2, %r4, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r5, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r5, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl7, [param_1];\n" |
|
" ld.param.u64 %rl8, [param_2];\n" |
|
" cvta.to.global.u64 %rl2, %rl7;\n" |
|
" cvta.to.global.u64 %rl3, %rl8;\n" |
|
" ld.param.u32 %r36, [param_5];\n" |
|
" shl.b32 %r18, %r36, 1;\n" |
|
" or.b32 %r19, %r18, 1;\n" |
|
" cvt.rn.f32.s32 %f6, %r19;\n" |
|
" mul.f32 %f1, %f6, %f6;\n" |
|
" ld.param.u32 %r31, [param_3];\n" |
|
" mad.lo.s32 %r20, %r5, %r31, %r4;\n" |
|
" cvt.s64.s32 %rl4, %r20;\n" |
|
" mul.wide.s32 %rl9, %r20, 4;\n" |
|
" add.s64 %rl5, %rl3, %rl9;\n" |
|
" mov.u32 %r21, 0;\n" |
|
" st.global.u32 [%rl5], %r21;\n" |
|
" sub.s32 %r23, %r31, %r36;\n" |
|
" setp.lt.s32 %p8, %r4, %r23;\n" |
|
" setp.ge.s32 %p9, %r4, %r36;\n" |
|
" and.pred %p10, %p8, %p9;\n" |
|
" setp.ge.s32 %p11, %r5, %r36;\n" |
|
" and.pred %p12, %p10, %p11;\n" |
|
" ld.param.u32 %r32, [param_4];\n" |
|
" sub.s32 %r24, %r32, %r36;\n" |
|
" setp.lt.s32 %p13, %r5, %r24;\n" |
|
" and.pred %p14, %p12, %p13;\n" |
|
" @!%p14 bra BB00_1;\n" |
|
"\n" |
|
" add.s64 %rl10, %rl2, %rl4;\n" |
|
" ld.global.u8 %rc1, [%rl10];\n" |
|
" cvt.s16.s8 %rc1, %rc1;\n" |
|
" mov.b16 %rc2, 1;\n" |
|
" cvt.s16.s8 %rc2, %rc2;\n" |
|
" setp.eq.s16 %p15, %rc1, %rc2;\n" |
|
" @!%p15 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r35, [param_5];\n" |
|
" neg.s32 %r6, %r35;\n" |
|
" setp.gt.s32 %p16, %r6, %r35;\n" |
|
" @%p16 bra BB00_5;\n" |
|
"\n" |
|
" mov.f32 %f9, 0f00000000;\n" |
|
" mov.u32 %r39, %r6;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" mov.u32 %r7, %r39;\n" |
|
" add.s32 %r8, %r7, %r4;\n" |
|
" mov.u32 %r38, %r6;\n" |
|
"\n" |
|
" BB00_4:\n" |
|
" add.s32 %r26, %r38, %r5;\n" |
|
" shl.b32 %r27, %r8, 1;\n" |
|
" suld.b.2d.b16.trap {%rc3}, [surfImageRef, {%r27, %r26}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rc3;\n" |
|
" cvt.f32.f16 %f7, %temp;\n" |
|
" }\n" |
|
" add.f32 %f9, %f9, %f7;\n" |
|
" add.s32 %r38, %r38, 1;\n" |
|
" ld.param.u32 %r34, [param_5];\n" |
|
" setp.le.s32 %p17, %r38, %r34;\n" |
|
" @%p17 bra BB00_4;\n" |
|
"\n" |
|
" add.s32 %r11, %r7, 1;\n" |
|
" ld.param.u32 %r33, [param_5];\n" |
|
" setp.le.s32 %p18, %r11, %r33;\n" |
|
" mov.u32 %r39, %r11;\n" |
|
" @%p18 bra BB00_3;\n" |
|
" bra.uni BB00_2;\n" |
|
"\n" |
|
" BB00_5:\n" |
|
" mov.f32 %f9, 0f00000000;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" div.rn.f32 %f8, %f9, %f1;\n" |
|
" st.global.f32 [%rl5], %f8;\n" |
|
"\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the variance for all image pixels for a given windows size |
|
".visible .entry ComputeImageVar(\n" |
|
" .param .u64 .ptr param_1, // image pixels mean\n" |
|
" .param .u64 .ptr param_2, // image mask\n" |
|
" .param .u64 .ptr param_3, // image pixels variance [out]\n" |
|
" .param .u32 param_4, // image width\n" |
|
" .param .u32 param_5, // image height\n" |
|
" .param .u32 param_6 // half-window size\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<15>;\n" |
|
" .reg .pred %p<19>;\n" |
|
" .reg .s16 %rc<4>;\n" |
|
" .reg .s32 %r<43>;\n" |
|
" .reg .s64 %rl<17>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_4];\n" |
|
" ld.param.u32 %r2, [param_5];\n" |
|
" mov.u32 %r12, %ntid.x;\n" |
|
" mov.u32 %r13, %ctaid.x;\n" |
|
" mov.u32 %r14, %tid.x;\n" |
|
" mad.lo.s32 %r4, %r12, %r13, %r14;\n" |
|
" mov.u32 %r15, %ntid.y;\n" |
|
" mov.u32 %r16, %ctaid.y;\n" |
|
" mov.u32 %r17, %tid.y;\n" |
|
" mad.lo.s32 %r5, %r15, %r16, %r17;\n" |
|
" setp.gt.s32 %p1, %r4, -1;\n" |
|
" setp.lt.s32 %p2, %r4, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r5, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r5, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl8, [param_1];\n" |
|
" ld.param.u64 %rl9, [param_2];\n" |
|
" ld.param.u64 %rl10, [param_3];\n" |
|
" cvta.to.global.u64 %rl1, %rl8;\n" |
|
" cvta.to.global.u64 %rl3, %rl9;\n" |
|
" cvta.to.global.u64 %rl4, %rl10;\n" |
|
" ld.param.u32 %r39, [param_6];\n" |
|
" shl.b32 %r18, %r39, 1;\n" |
|
" or.b32 %r19, %r18, 1;\n" |
|
" cvt.rn.f32.s32 %f7, %r19;\n" |
|
" mul.f32 %f1, %f7, %f7;\n" |
|
" ld.param.u32 %r34, [param_4];\n" |
|
" mad.lo.s32 %r20, %r5, %r34, %r4;\n" |
|
" cvt.s64.s32 %rl5, %r20;\n" |
|
" shl.b64 %rl11, %rl5, 2;\n" |
|
" add.s64 %rl6, %rl4, %rl11;\n" |
|
" mov.f32 %f3, 0f00000000;\n" |
|
" st.global.f32 [%rl6], %f3;\n" |
|
" sub.s32 %r23, %r34, %r39;\n" |
|
" setp.lt.s32 %p8, %r4, %r23;\n" |
|
" setp.ge.s32 %p9, %r4, %r39;\n" |
|
" and.pred %p10, %p8, %p9;\n" |
|
" setp.ge.s32 %p11, %r5, %r39;\n" |
|
" and.pred %p12, %p10, %p11;\n" |
|
" ld.param.u32 %r35, [param_5];\n" |
|
" sub.s32 %r24, %r35, %r39;\n" |
|
" setp.lt.s32 %p13, %r5, %r24;\n" |
|
" and.pred %p14, %p12, %p13;\n" |
|
" @!%p14 bra BB00_1;\n" |
|
"\n" |
|
" add.s64 %rl12, %rl3, %rl5;\n" |
|
" ld.global.u8 %rc1, [%rl12];\n" |
|
" cvt.s16.s8 %rc1, %rc1;\n" |
|
" mov.b16 %rc2, 1;\n" |
|
" cvt.s16.s8 %rc2, %rc2;\n" |
|
" setp.eq.s16 %p15, %rc1, %rc2;\n" |
|
" @!%p15 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r38, [param_6];\n" |
|
" neg.s32 %r6, %r38;\n" |
|
" add.s64 %rl14, %rl1, %rl11;\n" |
|
" ld.global.f32 %f2, [%rl14];\n" |
|
" mov.f32 %f14, 0f00000000;\n" |
|
" mov.u32 %r42, %r6;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" mov.u32 %r7, %r42;\n" |
|
" add.s32 %r8, %r7, %r4;\n" |
|
" mov.u32 %r41, %r6;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" add.s32 %r26, %r41, %r5;\n" |
|
" shl.b32 %r27, %r8, 1;\n" |
|
" suld.b.2d.b16.trap {%rc3}, [surfImageRef, {%r27, %r26}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rc3;\n" |
|
" cvt.f32.f16 %f9, %temp;\n" |
|
" }\n" |
|
" sub.f32 %f10, %f9, %f2;\n" |
|
" fma.rn.f32 %f14, %f10, %f10, %f14;\n" |
|
" add.s32 %r41, %r41, 1;\n" |
|
" ld.param.u32 %r37, [param_6];\n" |
|
" setp.le.s32 %p17, %r41, %r37;\n" |
|
" @%p17 bra BB00_3;\n" |
|
"\n" |
|
" add.s32 %r11, %r7, 1;\n" |
|
" ld.param.u32 %r36, [param_6];\n" |
|
" setp.le.s32 %p18, %r11, %r36;\n" |
|
" mov.u32 %r42, %r11;\n" |
|
" @%p18 bra BB00_2;\n" |
|
"\n" |
|
" div.rn.f32 %f12, %f14, %f1;\n" |
|
" max.f32 %f12, %f12, 0f38D1B717;\n" |
|
" st.global.f32 [%rl6], %f12;\n" |
|
"\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the covariance for all image pixels for a given windows size |
|
".visible .entry ComputeImageCov(\n" |
|
" .param .u64 .ptr param_1, // meanA\n" |
|
" .param .u64 .ptr param_2, // meanB\n" |
|
" .param .u64 .ptr param_3, // mask\n" |
|
" .param .u64 .ptr param_4, // cov [out]\n" |
|
" .param .u32 param_5, // image width\n" |
|
" .param .u32 param_6, // image height\n" |
|
" .param .u32 param_7 // window size\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<17>;\n" |
|
" .reg .pred %p<19>;\n" |
|
" .reg .s16 %rs<4>;\n" |
|
" .reg .s32 %r<53>;\n" |
|
" .reg .s64 %rl<27>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_5];\n" |
|
" ld.param.u32 %r2, [param_6];\n" |
|
" ld.param.u64 %rl10, [param_1];\n" |
|
" ld.param.u64 %rl12, [param_2];\n" |
|
" ld.param.u64 %rl13, [param_3];\n" |
|
" ld.param.u64 %rl1, [param_4];\n" |
|
" cvta.to.global.u64 %rl2, %rl12;\n" |
|
" cvta.to.global.u64 %rl4, %rl10;\n" |
|
" cvta.to.global.u64 %rl6, %rl13;\n" |
|
" cvta.to.global.u64 %rl7, %rl1;\n" |
|
" mov.u32 %r12, %ntid.x;\n" |
|
" mov.u32 %r13, %ctaid.x;\n" |
|
" mov.u32 %r14, %tid.x;\n" |
|
" mad.lo.s32 %r4, %r12, %r13, %r14;\n" |
|
" mov.u32 %r15, %ntid.y;\n" |
|
" mov.u32 %r16, %ctaid.y;\n" |
|
" mov.u32 %r17, %tid.y;\n" |
|
" mad.lo.s32 %r5, %r15, %r16, %r17;\n" |
|
" setp.gt.s32 %p1, %r4, -1;\n" |
|
" setp.lt.s32 %p2, %r4, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r5, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r5, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r49, [param_7];\n" |
|
" shl.b32 %r18, %r49, 1;\n" |
|
" or.b32 %r19, %r18, 1;\n" |
|
" cvt.rn.f32.s32 %f8, %r19;\n" |
|
" mul.f32 %f1, %f8, %f8;\n" |
|
" ld.param.u32 %r44, [param_5];\n" |
|
" mad.lo.s32 %r20, %r5, %r44, %r4;\n" |
|
" cvt.s64.s32 %rl8, %r20;\n" |
|
" mul.wide.s32 %rl14, %r20, 4;\n" |
|
" add.s64 %rl15, %rl7, %rl14;\n" |
|
" mov.u32 %r21, 0;\n" |
|
" st.global.u32 [%rl15], %r21;\n" |
|
" sub.s32 %r23, %r44, %r49;\n" |
|
" setp.lt.s32 %p8, %r4, %r23;\n" |
|
" setp.ge.s32 %p9, %r4, %r49;\n" |
|
" and.pred %p10, %p8, %p9;\n" |
|
" setp.ge.s32 %p11, %r5, %r49;\n" |
|
" and.pred %p12, %p10, %p11;\n" |
|
" ld.param.u32 %r45, [param_6];\n" |
|
" sub.s32 %r24, %r45, %r49;\n" |
|
" setp.lt.s32 %p13, %r5, %r24;\n" |
|
" and.pred %p14, %p12, %p13;\n" |
|
" @!%p14 bra BB00_1;\n" |
|
"\n" |
|
" add.s64 %rl16, %rl6, %rl8;\n" |
|
" ld.global.u8 %rs3, [%rl16];\n" |
|
" {\n" |
|
" .reg .s16 %temp1;\n" |
|
" .reg .s16 %temp2;\n" |
|
" cvt.s16.s8 %temp1, %rs3;\n" |
|
" mov.b16 %temp2, 1;\n" |
|
" cvt.s16.s8 %temp2, %temp2;\n" |
|
" setp.eq.s16 %p15, %temp1, %temp2;\n" |
|
" }\n" |
|
" @!%p15 bra BB00_1;\n" |
|
"\n" |
|
" neg.s32 %r6, %r49;\n" |
|
" setp.gt.s32 %p16, %r6, %r49;\n" |
|
" @%p16 bra BB00_4;\n" |
|
"\n" |
|
" shl.b64 %rl17, %rl8, 2;\n" |
|
" add.s64 %rl18, %rl4, %rl17;\n" |
|
" ld.global.f32 %f2, [%rl18];\n" |
|
" add.s64 %rl19, %rl2, %rl17;\n" |
|
" ld.global.f32 %f3, [%rl19];\n" |
|
" mov.f32 %f16, 0f00000000;\n" |
|
" mov.u32 %r52, %r6;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" mov.u32 %r7, %r52;\n" |
|
" add.s32 %r8, %r7, %r4;\n" |
|
" mov.u32 %r51, %r6;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" add.s32 %r26, %r51, %r5;\n" |
|
" shl.b32 %r27, %r8, 1;\n" |
|
" suld.b.2d.b16.trap {%rs1}, [surfImageRef, {%r27, %r26}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rs1;\n" |
|
" cvt.f32.f16 %f10, %temp;\n" |
|
" }\n" |
|
" sub.f32 %f11, %f10, %f2;\n" |
|
" suld.b.2d.b16.trap {%rs2}, [surfImageProjRef, {%r27, %r26}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rs2;\n" |
|
" cvt.f32.f16 %f12, %temp;\n" |
|
" }\n" |
|
" sub.f32 %f13, %f12, %f3;\n" |
|
" fma.rn.f32 %f16, %f11, %f13, %f16;\n" |
|
" add.s32 %r51, %r51, 1;\n" |
|
" setp.le.s32 %p17, %r51, %r49;\n" |
|
" @%p17 bra BB00_3;\n" |
|
"\n" |
|
" add.s32 %r11, %r7, 1;\n" |
|
" setp.le.s32 %p18, %r11, %r49;\n" |
|
" mov.u32 %r52, %r11;\n" |
|
" @%p18 bra BB00_2;\n" |
|
" bra.uni BB00_5;\n" |
|
"\n" |
|
" BB00_4:\n" |
|
" mov.f32 %f16, 0f00000000;\n" |
|
"\n" |
|
" BB00_5:\n" |
|
" ld.param.u32 %r42, [param_5];\n" |
|
" mad.lo.s32 %r40, %r5, %r42, %r4;\n" |
|
" ld.param.u64 %rl26, [param_4];\n" |
|
" cvta.to.global.u64 %rl23, %rl26;\n" |
|
" mul.wide.s32 %rl24, %r40, 4;\n" |
|
" add.s64 %rl25, %rl23, %rl24;\n" |
|
" div.rn.f32 %f15, %f16, %f1;\n" |
|
" st.global.f32 [%rl25], %f15;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the ZNCC score for all image pixels |
|
".visible .entry ComputeImageZNCC(\n" |
|
" .param .u64 .ptr param_2, // cov\n" |
|
" .param .u64 .ptr param_3, // varA\n" |
|
" .param .u64 .ptr param_4, // varB\n" |
|
" .param .u64 .ptr param_5, // mask\n" |
|
" .param .u64 .ptr param_6, // ZNCC [out]\n" |
|
" .param .u32 param_0, // image width\n" |
|
" .param .u32 param_1, // image height\n" |
|
" .param .u32 param_7 // window size\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<7>;\n" |
|
" .reg .pred %p<16>;\n" |
|
" .reg .s32 %r<25>;\n" |
|
" .reg .s64 %rl<19>;\n" |
|
" .reg .s16 %rc<2>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_0];\n" |
|
" ld.param.u32 %r2, [param_1];\n" |
|
" ld.param.u64 %rl8, [param_2];\n" |
|
" ld.param.u64 %rl9, [param_3];\n" |
|
" ld.param.u64 %rl10, [param_4];\n" |
|
" ld.param.u64 %rl11, [param_5];\n" |
|
" ld.param.u64 %rl12, [param_6];\n" |
|
" cvta.to.global.u64 %rl1, %rl10;\n" |
|
" cvta.to.global.u64 %rl2, %rl9;\n" |
|
" cvta.to.global.u64 %rl3, %rl8;\n" |
|
" cvta.to.global.u64 %rl4, %rl11;\n" |
|
" cvta.to.global.u64 %rl5, %rl12;\n" |
|
" mov.u32 %r6, %ntid.x;\n" |
|
" mov.u32 %r7, %ctaid.x;\n" |
|
" mov.u32 %r8, %tid.x;\n" |
|
" mad.lo.s32 %r4, %r6, %r7, %r8;\n" |
|
" mov.u32 %r9, %ntid.y;\n" |
|
" mov.u32 %r10, %ctaid.y;\n" |
|
" mov.u32 %r11, %tid.y;\n" |
|
" mad.lo.s32 %r5, %r9, %r10, %r11;\n" |
|
" setp.gt.s32 %p1, %r4, -1;\n" |
|
" setp.lt.s32 %p2, %r4, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r5, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r5, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u32 %r22, [param_0];\n" |
|
" mad.lo.s32 %r12, %r5, %r22, %r4;\n" |
|
" cvt.s64.s32 %rl6, %r12;\n" |
|
" mul.wide.s32 %rl13, %r12, 4;\n" |
|
" add.s64 %rl7, %rl5, %rl13;\n" |
|
" mov.u32 %r13, 0;\n" |
|
" st.global.u32 [%rl7], %r13;\n" |
|
" ld.param.u32 %r24, [param_7];\n" |
|
" sub.s32 %r15, %r22, %r24;\n" |
|
" setp.lt.s32 %p8, %r4, %r15;\n" |
|
" setp.ge.s32 %p9, %r4, %r24;\n" |
|
" and.pred %p10, %p8, %p9;\n" |
|
" setp.ge.s32 %p11, %r5, %r24;\n" |
|
" and.pred %p12, %p10, %p11;\n" |
|
" ld.param.u32 %r23, [param_1];\n" |
|
" sub.s32 %r16, %r23, %r24;\n" |
|
" setp.lt.s32 %p13, %r5, %r16;\n" |
|
" and.pred %p14, %p12, %p13;\n" |
|
" @!%p14 bra BB00_1;\n" |
|
"\n" |
|
" add.s64 %rl14, %rl4, %rl6;\n" |
|
" ld.global.u8 %rc1, [%rl14];\n" |
|
" {\n" |
|
" .reg .s16 %temp1;\n" |
|
" .reg .s16 %temp2;\n" |
|
" cvt.s16.s8 %temp1, %rc1;\n" |
|
" mov.b16 %temp2, 1;\n" |
|
" cvt.s16.s8 %temp2, %temp2;\n" |
|
" setp.eq.s16 %p15, %temp1, %temp2;\n" |
|
" }\n" |
|
" @!%p15 bra BB00_1;\n" |
|
"\n" |
|
" shl.b64 %rl15, %rl6, 2;\n" |
|
" add.s64 %rl16, %rl3, %rl15;\n" |
|
" add.s64 %rl17, %rl1, %rl15;\n" |
|
" ld.global.f32 %f1, [%rl17];\n" |
|
" add.s64 %rl18, %rl2, %rl15;\n" |
|
" ld.global.f32 %f2, [%rl18];\n" |
|
" mul.f32 %f3, %f2, %f1;\n" |
|
" sqrt.rn.f32 %f4, %f3;\n" |
|
" ld.global.f32 %f5, [%rl16];\n" |
|
" div.rn.f32 %f6, %f5, %f4;\n" |
|
" st.global.f32 [%rl7], %f6;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the gradient of the ZNCC score for all image pixels |
|
".visible .entry ComputeImageDZNCC(\n" |
|
" .param .u64 .ptr param_1, // meanA\n" |
|
" .param .u64 .ptr param_2, // meanB\n" |
|
" .param .u64 .ptr param_3, // varA\n" |
|
" .param .u64 .ptr param_4, // varB\n" |
|
" .param .u64 .ptr param_5, // ZNCC\n" |
|
" .param .u64 .ptr param_6, // mask\n" |
|
" .param .u64 .ptr param_7, // NCCGrad [out]\n" |
|
" .param .u32 param_8, // image width\n" |
|
" .param .u32 param_9, // image height\n" |
|
" .param .u32 param_10 // window size\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<69>;\n" |
|
" .reg .pred %p<20>;\n" |
|
" .reg .s16 %rc<7>;\n" |
|
" .reg .s32 %r<90>;\n" |
|
" .reg .s64 %rl<81>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_8];\n" |
|
" ld.param.u32 %r2, [param_9];\n" |
|
" mov.u32 %r17, %ntid.x;\n" |
|
" mov.u32 %r18, %ctaid.x;\n" |
|
" mov.u32 %r19, %tid.x;\n" |
|
" mad.lo.s32 %r5, %r17, %r18, %r19;\n" |
|
" mov.u32 %r20, %ntid.y;\n" |
|
" mov.u32 %r21, %ctaid.y;\n" |
|
" mov.u32 %r7, %tid.y;\n" |
|
" mad.lo.s32 %r8, %r20, %r21, %r7;\n" |
|
" setp.gt.s32 %p1, %r5, -1;\n" |
|
" setp.lt.s32 %p2, %r5, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r8, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r8, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" mad.lo.s32 %r22, %r8, %r1, %r5;\n" |
|
" ld.param.u64 %rl55, [param_7];\n" |
|
" cvta.to.global.u64 %rl13, %rl55;\n" |
|
" mul.wide.s32 %rl14, %r22, 4;\n" |
|
" add.s64 %rl15, %rl13, %rl14;\n" |
|
" mov.u32 %r23, 0;\n" |
|
" st.global.u32 [%rl15], %r23;\n" |
|
" ld.param.u32 %r74, [param_10];\n" |
|
" sub.s32 %r27, %r1, %r74;\n" |
|
" setp.lt.s32 %p8, %r5, %r27;\n" |
|
" setp.ge.s32 %p9, %r5, %r74;\n" |
|
" and.pred %p10, %p8, %p9;\n" |
|
" setp.ge.s32 %p11, %r8, %r74;\n" |
|
" and.pred %p12, %p10, %p11;\n" |
|
" sub.s32 %r28, %r2, %r74;\n" |
|
" setp.lt.s32 %p13, %r8, %r28;\n" |
|
" and.pred %p14, %p12, %p13;\n" |
|
" @!%p14 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl48, [param_3];\n" |
|
" cvta.to.global.u64 %rl23, %rl48;\n" |
|
" ld.param.u64 %rl50, [param_4];\n" |
|
" cvta.to.global.u64 %rl26, %rl50;\n" |
|
" ld.param.u64 %rl53, [param_6];\n" |
|
" cvta.to.global.u64 %rl18, %rl53;\n" |
|
" ld.param.u32 %r66, [param_8];\n" |
|
" mad.lo.s32 %r29, %r8, %r66, %r5;\n" |
|
" cvt.s64.s32 %rl19, %r29;\n" |
|
" add.s64 %rl20, %rl18, %rl19;\n" |
|
" ld.global.u8 %rc1, [%rl20];\n" |
|
" {\n" |
|
" .reg .s16 %temp1;\n" |
|
" .reg .s16 %temp2;\n" |
|
" cvt.s16.s8 %temp1, %rc1;\n" |
|
" mov.b16 %temp2, 1;\n" |
|
" cvt.s16.s8 %temp2, %temp2;\n" |
|
" setp.eq.s16 %p15, %temp1, %temp2;\n" |
|
" }\n" |
|
" @!%p15 bra BB00_1;\n" |
|
"\n" |
|
" neg.s32 %r31, %r74;\n" |
|
" setp.gt.s32 %p16, %r31, %r74;\n" |
|
" @%p16 bra BB00_6;\n" |
|
"\n" |
|
" neg.s32 %r77, %r74;\n" |
|
" mov.f32 %f56, 0f00000000;\n" |
|
" mov.f32 %f57, %f56;\n" |
|
" mov.f32 %f58, %f56;\n" |
|
" mov.f32 %f59, %f56;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" add.s32 %r12, %r77, %r5;\n" |
|
" neg.s32 %r78, %r74;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" ld.param.u64 %rl52, [param_6];\n" |
|
" cvta.to.global.u64 %rl21, %rl52;\n" |
|
" add.s32 %r36, %r78, %r8;\n" |
|
" ld.param.u32 %r65, [param_8];\n" |
|
" mad.lo.s32 %r37, %r36, %r65, %r12;\n" |
|
" cvt.s64.s32 %rl12, %r37;\n" |
|
" add.s64 %rl22, %rl21, %rl12;\n" |
|
" ld.global.u8 %rc2, [%rl22];\n" |
|
" {\n" |
|
" .reg .s16 %temp1;\n" |
|
" .reg .s16 %temp2;\n" |
|
" cvt.s16.s8 %temp1, %rc2;\n" |
|
" mov.b16 %temp2, 1;\n" |
|
" cvt.s16.s8 %temp2, %temp2;\n" |
|
" setp.eq.s16 %p17, %temp1, %temp2;\n" |
|
" }\n" |
|
" @%p17 bra BB00_4;\n" |
|
" bra.uni BB00_5;\n" |
|
"\n" |
|
" BB00_4:\n" |
|
" shl.b64 %rl24, %rl12, 2;\n" |
|
" add.s64 %rl25, %rl23, %rl24;\n" |
|
" add.s64 %rl27, %rl26, %rl24;\n" |
|
" ld.global.f32 %f25, [%rl27];\n" |
|
" ld.global.f32 %f26, [%rl25];\n" |
|
" mul.f32 %f27, %f26, %f25;\n" |
|
" sqrt.rn.f32 %f28, %f27;\n" |
|
" rcp.rn.f32 %f29, %f28;\n" |
|
" sub.f32 %f58, %f58, %f29;\n" |
|
" ld.param.u64 %rl51, [param_5];\n" |
|
" cvta.to.global.u64 %rl28, %rl51;\n" |
|
" add.s64 %rl29, %rl28, %rl24;\n" |
|
" ld.global.f32 %f30, [%rl29];\n" |
|
" div.rn.f32 %f31, %f30, %f25;\n" |
|
" add.f32 %f57, %f57, %f31;\n" |
|
" ld.param.u64 %rl45, [param_1];\n" |
|
" cvta.to.global.u64 %rl30, %rl45;\n" |
|
" add.s64 %rl31, %rl30, %rl24;\n" |
|
" ld.global.f32 %f32, [%rl31];\n" |
|
" ld.param.u64 %rl46, [param_2];\n" |
|
" cvta.to.global.u64 %rl32, %rl46;\n" |
|
" add.s64 %rl33, %rl32, %rl24;\n" |
|
" ld.global.f32 %f33, [%rl33];\n" |
|
" mul.f32 %f34, %f33, %f30;\n" |
|
" div.rn.f32 %f35, %f34, %f25;\n" |
|
" neg.f32 %f36, %f35;\n" |
|
" fma.rn.f32 %f37, %f32, %f29, %f36;\n" |
|
" add.f32 %f56, %f56, %f37;\n" |
|
" add.f32 %f59, %f59, 0f3F800000;\n" |
|
"\n" |
|
" BB00_5:\n" |
|
" add.s32 %r78, %r78, 1;\n" |
|
" setp.le.s32 %p18, %r78, %r74;\n" |
|
" @%p18 bra BB00_3;\n" |
|
"\n" |
|
" add.s32 %r77, %r77, 1;\n" |
|
" setp.le.s32 %p19, %r77, %r74;\n" |
|
" @%p19 bra BB00_2;\n" |
|
" bra.uni BB00_7;\n" |
|
"\n" |
|
" BB00_6:\n" |
|
" mov.f32 %f59, 0f00000000;\n" |
|
" mov.f32 %f58, %f59;\n" |
|
" mov.f32 %f57, %f59;\n" |
|
" mov.f32 %f56, %f59;\n" |
|
"\n" |
|
" BB00_7:\n" |
|
" shl.b32 %r9, %r5, 1;\n" |
|
" suld.b.2d.b16.trap {%rc5}, [surfImageProjRef, {%r9, %r8}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rc5;\n" |
|
" cvt.f32.f16 %f22, %temp;\n" |
|
" }\n" |
|
" div.rn.f32 %f43, %f57, %f59;\n" |
|
" mul.f32 %f65, %f43, %f22;\n" |
|
" div.rn.f32 %f42, %f58, %f59;\n" |
|
" suld.b.2d.b16.trap {%rc6}, [surfImageRef, {%r9, %r8}];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rc6;\n" |
|
" cvt.f32.f16 %f21, %temp;\n" |
|
" }\n" |
|
" fma.rn.f32 %f66, %f42, %f21, %f65;\n" |
|
" div.rn.f32 %f44, %f56, %f59;\n" |
|
" add.f32 %f68, %f66, %f44;\n" |
|
" add.s64 %rl42, %rl23, %rl14;\n" |
|
" add.s64 %rl44, %rl26, %rl14;\n" |
|
" ld.global.f32 %f45, [%rl44];\n" |
|
" ld.global.f32 %f46, [%rl42];\n" |
|
" min.f32 %f47, %f46, %f45;\n" |
|
" add.f32 %f48, %f47, 0f3AC49BA6;\n" |
|
" div.rn.f32 %f49, %f47, %f48;\n" |
|
" mul.f32 %f51, %f68, %f49;\n" |
|
" st.global.f32 [%rl15], %f51;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the photometric gradient for all vertices seen by an image pair |
|
".visible .entry ComputePhotometricGradient(\n" |
|
" .param .u64 .ptr param_1, // faces\n" |
|
" .param .u64 .ptr param_2, // normals\n" |
|
" .param .u64 .ptr param_3, // depth-map A (float)\n" |
|
" .param .u64 .ptr param_4, // face-map A (uint32_t)\n" |
|
" .param .u64 .ptr param_5, // bary-map A (hfloat*3)\n" |
|
" .param .u64 .ptr param_6, // DZNCC\n" |
|
" .param .u64 .ptr param_7, // mask\n" |
|
" .param .u64 .ptr param_8, // photo-grad [in/out]\n" |
|
" .param .u64 .ptr param_9, // photo-grad-norm [in/out]\n" |
|
" .param .align 4 .b8 param_10[176], // camera A\n" |
|
" .param .align 4 .b8 param_11[176], // camera B\n" |
|
" .param .f32 param_12 // square(avg-depth/f) scale (float)\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<232>;\n" |
|
" .reg .pred %p<11>;\n" |
|
" .reg .s16 %rs<5>;\n" |
|
" .reg .s32 %r<69>;\n" |
|
" .reg .s64 %rl<66>;\n" |
|
"\n" |
|
" ld.param.u32 %r1, [param_10+168];\n" |
|
" ld.param.u32 %r2, [param_10+172];\n" |
|
" mov.u32 %r9, %ntid.x;\n" |
|
" mov.u32 %r10, %ctaid.x;\n" |
|
" mov.u32 %r11, %tid.x;\n" |
|
" mad.lo.s32 %r3, %r9, %r10, %r11;\n" |
|
" mov.u32 %r12, %ntid.y;\n" |
|
" mov.u32 %r13, %ctaid.y;\n" |
|
" mov.u32 %r14, %tid.y;\n" |
|
" mad.lo.s32 %r4, %r12, %r13, %r14;\n" |
|
" setp.gt.s32 %p1, %r3, -1;\n" |
|
" setp.lt.s32 %p2, %r3, %r1;\n" |
|
" and.pred %p3, %p1, %p2;\n" |
|
" setp.gt.s32 %p4, %r4, -1;\n" |
|
" and.pred %p5, %p3, %p4;\n" |
|
" setp.lt.s32 %p6, %r4, %r2;\n" |
|
" and.pred %p7, %p5, %p6;\n" |
|
" @!%p7 bra BB00_1;\n" |
|
"\n" |
|
" mad.lo.s32 %r15, %r4, %r1, %r3;\n" |
|
" cvt.s64.s32 %rl11, %r15;\n" |
|
" ld.param.u64 %rl12, [param_7];\n" |
|
" cvta.to.global.u64 %rl10, %rl12;\n" |
|
" add.s64 %rl13, %rl10, %rl11;\n" |
|
" ld.global.u8 %rs4, [%rl13];\n" |
|
" {\n" |
|
" .reg .s16 %temp1;\n" |
|
" .reg .s16 %temp2;\n" |
|
" cvt.s16.s8 %temp1, %rs4;\n" |
|
" mov.b16 %temp2, 1;\n" |
|
" cvt.s16.s8 %temp2, %temp2;\n" |
|
" setp.ne.s16 %p8, %temp1, %temp2;\n" |
|
" }\n" |
|
" @%p8 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl59, [param_3];\n" |
|
" cvta.to.global.u64 %rl14, %rl59;\n" |
|
" shl.b64 %rl15, %rl11, 2;\n" |
|
" add.s64 %rl16, %rl14, %rl15;\n" |
|
" ld.global.f32 %f4, [%rl16];\n" |
|
" ld.param.u64 %rl60, [param_4];\n" |
|
" cvta.to.global.u64 %rl17, %rl60;\n" |
|
" add.s64 %rl18, %rl17, %rl15;\n" |
|
" ld.param.u64 %rl61, [param_5];\n" |
|
" cvta.to.global.u64 %rl19, %rl61;\n" |
|
" mul.wide.s32 %rl9, %r15, 6;\n" |
|
" add.s64 %rl20, %rl19, %rl9;\n" |
|
" ld.global.b16 %rs1, [%rl20];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rs1;\n" |
|
" cvt.f32.f16 %f41, %temp;\n" |
|
" }\n" |
|
" ld.global.b16 %rs2, [%rl20+2];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rs2;\n" |
|
" cvt.f32.f16 %f42, %temp;\n" |
|
" }\n" |
|
" ld.global.b16 %rs3, [%rl20+4];\n" |
|
" {\n" |
|
" .reg .b16 %temp;\n" |
|
" mov.b16 %temp, %rs3;\n" |
|
" cvt.f32.f16 %f43, %temp;\n" |
|
" }\n" |
|
" ld.param.f32 %f44, [param_12];\n" |
|
" ld.global.u32 %r20, [%rl18];\n" |
|
" mul.lo.s32 %r22, %r20, 3;\n" |
|
" ld.param.u64 %rl57, [param_1];\n" |
|
" cvta.to.global.u64 %rl23, %rl57;\n" |
|
" mul.wide.s32 %rl24, %r22, 4;\n" |
|
" add.s64 %rl25, %rl23, %rl24;\n" |
|
" ld.global.u32 %r5, [%rl25];\n" |
|
" ld.global.u32 %r6, [%rl25+4];\n" |
|
" ld.global.u32 %r7, [%rl25+8];\n" |
|
" ld.param.u64 %rl58, [param_2];\n" |
|
" cvta.to.global.u64 %rl26, %rl58;\n" |
|
" add.s64 %rl27, %rl26, %rl24;\n" |
|
" cvt.rn.f32.s32 %f71, %r3;\n" |
|
" ld.param.f32 %f193, [param_10+104];\n" |
|
" sub.f32 %f72, %f71, %f193;\n" |
|
" ld.param.f32 %f194, [param_10+96];\n" |
|
" div.rn.f32 %f73, %f72, %f194;\n" |
|
" cvt.rn.f32.s32 %f74, %r4;\n" |
|
" ld.param.f32 %f191, [param_10+116];\n" |
|
" sub.f32 %f75, %f74, %f191;\n" |
|
" ld.param.f32 %f192, [param_10+112];\n" |
|
" div.rn.f32 %f76, %f75, %f192;\n" |
|
" ld.param.f32 %f203, [param_10+60];\n" |
|
" mul.f32 %f77, %f203, %f76;\n" |
|
" ld.param.f32 %f206, [param_10+48];\n" |
|
" fma.rn.f32 %f78, %f206, %f73, %f77;\n" |
|
" ld.param.f32 %f200, [param_10+72];\n" |
|
" add.f32 %f79, %f78, %f200;\n" |
|
" ld.param.f32 %f202, [param_10+64];\n" |
|
" mul.f32 %f80, %f202, %f76;\n" |
|
" ld.param.f32 %f205, [param_10+52];\n" |
|
" fma.rn.f32 %f81, %f205, %f73, %f80;\n" |
|
" ld.param.f32 %f199, [param_10+76];\n" |
|
" add.f32 %f82, %f81, %f199;\n" |
|
" ld.param.f32 %f201, [param_10+68];\n" |
|
" mul.f32 %f83, %f201, %f76;\n" |
|
" ld.param.f32 %f204, [param_10+56];\n" |
|
" fma.rn.f32 %f84, %f204, %f73, %f83;\n" |
|
" ld.param.f32 %f198, [param_10+80];\n" |
|
" add.f32 %f85, %f84, %f198;\n" |
|
" mul.f32 %f86, %f82, %f82;\n" |
|
" fma.rn.f32 %f87, %f79, %f79, %f86;\n" |
|
" fma.rn.f32 %f88, %f85, %f85, %f87;\n" |
|
" sqrt.rn.f32 %f89, %f88;\n" |
|
" div.rn.f32 %f45, %f79, %f89;\n" |
|
" div.rn.f32 %f46, %f82, %f89;\n" |
|
" div.rn.f32 %f47, %f85, %f89;\n" |
|
" ld.global.f32 %f48, [%rl27];\n" |
|
" ld.global.f32 %f49, [%rl27+4];\n" |
|
" mul.f32 %f90, %f49, %f46;\n" |
|
" fma.rn.f32 %f91, %f48, %f45, %f90;\n" |
|
" ld.global.f32 %f50, [%rl27+8];\n" |
|
" fma.rn.f32 %f51, %f50, %f47, %f91;\n" |
|
" setp.gt.f32 %p9, %f51, 0fBDCCCCCD;\n" |
|
" @%p9 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.f32 %f38, [param_10+84];\n" |
|
" fma.rn.f32 %f92, %f4, %f79, %f38;\n" |
|
" ld.param.f32 %f39, [param_10+88];\n" |
|
" fma.rn.f32 %f93, %f4, %f82, %f39;\n" |
|
" ld.param.f32 %f40, [param_10+92];\n" |
|
" fma.rn.f32 %f94, %f4, %f85, %f40;\n" |
|
" ld.param.f32 %f227, [param_11+4];\n" |
|
" mul.f32 %f95, %f227, %f93;\n" |
|
" ld.param.f32 %f229, [param_11];\n" |
|
" fma.rn.f32 %f96, %f229, %f92, %f95;\n" |
|
" ld.param.f32 %f225, [param_11+8];\n" |
|
" fma.rn.f32 %f97, %f225, %f94, %f96;\n" |
|
" ld.param.f32 %f223, [param_11+12];\n" |
|
" add.f32 %f52, %f97, %f223;\n" |
|
" ld.param.f32 %f220, [param_11+20];\n" |
|
" mul.f32 %f98, %f220, %f93;\n" |
|
" ld.param.f32 %f222, [param_11+16];\n" |
|
" fma.rn.f32 %f99, %f222, %f92, %f98;\n" |
|
" ld.param.f32 %f218, [param_11+24];\n" |
|
" fma.rn.f32 %f100, %f218, %f94, %f99;\n" |
|
" ld.param.f32 %f216, [param_11+28];\n" |
|
" add.f32 %f53, %f100, %f216;\n" |
|
" ld.param.f32 %f213, [param_11+36];\n" |
|
" mul.f32 %f101, %f213, %f93;\n" |
|
" ld.param.f32 %f215, [param_11+32];\n" |
|
" fma.rn.f32 %f102, %f215, %f92, %f101;\n" |
|
" ld.param.f32 %f211, [param_11+40];\n" |
|
" fma.rn.f32 %f103, %f211, %f94, %f102;\n" |
|
" ld.param.f32 %f209, [param_11+44];\n" |
|
" add.f32 %f54, %f103, %f209;\n" |
|
" setp.gt.f32 %p10, %f54, 0f00000000;\n" |
|
" @%p10 bra BB00_2;\n" |
|
"\n" |
|
" mov.f32 %f231, 0fBF800000;\n" |
|
" mov.f32 %f230, %f231;\n" |
|
" bra.uni BB00_3;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" div.rn.f32 %f230, %f52, %f54;\n" |
|
" div.rn.f32 %f231, %f53, %f54;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" ld.param.f32 %f228, [param_11];\n" |
|
" mul.f32 %f118, %f228, %f54;\n" |
|
" neg.f32 %f119, %f52;\n" |
|
" ld.param.f32 %f214, [param_11+32];\n" |
|
" fma.rn.f32 %f120, %f119, %f214, %f118;\n" |
|
" mul.f32 %f121, %f54, %f54;\n" |
|
" div.rn.f32 %f122, %f120, %f121;\n" |
|
" ld.param.f32 %f226, [param_11+4];\n" |
|
" mul.f32 %f123, %f226, %f54;\n" |
|
" ld.param.f32 %f212, [param_11+36];\n" |
|
" fma.rn.f32 %f124, %f119, %f212, %f123;\n" |
|
" div.rn.f32 %f125, %f124, %f121;\n" |
|
" ld.param.f32 %f224, [param_11+8];\n" |
|
" mul.f32 %f126, %f224, %f54;\n" |
|
" ld.param.f32 %f210, [param_11+40];\n" |
|
" fma.rn.f32 %f127, %f119, %f210, %f126;\n" |
|
" div.rn.f32 %f128, %f127, %f121;\n" |
|
" ld.param.f32 %f221, [param_11+16];\n" |
|
" mul.f32 %f129, %f221, %f54;\n" |
|
" neg.f32 %f130, %f53;\n" |
|
" fma.rn.f32 %f131, %f130, %f214, %f129;\n" |
|
" div.rn.f32 %f132, %f131, %f121;\n" |
|
" ld.param.f32 %f219, [param_11+20];\n" |
|
" mul.f32 %f133, %f219, %f54;\n" |
|
" fma.rn.f32 %f134, %f130, %f212, %f133;\n" |
|
" div.rn.f32 %f135, %f134, %f121;\n" |
|
" ld.param.f32 %f217, [param_11+24];\n" |
|
" mul.f32 %f136, %f217, %f54;\n" |
|
" fma.rn.f32 %f137, %f130, %f210, %f136;\n" |
|
" div.rn.f32 %f138, %f137, %f121;\n" |
|
" add.f32 %f106, %f230, 0f3F800000;\n" |
|
" tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [texImageRef, {%f106, %f231}];\n" |
|
" mov.b32 %f140, %r29;\n" |
|
" tex.2d.v4.u32.f32 {%r34, %r35, %r36, %r37}, [texImageRef, {%f230, %f231}];\n" |
|
" mov.b32 %f141, %r34;\n" |
|
" sub.f32 %f142, %f140, %f141;\n" |
|
" add.f32 %f113, %f231, 0f3F800000;\n" |
|
" tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [texImageRef, {%f230, %f113}];\n" |
|
" mov.b32 %f143, %r39;\n" |
|
" sub.f32 %f145, %f143, %f141;\n" |
|
" ld.param.u64 %rl63, [param_6];\n" |
|
" cvta.to.global.u64 %rl28, %rl63;\n" |
|
" mul.wide.s32 %rl29, %r15, 4;\n" |
|
" add.s64 %rl30, %rl28, %rl29;\n" |
|
" mul.f32 %f146, %f145, %f132;\n" |
|
" fma.rn.f32 %f147, %f142, %f122, %f146;\n" |
|
" ld.global.f32 %f148, [%rl30];\n" |
|
" mul.f32 %f149, %f148, %f147;\n" |
|
" mul.f32 %f150, %f145, %f135;\n" |
|
" fma.rn.f32 %f151, %f142, %f125, %f150;\n" |
|
" mul.f32 %f152, %f148, %f151;\n" |
|
" mul.f32 %f153, %f145, %f138;\n" |
|
" fma.rn.f32 %f154, %f142, %f128, %f153;\n" |
|
" mul.f32 %f155, %f148, %f154;\n" |
|
" mul.f32 %f156, %f152, %f46;\n" |
|
" fma.rn.f32 %f157, %f149, %f45, %f156;\n" |
|
" fma.rn.f32 %f158, %f155, %f47, %f157;\n" |
|
" div.rn.f32 %f159, %f158, %f51;\n" |
|
" mul.lo.s32 %r60, %r5, 3;\n" |
|
" ld.param.u64 %rl64, [param_8];\n" |
|
" cvta.to.global.u64 %rl31, %rl64;\n" |
|
" mul.wide.s32 %rl32, %r60, 4;\n" |
|
" add.s64 %rl33, %rl31, %rl32;\n" |
|
" mul.f32 %f162, %f44, %f41;\n" |
|
" mul.f32 %f163, %f162, %f159;\n" |
|
" mul.f32 %f164, %f163, %f48;\n" |
|
" atom.global.add.f32 %f165, [%rl33], %f164;\n" |
|
" mad.lo.s32 %r61, %r5, 3, 1;\n" |
|
" mul.wide.s32 %rl34, %r61, 4;\n" |
|
" add.s64 %rl35, %rl31, %rl34;\n" |
|
" mul.f32 %f166, %f163, %f49;\n" |
|
" atom.global.add.f32 %f167, [%rl35], %f166;\n" |
|
" mad.lo.s32 %r62, %r5, 3, 2;\n" |
|
" mul.wide.s32 %rl36, %r62, 4;\n" |
|
" add.s64 %rl37, %rl31, %rl36;\n" |
|
" mul.f32 %f168, %f163, %f50;\n" |
|
" atom.global.add.f32 %f169, [%rl37], %f168;\n" |
|
" mul.lo.s32 %r63, %r6, 3;\n" |
|
" mul.wide.s32 %rl38, %r63, 4;\n" |
|
" add.s64 %rl39, %rl31, %rl38;\n" |
|
" mul.f32 %f170, %f44, %f42;\n" |
|
" mul.f32 %f171, %f170, %f159;\n" |
|
" mul.f32 %f172, %f171, %f48;\n" |
|
" atom.global.add.f32 %f173, [%rl39], %f172;\n" |
|
" mad.lo.s32 %r64, %r6, 3, 1;\n" |
|
" mul.wide.s32 %rl40, %r64, 4;\n" |
|
" add.s64 %rl41, %rl31, %rl40;\n" |
|
" mul.f32 %f174, %f171, %f49;\n" |
|
" atom.global.add.f32 %f175, [%rl41], %f174;\n" |
|
" mad.lo.s32 %r65, %r6, 3, 2;\n" |
|
" mul.wide.s32 %rl42, %r65, 4;\n" |
|
" add.s64 %rl43, %rl31, %rl42;\n" |
|
" mul.f32 %f176, %f171, %f50;\n" |
|
" atom.global.add.f32 %f177, [%rl43], %f176;\n" |
|
" mul.lo.s32 %r66, %r7, 3;\n" |
|
" mul.wide.s32 %rl44, %r66, 4;\n" |
|
" add.s64 %rl45, %rl31, %rl44;\n" |
|
" mul.f32 %f178, %f44, %f43;\n" |
|
" mul.f32 %f179, %f178, %f159;\n" |
|
" mul.f32 %f180, %f179, %f48;\n" |
|
" atom.global.add.f32 %f181, [%rl45], %f180;\n" |
|
" mad.lo.s32 %r67, %r7, 3, 1;\n" |
|
" mul.wide.s32 %rl46, %r67, 4;\n" |
|
" add.s64 %rl47, %rl31, %rl46;\n" |
|
" mul.f32 %f182, %f179, %f49;\n" |
|
" atom.global.add.f32 %f183, [%rl47], %f182;\n" |
|
" mad.lo.s32 %r68, %r7, 3, 2;\n" |
|
" mul.wide.s32 %rl48, %r68, 4;\n" |
|
" add.s64 %rl49, %rl31, %rl48;\n" |
|
" mul.f32 %f184, %f179, %f50;\n" |
|
" atom.global.add.f32 %f185, [%rl49], %f184;\n" |
|
" ld.param.u64 %rl65, [param_9];\n" |
|
" cvta.to.global.u64 %rl50, %rl65;\n" |
|
" mul.wide.s32 %rl51, %r5, 4;\n" |
|
" add.s64 %rl52, %rl50, %rl51;\n" |
|
" atom.global.add.f32 %f186, [%rl52], 0f3F800000;\n" |
|
" mul.wide.s32 %rl53, %r6, 4;\n" |
|
" add.s64 %rl54, %rl50, %rl53;\n" |
|
" atom.global.add.f32 %f187, [%rl54], 0f3F800000;\n" |
|
" mul.wide.s32 %rl55, %r7, 4;\n" |
|
" add.s64 %rl56, %rl50, %rl55;\n" |
|
" atom.global.add.f32 %f188, [%rl56], 0f3F800000;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to update the norm of the photo gradient for all vertices |
|
".visible .entry UpdatePhotoGradNorm(\n" |
|
" .param .u64 .ptr param_1, // photoGradNorm [in/out]\n" |
|
" .param .u64 .ptr param_2, // photoGradPixels [in]\n" |
|
" .param .u32 param_3 // numVertices\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<4>;\n" |
|
" .reg .pred %p<3>;\n" |
|
" .reg .s32 %r<9>;\n" |
|
" .reg .s64 %rl<9>;\n" |
|
"\n" |
|
" ld.param.u32 %r2, [param_3];\n" |
|
" mov.u32 %r3, %ntid.x;\n" |
|
" mov.u32 %r4, %ctaid.x;\n" |
|
" mov.u32 %r5, %tid.x;\n" |
|
" mad.lo.s32 %r1, %r3, %r4, %r5;\n" |
|
" setp.ge.s32 %p1, %r1, %r2;\n" |
|
" @%p1 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl5, [param_2];\n" |
|
" cvta.to.global.u64 %rl2, %rl5;\n" |
|
" mul.wide.s32 %rl6, %r1, 4;\n" |
|
" add.s64 %rl7, %rl2, %rl6;\n" |
|
" ld.global.f32 %f1, [%rl7];\n" |
|
" setp.le.f32 %p2, %f1, 0f00000000;\n" |
|
" @%p2 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl4, [param_1];\n" |
|
" cvta.to.global.u64 %rl1, %rl4;\n" |
|
" add.s64 %rl8, %rl1, %rl6;\n" |
|
" ld.global.f32 %f2, [%rl8];\n" |
|
" add.f32 %f3, %f2, 0f3F800000;\n" |
|
" st.global.f32 [%rl8], %f3;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to compute the smoothness gradient for all vertices |
|
".visible .entry ComputeSmoothnessGradient(\n" |
|
" .param .u64 .ptr param_1, // vertices\n" |
|
" .param .u64 .ptr param_2, // vert-vertices [in]\n" |
|
" .param .u64 .ptr param_3, // vert-sizes [in]\n" |
|
" .param .u64 .ptr param_4, // vert-pos [in]\n" |
|
" .param .u64 .ptr param_5, // smooth-grad [out]\n" |
|
" .param .u32 param_6, // numVertices\n" |
|
" .param .u8 param_7 // switch 0/1\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<38>;\n" |
|
" .reg .pred %p<5>;\n" |
|
" .reg .s32 %r<69>;\n" |
|
" .reg .s64 %rl<42>;\n" |
|
" .reg .s16 %rc<4>;\n" |
|
"\n" |
|
" ld.param.u32 %r9, [param_6];\n" |
|
" ld.param.u64 %rl1, [param_1];\n" |
|
" ld.param.u64 %rl12, [param_2];\n" |
|
" ld.param.u64 %rl2, [param_3];\n" |
|
" ld.param.u64 %rl41, [param_4];\n" |
|
" ld.param.u64 %rl13, [param_5];\n" |
|
" cvta.to.global.u64 %rl4, %rl12;\n" |
|
" cvta.to.global.u64 %rl20, %rl41;\n" |
|
" cvta.to.global.u64 %rl5, %rl2;\n" |
|
" cvta.to.global.u64 %rl6, %rl13;\n" |
|
" cvta.to.global.u64 %rl7, %rl1;\n" |
|
" mov.u32 %r10, %ntid.x;\n" |
|
" mov.u32 %r11, %ctaid.x;\n" |
|
" mov.u32 %r12, %tid.x;\n" |
|
" mad.lo.s32 %r1, %r10, %r11, %r12;\n" |
|
" setp.ge.s32 %p1, %r1, %r9;\n" |
|
" @%p1 bra BB00_1;\n" |
|
"\n" |
|
" mul.lo.s32 %r13, %r1, 3;\n" |
|
" mul.wide.s32 %rl14, %r13, 4;\n" |
|
" add.s64 %rl8, %rl6, %rl14;\n" |
|
" mul.wide.s32 %rl19, %r1, 4;\n" |
|
" add.s64 %rl10, %rl5, %rl19;\n" |
|
" ld.global.u32 %r67, [%rl10];\n" |
|
" setp.gt.s32 %p2, %r67, 0;\n" |
|
" @%p2 bra BB00_5;\n" |
|
"\n" |
|
" mov.f32 %f37, 0f00000000;\n" |
|
" st.global.f32 [%rl8], %f37;\n" |
|
" st.global.f32 [%rl8+4], %f37;\n" |
|
" st.global.f32 [%rl8+8], %f37;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"\n" |
|
" BB00_5:\n" |
|
" add.s64 %rl15, %rl7, %rl14;\n" |
|
" ld.global.f32 %f36, [%rl15];\n" |
|
" ld.global.f32 %f35, [%rl15+4];\n" |
|
" ld.global.f32 %f34, [%rl15+8];\n" |
|
" ld.param.u8 %rc3, [param_7];\n" |
|
" cvt.s16.s8 %rc2, %rc3;\n" |
|
" mov.b16 %rc1, 0;\n" |
|
" setp.eq.s16 %p3, %rc2, %rc1;\n" |
|
" mov.f32 %f37, 0f3F800000;\n" |
|
" add.s64 %rl11, %rl20, %rl19;\n" |
|
" ld.global.u32 %r27, [%rl11];\n" |
|
" cvt.rn.f32.s32 %f22, %r67;\n" |
|
" rcp.rn.f32 %f30, %f22;\n" |
|
" mov.u32 %r68, 0;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" add.s32 %r29, %r27, %r68;\n" |
|
" mul.wide.s32 %rl22, %r29, 4;\n" |
|
" add.s64 %rl23, %rl4, %rl22;\n" |
|
" ld.global.u32 %r30, [%rl23];\n" |
|
" mul.lo.s32 %r32, %r30, 3;\n" |
|
" mul.wide.s32 %rl25, %r32, 4;\n" |
|
" add.s64 %rl26, %rl7, %rl25;\n" |
|
" ld.global.f32 %f20, [%rl26];\n" |
|
" mul.f32 %f21, %f20, %f30;\n" |
|
" sub.f32 %f36, %f36, %f21;\n" |
|
" ld.global.f32 %f23, [%rl26+4];\n" |
|
" mul.f32 %f24, %f23, %f30;\n" |
|
" sub.f32 %f35, %f35, %f24;\n" |
|
" ld.global.f32 %f26, [%rl26+8];\n" |
|
" mul.f32 %f27, %f26, %f30;\n" |
|
" sub.f32 %f34, %f34, %f27;\n" |
|
" @%p3 bra BB00_3;\n" |
|
"\n" |
|
" mul.wide.s32 %rl39, %r30, 4;\n" |
|
" add.s64 %rl40, %rl5, %rl39;\n" |
|
" ld.global.u32 %r59, [%rl40];\n" |
|
" cvt.rn.f32.s32 %f28, %r59;\n" |
|
" rcp.rn.f32 %f29, %f28;\n" |
|
" fma.rn.f32 %f37, %f29, %f30, %f37;\n" |
|
"\n" |
|
" BB00_3:\n" |
|
" add.s32 %r68, %r68, 1;\n" |
|
" setp.lt.s32 %p4, %r68, %r67;\n" |
|
" @%p4 bra BB00_2;\n" |
|
"\n" |
|
" @%p3 bra BB00_4;\n" |
|
"\n" |
|
" div.rn.f32 %f36, %f36, %f37;\n" |
|
" div.rn.f32 %f35, %f35, %f37;\n" |
|
" div.rn.f32 %f34, %f34, %f37;\n" |
|
"\n" |
|
" BB00_4:\n" |
|
" st.global.f32 [%rl8], %f36;\n" |
|
" st.global.f32 [%rl8+4], %f35;\n" |
|
" st.global.f32 [%rl8+8], %f34;\n" |
|
" ret;\n" |
|
"}\n" |
|
"\n" |
|
// kernel used to combine the photo and smoothness gradient for all vertices |
|
".visible .entry CombineGradients(\n" |
|
" .param .u64 .ptr param_1, // photo-gradient [in/out]\n" |
|
" .param .u64 .ptr param_2, // photo-norm [in]\n" |
|
" .param .u64 .ptr param_3, // smoothness-gradient [in]\n" |
|
" .param .u32 param_4, // numVertices\n" |
|
" .param .f32 param_5 // smoothness-weight\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<17>;\n" |
|
" .reg .pred %p<3>;\n" |
|
" .reg .s32 %r<6>;\n" |
|
" .reg .s64 %rl<15>;\n" |
|
"\n" |
|
" ld.param.u32 %r2, [param_4];\n" |
|
" mov.u32 %r3, %ntid.x;\n" |
|
" mov.u32 %r4, %ctaid.x;\n" |
|
" mov.u32 %r5, %tid.x;\n" |
|
" mad.lo.s32 %r1, %r3, %r4, %r5;\n" |
|
" setp.lt.s32 %p1, %r1, %r2;\n" |
|
" @!%p1 bra BB00_1;\n" |
|
"\n" |
|
" ld.param.u64 %rl4, [param_3];\n" |
|
" cvta.to.global.u64 %rl2, %rl4;\n" |
|
" ld.param.u64 %rl11, [param_2];\n" |
|
" cvta.to.global.u64 %rl12, %rl11;\n" |
|
" ld.param.u64 %rl13, [param_1];\n" |
|
" cvta.to.global.u64 %rl14, %rl13;\n" |
|
" mul.wide.s32 %rl4, %r1, 4;\n" |
|
" mul.wide.s32 %rl5, %r1, 12;\n" |
|
" add.s64 %rl9, %rl12, %rl4;\n" |
|
" add.s64 %rl6, %rl2, %rl5;\n" |
|
" add.s64 %rl8, %rl14, %rl5;\n" |
|
" ld.param.f32 %f8, [param_5];\n" |
|
" ld.global.f32 %f2, [%rl6];\n" |
|
" mul.f32 %f3, %f2, %f8;\n" |
|
" ld.global.f32 %f4, [%rl6+4];\n" |
|
" mul.f32 %f5, %f4, %f8;\n" |
|
" ld.global.f32 %f6, [%rl6+8];\n" |
|
" mul.f32 %f7, %f6, %f8;\n" |
|
" ld.global.f32 %f9, [%rl9];\n" |
|
" setp.gt.f32 %p2, %f9, 0f00000000;\n" |
|
" @%p2 bra BB00_2;\n" |
|
"\n" |
|
" st.global.f32 [%rl8], %f3;\n" |
|
" st.global.f32 [%rl8+4], %f5;\n" |
|
" st.global.f32 [%rl8+8], %f7;\n" |
|
" ret;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" rcp.rn.f32 %f10, %f9;\n" |
|
" ld.global.f32 %f11, [%rl8];\n" |
|
" fma.rn.f32 %f12, %f11, %f10, %f3;\n" |
|
" ld.global.f32 %f13, [%rl8+4];\n" |
|
" fma.rn.f32 %f14, %f13, %f10, %f5;\n" |
|
" ld.global.f32 %f15, [%rl8+8];\n" |
|
" fma.rn.f32 %f16, %f15, %f10, %f7;\n" |
|
" st.global.f32 [%rl8], %f12;\n" |
|
" st.global.f32 [%rl8+4], %f14;\n" |
|
" st.global.f32 [%rl8+8], %f16;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n" |
|
// kernel used to combine the photo and both smoothness gradients for all vertices |
|
".visible .entry CombineAllGradients(\n" |
|
" .param .u64 .ptr param_1, // photo-gradient [in/out]\n" |
|
" .param .u64 .ptr param_2, // photo-norm [in]\n" |
|
" .param .u64 .ptr param_3, // smoothness-gradient 1 [in]\n" |
|
" .param .u64 .ptr param_4, // smoothness-gradient 2 [in]\n" |
|
" .param .u32 param_5, // numVertices\n" |
|
" .param .f32 param_6, // rigidity-weight\n" |
|
" .param .f32 param_7 // elasticity-weight\n" |
|
")\n" |
|
"{\n" |
|
" .reg .f32 %f<19>;\n" |
|
" .reg .pred %p<3>;\n" |
|
" .reg .s32 %r<17>;\n" |
|
" .reg .s64 %rl<15>;\n" |
|
"\n" |
|
" ld.param.u32 %r2, [param_5];\n" |
|
" mov.u32 %r3, %ntid.x;\n" |
|
" mov.u32 %r4, %ctaid.x;\n" |
|
" mov.u32 %r5, %tid.x;\n" |
|
" mad.lo.s32 %r1, %r3, %r4, %r5;\n" |
|
" setp.lt.s32 %p1, %r1, %r2;\n" |
|
" @!%p1 bra BB00_1;\n" |
|
"\n" |
|
" ld.global.f32 %f11, [%rl7];\n" |
|
"\n" |
|
" ld.param.u64 %rl3, [param_4];\n" |
|
" cvta.to.global.u64 %rl1, %rl3;\n" |
|
" ld.param.u64 %rl4, [param_3];\n" |
|
" cvta.to.global.u64 %rl2, %rl4;\n" |
|
" ld.param.u64 %rl11, [param_2];\n" |
|
" cvta.to.global.u64 %rl12, %rl11;\n" |
|
" ld.param.u64 %rl13, [param_1];\n" |
|
" cvta.to.global.u64 %rl14, %rl13;\n" |
|
" mul.wide.s32 %rl4, %r1, 4;\n" |
|
" mul.wide.s32 %rl5, %r1, 12;\n" |
|
" add.s64 %rl7, %rl1, %rl5;\n" |
|
" add.s64 %rl6, %rl2, %rl5;\n" |
|
" add.s64 %rl9, %rl12, %rl4;\n" |
|
" add.s64 %rl8, %rl14, %rl5;\n" |
|
" ld.param.f32 %f8, [param_6];\n" |
|
" ld.param.f32 %f18, [param_7];\n" |
|
" ld.global.f32 %f2, [%rl6];\n" |
|
" mul.f32 %f3, %f2, %f8;\n" |
|
" ld.global.f32 %f12, [%rl7];\n" |
|
" fma.rn.f32 %f11, %f12, %f18, %f3;\n" |
|
" ld.global.f32 %f4, [%rl6+4];\n" |
|
" mul.f32 %f5, %f4, %f8;\n" |
|
" ld.global.f32 %f14, [%rl7+4];\n" |
|
" fma.rn.f32 %f13, %f14, %f18, %f5;\n" |
|
" ld.global.f32 %f6, [%rl6+8];\n" |
|
" mul.f32 %f7, %f6, %f8;\n" |
|
" ld.global.f32 %f16, [%rl7+8];\n" |
|
" fma.rn.f32 %f15, %f16, %f18, %f7;\n" |
|
" ld.global.f32 %f9, [%rl9];\n" |
|
" setp.gt.f32 %p2, %f9, 0f00000000;\n" |
|
" @%p2 bra BB00_2;\n" |
|
"\n" |
|
" st.global.f32 [%rl8], %f11;\n" |
|
" st.global.f32 [%rl8+4], %f13;\n" |
|
" st.global.f32 [%rl8+8], %f15;\n" |
|
" ret;\n" |
|
"\n" |
|
" BB00_2:\n" |
|
" rcp.rn.f32 %f10, %f9;\n" |
|
" ld.global.f32 %f2, [%rl8];\n" |
|
" fma.rn.f32 %f3, %f2, %f10, %f11;\n" |
|
" ld.global.f32 %f4, [%rl8+4];\n" |
|
" fma.rn.f32 %f5, %f4, %f10, %f13;\n" |
|
" ld.global.f32 %f6, [%rl8+8];\n" |
|
" fma.rn.f32 %f7, %f6, %f10, %f15;\n" |
|
" st.global.f32 [%rl8], %f3;\n" |
|
" st.global.f32 [%rl8+4], %f5;\n" |
|
" st.global.f32 [%rl8+8], %f7;\n" |
|
" BB00_1:\n" |
|
" ret;\n" |
|
"}\n"; |
|
|
|
|
|
// S T R U C T S /////////////////////////////////////////////////// |
|
|
|
typedef Mesh::Vertex Vertex; |
|
typedef Mesh::VIndex VIndex; |
|
typedef Mesh::Face Face; |
|
typedef Mesh::FIndex FIndex; |
|
|
|
class MeshRefineCUDA { |
|
public: |
|
typedef Mesh::FaceIdxArr CameraFaces; |
|
typedef CLISTDEF2(CameraFaces) CameraFacesArr; |
|
|
|
// store necessary data about a view |
|
struct View { |
|
Image32F imageHost; // store temporarily the image pixels |
|
Image8U::Size size; |
|
CUDA::ArrayRT16F image; |
|
CUDA::MemDevice depthMap; |
|
CUDA::MemDevice faceMap; |
|
CUDA::MemDevice baryMap; |
|
inline View() {} |
|
inline View(View&) {} |
|
}; |
|
typedef CLISTDEF2(View) ViewsArr; |
|
|
|
struct CameraCUDA { |
|
Matrix3x4f P; |
|
Matrix3x3f R; |
|
Point3f C; |
|
Matrix3x3f K; |
|
Matrix3x3f invK; |
|
Image8U::Size size; |
|
|
|
inline CameraCUDA() {} |
|
inline CameraCUDA(const Camera& camera, const Image8U::Size& _size) : P(camera.P), R(camera.R), C(camera.C), K(camera.K), invK(camera.GetInvK()), size(_size) {} |
|
}; |
|
|
|
|
|
public: |
|
MeshRefineCUDA(Scene& _scene, unsigned _nAlternatePair=true, float _weightRegularity=1.5f, float _ratioRigidityElasticity=0.8f, unsigned _nResolutionLevel=0, unsigned _nMinResolution=640, unsigned nMaxViews=8); |
|
~MeshRefineCUDA(); |
|
|
|
bool IsValid() const { return module != NULL && module->IsValid() && !pairs.IsEmpty(); } |
|
|
|
bool InitKernels(int device=-1); |
|
bool InitImages(float scale, float sigma=0); |
|
|
|
void ListVertexFacesPre(); |
|
void ListVertexFacesPost(); |
|
void ListCameraFaces(); |
|
|
|
void ListFaceAreas(Mesh::AreaArr& maxAreas); |
|
void SubdivideMesh(uint32_t maxArea, float fDecimate=1.f, unsigned nCloseHoles=15, unsigned nEnsureEdgeSize=1); |
|
|
|
void ComputeNormalFaces(); |
|
|
|
void ScoreMesh(float* gradients); |
|
|
|
void ProjectMesh( |
|
const CameraFaces& cameraFaces, |
|
const Camera& camera, const Image8U::Size& size, uint32_t idxImage); |
|
void ProcessPair(uint32_t idxImageA, uint32_t idxImageB); |
|
void ImageMeshWarp( |
|
const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, |
|
uint32_t idxImageA, uint32_t idxImageB); |
|
void ComputeLocalVariance(const CUDA::ArrayRT16F& image, const Image8U::Size& size, |
|
CUDA::MemDevice& imageMean, CUDA::MemDevice& imageVar); |
|
void ComputeLocalZNCC(const Image8U::Size& size); |
|
void ComputePhotometricGradient(const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, |
|
uint32_t idxImageA, uint32_t idxImageB, uint32_t numVertices, float RegularizationScale); |
|
void ComputeSmoothnessGradient(uint32_t numVertices); |
|
void CombineGradients(uint32_t numVertices); |
|
|
|
public: |
|
const float weightRegularity; // a scalar regularity weight to balance between photo-consistency and regularization terms |
|
float ratioRigidityElasticity; // a scalar ratio used to compute the regularity gradient as a combination of rigidity and elasticity |
|
const unsigned nResolutionLevel; // how many times to scale down the images before mesh optimization |
|
const unsigned nMinResolution; // how many times to scale down the images before mesh optimization |
|
unsigned nAlternatePair; // using an image pair alternatively as reference image (0 - both, 1 - alternate, 2 - only left, 3 - only right) |
|
unsigned iteration; // current refinement iteration |
|
|
|
Scene& scene; // the mesh vertices and faces |
|
|
|
// constant the entire time |
|
ImageArr& images; |
|
ViewsArr views; // views' data |
|
PairIdxArr pairs; // image pairs used to refine the mesh |
|
|
|
CUDA::ModuleRTPtr module; |
|
CUDA::KernelRT kernelProjectMesh; |
|
CUDA::KernelRT kernelCrossCheckProjection; |
|
CUDA::KernelRT kernelImageMeshWarp; |
|
CUDA::KernelRT kernelComputeImageMean; |
|
CUDA::KernelRT kernelComputeImageVar; |
|
CUDA::KernelRT kernelComputeImageCov; |
|
CUDA::KernelRT kernelComputeImageZNCC; |
|
CUDA::KernelRT kernelComputeImageDZNCC; |
|
CUDA::KernelRT kernelComputePhotometricGradient; |
|
CUDA::KernelRT kernelUpdatePhotoGradNorm; |
|
CUDA::KernelRT kernelComputeSmoothnessGradient; |
|
CUDA::KernelRT kernelCombineGradients; |
|
CUDA::KernelRT kernelCombineAllGradients; |
|
|
|
CUDA::MemDevice vertices; |
|
CUDA::MemDevice vertexVertices; |
|
CUDA::MemDevice faces; |
|
CUDA::MemDevice faceNormals; |
|
CUDA::TextureRT16F texImageRef; |
|
CUDA::SurfaceRT16F surfImageRef; |
|
CUDA::SurfaceRT16F surfImageProjRef; |
|
CUDA::MemDevice mask; |
|
CUDA::MemDevice imageMeanA; |
|
CUDA::MemDevice imageVarA; |
|
CUDA::ArrayRT16F imageAB; |
|
CUDA::MemDevice imageMeanAB; |
|
CUDA::MemDevice imageVarAB; |
|
CUDA::MemDevice imageCov; |
|
CUDA::MemDevice imageZNCC; |
|
CUDA::MemDevice imageDZNCC; |
|
CUDA::MemDevice photoGrad; |
|
CUDA::MemDevice photoGradNorm; |
|
CUDA::MemDevice photoGradPixels; |
|
CUDA::MemDevice vertexVerticesCont; |
|
CUDA::MemDevice vertexVerticesSizes; |
|
CUDA::MemDevice vertexVerticesPointers; |
|
CUDA::MemDevice smoothGrad1; |
|
CUDA::MemDevice smoothGrad2; |
|
|
|
enum { HalfSize = 2 }; // half window size used to compute ZNCC |
|
}; |
|
|
|
MeshRefineCUDA::MeshRefineCUDA(Scene& _scene, unsigned _nAlternatePair, float _weightRegularity, float _ratioRigidityElasticity, unsigned _nResolutionLevel, unsigned _nMinResolution, unsigned nMaxViews) |
|
: |
|
weightRegularity(_weightRegularity), |
|
ratioRigidityElasticity(_ratioRigidityElasticity), |
|
nResolutionLevel(_nResolutionLevel), |
|
nMinResolution(_nMinResolution), |
|
nAlternatePair(_nAlternatePair), |
|
scene(_scene), |
|
images(_scene.images) |
|
{ |
|
if (!InitKernels(CUDA::desiredDeviceID)) |
|
return; |
|
// keep only best neighbor views for each image |
|
std::unordered_set<uint64_t> mapPairs; |
|
mapPairs.reserve(images.GetSize()*nMaxViews); |
|
FOREACH(idxImage, images) { |
|
// keep only best neighbor views |
|
const float fMinArea(0.1f); |
|
const float fMinScale(0.2f), fMaxScale(3.2f); |
|
const float fMinAngle(FD2R(2.5f)), fMaxAngle(FD2R(45.f)); |
|
const Image& imageData = images[idxImage]; |
|
if (!imageData.IsValid()) |
|
continue; |
|
ViewScoreArr neighbors(imageData.neighbors); |
|
Scene::FilterNeighborViews(neighbors, fMinArea, fMinScale, fMaxScale, fMinAngle, fMaxAngle, nMaxViews); |
|
for (const ViewScore& neighbor: neighbors) { |
|
ASSERT(images[neighbor.ID].IsValid()); |
|
mapPairs.insert(MakePairIdx((uint32_t)idxImage, neighbor.ID)); |
|
} |
|
} |
|
pairs.Reserve(mapPairs.size()); |
|
for (uint64_t pair: mapPairs) |
|
pairs.AddConstruct(pair); |
|
} |
|
MeshRefineCUDA::~MeshRefineCUDA() |
|
{ |
|
scene.mesh.ReleaseExtra(); |
|
} |
|
|
|
bool MeshRefineCUDA::InitKernels(int device) |
|
{ |
|
STATIC_ASSERT(sizeof(CameraCUDA) == 176); |
|
|
|
// initialize CUDA device if needed |
|
if (CUDA::devices.IsEmpty() && CUDA::initDevice(device) != CUDA_SUCCESS) |
|
return false; |
|
|
|
// initialize CUDA kernels |
|
if (module != NULL && module->IsValid()) |
|
return true; |
|
module = new CUDA::ModuleRT(g_szMeshRefineModule); |
|
if (!module->IsValid()) { |
|
module.Release(); |
|
return false; |
|
} |
|
if (kernelProjectMesh.Reset(module, "ProjectMesh") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelProjectMesh.IsValid()); |
|
if (kernelCrossCheckProjection.Reset(module, "CrossCheckProjection") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelCrossCheckProjection.IsValid()); |
|
if (kernelImageMeshWarp.Reset(module, "ImageMeshWarp") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelImageMeshWarp.IsValid()); |
|
if (kernelComputeImageMean.Reset(module, "ComputeImageMean") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeImageMean.IsValid()); |
|
if (kernelComputeImageVar.Reset(module, "ComputeImageVar") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeImageVar.IsValid()); |
|
if (kernelComputeImageCov.Reset(module, "ComputeImageCov") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeImageCov.IsValid()); |
|
if (kernelComputeImageZNCC.Reset(module, "ComputeImageZNCC") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeImageZNCC.IsValid()); |
|
if (kernelComputeImageDZNCC.Reset(module, "ComputeImageDZNCC") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeImageDZNCC.IsValid()); |
|
if (kernelComputePhotometricGradient.Reset(module, "ComputePhotometricGradient") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputePhotometricGradient.IsValid()); |
|
if (kernelUpdatePhotoGradNorm.Reset(module, "UpdatePhotoGradNorm") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelUpdatePhotoGradNorm.IsValid()); |
|
if (kernelComputeSmoothnessGradient.Reset(module, "ComputeSmoothnessGradient") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelComputeSmoothnessGradient.IsValid()); |
|
if (kernelCombineGradients.Reset(module, "CombineGradients") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelCombineGradients.IsValid()); |
|
if (kernelCombineAllGradients.Reset(module, "CombineAllGradients") != CUDA_SUCCESS) |
|
return false; |
|
ASSERT(kernelCombineAllGradients.IsValid()); |
|
|
|
// init textures |
|
if (texImageRef.Reset(module, "texImageRef", CU_TR_FILTER_MODE_LINEAR) != CUDA_SUCCESS) |
|
return false; |
|
if (surfImageRef.Reset(module, "surfImageRef") != CUDA_SUCCESS) |
|
return false; |
|
if (surfImageProjRef.Reset(module, "surfImageProjRef") != CUDA_SUCCESS) |
|
return false; |
|
return true; |
|
} |
|
|
|
// load and initialize all images at the given scale |
|
// and compute the gradient for each input image |
|
// optional: blur them using the given sigma |
|
bool MeshRefineCUDA::InitImages(float scale, float sigma) |
|
{ |
|
views.Resize(images.GetSize()); |
|
#ifdef MESHCUDAOPT_USE_OPENMP |
|
bool bAbort(false); |
|
#pragma omp parallel for |
|
for (int_t ID=0; ID<(int_t)images.GetSize(); ++ID) { |
|
#pragma omp flush (bAbort) |
|
if (bAbort) |
|
continue; |
|
const uint32_t idxImage((uint32_t)ID); |
|
#else |
|
FOREACH(idxImage, images) { |
|
#endif |
|
Image& imageData = images[idxImage]; |
|
if (!imageData.IsValid()) |
|
continue; |
|
// load and init image |
|
unsigned level(nResolutionLevel); |
|
const unsigned imageSize(imageData.RecomputeMaxResolution(level, nMinResolution)); |
|
if ((imageData.image.empty() || MAXF(imageData.width,imageData.height) != imageSize) && !imageData.ReloadImage(imageSize)) { |
|
#ifdef MESHCUDAOPT_USE_OPENMP |
|
bAbort = true; |
|
#pragma omp flush (bAbort) |
|
continue; |
|
#else |
|
return false; |
|
#endif |
|
} |
|
View& view = views[idxImage]; |
|
Image32F& img = view.imageHost; |
|
imageData.image.toGray(img, cv::COLOR_BGR2GRAY, true); |
|
imageData.image.release(); |
|
if (sigma > 0) |
|
cv::GaussianBlur(img, img, cv::Size(), sigma); |
|
if (scale < 1.0) { |
|
cv::resize(img, img, cv::Size(), scale, scale, cv::INTER_AREA); |
|
imageData.width = img.width(); imageData.height = img.height(); |
|
} |
|
imageData.UpdateCamera(scene.platforms); |
|
} |
|
#ifdef MESHCUDAOPT_USE_OPENMP |
|
if (bAbort) |
|
return false; |
|
#endif |
|
// init GPU memory |
|
Image8U::Size maxSize(0,0); |
|
FOREACH(idxImage, views) { |
|
View& view = views[idxImage]; |
|
if (view.imageHost.empty()) |
|
continue; |
|
Image8U::Size& size(view.size); |
|
size = view.imageHost.size(); |
|
reportCudaError(view.image.Reset(size, CUDA_ARRAY3D_SURFACE_LDST)); |
|
reportCudaError(view.image.SetData(cvtImage<float,hfloat>(view.imageHost))); |
|
view.imageHost.release(); |
|
const size_t area((size_t)size.area()); |
|
reportCudaError(view.depthMap.Reset(sizeof(float)*area)); |
|
reportCudaError(view.faceMap.Reset(sizeof(FIndex)*area)); |
|
reportCudaError(view.baryMap.Reset(sizeof(hfloat)*3*area)); |
|
if (maxSize.width < size.width) |
|
maxSize.width = size.width; |
|
if (maxSize.height < size.height) |
|
maxSize.height = size.height; |
|
} |
|
const size_t area(maxSize.area()); |
|
reportCudaError(mask.Reset(sizeof(uint8_t)*area)); |
|
reportCudaError(imageMeanA.Reset(sizeof(float)*area)); |
|
reportCudaError(imageVarA.Reset(sizeof(float)*area)); |
|
reportCudaError(imageAB.Reset(maxSize, CUDA_ARRAY3D_SURFACE_LDST)); |
|
reportCudaError(imageMeanAB.Reset(sizeof(float)*area)); |
|
reportCudaError(imageVarAB.Reset(sizeof(float)*area)); |
|
reportCudaError(imageCov.Reset(sizeof(float)*area)); |
|
reportCudaError(imageZNCC.Reset(sizeof(float)*area)); |
|
reportCudaError(imageDZNCC.Reset(sizeof(float)*area)); |
|
surfImageProjRef.Bind(imageAB); |
|
iteration = 0; |
|
return true; |
|
} |
|
|
|
// extract array of triangles incident to each vertex |
|
// and check each vertex if it is at the boundary or not |
|
void MeshRefineCUDA::ListVertexFacesPre() |
|
{ |
|
scene.mesh.EmptyExtra(); |
|
scene.mesh.ListIncidenteFaces(); |
|
reportCudaError(faces.Reset(scene.mesh.faces)); |
|
} |
|
void MeshRefineCUDA::ListVertexFacesPost() |
|
{ |
|
scene.mesh.ListIncidenteVertices(); |
|
scene.mesh.ListBoundaryVertices(); |
|
ASSERT(!scene.mesh.vertices.IsEmpty() && scene.mesh.vertices.GetSize() == scene.mesh.vertexVertices.GetSize()); |
|
// set vertex vertices |
|
reportCudaError(vertexVertices.Reset(scene.mesh.vertexVertices)); |
|
// list adjacent vertices for each vertex |
|
const size_t numVertices(scene.mesh.vertices.GetSize()); |
|
Unsigned32Arr _vertexVerticesCont(0, numVertices*6); |
|
Unsigned32Arr _vertexVerticesSizes(0, numVertices); |
|
Unsigned32Arr _vertexVerticesPointers(0, numVertices); |
|
uint32_t lastPosition(0); |
|
FOREACH(idxV, scene.mesh.vertices) { |
|
if (scene.mesh.vertexBoundary[idxV]) { |
|
_vertexVerticesSizes.Insert(0); |
|
_vertexVerticesPointers.Insert(lastPosition); |
|
continue; |
|
} |
|
const Mesh::VertexIdxArr& verts = scene.mesh.vertexVertices[idxV]; |
|
_vertexVerticesCont.Join(verts.GetData(), verts.GetSize()); |
|
_vertexVerticesSizes.Insert(verts.GetSize()); |
|
_vertexVerticesPointers.Insert(lastPosition); lastPosition += verts.GetSize(); |
|
} |
|
reportCudaError(vertexVerticesCont.Reset(_vertexVerticesCont)); |
|
reportCudaError(vertexVerticesSizes.Reset(_vertexVerticesSizes)); |
|
reportCudaError(vertexVerticesPointers.Reset(_vertexVerticesPointers)); |
|
// init memory |
|
reportCudaError(photoGrad.Reset(sizeof(Point3f)*numVertices)); |
|
reportCudaError(photoGradNorm.Reset(sizeof(float)*numVertices)); |
|
reportCudaError(photoGradPixels.Reset(sizeof(float)*numVertices)); |
|
reportCudaError(smoothGrad1.Reset(sizeof(Point3f)*numVertices)); |
|
reportCudaError(smoothGrad2.Reset(sizeof(Point3f)*numVertices)); |
|
} |
|
|
|
// extract array of faces viewed by each image |
|
void MeshRefineCUDA::ListCameraFaces() |
|
{ |
|
// extract array of faces viewed by each camera |
|
CameraFacesArr arrCameraFaces(images.GetSize()); { |
|
Mesh::Octree octree; |
|
Mesh::FacesInserter::CreateOctree(octree, scene.mesh); |
|
FOREACH(ID, images) { |
|
const Image& imageData = images[ID]; |
|
if (!imageData.IsValid()) |
|
continue; |
|
const TFrustum<float,5> frustum(Matrix3x4f(imageData.camera.P), (float)imageData.width, (float)imageData.height); |
|
Mesh::FacesInserter inserter(arrCameraFaces[ID]); |
|
octree.Traverse(frustum, inserter); |
|
} |
|
} |
|
|
|
// project mesh to each camera plane |
|
reportCudaError(vertices.Reset(scene.mesh.vertices)); |
|
FOREACH(idxImage, images) { |
|
const Image& imageData = images[idxImage]; |
|
if (imageData.IsValid()) |
|
ProjectMesh(arrCameraFaces[idxImage], imageData.camera, views[idxImage].size, idxImage); |
|
} |
|
} |
|
|
|
// compute for each face the projection area as the maximum area in both images of a pair |
|
// (make sure ListCameraFaces() was called before) |
|
void MeshRefineCUDA::ListFaceAreas(Mesh::AreaArr& maxAreas) |
|
{ |
|
ASSERT(maxAreas.IsEmpty()); |
|
// for each image, compute the projection area of visible faces |
|
typedef cList<Mesh::AreaArr> ImageAreaArr; |
|
ImageAreaArr viewAreas(images.GetSize()); |
|
FOREACH(idxImage, images) { |
|
const Image& imageData = images[idxImage]; |
|
if (!imageData.IsValid()) |
|
continue; |
|
Mesh::AreaArr& areas = viewAreas[idxImage]; |
|
areas.Resize(scene.mesh.faces.GetSize()); |
|
areas.Memset(0); |
|
// get faceMap from the GPU memory |
|
TImage<FIndex> faceMap(imageData.height, imageData.width); |
|
views[idxImage].faceMap.GetData(faceMap); |
|
// compute area covered by all vertices (incident faces) viewed by this image |
|
for (int j=0; j<faceMap.rows; ++j) { |
|
for (int i=0; i<faceMap.cols; ++i) { |
|
const FIndex idxFace(faceMap(j,i)); |
|
if (idxFace == NO_ID) |
|
continue; |
|
++areas[idxFace]; |
|
} |
|
} |
|
} |
|
// for each pair, mark the faces that have big projection areas in both images |
|
maxAreas.Resize(scene.mesh.faces.GetSize()); |
|
maxAreas.Memset(0); |
|
FOREACHPTR(pPair, pairs) { |
|
const Mesh::AreaArr& areasA = viewAreas[pPair->i]; |
|
const Mesh::AreaArr& areasB = viewAreas[pPair->j]; |
|
ASSERT(areasA.GetSize() == areasB.GetSize()); |
|
FOREACH(f, areasA) { |
|
const uint16_t minArea(MINF(areasA[f], areasB[f])); |
|
uint16_t& maxArea = maxAreas[f]; |
|
if (maxArea < minArea) |
|
maxArea = minArea; |
|
} |
|
} |
|
} |
|
|
|
// decimate or subdivide mesh such that for each face there is no image pair in which |
|
// its projection area is bigger than the given number of pixels in both images |
|
void MeshRefineCUDA::SubdivideMesh(uint32_t maxArea, float fDecimate, unsigned nCloseHoles, unsigned nEnsureEdgeSize) |
|
{ |
|
Mesh::AreaArr maxAreas; |
|
|
|
// first decimate if necessary |
|
const bool bNoDecimation(fDecimate >= 1.f); |
|
const bool bNoSimplification(maxArea == 0); |
|
if (!bNoDecimation) { |
|
if (fDecimate > 0.f) { |
|
// decimate to the desired resolution |
|
scene.mesh.Clean(fDecimate, 0.f, false, nCloseHoles, 0u, 0.f, false); |
|
scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); |
|
|
|
#ifdef MESHOPT_ENSUREEDGESIZE |
|
// make sure there are no edges too small or too long |
|
if (nEnsureEdgeSize > 0 && bNoSimplification) { |
|
scene.mesh.EnsureEdgeSize(); |
|
scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); |
|
} |
|
#endif |
|
|
|
// re-map vertex and camera faces |
|
ListVertexFacesPre(); |
|
} else { |
|
// extract array of faces viewed by each camera |
|
ListCameraFaces(); |
|
|
|
// estimate the faces' area that have big projection areas in both images of a pair |
|
ListFaceAreas(maxAreas); |
|
ASSERT(!maxAreas.IsEmpty()); |
|
|
|
const float maxAreaf((float)(maxArea > 0 ? maxArea : 64)); |
|
const float medianArea(6.f*(float)Mesh::AreaArr(maxAreas).GetMedian()); |
|
if (medianArea < maxAreaf) { |
|
maxAreas.Empty(); |
|
|
|
// decimate to the auto detected resolution |
|
scene.mesh.Clean(MAXF(0.1f, medianArea/maxAreaf), 0.f, false, nCloseHoles, 0u, 0.f, false); |
|
scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); |
|
|
|
#ifdef MESHOPT_ENSUREEDGESIZE |
|
// make sure there are no edges too small or too long |
|
if (nEnsureEdgeSize > 0 && bNoSimplification) { |
|
scene.mesh.EnsureEdgeSize(); |
|
scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); |
|
} |
|
#endif |
|
|
|
// re-map vertex and camera faces |
|
ListVertexFacesPre(); |
|
} |
|
} |
|
} |
|
if (bNoSimplification) |
|
return; |
|
|
|
if (maxAreas.IsEmpty()) { |
|
// extract array of faces viewed by each camera |
|
ListCameraFaces(); |
|
|
|
// estimate the faces' area that have big projection areas in both images of a pair |
|
ListFaceAreas(maxAreas); |
|
} |
|
|
|
// subdivide mesh faces if its projection area is bigger than the given number of pixels |
|
const size_t numVertsOld(scene.mesh.vertices.GetSize()); |
|
const size_t numFacesOld(scene.mesh.faces.GetSize()); |
|
scene.mesh.Subdivide(maxAreas, maxArea); |
|
|
|
#ifdef MESHOPT_ENSUREEDGESIZE |
|
// make sure there are no edges too small or too long |
|
#if MESHOPT_ENSUREEDGESIZE==1 |
|
if ((nEnsureEdgeSize == 1 && !bNoDecimation) || nEnsureEdgeSize > 1) |
|
#endif |
|
{ |
|
scene.mesh.EnsureEdgeSize(); |
|
scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); |
|
} |
|
#endif |
|
|
|
// re-map vertex and camera faces |
|
ListVertexFacesPre(); |
|
|
|
DEBUG_EXTRA("Mesh subdivided: %u/%u -> %u/%u vertices/faces", numVertsOld, numFacesOld, scene.mesh.vertices.GetSize(), scene.mesh.faces.GetSize()); |
|
|
|
#if TD_VERBOSE != TD_VERBOSE_OFF |
|
if (VERBOSITY_LEVEL > 3) |
|
scene.mesh.Save(MAKE_PATH("MeshSubdivided.ply")); |
|
#endif |
|
} |
|
|
|
|
|
// compute face normals |
|
void MeshRefineCUDA::ComputeNormalFaces() |
|
{ |
|
const FIndex numFaces(scene.mesh.faces.GetSize()); |
|
reportCudaError(faceNormals.Reset(sizeof(Point3f)*numFaces)); |
|
reportCudaError(Mesh::kernelComputeFaceNormal((int)numFaces, |
|
vertices, |
|
faces, |
|
faceNormals, |
|
numFaces |
|
)); |
|
} |
|
|
|
|
|
// score mesh using photo-consistency |
|
// and compute vertices gradient using analytical method |
|
void MeshRefineCUDA::ScoreMesh(float* gradients) |
|
{ |
|
// extract array of faces viewed by each camera |
|
ListCameraFaces(); |
|
|
|
// compute face normals |
|
ComputeNormalFaces(); |
|
|
|
// init memory |
|
const VIndex numVertices(scene.mesh.vertices.GetSize()); |
|
reportCudaError(cuMemsetD32(photoGrad, 0, numVertices*3)); |
|
reportCudaError(cuMemsetD32(photoGradNorm, 0, numVertices)); |
|
|
|
// for each pair of images, compute a photo-consistency score |
|
// between the reference image and the pixels of the second image |
|
// projected in the reference image through the mesh surface |
|
FOREACHPTR(pPair, pairs) { |
|
ASSERT(pPair->i < pPair->j); |
|
switch (nAlternatePair) { |
|
case 1: { |
|
const PairIdx pair(iteration%2 ? PairIdx(pPair->j,pPair->i) : PairIdx(pPair->i,pPair->j)); |
|
ProcessPair(pair.i, pair.j); |
|
break; } |
|
case 2: { |
|
ProcessPair(pPair->i, pPair->j); |
|
break; } |
|
case 3: { |
|
ProcessPair(pPair->j, pPair->i); |
|
break; } |
|
default: |
|
for (int ip=0; ip<2; ++ip) { |
|
const PairIdx pair(ip ? PairIdx(pPair->j,pPair->i) : PairIdx(pPair->i,pPair->j)); |
|
ProcessPair(pair.i, pair.j); |
|
} |
|
} |
|
} |
|
|
|
// loop through all vertices and compute the smoothing score |
|
ComputeSmoothnessGradient(numVertices); |
|
|
|
// set the final gradient as the combination of photometric and smoothness gradients |
|
CombineGradients(numVertices); |
|
reportCudaError(photoGrad.GetData(gradients, sizeof(Point3f)*numVertices)); |
|
} |
|
|
|
|
|
// project mesh to the given camera plane |
|
void MeshRefineCUDA::ProjectMesh( |
|
const CameraFaces& cameraFaces, |
|
const Camera& camera, const Image8U::Size& size, uint32_t idxImage) |
|
{ |
|
View& view = views[idxImage]; |
|
// init depth-map |
|
const float fltMax(FLT_MAX); |
|
reportCudaError(cuMemsetD32(view.depthMap, (uint32_t&)fltMax, size.area())); |
|
// fetch only the faces viewed by this camera |
|
Mesh::FaceIdxArr faceIDsView(0, (FIndex)cameraFaces.size()); |
|
for (auto idxFace : cameraFaces) |
|
faceIDsView.Insert(idxFace); |
|
// project mesh |
|
reportCudaError(kernelProjectMesh((int)faceIDsView.GetSize(), |
|
vertices, |
|
faces, |
|
faceIDsView, |
|
view.depthMap, |
|
view.faceMap, |
|
view.baryMap, |
|
CameraCUDA(camera, size), |
|
faceIDsView.GetSize() |
|
)); |
|
kernelProjectMesh.Reset(); |
|
// cross-check valid depth and face index |
|
reportCudaError(kernelCrossCheckProjection(size, |
|
view.depthMap, |
|
view.faceMap, |
|
size.width, size.height |
|
)); |
|
#if 0 |
|
// debug view |
|
DepthMap depthMap(size); |
|
TImage<FIndex> faceMap(size); |
|
TImage<Point3hf> baryMap(size); |
|
view.depthMap.GetData(depthMap); |
|
view.faceMap.GetData(faceMap); |
|
view.baryMap.GetData(baryMap); |
|
TImage<Point3f> _baryMap(cvtImage<Point3hf,Point3f>(baryMap)); |
|
#endif |
|
} |
|
|
|
void MeshRefineCUDA::ProcessPair(uint32_t idxImageA, uint32_t idxImageB) |
|
{ |
|
// fetch view A data |
|
const Image& imageDataA = images[idxImageA]; |
|
ASSERT(imageDataA.IsValid()); |
|
const Camera& cameraA = imageDataA.camera; |
|
const Image8U::Size& sizeA(views[idxImageA].size); |
|
// fetch view B data |
|
const Image& imageDataB = images[idxImageB]; |
|
ASSERT(imageDataB.IsValid()); |
|
const Camera& cameraB = imageDataB.camera; |
|
// warp imageB to imageA using the mesh |
|
ImageMeshWarp(cameraA, cameraB, sizeA, idxImageA, idxImageB); |
|
// init vertex textures |
|
ComputeLocalVariance(imageAB, sizeA, imageMeanAB, imageVarAB); |
|
ComputeLocalVariance(views[idxImageA].image, sizeA, imageMeanA, imageVarA); |
|
ComputeLocalZNCC(sizeA); |
|
const float RegularizationScale((float)((REAL)(imageDataA.avgDepth*imageDataB.avgDepth)/(cameraA.GetFocalLength()*cameraB.GetFocalLength()))); |
|
ComputePhotometricGradient(cameraA, cameraB, sizeA, idxImageA, idxImageB, scene.mesh.vertices.GetSize(), RegularizationScale); |
|
} |
|
|
|
// project image from view B to view A through the mesh; |
|
// the projected image is stored in imageA |
|
void MeshRefineCUDA::ImageMeshWarp( |
|
const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, |
|
uint32_t idxImageA, uint32_t idxImageB) |
|
{ |
|
// set image texture |
|
surfImageRef.Bind(views[idxImageA].image); |
|
texImageRef.Bind(views[idxImageB].image); |
|
// project image |
|
reportCudaError(kernelImageMeshWarp(size, |
|
views[idxImageA].depthMap, |
|
views[idxImageB].depthMap, |
|
mask, |
|
CameraCUDA(cameraA, size), |
|
CameraCUDA(cameraB, size) |
|
)); |
|
#if 0 |
|
// debug view |
|
Image16F _imageAB(size); |
|
Image8U _mask(size); |
|
imageAB.GetData(_imageAB); |
|
mask.GetData(_mask); |
|
Image32F __imageAB(cvtImage<hfloat,float>(_imageAB)); |
|
#endif |
|
} |
|
|
|
// compute local variance for each image pixel |
|
void MeshRefineCUDA::ComputeLocalVariance(const CUDA::ArrayRT16F& image, const Image8U::Size& size, |
|
CUDA::MemDevice& imageMean, CUDA::MemDevice& imageVar) |
|
{ |
|
surfImageRef.Bind(image); |
|
reportCudaError(kernelComputeImageMean(size, |
|
mask, |
|
imageMean, |
|
size.width, size.height, |
|
HalfSize |
|
)); |
|
reportCudaError(kernelComputeImageVar(size, |
|
imageMean, |
|
mask, |
|
imageVar, |
|
size.width, size.height, |
|
HalfSize |
|
)); |
|
#if 0 |
|
// debug view |
|
Image32F mean(size); |
|
Image32F var(size); |
|
imageMean.GetData(mean); |
|
imageVar.GetData(var); |
|
#endif |
|
} |
|
|
|
// compute local ZNCC and its gradient for each image pixel |
|
void MeshRefineCUDA::ComputeLocalZNCC(const Image8U::Size& size) |
|
{ |
|
reportCudaError(kernelComputeImageCov(size, |
|
imageMeanA, |
|
imageMeanAB, |
|
mask, |
|
imageCov, |
|
size.width, size.height, |
|
HalfSize |
|
)); |
|
reportCudaError(kernelComputeImageZNCC(size, |
|
imageCov, |
|
imageVarA, |
|
imageVarAB, |
|
mask, |
|
imageZNCC, |
|
size.width, size.height, |
|
HalfSize |
|
)); |
|
reportCudaError(kernelComputeImageDZNCC(size, |
|
imageMeanA, |
|
imageMeanAB, |
|
imageVarA, |
|
imageVarAB, |
|
imageZNCC, |
|
mask, |
|
imageDZNCC, |
|
size.width, size.height, |
|
HalfSize |
|
)); |
|
#if 0 |
|
// debug view |
|
Image32F _imageZNCC(size); |
|
Image32F _imageDZNCC(size); |
|
imageZNCC.GetData(_imageZNCC); |
|
imageDZNCC.GetData(_imageDZNCC); |
|
#endif |
|
} |
|
|
|
// compute the photometric gradient for all vertices seen by an image pair |
|
void MeshRefineCUDA::ComputePhotometricGradient(const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, |
|
uint32_t idxImageA, uint32_t idxImageB, uint32_t numVertices, float RegularizationScale) |
|
{ |
|
// compute photometric gradient for all visible vertices |
|
reportCudaError(cuMemsetD32(photoGradPixels, 0, numVertices)); |
|
reportCudaError(kernelComputePhotometricGradient(size, |
|
faces, faceNormals, |
|
views[idxImageA].depthMap, |
|
views[idxImageA].faceMap, |
|
views[idxImageA].baryMap, |
|
imageDZNCC, |
|
mask, |
|
photoGrad, photoGradPixels, |
|
CameraCUDA(cameraA, size), |
|
CameraCUDA(cameraB, size), |
|
RegularizationScale |
|
)); |
|
// update photometric gradient norm for all visible vertices |
|
reportCudaError(kernelUpdatePhotoGradNorm(numVertices, |
|
photoGradNorm, photoGradPixels, |
|
numVertices |
|
)); |
|
#if 0 |
|
// debug view |
|
Point3fArr _photoGrad(numVertices); |
|
FloatArr _photoGradPixels(numVertices); |
|
FloatArr _photoGradNorm(numVertices); |
|
photoGrad.GetData(_photoGrad); |
|
photoGradPixels.GetData(_photoGradPixels); |
|
photoGradNorm.GetData(_photoGradNorm); |
|
#endif |
|
} |
|
|
|
void MeshRefineCUDA::ComputeSmoothnessGradient(uint32_t numVertices) |
|
{ |
|
// compute smoothness gradient for all vertices |
|
reportCudaError(kernelComputeSmoothnessGradient((int)numVertices, |
|
vertices, |
|
vertexVerticesCont, |
|
vertexVerticesSizes, |
|
vertexVerticesPointers, |
|
smoothGrad1, |
|
numVertices, |
|
uint8_t(0) |
|
)); |
|
reportCudaError(kernelComputeSmoothnessGradient((int)numVertices, |
|
smoothGrad1, |
|
vertexVerticesCont, |
|
vertexVerticesSizes, |
|
vertexVerticesPointers, |
|
smoothGrad2, |
|
numVertices, |
|
uint8_t(1) |
|
)); |
|
#if 0 |
|
// debug view |
|
Point3fArr _smoothGrad1(numVertices); |
|
Point3fArr _smoothGrad2(numVertices); |
|
smoothGrad1.GetData(_smoothGrad1); |
|
smoothGrad2.GetData(_smoothGrad2); |
|
#endif |
|
} |
|
|
|
void MeshRefineCUDA::CombineGradients(uint32_t numVertices) |
|
{ |
|
// compute smoothness gradient for all vertices |
|
if (ratioRigidityElasticity >= 1.f) { |
|
reportCudaError(kernelCombineGradients((int)numVertices, |
|
photoGrad, |
|
photoGradNorm, |
|
smoothGrad2, |
|
numVertices, |
|
weightRegularity |
|
)); |
|
} else { |
|
// compute smoothing gradient as a combination of level 1 and 2 of the Laplacian operator; |
|
// (see page 105 of "Stereo and Silhouette Fusion for 3D Object Modeling from Uncalibrated Images Under Circular Motion" C. Hernandez, 2004) |
|
const float rigidity((1.f-ratioRigidityElasticity)*weightRegularity); |
|
const float elasticity(ratioRigidityElasticity*weightRegularity); |
|
reportCudaError(kernelCombineAllGradients((int)numVertices, |
|
photoGrad, |
|
photoGradNorm, |
|
smoothGrad1, |
|
smoothGrad2, |
|
numVertices, |
|
rigidity, |
|
elasticity |
|
)); |
|
} |
|
#if 0 |
|
// debug view |
|
Point3fArr _photoGrad(numVertices); |
|
photoGrad.GetData(_photoGrad); |
|
#endif |
|
} |
|
/*----------------------------------------------------------------*/ |
|
|
|
|
|
|
|
// S T R U C T S /////////////////////////////////////////////////// |
|
|
|
// optimize mesh using photo-consistency |
|
bool Scene::RefineMeshCUDA(unsigned nResolutionLevel, unsigned nMinResolution, unsigned nMaxViews, |
|
float fDecimateMesh, unsigned nCloseHoles, unsigned nEnsureEdgeSize, unsigned nMaxFaceArea, |
|
unsigned nScales, float fScaleStep, unsigned nAlternatePair, float fRegularityWeight, float fRatioRigidityElasticity, float fGradientStep) |
|
{ |
|
if (pointcloud.IsEmpty() && !ImagesHaveNeighbors()) |
|
SampleMeshWithVisibility(); |
|
|
|
MeshRefineCUDA refine(*this, nAlternatePair, fRegularityWeight, fRatioRigidityElasticity, nResolutionLevel, nMinResolution, nMaxViews); |
|
if (!refine.IsValid()) |
|
return false; |
|
|
|
// run the mesh optimization on multiple scales (coarse to fine) |
|
for (unsigned nScale=0; nScale<nScales; ++nScale) { |
|
// init images |
|
const float scale(POWI(fScaleStep, nScales-nScale-1)); |
|
const float step(POWI(2.f, nScales-nScale)); |
|
DEBUG_ULTIMATE("Refine mesh at: %.2f image scale", scale); |
|
if (!refine.InitImages(scale, 0.12f*step+0.2f)) |
|
return false; |
|
|
|
// extract array of triangles incident to each vertex |
|
refine.ListVertexFacesPre(); |
|
|
|
// automatic mesh subdivision |
|
refine.SubdivideMesh(nMaxFaceArea, nScale == 0 ? fDecimateMesh : 1.f, nCloseHoles, nEnsureEdgeSize); |
|
|
|
// extract array of triangle normals |
|
refine.ListVertexFacesPost(); |
|
|
|
#if TD_VERBOSE != TD_VERBOSE_OFF |
|
if (VERBOSITY_LEVEL > 2) |
|
mesh.Save(MAKE_PATH(String::FormatString("MeshRefine%u.ply", nScales-nScale-1))); |
|
#endif |
|
|
|
// loop a constant number of iterations and apply the gradient |
|
int iters(25); |
|
float gstep(0.05f); |
|
if (fGradientStep > 1) { |
|
iters = FLOOR2INT(fGradientStep); |
|
gstep = (fGradientStep-(float)iters)*10; |
|
} |
|
iters = MAXF(iters/(int)(nScale+1),8); |
|
const int iterStop(iters*7/10); |
|
Eigen::Matrix<float,Eigen::Dynamic,3,Eigen::RowMajor> gradients(mesh.vertices.GetSize(),3); |
|
Util::Progress progress(_T("Processed iterations"), iters); |
|
GET_LOGCONSOLE().Pause(); |
|
for (int iter=0; iter<iters; ++iter) { |
|
refine.iteration = (unsigned)iter; |
|
refine.nAlternatePair = (iter+1 < iters ? nAlternatePair : 0); |
|
refine.ratioRigidityElasticity = (iter <= iterStop ? fRatioRigidityElasticity : 1.f); |
|
// evaluate residuals and gradients |
|
refine.ScoreMesh(gradients.data()); |
|
// apply gradients |
|
float gv(0); |
|
FOREACH(v, mesh.vertices) { |
|
Vertex& vert = mesh.vertices[v]; |
|
const Point3f grad(gradients.row(v)); |
|
if (!ISFINITE(grad)) |
|
continue; |
|
vert -= Vertex(grad*gstep); |
|
gv += norm(grad); |
|
} |
|
DEBUG_EXTRA("\t%2d. g: %.5f (%.3e - %.3e)\ts: %.3f", iter+1, gradients.norm(), gradients.norm()/mesh.vertices.GetSize(), gv/mesh.vertices.GetSize(), gstep); |
|
gstep *= 0.98f; |
|
progress.display(iter); |
|
} |
|
GET_LOGCONSOLE().Play(); |
|
progress.close(); |
|
|
|
#if TD_VERBOSE != TD_VERBOSE_OFF |
|
if (VERBOSITY_LEVEL > 2) |
|
mesh.Save(MAKE_PATH(String::FormatString("MeshRefined%u.ply", nScales-nScale-1))); |
|
#endif |
|
} |
|
|
|
return true; |
|
} // RefineMeshCUDA |
|
/*----------------------------------------------------------------*/ |
|
|
|
#endif // _USE_CUDA
|
|
|