/* * SceneRefineCUDA.cpp * * Copyright (c) 2014-2015 SEACAVE * * Author(s): * * cDc * * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * * Additional Terms: * * You are required to preserve legal notices and author attributions in * that material or in the Appropriate Legal Notices displayed by works * containing it. */ #include "Common.h" #include "Scene.h" using namespace MVS; #ifdef _USE_CUDA // D E F I N E S /////////////////////////////////////////////////// // uncomment to enable multi-threading based on OpenMP #ifdef _USE_OPENMP #define MESHCUDAOPT_USE_OPENMP #endif // uncomment to ensure edge size and improve vertex valence // (should enable more stable flow) #define MESHOPT_ENSUREEDGESIZE 1 // 0 - at all resolution // S T R U C T S /////////////////////////////////////////////////// static LPCSTR const g_szMeshRefineModule = ".version 3.2\n" ".target sm_20\n" ".address_size 64\n" "\n" ".global .texref texImageRef;\n" ".global .surfref surfImageRef;\n" ".global .surfref surfImageProjRef;\n" "\n" // kernel used to project the given mesh to a given camera plane: // the depth-map is computed by rasterizing all triangles (using a brute force scan-line approach) // and storing only the closest ones; // additionally the face index and barycentric coordinates are stored for each pixel ".visible .entry ProjectMesh(\n" " .param .u64 .ptr param_1, // array vertices (float*3 * numVertices)\n" " .param .u64 .ptr param_2, // array faces (uint32_t*3 * numFaces)\n" " .param .u64 .ptr param_3, // array face IDs (uint32_t * numFacesView)\n" " .param .u64 .ptr param_4, // depth-map (float) [out]\n" " .param .u64 .ptr param_5, // face-map (uint32_t) [out]\n" " .param .u64 .ptr param_6, // bary-map (hfloat*3) [out]\n" " .param .align 4 .b8 param_7[176], // camera\n" " .param .u32 param_8 // numFacesView (uint32_t)\n" ")\n" "{\n" " .reg .f32 %f<234>;\n" " .reg .pred %p<20>;\n" " .reg .s16 %rs<5>;\n" " .reg .s32 %r<105>;\n" " .reg .s64 %rl<42>;\n" "\n" " ld.param.u32 %r16, [param_8];\n" " mov.u32 %r17, %ntid.x;\n" " mov.u32 %r18, %ctaid.x;\n" " mov.u32 %r19, %tid.x;\n" " mad.lo.s32 %r5, %r17, %r18, %r19;\n" " setp.ge.s32 %p3, %r5, %r16;\n" " @%p3 bra BB00_1;\n" "\n" " ld.param.u64 %rl4, [param_3];\n" " cvta.to.global.u64 %rl3, %rl4;\n" " mul.wide.u32 %rl5, %r5, 4;\n" " add.u64 %rl6, %rl3, %rl5;\n" " ld.global.u32 %r6, [%rl6];\n" " mul.lo.s32 %r20, %r6, 3;\n" " ld.param.u64 %rl36, [param_2];\n" " cvta.to.global.u64 %rl10, %rl36;\n" " mul.wide.s32 %rl11, %r20, 4;\n" " add.s64 %rl12, %rl10, %rl11;\n" " mad.lo.s32 %r21, %r6, 3, 1;\n" " mul.wide.s32 %rl13, %r21, 4;\n" " add.s64 %rl14, %rl10, %rl13;\n" " ld.global.u32 %r22, [%rl12];\n" " mul.lo.s32 %r24, %r22, 3;\n" " ld.param.u64 %rl35, [param_1];\n" " cvta.to.global.u64 %rl15, %rl35;\n" " mul.wide.s32 %rl16, %r24, 4;\n" " add.s64 %rl17, %rl15, %rl16;\n" " ld.global.u32 %r25, [%rl14];\n" " mul.lo.s32 %r27, %r25, 3;\n" " mul.wide.s32 %rl18, %r27, 4;\n" " add.s64 %rl19, %rl15, %rl18;\n" " ld.global.f32 %f33, [%rl19];\n" " ld.global.f32 %f34, [%rl19+4];\n" " ld.global.f32 %f35, [%rl19+8];\n" " ld.global.u32 %r31, [%rl14+4];\n" " mul.lo.s32 %r33, %r31, 3;\n" " mul.wide.s32 %rl20, %r33, 4;\n" " add.s64 %rl21, %rl15, %rl20;\n" " ld.global.f32 %f36, [%rl21];\n" " ld.global.f32 %f37, [%rl21+4];\n" " ld.global.f32 %f38, [%rl21+8];\n" " ld.global.f32 %f39, [%rl17];\n" " ld.global.f32 %f40, [%rl17+4];\n" " ld.param.f32 %f217, [param_7+4];\n" " mul.f32 %f95, %f217, %f40;\n" " ld.param.f32 %f220, [param_7];\n" " fma.rn.f32 %f96, %f220, %f39, %f95;\n" " ld.global.f32 %f41, [%rl17+8];\n" " ld.param.f32 %f214, [param_7+8];\n" " fma.rn.f32 %f97, %f214, %f41, %f96;\n" " ld.param.f32 %f211, [param_7+12];\n" " add.f32 %f42, %f97, %f211;\n" " ld.param.f32 %f205, [param_7+20];\n" " mul.f32 %f98, %f205, %f40;\n" " ld.param.f32 %f208, [param_7+16];\n" " fma.rn.f32 %f99, %f208, %f39, %f98;\n" " ld.param.f32 %f202, [param_7+24];\n" " fma.rn.f32 %f100, %f202, %f41, %f99;\n" " ld.param.f32 %f197, [param_7+28];\n" " add.f32 %f43, %f100, %f197;\n" " ld.param.f32 %f191, [param_7+36];\n" " mul.f32 %f101, %f191, %f40;\n" " ld.param.f32 %f194, [param_7+32];\n" " fma.rn.f32 %f102, %f194, %f39, %f101;\n" " ld.param.f32 %f188, [param_7+40];\n" " fma.rn.f32 %f103, %f188, %f41, %f102;\n" " ld.param.f32 %f185, [param_7+44];\n" " add.f32 %f44, %f103, %f185;\n" " setp.gt.f32 %p4, %f44, 0f00000000;\n" " @%p4 bra BB00_8;\n" "\n" " mov.f32 %f222, 0fBF800000;\n" " mov.f32 %f221, %f222;\n" " bra.uni BB00_9;\n" "\n" " BB00_8:\n" " div.rn.f32 %f221, %f42, %f44;\n" " div.rn.f32 %f222, %f43, %f44;\n" "\n" " BB00_9:\n" " ld.param.f32 %f216, [param_7+4];\n" " mul.f32 %f106, %f216, %f34;\n" " ld.param.f32 %f219, [param_7];\n" " fma.rn.f32 %f107, %f219, %f33, %f106;\n" " ld.param.f32 %f213, [param_7+8];\n" " fma.rn.f32 %f108, %f213, %f35, %f107;\n" " ld.param.f32 %f210, [param_7+12];\n" " add.f32 %f49, %f108, %f210;\n" " ld.param.f32 %f204, [param_7+20];\n" " mul.f32 %f109, %f204, %f34;\n" " ld.param.f32 %f207, [param_7+16];\n" " fma.rn.f32 %f110, %f207, %f33, %f109;\n" " ld.param.f32 %f201, [param_7+24];\n" " fma.rn.f32 %f111, %f201, %f35, %f110;\n" " ld.param.f32 %f199, [param_7+28];\n" " add.f32 %f50, %f111, %f199;\n" " ld.param.f32 %f193, [param_7+36];\n" " mul.f32 %f112, %f193, %f34;\n" " ld.param.f32 %f196, [param_7+32];\n" " fma.rn.f32 %f113, %f196, %f33, %f112;\n" " ld.param.f32 %f190, [param_7+40];\n" " fma.rn.f32 %f114, %f190, %f35, %f113;\n" " ld.param.f32 %f187, [param_7+44];\n" " add.f32 %f51, %f114, %f187;\n" " setp.gt.f32 %p5, %f51, 0f00000000;\n" " @%p5 bra BB00_10;\n" "\n" " mov.f32 %f224, 0fBF800000;\n" " mov.f32 %f223, %f224;\n" " bra.uni BB00_11;\n" "\n" " BB00_10:\n" " div.rn.f32 %f223, %f49, %f51;\n" " div.rn.f32 %f224, %f50, %f51;\n" "\n" " BB00_11:\n" " ld.param.f32 %f215, [param_7+4];\n" " mul.f32 %f117, %f215, %f37;\n" " ld.param.f32 %f218, [param_7];\n" " fma.rn.f32 %f118, %f218, %f36, %f117;\n" " ld.param.f32 %f212, [param_7+8];\n" " fma.rn.f32 %f119, %f212, %f38, %f118;\n" " ld.param.f32 %f209, [param_7+12];\n" " add.f32 %f56, %f119, %f209;\n" " ld.param.f32 %f203, [param_7+20];\n" " mul.f32 %f120, %f203, %f37;\n" " ld.param.f32 %f206, [param_7+16];\n" " fma.rn.f32 %f121, %f206, %f36, %f120;\n" " ld.param.f32 %f200, [param_7+24];\n" " fma.rn.f32 %f122, %f200, %f38, %f121;\n" " ld.param.f32 %f198, [param_7+28];\n" " add.f32 %f57, %f122, %f198;\n" " ld.param.f32 %f192, [param_7+36];\n" " mul.f32 %f123, %f192, %f37;\n" " ld.param.f32 %f195, [param_7+32];\n" " fma.rn.f32 %f124, %f195, %f36, %f123;\n" " ld.param.f32 %f189, [param_7+40];\n" " fma.rn.f32 %f125, %f189, %f38, %f124;\n" " ld.param.f32 %f186, [param_7+44];\n" " add.f32 %f58, %f125, %f186;\n" " setp.gt.f32 %p6, %f58, 0f00000000;\n" " @%p6 bra BB00_12;\n" "\n" " mov.f32 %f226, 0fBF800000;\n" " mov.f32 %f225, %f226;\n" " bra.uni BB00_13;\n" "\n" " BB00_12:\n" " div.rn.f32 %f225, %f56, %f58;\n" " div.rn.f32 %f226, %f57, %f58;\n" "\n" " BB00_13:\n" " add.f32 %f2, %f221, 0fBF000000;\n" " cvt.rzi.s32.f32 %r40, %f2;\n" " add.f32 %f4, %f223, 0fBF000000;\n" " cvt.rzi.s32.f32 %r41, %f4;\n" " min.s32 %r42, %r40, %r41;\n" " add.f32 %f6, %f225, 0fBF000000;\n" " cvt.rzi.s32.f32 %r43, %f6;\n" " min.s32 %r44, %r42, %r43;\n" " add.s32 %r103, %r44, -1;\n" " add.f32 %f7, %f221, 0f3F000000;\n" " cvt.rzi.s32.f32 %r45, %f7;\n" " add.f32 %f8, %f223, 0f3F000000;\n" " cvt.rzi.s32.f32 %r46, %f8;\n" " max.s32 %r47, %r45, %r46;\n" " add.f32 %f9, %f225, 0f3F000000;\n" " cvt.rzi.s32.f32 %r48, %f9;\n" " max.s32 %r49, %r47, %r48;\n" " add.s32 %r7, %r49, 1;\n" " add.f32 %f11, %f222, 0fBF000000;\n" " cvt.rzi.s32.f32 %r50, %f11;\n" " add.f32 %f13, %f224, 0fBF000000;\n" " cvt.rzi.s32.f32 %r51, %f13;\n" " min.s32 %r52, %r50, %r51;\n" " add.f32 %f15, %f226, 0fBF000000;\n" " cvt.rzi.s32.f32 %r53, %f15;\n" " min.s32 %r54, %r52, %r53;\n" " add.s32 %r8, %r54, -1;\n" " add.f32 %f16, %f222, 0f3F000000;\n" " cvt.rzi.s32.f32 %r55, %f16;\n" " add.f32 %f17, %f224, 0f3F000000;\n" " cvt.rzi.s32.f32 %r56, %f17;\n" " max.s32 %r57, %r55, %r56;\n" " add.f32 %f18, %f226, 0f3F000000;\n" " cvt.rzi.s32.f32 %r58, %f18;\n" " max.s32 %r59, %r57, %r58;\n" " add.s32 %r9, %r59, 1;\n" " mov.s32 %r60, 10;\n" " setp.lt.s32 %p7, %r103, %r60;\n" " @%p7 bra BB00_1;\n" "\n" " mov.s32 %r61, 10;\n" " setp.lt.s32 %p8, %r8, %r61;\n" " @%p8 bra BB00_1;\n" "\n" " ld.param.u32 %r101, [param_7+168];\n" " add.s32 %r63, %r101, -10;\n" " setp.gt.s32 %p9, %r7, %r63;\n" " @%p9 bra BB00_1;\n" "\n" " ld.param.u32 %r102, [param_7+172];\n" " add.s32 %r65, %r102, -10;\n" " setp.gt.s32 %p10, %r9, %r65;\n" " @%p10 bra BB00_1;\n" "\n" " sub.f32 %f128, %f224, %f222;\n" " sub.f32 %f129, %f225, %f221;\n" " mul.f32 %f130, %f128, %f129;\n" " sub.f32 %f131, %f226, %f222;\n" " sub.f32 %f132, %f223, %f221;\n" " neg.f32 %f133, %f132;\n" " fma.rn.f32 %f134, %f133, %f131, %f130;\n" " rcp.rn.f32 %f63, %f134;\n" " sub.f32 %f153, %f221, %f223;\n" " mul.f32 %f67, %f63, %f153;\n" " mul.f32 %f68, %f63, %f128;\n" " mul.f32 %f69, %f63, %f129;\n" " sub.f32 %f154, %f222, %f226;\n" " mul.f32 %f70, %f63, %f154;\n" " cvt.rn.f32.s32 %f155, %r103;\n" " sub.f32 %f156, %f155, %f225;\n" " mul.f32 %f157, %f154, %f156;\n" " cvt.rn.f32.s32 %f158, %r8;\n" " sub.f32 %f159, %f158, %f226;\n" " sub.f32 %f160, %f221, %f225;\n" " neg.f32 %f161, %f160;\n" " fma.rn.f32 %f162, %f161, %f159, %f157;\n" " mul.f32 %f232, %f63, %f162;\n" " sub.f32 %f163, %f155, %f221;\n" " mul.f32 %f164, %f128, %f163;\n" " sub.f32 %f165, %f158, %f222;\n" " fma.rn.f32 %f166, %f133, %f165, %f164;\n" " mul.f32 %f229, %f63, %f166;\n" " setp.gt.s32 %p11, %r103, %r7;\n" " @%p11 bra BB00_1;\n" "\n" " setp.gt.s32 %p1, %r8, %r9;\n" " setp.gt.f32 %p2, %f63, 0f00000000;\n" " ld.param.u64 %rl37, [param_4];\n" " cvta.to.global.u64 %rl22, %rl37;\n" " ld.param.u64 %rl39, [param_5];\n" " cvta.to.global.u64 %rl28, %rl39;\n" " ld.param.u64 %rl40, [param_6];\n" " cvta.to.global.u64 %rl31, %rl40;\n" "\n" " BB00_2:\n" " mov.f32 %f230, %f232;\n" " mov.f32 %f74, %f230;\n" " mov.f32 %f227, %f229;\n" " mov.f32 %f73, %f227;\n" " @%p1 bra BB00_6;\n" "\n" " mov.u32 %r11, %r8;\n" " mov.f32 %f75, %f73;\n" " mov.f32 %f76, %f74;\n" "\n" " BB00_7:\n" " setp.ge.f32 %p12, %f75, 0f00000000;\n" " setp.ge.f32 %p13, %f76, 0f00000000;\n" " and.pred %p14, %p13, %p12;\n" " @!%p14 bra BB00_5;\n" "\n" " add.f32 %f167, %f76, %f75;\n" " setp.gtu.f32 %p15, %f167, 0f3F800000;\n" " @%p15 bra BB00_5;\n" "\n" " sub.f32 %f77, 0f3F800000, %f76;\n" " sub.f32 %f77, %f77, %f75;\n" " mul.f32 %f171, %f76, %f51;\n" " fma.rn.f32 %f172, %f77, %f44, %f171;\n" " fma.rn.f32 %f78, %f75, %f58, %f172;\n" " ld.param.u32 %r100, [param_7+168];\n" " mad.lo.s32 %r67, %r11, %r100, %r103;\n" " mul.wide.s32 %rl23, %r67, 4;\n" " mov.b32 %r13, %f78;\n" " add.s64 %rl24, %rl22, %rl23;\n" " ld.global.f32 %f233, [%rl24];\n" "\n" " BB00_3:\n" " setp.lt.f32 %p16, %f233, %f78;\n" " @%p16 bra BB00_5;\n" "\n" " mov.b32 %r72, %f233;\n" " add.s64 %rl27, %rl22, %rl23;\n" " atom.global.cas.b32 %r73, [%rl27], %r72, %r13;\n" " ld.global.f32 %f80, [%rl27];\n" " setp.neu.f32 %p17, %f233, %f80;\n" " mov.f32 %f233, %f80;\n" " @%p17 bra BB00_3;\n" "\n" " add.s64 %rl7, %rl28, %rl23;\n" " mul.wide.s32 %rl30, %r67, 6;\n" " add.s64 %rl8, %rl31, %rl30;\n" " @%p2 bra BB00_4;\n" "\n" " mov.s32 %r77, -1;\n" " st.global.u32 [%rl7], %r77;\n" " mov.u16 %rs4, 0;\n" " st.global.b16 [%rl8], %rs4;\n" " st.global.b16 [%rl8+2], %rs4;\n" " st.global.b16 [%rl8+4], %rs4;\n" " bra.uni BB00_5;\n" "\n" " BB00_4:\n" " st.global.u32 [%rl7], %r6;\n" " {\n" " .reg .b16 %temp;\n" " cvt.rn.f16.f32 %temp, %f77;\n" " mov.b16 %rs1, %temp;\n" " }\n" " st.global.b16 [%rl8], %rs1;\n" " {\n" " .reg .b16 %temp;\n" " cvt.rn.f16.f32 %temp, %f76;\n" " mov.b16 %rs2, %temp;\n" " }\n" " st.global.b16 [%rl8+2], %rs2;\n" " {\n" " .reg .b16 %temp;\n" " cvt.rn.f16.f32 %temp, %f75;\n" " mov.b16 %rs3, %temp;\n" " }\n" " st.global.b16 [%rl8+4], %rs3;\n" "\n" " BB00_5:\n" " add.f32 %f76, %f76, %f69;\n" " add.f32 %f75, %f75, %f67;\n" " add.s32 %r11, %r11, 1;\n" " setp.le.s32 %p18, %r11, %r9;\n" " @%p18 bra BB00_7;\n" "\n" " BB00_6:\n" " add.f32 %f83, %f74, %f70;\n" " add.f32 %f84, %f73, %f68;\n" " add.s32 %r103, %r103, 1;\n" " setp.le.s32 %p19, %r103, %r7;\n" " mov.f32 %f229, %f84;\n" " mov.f32 %f232, %f83;\n" " @%p19 bra BB00_2;\n" "\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to invalidate pixels that don't have valid both depth and face index ".visible .entry CrossCheckProjection(\n" " .param .u64 .ptr param_1, // depth-map (float) [in/out]\n" " .param .u64 .ptr param_2, // face-map (uint32_t) [in/out]\n" " .param .u32 param_3, // width\n" " .param .u32 param_4 // height\n" ")\n" "{\n" " .reg .f32 %f<2>;\n" " .reg .pred %p<10>;\n" " .reg .s32 %r<14>;\n" " .reg .s64 %rl<8>;\n" "\n" " ld.param.u32 %r1, [param_3];\n" " ld.param.u32 %r2, [param_4];\n" " mov.u32 %r6, %ntid.x;\n" " mov.u32 %r7, %ctaid.x;\n" " mov.u32 %r8, %tid.x;\n" " mad.lo.s32 %r3, %r6, %r7, %r8;\n" " mov.u32 %r9, %ntid.y;\n" " mov.u32 %r10, %ctaid.y;\n" " mov.u32 %r11, %tid.y;\n" " mad.lo.s32 %r4, %r9, %r10, %r11;\n" " setp.gt.s32 %p1, %r3, -1;\n" " setp.lt.s32 %p2, %r3, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r4, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r4, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " mad.lo.s32 %r12, %r4, %r1, %r3;\n" " mul.wide.s32 %rl7, %r12, 4;\n" "\n" " ld.param.u64 %rl2, [param_1];\n" " cvta.to.global.u64 %rl1, %rl2;\n" " add.s64 %rl3, %rl1, %rl7;\n" "\n" " ld.param.u64 %rl5, [param_2];\n" " cvta.to.global.u64 %rl4, %rl5;\n" " add.s64 %rl6, %rl4, %rl7;\n" "\n" " ld.global.f32 %f1, [%rl3];\n" " setp.eq.f32 %p8, %f1, 0f7F7FFFFF;\n" " @%p8 bra BB00_2;\n" "\n" " ld.global.s32 %r13, [%rl6];\n" " setp.eq.s32 %p9, %r13, -1;\n" " @%p9 bra BB00_2;\n" "\n" " ret;\n" "\n" " BB00_2:\n" " st.global.f32 [%rl3], 0f00000000;\n" " st.global.s32 [%rl6], -1;\n" "\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to project image from view B to view A through the mesh; // additionally the mask is computed ".visible .entry ImageMeshWarp(\n" " .param .u64 .ptr param_1, // depth-map A (float)\n" " .param .u64 .ptr param_2, // depth-map B (float)\n" " .param .u64 .ptr param_3, // mask [out]\n" " .param .align 4 .b8 param_4[176], // camera A \n" " .param .align 4 .b8 param_5[176] // camera B \n" ")\n" "{\n" " .reg .f32 %f<187>;\n" " .reg .pred %p<19>;\n" " .reg .s16 %rs<2>;\n" " .reg .s32 %r<36>;\n" " .reg .s64 %rl<23>;\n" " .reg .s16 %rc<2>;\n" "\n" " ld.param.u32 %r1, [param_4+168];\n" " ld.param.u32 %r2, [param_4+172];\n" " mov.u32 %r6, %ntid.x;\n" " mov.u32 %r7, %ctaid.x;\n" " mov.u32 %r8, %tid.x;\n" " mad.lo.s32 %r3, %r6, %r7, %r8;\n" " mov.u32 %r9, %ntid.y;\n" " mov.u32 %r10, %ctaid.y;\n" " mov.u32 %r11, %tid.y;\n" " mad.lo.s32 %r4, %r9, %r10, %r11;\n" " setp.gt.s32 %p1, %r3, -1;\n" " setp.lt.s32 %p2, %r3, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r4, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r4, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_0;\n" "\n" " mov.f32 %f141, 0f00000000;\n" " mov.u16 %rc1, 0;\n" " shl.b32 %r13, %r3, 1;\n" " ld.param.u64 %rl18, [param_1];\n" " cvta.to.global.u64 %rl10, %rl18;\n" " mad.lo.s32 %r12, %r4, %r1, %r3;\n" " mul.wide.s32 %rl9, %r12, 4;\n" " add.s64 %rl11, %rl10, %rl9;\n" " ld.global.f32 %f51, [%rl11];\n" " setp.gt.f32 %p8, %f51, 0f00000000;\n" " @!%p8 bra BB00_1;\n" "\n" " cvt.rn.f32.s32 %f88, %r3;\n" " ld.param.f32 %f147, [param_4+104];\n" " sub.f32 %f89, %f88, %f147;\n" " ld.param.f32 %f148, [param_4+96];\n" " div.rn.f32 %f90, %f89, %f148;\n" " cvt.rn.f32.s32 %f91, %r4;\n" " ld.param.f32 %f145, [param_4+116];\n" " sub.f32 %f92, %f91, %f145;\n" " ld.param.f32 %f146, [param_4+112];\n" " div.rn.f32 %f93, %f92, %f146;\n" " ld.param.f32 %f157, [param_4+60];\n" " mul.f32 %f94, %f157, %f93;\n" " ld.param.f32 %f160, [param_4+48];\n" " fma.rn.f32 %f95, %f160, %f90, %f94;\n" " ld.param.f32 %f154, [param_4+72];\n" " add.f32 %f96, %f95, %f154;\n" " ld.param.f32 %f156, [param_4+64];\n" " mul.f32 %f97, %f156, %f93;\n" " ld.param.f32 %f159, [param_4+52];\n" " fma.rn.f32 %f98, %f159, %f90, %f97;\n" " ld.param.f32 %f153, [param_4+76];\n" " add.f32 %f99, %f98, %f153;\n" " ld.param.f32 %f155, [param_4+68];\n" " mul.f32 %f100, %f155, %f93;\n" " ld.param.f32 %f158, [param_4+56];\n" " fma.rn.f32 %f101, %f158, %f90, %f100;\n" " ld.param.f32 %f152, [param_4+80];\n" " add.f32 %f102, %f101, %f152;\n" " ld.param.f32 %f72, [param_4+84];\n" " fma.rn.f32 %f55, %f51, %f96, %f72;\n" " ld.param.f32 %f75, [param_4+88];\n" " fma.rn.f32 %f56, %f51, %f99, %f75;\n" " ld.param.f32 %f78, [param_4+92];\n" " fma.rn.f32 %f57, %f51, %f102, %f78;\n" " ld.param.f32 %f183, [param_5+4];\n" " mul.f32 %f110, %f183, %f56;\n" " ld.param.f32 %f184, [param_5];\n" " fma.rn.f32 %f111, %f184, %f55, %f110;\n" " ld.param.f32 %f182, [param_5+8];\n" " fma.rn.f32 %f112, %f182, %f57, %f111;\n" " ld.param.f32 %f181, [param_5+12];\n" " add.f32 %f58, %f112, %f181;\n" " ld.param.f32 %f179, [param_5+20];\n" " mul.f32 %f113, %f179, %f56;\n" " ld.param.f32 %f180, [param_5+16];\n" " fma.rn.f32 %f114, %f180, %f55, %f113;\n" " ld.param.f32 %f178, [param_5+24];\n" " fma.rn.f32 %f115, %f178, %f57, %f114;\n" " ld.param.f32 %f177, [param_5+28];\n" " add.f32 %f59, %f115, %f177;\n" " ld.param.f32 %f175, [param_5+36];\n" " mul.f32 %f116, %f175, %f56;\n" " ld.param.f32 %f176, [param_5+32];\n" " fma.rn.f32 %f117, %f176, %f55, %f116;\n" " ld.param.f32 %f174, [param_5+40];\n" " fma.rn.f32 %f118, %f174, %f57, %f117;\n" " ld.param.f32 %f173, [param_5+44];\n" " add.f32 %f60, %f118, %f173;\n" " setp.gt.f32 %p9, %f60, 0f00000000;\n" " @!%p9 bra BB00_1;\n" "\n" " div.rn.f32 %f185, %f58, %f60;\n" " div.rn.f32 %f186, %f59, %f60;\n" " setp.leu.f32 %p10, %f185, 0f41200000;\n" " @%p10 bra BB00_1;\n" "\n" " ld.param.u32 %r35, [param_5+168];\n" " add.s32 %r18, %r35, -10;\n" " cvt.rn.f32.s32 %f127, %r18;\n" " setp.lt.f32 %p11, %f185, %f127;\n" " setp.gt.f32 %p12, %f186, 0f41200000;\n" " and.pred %p13, %p11, %p12;\n" " @!%p13 bra BB00_1;\n" "\n" " ld.param.u32 %r14, [param_5+172];\n" " add.s32 %r19, %r14, -10;\n" " cvt.rn.f32.s32 %f128, %r19;\n" " setp.geu.f32 %p14, %f186, %f128;\n" " @%p14 bra BB00_1;\n" "\n" " cvt.rzi.s32.f32 %r20, %f186;\n" " cvt.rzi.s32.f32 %r21, %f185;\n" " mad.lo.s32 %r22, %r20, %r35, %r21;\n" " ld.param.u64 %rl20, [param_2];\n" " cvta.to.global.u64 %rl14, %rl20;\n" " mul.wide.s32 %rl15, %r22, 4;\n" " add.s64 %rl6, %rl14, %rl15;\n" " ld.global.f32 %f129, [%rl6];\n" " sub.f32 %f130, %f129, %f60;\n" " abs.f32 %f131, %f130;\n" " mul.f32 %f1, %f60, 0f3C23D70A;\n" " setp.lt.f32 %p15, %f131, %f1;\n" " @%p15 bra BB00_2;\n" "\n" " ld.global.f32 %f135, [%rl6+4];\n" " sub.f32 %f136, %f135, %f60;\n" " abs.f32 %f137, %f136;\n" " setp.lt.f32 %p16, %f137, %f1;\n" " @%p16 bra BB00_2;\n" "\n" " add.s32 %r23, %r22, %r35;\n" " mul.wide.s32 %rl17, %r23, 4;\n" " add.s64 %rl7, %rl14, %rl17;\n" " ld.global.f32 %f132, [%rl7];\n" " sub.f32 %f133, %f132, %f60;\n" " abs.f32 %f134, %f133;\n" " setp.lt.f32 %p17, %f134, %f1;\n" " @%p17 bra BB00_2;\n" "\n" " ld.global.f32 %f138, [%rl7+4];\n" " sub.f32 %f139, %f138, %f60;\n" " abs.f32 %f140, %f139;\n" " setp.lt.f32 %p18, %f140, %f1;\n" " @%p18 bra BB00_2;\n" "\n" " BB00_1:\n" " suld.b.2d.b16.trap {%rs1}, [surfImageRef, {%r13, %r4}];\n" " bra.uni BB00_3;\n" "\n" " BB00_2:\n" " tex.2d.v4.f32.f32 {%f141, %f142, %f143, %f144}, [texImageRef, {%f185, %f186}];\n" " {\n" " .reg .b16 %temp;\n" " cvt.rn.f16.f32 %temp, %f141;\n" " mov.b16 %rs1, %temp;\n" " }\n" " mov.u16 %rc1, 1;\n" "\n" " BB00_3:\n" " sust.b.2d.b16.trap [surfImageProjRef, {%r13, %r4}], {%rs1};\n" " ld.param.u64 %rl1, [param_3];\n" " cvta.to.global.u64 %rl4, %rl1;\n" " cvt.s64.s32 %rl3, %r12;\n" " add.s64 %rl2, %rl4, %rl3;\n" " st.global.u8 [%rl2], %rc1;\n" " BB00_0:\n" " ret;\n" "}\n" "\n" // kernel used to compute the mean for all image pixels for a given windows size ".visible .entry ComputeImageMean(\n" " .param .u64 .ptr param_1, // image mask\n" " .param .u64 .ptr param_2, // image pixels mean [out]\n" " .param .u32 param_3, // image width\n" " .param .u32 param_4, // image height\n" " .param .u32 param_5 // half-window size\n" ")\n" "{\n" " .reg .f32 %f<10>;\n" " .reg .pred %p<19>;\n" " .reg .s16 %rc<4>;\n" " .reg .s32 %r<40>;\n" " .reg .s64 %rl<13>;\n" "\n" " ld.param.u32 %r1, [param_3];\n" " ld.param.u32 %r2, [param_4];\n" " mov.u32 %r12, %ntid.x;\n" " mov.u32 %r13, %ctaid.x;\n" " mov.u32 %r14, %tid.x;\n" " mad.lo.s32 %r4, %r12, %r13, %r14;\n" " mov.u32 %r15, %ntid.y;\n" " mov.u32 %r16, %ctaid.y;\n" " mov.u32 %r17, %tid.y;\n" " mad.lo.s32 %r5, %r15, %r16, %r17;\n" " setp.gt.s32 %p1, %r4, -1;\n" " setp.lt.s32 %p2, %r4, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r5, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r5, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " ld.param.u64 %rl7, [param_1];\n" " ld.param.u64 %rl8, [param_2];\n" " cvta.to.global.u64 %rl2, %rl7;\n" " cvta.to.global.u64 %rl3, %rl8;\n" " ld.param.u32 %r36, [param_5];\n" " shl.b32 %r18, %r36, 1;\n" " or.b32 %r19, %r18, 1;\n" " cvt.rn.f32.s32 %f6, %r19;\n" " mul.f32 %f1, %f6, %f6;\n" " ld.param.u32 %r31, [param_3];\n" " mad.lo.s32 %r20, %r5, %r31, %r4;\n" " cvt.s64.s32 %rl4, %r20;\n" " mul.wide.s32 %rl9, %r20, 4;\n" " add.s64 %rl5, %rl3, %rl9;\n" " mov.u32 %r21, 0;\n" " st.global.u32 [%rl5], %r21;\n" " sub.s32 %r23, %r31, %r36;\n" " setp.lt.s32 %p8, %r4, %r23;\n" " setp.ge.s32 %p9, %r4, %r36;\n" " and.pred %p10, %p8, %p9;\n" " setp.ge.s32 %p11, %r5, %r36;\n" " and.pred %p12, %p10, %p11;\n" " ld.param.u32 %r32, [param_4];\n" " sub.s32 %r24, %r32, %r36;\n" " setp.lt.s32 %p13, %r5, %r24;\n" " and.pred %p14, %p12, %p13;\n" " @!%p14 bra BB00_1;\n" "\n" " add.s64 %rl10, %rl2, %rl4;\n" " ld.global.u8 %rc1, [%rl10];\n" " cvt.s16.s8 %rc1, %rc1;\n" " mov.b16 %rc2, 1;\n" " cvt.s16.s8 %rc2, %rc2;\n" " setp.eq.s16 %p15, %rc1, %rc2;\n" " @!%p15 bra BB00_1;\n" "\n" " ld.param.u32 %r35, [param_5];\n" " neg.s32 %r6, %r35;\n" " setp.gt.s32 %p16, %r6, %r35;\n" " @%p16 bra BB00_5;\n" "\n" " mov.f32 %f9, 0f00000000;\n" " mov.u32 %r39, %r6;\n" "\n" " BB00_3:\n" " mov.u32 %r7, %r39;\n" " add.s32 %r8, %r7, %r4;\n" " mov.u32 %r38, %r6;\n" "\n" " BB00_4:\n" " add.s32 %r26, %r38, %r5;\n" " shl.b32 %r27, %r8, 1;\n" " suld.b.2d.b16.trap {%rc3}, [surfImageRef, {%r27, %r26}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rc3;\n" " cvt.f32.f16 %f7, %temp;\n" " }\n" " add.f32 %f9, %f9, %f7;\n" " add.s32 %r38, %r38, 1;\n" " ld.param.u32 %r34, [param_5];\n" " setp.le.s32 %p17, %r38, %r34;\n" " @%p17 bra BB00_4;\n" "\n" " add.s32 %r11, %r7, 1;\n" " ld.param.u32 %r33, [param_5];\n" " setp.le.s32 %p18, %r11, %r33;\n" " mov.u32 %r39, %r11;\n" " @%p18 bra BB00_3;\n" " bra.uni BB00_2;\n" "\n" " BB00_5:\n" " mov.f32 %f9, 0f00000000;\n" "\n" " BB00_2:\n" " div.rn.f32 %f8, %f9, %f1;\n" " st.global.f32 [%rl5], %f8;\n" "\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the variance for all image pixels for a given windows size ".visible .entry ComputeImageVar(\n" " .param .u64 .ptr param_1, // image pixels mean\n" " .param .u64 .ptr param_2, // image mask\n" " .param .u64 .ptr param_3, // image pixels variance [out]\n" " .param .u32 param_4, // image width\n" " .param .u32 param_5, // image height\n" " .param .u32 param_6 // half-window size\n" ")\n" "{\n" " .reg .f32 %f<15>;\n" " .reg .pred %p<19>;\n" " .reg .s16 %rc<4>;\n" " .reg .s32 %r<43>;\n" " .reg .s64 %rl<17>;\n" "\n" " ld.param.u32 %r1, [param_4];\n" " ld.param.u32 %r2, [param_5];\n" " mov.u32 %r12, %ntid.x;\n" " mov.u32 %r13, %ctaid.x;\n" " mov.u32 %r14, %tid.x;\n" " mad.lo.s32 %r4, %r12, %r13, %r14;\n" " mov.u32 %r15, %ntid.y;\n" " mov.u32 %r16, %ctaid.y;\n" " mov.u32 %r17, %tid.y;\n" " mad.lo.s32 %r5, %r15, %r16, %r17;\n" " setp.gt.s32 %p1, %r4, -1;\n" " setp.lt.s32 %p2, %r4, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r5, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r5, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " ld.param.u64 %rl8, [param_1];\n" " ld.param.u64 %rl9, [param_2];\n" " ld.param.u64 %rl10, [param_3];\n" " cvta.to.global.u64 %rl1, %rl8;\n" " cvta.to.global.u64 %rl3, %rl9;\n" " cvta.to.global.u64 %rl4, %rl10;\n" " ld.param.u32 %r39, [param_6];\n" " shl.b32 %r18, %r39, 1;\n" " or.b32 %r19, %r18, 1;\n" " cvt.rn.f32.s32 %f7, %r19;\n" " mul.f32 %f1, %f7, %f7;\n" " ld.param.u32 %r34, [param_4];\n" " mad.lo.s32 %r20, %r5, %r34, %r4;\n" " cvt.s64.s32 %rl5, %r20;\n" " shl.b64 %rl11, %rl5, 2;\n" " add.s64 %rl6, %rl4, %rl11;\n" " mov.f32 %f3, 0f00000000;\n" " st.global.f32 [%rl6], %f3;\n" " sub.s32 %r23, %r34, %r39;\n" " setp.lt.s32 %p8, %r4, %r23;\n" " setp.ge.s32 %p9, %r4, %r39;\n" " and.pred %p10, %p8, %p9;\n" " setp.ge.s32 %p11, %r5, %r39;\n" " and.pred %p12, %p10, %p11;\n" " ld.param.u32 %r35, [param_5];\n" " sub.s32 %r24, %r35, %r39;\n" " setp.lt.s32 %p13, %r5, %r24;\n" " and.pred %p14, %p12, %p13;\n" " @!%p14 bra BB00_1;\n" "\n" " add.s64 %rl12, %rl3, %rl5;\n" " ld.global.u8 %rc1, [%rl12];\n" " cvt.s16.s8 %rc1, %rc1;\n" " mov.b16 %rc2, 1;\n" " cvt.s16.s8 %rc2, %rc2;\n" " setp.eq.s16 %p15, %rc1, %rc2;\n" " @!%p15 bra BB00_1;\n" "\n" " ld.param.u32 %r38, [param_6];\n" " neg.s32 %r6, %r38;\n" " add.s64 %rl14, %rl1, %rl11;\n" " ld.global.f32 %f2, [%rl14];\n" " mov.f32 %f14, 0f00000000;\n" " mov.u32 %r42, %r6;\n" "\n" " BB00_2:\n" " mov.u32 %r7, %r42;\n" " add.s32 %r8, %r7, %r4;\n" " mov.u32 %r41, %r6;\n" "\n" " BB00_3:\n" " add.s32 %r26, %r41, %r5;\n" " shl.b32 %r27, %r8, 1;\n" " suld.b.2d.b16.trap {%rc3}, [surfImageRef, {%r27, %r26}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rc3;\n" " cvt.f32.f16 %f9, %temp;\n" " }\n" " sub.f32 %f10, %f9, %f2;\n" " fma.rn.f32 %f14, %f10, %f10, %f14;\n" " add.s32 %r41, %r41, 1;\n" " ld.param.u32 %r37, [param_6];\n" " setp.le.s32 %p17, %r41, %r37;\n" " @%p17 bra BB00_3;\n" "\n" " add.s32 %r11, %r7, 1;\n" " ld.param.u32 %r36, [param_6];\n" " setp.le.s32 %p18, %r11, %r36;\n" " mov.u32 %r42, %r11;\n" " @%p18 bra BB00_2;\n" "\n" " div.rn.f32 %f12, %f14, %f1;\n" " max.f32 %f12, %f12, 0f38D1B717;\n" " st.global.f32 [%rl6], %f12;\n" "\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the covariance for all image pixels for a given windows size ".visible .entry ComputeImageCov(\n" " .param .u64 .ptr param_1, // meanA\n" " .param .u64 .ptr param_2, // meanB\n" " .param .u64 .ptr param_3, // mask\n" " .param .u64 .ptr param_4, // cov [out]\n" " .param .u32 param_5, // image width\n" " .param .u32 param_6, // image height\n" " .param .u32 param_7 // window size\n" ")\n" "{\n" " .reg .f32 %f<17>;\n" " .reg .pred %p<19>;\n" " .reg .s16 %rs<4>;\n" " .reg .s32 %r<53>;\n" " .reg .s64 %rl<27>;\n" "\n" " ld.param.u32 %r1, [param_5];\n" " ld.param.u32 %r2, [param_6];\n" " ld.param.u64 %rl10, [param_1];\n" " ld.param.u64 %rl12, [param_2];\n" " ld.param.u64 %rl13, [param_3];\n" " ld.param.u64 %rl1, [param_4];\n" " cvta.to.global.u64 %rl2, %rl12;\n" " cvta.to.global.u64 %rl4, %rl10;\n" " cvta.to.global.u64 %rl6, %rl13;\n" " cvta.to.global.u64 %rl7, %rl1;\n" " mov.u32 %r12, %ntid.x;\n" " mov.u32 %r13, %ctaid.x;\n" " mov.u32 %r14, %tid.x;\n" " mad.lo.s32 %r4, %r12, %r13, %r14;\n" " mov.u32 %r15, %ntid.y;\n" " mov.u32 %r16, %ctaid.y;\n" " mov.u32 %r17, %tid.y;\n" " mad.lo.s32 %r5, %r15, %r16, %r17;\n" " setp.gt.s32 %p1, %r4, -1;\n" " setp.lt.s32 %p2, %r4, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r5, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r5, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " ld.param.u32 %r49, [param_7];\n" " shl.b32 %r18, %r49, 1;\n" " or.b32 %r19, %r18, 1;\n" " cvt.rn.f32.s32 %f8, %r19;\n" " mul.f32 %f1, %f8, %f8;\n" " ld.param.u32 %r44, [param_5];\n" " mad.lo.s32 %r20, %r5, %r44, %r4;\n" " cvt.s64.s32 %rl8, %r20;\n" " mul.wide.s32 %rl14, %r20, 4;\n" " add.s64 %rl15, %rl7, %rl14;\n" " mov.u32 %r21, 0;\n" " st.global.u32 [%rl15], %r21;\n" " sub.s32 %r23, %r44, %r49;\n" " setp.lt.s32 %p8, %r4, %r23;\n" " setp.ge.s32 %p9, %r4, %r49;\n" " and.pred %p10, %p8, %p9;\n" " setp.ge.s32 %p11, %r5, %r49;\n" " and.pred %p12, %p10, %p11;\n" " ld.param.u32 %r45, [param_6];\n" " sub.s32 %r24, %r45, %r49;\n" " setp.lt.s32 %p13, %r5, %r24;\n" " and.pred %p14, %p12, %p13;\n" " @!%p14 bra BB00_1;\n" "\n" " add.s64 %rl16, %rl6, %rl8;\n" " ld.global.u8 %rs3, [%rl16];\n" " {\n" " .reg .s16 %temp1;\n" " .reg .s16 %temp2;\n" " cvt.s16.s8 %temp1, %rs3;\n" " mov.b16 %temp2, 1;\n" " cvt.s16.s8 %temp2, %temp2;\n" " setp.eq.s16 %p15, %temp1, %temp2;\n" " }\n" " @!%p15 bra BB00_1;\n" "\n" " neg.s32 %r6, %r49;\n" " setp.gt.s32 %p16, %r6, %r49;\n" " @%p16 bra BB00_4;\n" "\n" " shl.b64 %rl17, %rl8, 2;\n" " add.s64 %rl18, %rl4, %rl17;\n" " ld.global.f32 %f2, [%rl18];\n" " add.s64 %rl19, %rl2, %rl17;\n" " ld.global.f32 %f3, [%rl19];\n" " mov.f32 %f16, 0f00000000;\n" " mov.u32 %r52, %r6;\n" "\n" " BB00_2:\n" " mov.u32 %r7, %r52;\n" " add.s32 %r8, %r7, %r4;\n" " mov.u32 %r51, %r6;\n" "\n" " BB00_3:\n" " add.s32 %r26, %r51, %r5;\n" " shl.b32 %r27, %r8, 1;\n" " suld.b.2d.b16.trap {%rs1}, [surfImageRef, {%r27, %r26}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rs1;\n" " cvt.f32.f16 %f10, %temp;\n" " }\n" " sub.f32 %f11, %f10, %f2;\n" " suld.b.2d.b16.trap {%rs2}, [surfImageProjRef, {%r27, %r26}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rs2;\n" " cvt.f32.f16 %f12, %temp;\n" " }\n" " sub.f32 %f13, %f12, %f3;\n" " fma.rn.f32 %f16, %f11, %f13, %f16;\n" " add.s32 %r51, %r51, 1;\n" " setp.le.s32 %p17, %r51, %r49;\n" " @%p17 bra BB00_3;\n" "\n" " add.s32 %r11, %r7, 1;\n" " setp.le.s32 %p18, %r11, %r49;\n" " mov.u32 %r52, %r11;\n" " @%p18 bra BB00_2;\n" " bra.uni BB00_5;\n" "\n" " BB00_4:\n" " mov.f32 %f16, 0f00000000;\n" "\n" " BB00_5:\n" " ld.param.u32 %r42, [param_5];\n" " mad.lo.s32 %r40, %r5, %r42, %r4;\n" " ld.param.u64 %rl26, [param_4];\n" " cvta.to.global.u64 %rl23, %rl26;\n" " mul.wide.s32 %rl24, %r40, 4;\n" " add.s64 %rl25, %rl23, %rl24;\n" " div.rn.f32 %f15, %f16, %f1;\n" " st.global.f32 [%rl25], %f15;\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the ZNCC score for all image pixels ".visible .entry ComputeImageZNCC(\n" " .param .u64 .ptr param_2, // cov\n" " .param .u64 .ptr param_3, // varA\n" " .param .u64 .ptr param_4, // varB\n" " .param .u64 .ptr param_5, // mask\n" " .param .u64 .ptr param_6, // ZNCC [out]\n" " .param .u32 param_0, // image width\n" " .param .u32 param_1, // image height\n" " .param .u32 param_7 // window size\n" ")\n" "{\n" " .reg .f32 %f<7>;\n" " .reg .pred %p<16>;\n" " .reg .s32 %r<25>;\n" " .reg .s64 %rl<19>;\n" " .reg .s16 %rc<2>;\n" "\n" " ld.param.u32 %r1, [param_0];\n" " ld.param.u32 %r2, [param_1];\n" " ld.param.u64 %rl8, [param_2];\n" " ld.param.u64 %rl9, [param_3];\n" " ld.param.u64 %rl10, [param_4];\n" " ld.param.u64 %rl11, [param_5];\n" " ld.param.u64 %rl12, [param_6];\n" " cvta.to.global.u64 %rl1, %rl10;\n" " cvta.to.global.u64 %rl2, %rl9;\n" " cvta.to.global.u64 %rl3, %rl8;\n" " cvta.to.global.u64 %rl4, %rl11;\n" " cvta.to.global.u64 %rl5, %rl12;\n" " mov.u32 %r6, %ntid.x;\n" " mov.u32 %r7, %ctaid.x;\n" " mov.u32 %r8, %tid.x;\n" " mad.lo.s32 %r4, %r6, %r7, %r8;\n" " mov.u32 %r9, %ntid.y;\n" " mov.u32 %r10, %ctaid.y;\n" " mov.u32 %r11, %tid.y;\n" " mad.lo.s32 %r5, %r9, %r10, %r11;\n" " setp.gt.s32 %p1, %r4, -1;\n" " setp.lt.s32 %p2, %r4, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r5, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r5, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " ld.param.u32 %r22, [param_0];\n" " mad.lo.s32 %r12, %r5, %r22, %r4;\n" " cvt.s64.s32 %rl6, %r12;\n" " mul.wide.s32 %rl13, %r12, 4;\n" " add.s64 %rl7, %rl5, %rl13;\n" " mov.u32 %r13, 0;\n" " st.global.u32 [%rl7], %r13;\n" " ld.param.u32 %r24, [param_7];\n" " sub.s32 %r15, %r22, %r24;\n" " setp.lt.s32 %p8, %r4, %r15;\n" " setp.ge.s32 %p9, %r4, %r24;\n" " and.pred %p10, %p8, %p9;\n" " setp.ge.s32 %p11, %r5, %r24;\n" " and.pred %p12, %p10, %p11;\n" " ld.param.u32 %r23, [param_1];\n" " sub.s32 %r16, %r23, %r24;\n" " setp.lt.s32 %p13, %r5, %r16;\n" " and.pred %p14, %p12, %p13;\n" " @!%p14 bra BB00_1;\n" "\n" " add.s64 %rl14, %rl4, %rl6;\n" " ld.global.u8 %rc1, [%rl14];\n" " {\n" " .reg .s16 %temp1;\n" " .reg .s16 %temp2;\n" " cvt.s16.s8 %temp1, %rc1;\n" " mov.b16 %temp2, 1;\n" " cvt.s16.s8 %temp2, %temp2;\n" " setp.eq.s16 %p15, %temp1, %temp2;\n" " }\n" " @!%p15 bra BB00_1;\n" "\n" " shl.b64 %rl15, %rl6, 2;\n" " add.s64 %rl16, %rl3, %rl15;\n" " add.s64 %rl17, %rl1, %rl15;\n" " ld.global.f32 %f1, [%rl17];\n" " add.s64 %rl18, %rl2, %rl15;\n" " ld.global.f32 %f2, [%rl18];\n" " mul.f32 %f3, %f2, %f1;\n" " sqrt.rn.f32 %f4, %f3;\n" " ld.global.f32 %f5, [%rl16];\n" " div.rn.f32 %f6, %f5, %f4;\n" " st.global.f32 [%rl7], %f6;\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the gradient of the ZNCC score for all image pixels ".visible .entry ComputeImageDZNCC(\n" " .param .u64 .ptr param_1, // meanA\n" " .param .u64 .ptr param_2, // meanB\n" " .param .u64 .ptr param_3, // varA\n" " .param .u64 .ptr param_4, // varB\n" " .param .u64 .ptr param_5, // ZNCC\n" " .param .u64 .ptr param_6, // mask\n" " .param .u64 .ptr param_7, // NCCGrad [out]\n" " .param .u32 param_8, // image width\n" " .param .u32 param_9, // image height\n" " .param .u32 param_10 // window size\n" ")\n" "{\n" " .reg .f32 %f<69>;\n" " .reg .pred %p<20>;\n" " .reg .s16 %rc<7>;\n" " .reg .s32 %r<90>;\n" " .reg .s64 %rl<81>;\n" "\n" " ld.param.u32 %r1, [param_8];\n" " ld.param.u32 %r2, [param_9];\n" " mov.u32 %r17, %ntid.x;\n" " mov.u32 %r18, %ctaid.x;\n" " mov.u32 %r19, %tid.x;\n" " mad.lo.s32 %r5, %r17, %r18, %r19;\n" " mov.u32 %r20, %ntid.y;\n" " mov.u32 %r21, %ctaid.y;\n" " mov.u32 %r7, %tid.y;\n" " mad.lo.s32 %r8, %r20, %r21, %r7;\n" " setp.gt.s32 %p1, %r5, -1;\n" " setp.lt.s32 %p2, %r5, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r8, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r8, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " mad.lo.s32 %r22, %r8, %r1, %r5;\n" " ld.param.u64 %rl55, [param_7];\n" " cvta.to.global.u64 %rl13, %rl55;\n" " mul.wide.s32 %rl14, %r22, 4;\n" " add.s64 %rl15, %rl13, %rl14;\n" " mov.u32 %r23, 0;\n" " st.global.u32 [%rl15], %r23;\n" " ld.param.u32 %r74, [param_10];\n" " sub.s32 %r27, %r1, %r74;\n" " setp.lt.s32 %p8, %r5, %r27;\n" " setp.ge.s32 %p9, %r5, %r74;\n" " and.pred %p10, %p8, %p9;\n" " setp.ge.s32 %p11, %r8, %r74;\n" " and.pred %p12, %p10, %p11;\n" " sub.s32 %r28, %r2, %r74;\n" " setp.lt.s32 %p13, %r8, %r28;\n" " and.pred %p14, %p12, %p13;\n" " @!%p14 bra BB00_1;\n" "\n" " ld.param.u64 %rl48, [param_3];\n" " cvta.to.global.u64 %rl23, %rl48;\n" " ld.param.u64 %rl50, [param_4];\n" " cvta.to.global.u64 %rl26, %rl50;\n" " ld.param.u64 %rl53, [param_6];\n" " cvta.to.global.u64 %rl18, %rl53;\n" " ld.param.u32 %r66, [param_8];\n" " mad.lo.s32 %r29, %r8, %r66, %r5;\n" " cvt.s64.s32 %rl19, %r29;\n" " add.s64 %rl20, %rl18, %rl19;\n" " ld.global.u8 %rc1, [%rl20];\n" " {\n" " .reg .s16 %temp1;\n" " .reg .s16 %temp2;\n" " cvt.s16.s8 %temp1, %rc1;\n" " mov.b16 %temp2, 1;\n" " cvt.s16.s8 %temp2, %temp2;\n" " setp.eq.s16 %p15, %temp1, %temp2;\n" " }\n" " @!%p15 bra BB00_1;\n" "\n" " neg.s32 %r31, %r74;\n" " setp.gt.s32 %p16, %r31, %r74;\n" " @%p16 bra BB00_6;\n" "\n" " neg.s32 %r77, %r74;\n" " mov.f32 %f56, 0f00000000;\n" " mov.f32 %f57, %f56;\n" " mov.f32 %f58, %f56;\n" " mov.f32 %f59, %f56;\n" "\n" " BB00_2:\n" " add.s32 %r12, %r77, %r5;\n" " neg.s32 %r78, %r74;\n" "\n" " BB00_3:\n" " ld.param.u64 %rl52, [param_6];\n" " cvta.to.global.u64 %rl21, %rl52;\n" " add.s32 %r36, %r78, %r8;\n" " ld.param.u32 %r65, [param_8];\n" " mad.lo.s32 %r37, %r36, %r65, %r12;\n" " cvt.s64.s32 %rl12, %r37;\n" " add.s64 %rl22, %rl21, %rl12;\n" " ld.global.u8 %rc2, [%rl22];\n" " {\n" " .reg .s16 %temp1;\n" " .reg .s16 %temp2;\n" " cvt.s16.s8 %temp1, %rc2;\n" " mov.b16 %temp2, 1;\n" " cvt.s16.s8 %temp2, %temp2;\n" " setp.eq.s16 %p17, %temp1, %temp2;\n" " }\n" " @%p17 bra BB00_4;\n" " bra.uni BB00_5;\n" "\n" " BB00_4:\n" " shl.b64 %rl24, %rl12, 2;\n" " add.s64 %rl25, %rl23, %rl24;\n" " add.s64 %rl27, %rl26, %rl24;\n" " ld.global.f32 %f25, [%rl27];\n" " ld.global.f32 %f26, [%rl25];\n" " mul.f32 %f27, %f26, %f25;\n" " sqrt.rn.f32 %f28, %f27;\n" " rcp.rn.f32 %f29, %f28;\n" " sub.f32 %f58, %f58, %f29;\n" " ld.param.u64 %rl51, [param_5];\n" " cvta.to.global.u64 %rl28, %rl51;\n" " add.s64 %rl29, %rl28, %rl24;\n" " ld.global.f32 %f30, [%rl29];\n" " div.rn.f32 %f31, %f30, %f25;\n" " add.f32 %f57, %f57, %f31;\n" " ld.param.u64 %rl45, [param_1];\n" " cvta.to.global.u64 %rl30, %rl45;\n" " add.s64 %rl31, %rl30, %rl24;\n" " ld.global.f32 %f32, [%rl31];\n" " ld.param.u64 %rl46, [param_2];\n" " cvta.to.global.u64 %rl32, %rl46;\n" " add.s64 %rl33, %rl32, %rl24;\n" " ld.global.f32 %f33, [%rl33];\n" " mul.f32 %f34, %f33, %f30;\n" " div.rn.f32 %f35, %f34, %f25;\n" " neg.f32 %f36, %f35;\n" " fma.rn.f32 %f37, %f32, %f29, %f36;\n" " add.f32 %f56, %f56, %f37;\n" " add.f32 %f59, %f59, 0f3F800000;\n" "\n" " BB00_5:\n" " add.s32 %r78, %r78, 1;\n" " setp.le.s32 %p18, %r78, %r74;\n" " @%p18 bra BB00_3;\n" "\n" " add.s32 %r77, %r77, 1;\n" " setp.le.s32 %p19, %r77, %r74;\n" " @%p19 bra BB00_2;\n" " bra.uni BB00_7;\n" "\n" " BB00_6:\n" " mov.f32 %f59, 0f00000000;\n" " mov.f32 %f58, %f59;\n" " mov.f32 %f57, %f59;\n" " mov.f32 %f56, %f59;\n" "\n" " BB00_7:\n" " shl.b32 %r9, %r5, 1;\n" " suld.b.2d.b16.trap {%rc5}, [surfImageProjRef, {%r9, %r8}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rc5;\n" " cvt.f32.f16 %f22, %temp;\n" " }\n" " div.rn.f32 %f43, %f57, %f59;\n" " mul.f32 %f65, %f43, %f22;\n" " div.rn.f32 %f42, %f58, %f59;\n" " suld.b.2d.b16.trap {%rc6}, [surfImageRef, {%r9, %r8}];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rc6;\n" " cvt.f32.f16 %f21, %temp;\n" " }\n" " fma.rn.f32 %f66, %f42, %f21, %f65;\n" " div.rn.f32 %f44, %f56, %f59;\n" " add.f32 %f68, %f66, %f44;\n" " add.s64 %rl42, %rl23, %rl14;\n" " add.s64 %rl44, %rl26, %rl14;\n" " ld.global.f32 %f45, [%rl44];\n" " ld.global.f32 %f46, [%rl42];\n" " min.f32 %f47, %f46, %f45;\n" " add.f32 %f48, %f47, 0f3AC49BA6;\n" " div.rn.f32 %f49, %f47, %f48;\n" " mul.f32 %f51, %f68, %f49;\n" " st.global.f32 [%rl15], %f51;\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the photometric gradient for all vertices seen by an image pair ".visible .entry ComputePhotometricGradient(\n" " .param .u64 .ptr param_1, // faces\n" " .param .u64 .ptr param_2, // normals\n" " .param .u64 .ptr param_3, // depth-map A (float)\n" " .param .u64 .ptr param_4, // face-map A (uint32_t)\n" " .param .u64 .ptr param_5, // bary-map A (hfloat*3)\n" " .param .u64 .ptr param_6, // DZNCC\n" " .param .u64 .ptr param_7, // mask\n" " .param .u64 .ptr param_8, // photo-grad [in/out]\n" " .param .u64 .ptr param_9, // photo-grad-norm [in/out]\n" " .param .align 4 .b8 param_10[176], // camera A\n" " .param .align 4 .b8 param_11[176], // camera B\n" " .param .f32 param_12 // square(avg-depth/f) scale (float)\n" ")\n" "{\n" " .reg .f32 %f<232>;\n" " .reg .pred %p<11>;\n" " .reg .s16 %rs<5>;\n" " .reg .s32 %r<69>;\n" " .reg .s64 %rl<66>;\n" "\n" " ld.param.u32 %r1, [param_10+168];\n" " ld.param.u32 %r2, [param_10+172];\n" " mov.u32 %r9, %ntid.x;\n" " mov.u32 %r10, %ctaid.x;\n" " mov.u32 %r11, %tid.x;\n" " mad.lo.s32 %r3, %r9, %r10, %r11;\n" " mov.u32 %r12, %ntid.y;\n" " mov.u32 %r13, %ctaid.y;\n" " mov.u32 %r14, %tid.y;\n" " mad.lo.s32 %r4, %r12, %r13, %r14;\n" " setp.gt.s32 %p1, %r3, -1;\n" " setp.lt.s32 %p2, %r3, %r1;\n" " and.pred %p3, %p1, %p2;\n" " setp.gt.s32 %p4, %r4, -1;\n" " and.pred %p5, %p3, %p4;\n" " setp.lt.s32 %p6, %r4, %r2;\n" " and.pred %p7, %p5, %p6;\n" " @!%p7 bra BB00_1;\n" "\n" " mad.lo.s32 %r15, %r4, %r1, %r3;\n" " cvt.s64.s32 %rl11, %r15;\n" " ld.param.u64 %rl12, [param_7];\n" " cvta.to.global.u64 %rl10, %rl12;\n" " add.s64 %rl13, %rl10, %rl11;\n" " ld.global.u8 %rs4, [%rl13];\n" " {\n" " .reg .s16 %temp1;\n" " .reg .s16 %temp2;\n" " cvt.s16.s8 %temp1, %rs4;\n" " mov.b16 %temp2, 1;\n" " cvt.s16.s8 %temp2, %temp2;\n" " setp.ne.s16 %p8, %temp1, %temp2;\n" " }\n" " @%p8 bra BB00_1;\n" "\n" " ld.param.u64 %rl59, [param_3];\n" " cvta.to.global.u64 %rl14, %rl59;\n" " shl.b64 %rl15, %rl11, 2;\n" " add.s64 %rl16, %rl14, %rl15;\n" " ld.global.f32 %f4, [%rl16];\n" " ld.param.u64 %rl60, [param_4];\n" " cvta.to.global.u64 %rl17, %rl60;\n" " add.s64 %rl18, %rl17, %rl15;\n" " ld.param.u64 %rl61, [param_5];\n" " cvta.to.global.u64 %rl19, %rl61;\n" " mul.wide.s32 %rl9, %r15, 6;\n" " add.s64 %rl20, %rl19, %rl9;\n" " ld.global.b16 %rs1, [%rl20];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rs1;\n" " cvt.f32.f16 %f41, %temp;\n" " }\n" " ld.global.b16 %rs2, [%rl20+2];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rs2;\n" " cvt.f32.f16 %f42, %temp;\n" " }\n" " ld.global.b16 %rs3, [%rl20+4];\n" " {\n" " .reg .b16 %temp;\n" " mov.b16 %temp, %rs3;\n" " cvt.f32.f16 %f43, %temp;\n" " }\n" " ld.param.f32 %f44, [param_12];\n" " ld.global.u32 %r20, [%rl18];\n" " mul.lo.s32 %r22, %r20, 3;\n" " ld.param.u64 %rl57, [param_1];\n" " cvta.to.global.u64 %rl23, %rl57;\n" " mul.wide.s32 %rl24, %r22, 4;\n" " add.s64 %rl25, %rl23, %rl24;\n" " ld.global.u32 %r5, [%rl25];\n" " ld.global.u32 %r6, [%rl25+4];\n" " ld.global.u32 %r7, [%rl25+8];\n" " ld.param.u64 %rl58, [param_2];\n" " cvta.to.global.u64 %rl26, %rl58;\n" " add.s64 %rl27, %rl26, %rl24;\n" " cvt.rn.f32.s32 %f71, %r3;\n" " ld.param.f32 %f193, [param_10+104];\n" " sub.f32 %f72, %f71, %f193;\n" " ld.param.f32 %f194, [param_10+96];\n" " div.rn.f32 %f73, %f72, %f194;\n" " cvt.rn.f32.s32 %f74, %r4;\n" " ld.param.f32 %f191, [param_10+116];\n" " sub.f32 %f75, %f74, %f191;\n" " ld.param.f32 %f192, [param_10+112];\n" " div.rn.f32 %f76, %f75, %f192;\n" " ld.param.f32 %f203, [param_10+60];\n" " mul.f32 %f77, %f203, %f76;\n" " ld.param.f32 %f206, [param_10+48];\n" " fma.rn.f32 %f78, %f206, %f73, %f77;\n" " ld.param.f32 %f200, [param_10+72];\n" " add.f32 %f79, %f78, %f200;\n" " ld.param.f32 %f202, [param_10+64];\n" " mul.f32 %f80, %f202, %f76;\n" " ld.param.f32 %f205, [param_10+52];\n" " fma.rn.f32 %f81, %f205, %f73, %f80;\n" " ld.param.f32 %f199, [param_10+76];\n" " add.f32 %f82, %f81, %f199;\n" " ld.param.f32 %f201, [param_10+68];\n" " mul.f32 %f83, %f201, %f76;\n" " ld.param.f32 %f204, [param_10+56];\n" " fma.rn.f32 %f84, %f204, %f73, %f83;\n" " ld.param.f32 %f198, [param_10+80];\n" " add.f32 %f85, %f84, %f198;\n" " mul.f32 %f86, %f82, %f82;\n" " fma.rn.f32 %f87, %f79, %f79, %f86;\n" " fma.rn.f32 %f88, %f85, %f85, %f87;\n" " sqrt.rn.f32 %f89, %f88;\n" " div.rn.f32 %f45, %f79, %f89;\n" " div.rn.f32 %f46, %f82, %f89;\n" " div.rn.f32 %f47, %f85, %f89;\n" " ld.global.f32 %f48, [%rl27];\n" " ld.global.f32 %f49, [%rl27+4];\n" " mul.f32 %f90, %f49, %f46;\n" " fma.rn.f32 %f91, %f48, %f45, %f90;\n" " ld.global.f32 %f50, [%rl27+8];\n" " fma.rn.f32 %f51, %f50, %f47, %f91;\n" " setp.gt.f32 %p9, %f51, 0fBDCCCCCD;\n" " @%p9 bra BB00_1;\n" "\n" " ld.param.f32 %f38, [param_10+84];\n" " fma.rn.f32 %f92, %f4, %f79, %f38;\n" " ld.param.f32 %f39, [param_10+88];\n" " fma.rn.f32 %f93, %f4, %f82, %f39;\n" " ld.param.f32 %f40, [param_10+92];\n" " fma.rn.f32 %f94, %f4, %f85, %f40;\n" " ld.param.f32 %f227, [param_11+4];\n" " mul.f32 %f95, %f227, %f93;\n" " ld.param.f32 %f229, [param_11];\n" " fma.rn.f32 %f96, %f229, %f92, %f95;\n" " ld.param.f32 %f225, [param_11+8];\n" " fma.rn.f32 %f97, %f225, %f94, %f96;\n" " ld.param.f32 %f223, [param_11+12];\n" " add.f32 %f52, %f97, %f223;\n" " ld.param.f32 %f220, [param_11+20];\n" " mul.f32 %f98, %f220, %f93;\n" " ld.param.f32 %f222, [param_11+16];\n" " fma.rn.f32 %f99, %f222, %f92, %f98;\n" " ld.param.f32 %f218, [param_11+24];\n" " fma.rn.f32 %f100, %f218, %f94, %f99;\n" " ld.param.f32 %f216, [param_11+28];\n" " add.f32 %f53, %f100, %f216;\n" " ld.param.f32 %f213, [param_11+36];\n" " mul.f32 %f101, %f213, %f93;\n" " ld.param.f32 %f215, [param_11+32];\n" " fma.rn.f32 %f102, %f215, %f92, %f101;\n" " ld.param.f32 %f211, [param_11+40];\n" " fma.rn.f32 %f103, %f211, %f94, %f102;\n" " ld.param.f32 %f209, [param_11+44];\n" " add.f32 %f54, %f103, %f209;\n" " setp.gt.f32 %p10, %f54, 0f00000000;\n" " @%p10 bra BB00_2;\n" "\n" " mov.f32 %f231, 0fBF800000;\n" " mov.f32 %f230, %f231;\n" " bra.uni BB00_3;\n" "\n" " BB00_2:\n" " div.rn.f32 %f230, %f52, %f54;\n" " div.rn.f32 %f231, %f53, %f54;\n" "\n" " BB00_3:\n" " ld.param.f32 %f228, [param_11];\n" " mul.f32 %f118, %f228, %f54;\n" " neg.f32 %f119, %f52;\n" " ld.param.f32 %f214, [param_11+32];\n" " fma.rn.f32 %f120, %f119, %f214, %f118;\n" " mul.f32 %f121, %f54, %f54;\n" " div.rn.f32 %f122, %f120, %f121;\n" " ld.param.f32 %f226, [param_11+4];\n" " mul.f32 %f123, %f226, %f54;\n" " ld.param.f32 %f212, [param_11+36];\n" " fma.rn.f32 %f124, %f119, %f212, %f123;\n" " div.rn.f32 %f125, %f124, %f121;\n" " ld.param.f32 %f224, [param_11+8];\n" " mul.f32 %f126, %f224, %f54;\n" " ld.param.f32 %f210, [param_11+40];\n" " fma.rn.f32 %f127, %f119, %f210, %f126;\n" " div.rn.f32 %f128, %f127, %f121;\n" " ld.param.f32 %f221, [param_11+16];\n" " mul.f32 %f129, %f221, %f54;\n" " neg.f32 %f130, %f53;\n" " fma.rn.f32 %f131, %f130, %f214, %f129;\n" " div.rn.f32 %f132, %f131, %f121;\n" " ld.param.f32 %f219, [param_11+20];\n" " mul.f32 %f133, %f219, %f54;\n" " fma.rn.f32 %f134, %f130, %f212, %f133;\n" " div.rn.f32 %f135, %f134, %f121;\n" " ld.param.f32 %f217, [param_11+24];\n" " mul.f32 %f136, %f217, %f54;\n" " fma.rn.f32 %f137, %f130, %f210, %f136;\n" " div.rn.f32 %f138, %f137, %f121;\n" " add.f32 %f106, %f230, 0f3F800000;\n" " tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [texImageRef, {%f106, %f231}];\n" " mov.b32 %f140, %r29;\n" " tex.2d.v4.u32.f32 {%r34, %r35, %r36, %r37}, [texImageRef, {%f230, %f231}];\n" " mov.b32 %f141, %r34;\n" " sub.f32 %f142, %f140, %f141;\n" " add.f32 %f113, %f231, 0f3F800000;\n" " tex.2d.v4.u32.f32 {%r39, %r40, %r41, %r42}, [texImageRef, {%f230, %f113}];\n" " mov.b32 %f143, %r39;\n" " sub.f32 %f145, %f143, %f141;\n" " ld.param.u64 %rl63, [param_6];\n" " cvta.to.global.u64 %rl28, %rl63;\n" " mul.wide.s32 %rl29, %r15, 4;\n" " add.s64 %rl30, %rl28, %rl29;\n" " mul.f32 %f146, %f145, %f132;\n" " fma.rn.f32 %f147, %f142, %f122, %f146;\n" " ld.global.f32 %f148, [%rl30];\n" " mul.f32 %f149, %f148, %f147;\n" " mul.f32 %f150, %f145, %f135;\n" " fma.rn.f32 %f151, %f142, %f125, %f150;\n" " mul.f32 %f152, %f148, %f151;\n" " mul.f32 %f153, %f145, %f138;\n" " fma.rn.f32 %f154, %f142, %f128, %f153;\n" " mul.f32 %f155, %f148, %f154;\n" " mul.f32 %f156, %f152, %f46;\n" " fma.rn.f32 %f157, %f149, %f45, %f156;\n" " fma.rn.f32 %f158, %f155, %f47, %f157;\n" " div.rn.f32 %f159, %f158, %f51;\n" " mul.lo.s32 %r60, %r5, 3;\n" " ld.param.u64 %rl64, [param_8];\n" " cvta.to.global.u64 %rl31, %rl64;\n" " mul.wide.s32 %rl32, %r60, 4;\n" " add.s64 %rl33, %rl31, %rl32;\n" " mul.f32 %f162, %f44, %f41;\n" " mul.f32 %f163, %f162, %f159;\n" " mul.f32 %f164, %f163, %f48;\n" " atom.global.add.f32 %f165, [%rl33], %f164;\n" " mad.lo.s32 %r61, %r5, 3, 1;\n" " mul.wide.s32 %rl34, %r61, 4;\n" " add.s64 %rl35, %rl31, %rl34;\n" " mul.f32 %f166, %f163, %f49;\n" " atom.global.add.f32 %f167, [%rl35], %f166;\n" " mad.lo.s32 %r62, %r5, 3, 2;\n" " mul.wide.s32 %rl36, %r62, 4;\n" " add.s64 %rl37, %rl31, %rl36;\n" " mul.f32 %f168, %f163, %f50;\n" " atom.global.add.f32 %f169, [%rl37], %f168;\n" " mul.lo.s32 %r63, %r6, 3;\n" " mul.wide.s32 %rl38, %r63, 4;\n" " add.s64 %rl39, %rl31, %rl38;\n" " mul.f32 %f170, %f44, %f42;\n" " mul.f32 %f171, %f170, %f159;\n" " mul.f32 %f172, %f171, %f48;\n" " atom.global.add.f32 %f173, [%rl39], %f172;\n" " mad.lo.s32 %r64, %r6, 3, 1;\n" " mul.wide.s32 %rl40, %r64, 4;\n" " add.s64 %rl41, %rl31, %rl40;\n" " mul.f32 %f174, %f171, %f49;\n" " atom.global.add.f32 %f175, [%rl41], %f174;\n" " mad.lo.s32 %r65, %r6, 3, 2;\n" " mul.wide.s32 %rl42, %r65, 4;\n" " add.s64 %rl43, %rl31, %rl42;\n" " mul.f32 %f176, %f171, %f50;\n" " atom.global.add.f32 %f177, [%rl43], %f176;\n" " mul.lo.s32 %r66, %r7, 3;\n" " mul.wide.s32 %rl44, %r66, 4;\n" " add.s64 %rl45, %rl31, %rl44;\n" " mul.f32 %f178, %f44, %f43;\n" " mul.f32 %f179, %f178, %f159;\n" " mul.f32 %f180, %f179, %f48;\n" " atom.global.add.f32 %f181, [%rl45], %f180;\n" " mad.lo.s32 %r67, %r7, 3, 1;\n" " mul.wide.s32 %rl46, %r67, 4;\n" " add.s64 %rl47, %rl31, %rl46;\n" " mul.f32 %f182, %f179, %f49;\n" " atom.global.add.f32 %f183, [%rl47], %f182;\n" " mad.lo.s32 %r68, %r7, 3, 2;\n" " mul.wide.s32 %rl48, %r68, 4;\n" " add.s64 %rl49, %rl31, %rl48;\n" " mul.f32 %f184, %f179, %f50;\n" " atom.global.add.f32 %f185, [%rl49], %f184;\n" " ld.param.u64 %rl65, [param_9];\n" " cvta.to.global.u64 %rl50, %rl65;\n" " mul.wide.s32 %rl51, %r5, 4;\n" " add.s64 %rl52, %rl50, %rl51;\n" " atom.global.add.f32 %f186, [%rl52], 0f3F800000;\n" " mul.wide.s32 %rl53, %r6, 4;\n" " add.s64 %rl54, %rl50, %rl53;\n" " atom.global.add.f32 %f187, [%rl54], 0f3F800000;\n" " mul.wide.s32 %rl55, %r7, 4;\n" " add.s64 %rl56, %rl50, %rl55;\n" " atom.global.add.f32 %f188, [%rl56], 0f3F800000;\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to update the norm of the photo gradient for all vertices ".visible .entry UpdatePhotoGradNorm(\n" " .param .u64 .ptr param_1, // photoGradNorm [in/out]\n" " .param .u64 .ptr param_2, // photoGradPixels [in]\n" " .param .u32 param_3 // numVertices\n" ")\n" "{\n" " .reg .f32 %f<4>;\n" " .reg .pred %p<3>;\n" " .reg .s32 %r<9>;\n" " .reg .s64 %rl<9>;\n" "\n" " ld.param.u32 %r2, [param_3];\n" " mov.u32 %r3, %ntid.x;\n" " mov.u32 %r4, %ctaid.x;\n" " mov.u32 %r5, %tid.x;\n" " mad.lo.s32 %r1, %r3, %r4, %r5;\n" " setp.ge.s32 %p1, %r1, %r2;\n" " @%p1 bra BB00_1;\n" "\n" " ld.param.u64 %rl5, [param_2];\n" " cvta.to.global.u64 %rl2, %rl5;\n" " mul.wide.s32 %rl6, %r1, 4;\n" " add.s64 %rl7, %rl2, %rl6;\n" " ld.global.f32 %f1, [%rl7];\n" " setp.le.f32 %p2, %f1, 0f00000000;\n" " @%p2 bra BB00_1;\n" "\n" " ld.param.u64 %rl4, [param_1];\n" " cvta.to.global.u64 %rl1, %rl4;\n" " add.s64 %rl8, %rl1, %rl6;\n" " ld.global.f32 %f2, [%rl8];\n" " add.f32 %f3, %f2, 0f3F800000;\n" " st.global.f32 [%rl8], %f3;\n" " BB00_1:\n" " ret;\n" "}\n" "\n" // kernel used to compute the smoothness gradient for all vertices ".visible .entry ComputeSmoothnessGradient(\n" " .param .u64 .ptr param_1, // vertices\n" " .param .u64 .ptr param_2, // vert-vertices [in]\n" " .param .u64 .ptr param_3, // vert-sizes [in]\n" " .param .u64 .ptr param_4, // vert-pos [in]\n" " .param .u64 .ptr param_5, // smooth-grad [out]\n" " .param .u32 param_6, // numVertices\n" " .param .u8 param_7 // switch 0/1\n" ")\n" "{\n" " .reg .f32 %f<38>;\n" " .reg .pred %p<5>;\n" " .reg .s32 %r<69>;\n" " .reg .s64 %rl<42>;\n" " .reg .s16 %rc<4>;\n" "\n" " ld.param.u32 %r9, [param_6];\n" " ld.param.u64 %rl1, [param_1];\n" " ld.param.u64 %rl12, [param_2];\n" " ld.param.u64 %rl2, [param_3];\n" " ld.param.u64 %rl41, [param_4];\n" " ld.param.u64 %rl13, [param_5];\n" " cvta.to.global.u64 %rl4, %rl12;\n" " cvta.to.global.u64 %rl20, %rl41;\n" " cvta.to.global.u64 %rl5, %rl2;\n" " cvta.to.global.u64 %rl6, %rl13;\n" " cvta.to.global.u64 %rl7, %rl1;\n" " mov.u32 %r10, %ntid.x;\n" " mov.u32 %r11, %ctaid.x;\n" " mov.u32 %r12, %tid.x;\n" " mad.lo.s32 %r1, %r10, %r11, %r12;\n" " setp.ge.s32 %p1, %r1, %r9;\n" " @%p1 bra BB00_1;\n" "\n" " mul.lo.s32 %r13, %r1, 3;\n" " mul.wide.s32 %rl14, %r13, 4;\n" " add.s64 %rl8, %rl6, %rl14;\n" " mul.wide.s32 %rl19, %r1, 4;\n" " add.s64 %rl10, %rl5, %rl19;\n" " ld.global.u32 %r67, [%rl10];\n" " setp.gt.s32 %p2, %r67, 0;\n" " @%p2 bra BB00_5;\n" "\n" " mov.f32 %f37, 0f00000000;\n" " st.global.f32 [%rl8], %f37;\n" " st.global.f32 [%rl8+4], %f37;\n" " st.global.f32 [%rl8+8], %f37;\n" " BB00_1:\n" " ret;\n" "\n" " BB00_5:\n" " add.s64 %rl15, %rl7, %rl14;\n" " ld.global.f32 %f36, [%rl15];\n" " ld.global.f32 %f35, [%rl15+4];\n" " ld.global.f32 %f34, [%rl15+8];\n" " ld.param.u8 %rc3, [param_7];\n" " cvt.s16.s8 %rc2, %rc3;\n" " mov.b16 %rc1, 0;\n" " setp.eq.s16 %p3, %rc2, %rc1;\n" " mov.f32 %f37, 0f3F800000;\n" " add.s64 %rl11, %rl20, %rl19;\n" " ld.global.u32 %r27, [%rl11];\n" " cvt.rn.f32.s32 %f22, %r67;\n" " rcp.rn.f32 %f30, %f22;\n" " mov.u32 %r68, 0;\n" "\n" " BB00_2:\n" " add.s32 %r29, %r27, %r68;\n" " mul.wide.s32 %rl22, %r29, 4;\n" " add.s64 %rl23, %rl4, %rl22;\n" " ld.global.u32 %r30, [%rl23];\n" " mul.lo.s32 %r32, %r30, 3;\n" " mul.wide.s32 %rl25, %r32, 4;\n" " add.s64 %rl26, %rl7, %rl25;\n" " ld.global.f32 %f20, [%rl26];\n" " mul.f32 %f21, %f20, %f30;\n" " sub.f32 %f36, %f36, %f21;\n" " ld.global.f32 %f23, [%rl26+4];\n" " mul.f32 %f24, %f23, %f30;\n" " sub.f32 %f35, %f35, %f24;\n" " ld.global.f32 %f26, [%rl26+8];\n" " mul.f32 %f27, %f26, %f30;\n" " sub.f32 %f34, %f34, %f27;\n" " @%p3 bra BB00_3;\n" "\n" " mul.wide.s32 %rl39, %r30, 4;\n" " add.s64 %rl40, %rl5, %rl39;\n" " ld.global.u32 %r59, [%rl40];\n" " cvt.rn.f32.s32 %f28, %r59;\n" " rcp.rn.f32 %f29, %f28;\n" " fma.rn.f32 %f37, %f29, %f30, %f37;\n" "\n" " BB00_3:\n" " add.s32 %r68, %r68, 1;\n" " setp.lt.s32 %p4, %r68, %r67;\n" " @%p4 bra BB00_2;\n" "\n" " @%p3 bra BB00_4;\n" "\n" " div.rn.f32 %f36, %f36, %f37;\n" " div.rn.f32 %f35, %f35, %f37;\n" " div.rn.f32 %f34, %f34, %f37;\n" "\n" " BB00_4:\n" " st.global.f32 [%rl8], %f36;\n" " st.global.f32 [%rl8+4], %f35;\n" " st.global.f32 [%rl8+8], %f34;\n" " ret;\n" "}\n" "\n" // kernel used to combine the photo and smoothness gradient for all vertices ".visible .entry CombineGradients(\n" " .param .u64 .ptr param_1, // photo-gradient [in/out]\n" " .param .u64 .ptr param_2, // photo-norm [in]\n" " .param .u64 .ptr param_3, // smoothness-gradient [in]\n" " .param .u32 param_4, // numVertices\n" " .param .f32 param_5 // smoothness-weight\n" ")\n" "{\n" " .reg .f32 %f<17>;\n" " .reg .pred %p<3>;\n" " .reg .s32 %r<6>;\n" " .reg .s64 %rl<15>;\n" "\n" " ld.param.u32 %r2, [param_4];\n" " mov.u32 %r3, %ntid.x;\n" " mov.u32 %r4, %ctaid.x;\n" " mov.u32 %r5, %tid.x;\n" " mad.lo.s32 %r1, %r3, %r4, %r5;\n" " setp.lt.s32 %p1, %r1, %r2;\n" " @!%p1 bra BB00_1;\n" "\n" " ld.param.u64 %rl4, [param_3];\n" " cvta.to.global.u64 %rl2, %rl4;\n" " ld.param.u64 %rl11, [param_2];\n" " cvta.to.global.u64 %rl12, %rl11;\n" " ld.param.u64 %rl13, [param_1];\n" " cvta.to.global.u64 %rl14, %rl13;\n" " mul.wide.s32 %rl4, %r1, 4;\n" " mul.wide.s32 %rl5, %r1, 12;\n" " add.s64 %rl9, %rl12, %rl4;\n" " add.s64 %rl6, %rl2, %rl5;\n" " add.s64 %rl8, %rl14, %rl5;\n" " ld.param.f32 %f8, [param_5];\n" " ld.global.f32 %f2, [%rl6];\n" " mul.f32 %f3, %f2, %f8;\n" " ld.global.f32 %f4, [%rl6+4];\n" " mul.f32 %f5, %f4, %f8;\n" " ld.global.f32 %f6, [%rl6+8];\n" " mul.f32 %f7, %f6, %f8;\n" " ld.global.f32 %f9, [%rl9];\n" " setp.gt.f32 %p2, %f9, 0f00000000;\n" " @%p2 bra BB00_2;\n" "\n" " st.global.f32 [%rl8], %f3;\n" " st.global.f32 [%rl8+4], %f5;\n" " st.global.f32 [%rl8+8], %f7;\n" " ret;\n" "\n" " BB00_2:\n" " rcp.rn.f32 %f10, %f9;\n" " ld.global.f32 %f11, [%rl8];\n" " fma.rn.f32 %f12, %f11, %f10, %f3;\n" " ld.global.f32 %f13, [%rl8+4];\n" " fma.rn.f32 %f14, %f13, %f10, %f5;\n" " ld.global.f32 %f15, [%rl8+8];\n" " fma.rn.f32 %f16, %f15, %f10, %f7;\n" " st.global.f32 [%rl8], %f12;\n" " st.global.f32 [%rl8+4], %f14;\n" " st.global.f32 [%rl8+8], %f16;\n" " BB00_1:\n" " ret;\n" "}\n" // kernel used to combine the photo and both smoothness gradients for all vertices ".visible .entry CombineAllGradients(\n" " .param .u64 .ptr param_1, // photo-gradient [in/out]\n" " .param .u64 .ptr param_2, // photo-norm [in]\n" " .param .u64 .ptr param_3, // smoothness-gradient 1 [in]\n" " .param .u64 .ptr param_4, // smoothness-gradient 2 [in]\n" " .param .u32 param_5, // numVertices\n" " .param .f32 param_6, // rigidity-weight\n" " .param .f32 param_7 // elasticity-weight\n" ")\n" "{\n" " .reg .f32 %f<19>;\n" " .reg .pred %p<3>;\n" " .reg .s32 %r<17>;\n" " .reg .s64 %rl<15>;\n" "\n" " ld.param.u32 %r2, [param_5];\n" " mov.u32 %r3, %ntid.x;\n" " mov.u32 %r4, %ctaid.x;\n" " mov.u32 %r5, %tid.x;\n" " mad.lo.s32 %r1, %r3, %r4, %r5;\n" " setp.lt.s32 %p1, %r1, %r2;\n" " @!%p1 bra BB00_1;\n" "\n" " ld.global.f32 %f11, [%rl7];\n" "\n" " ld.param.u64 %rl3, [param_4];\n" " cvta.to.global.u64 %rl1, %rl3;\n" " ld.param.u64 %rl4, [param_3];\n" " cvta.to.global.u64 %rl2, %rl4;\n" " ld.param.u64 %rl11, [param_2];\n" " cvta.to.global.u64 %rl12, %rl11;\n" " ld.param.u64 %rl13, [param_1];\n" " cvta.to.global.u64 %rl14, %rl13;\n" " mul.wide.s32 %rl4, %r1, 4;\n" " mul.wide.s32 %rl5, %r1, 12;\n" " add.s64 %rl7, %rl1, %rl5;\n" " add.s64 %rl6, %rl2, %rl5;\n" " add.s64 %rl9, %rl12, %rl4;\n" " add.s64 %rl8, %rl14, %rl5;\n" " ld.param.f32 %f8, [param_6];\n" " ld.param.f32 %f18, [param_7];\n" " ld.global.f32 %f2, [%rl6];\n" " mul.f32 %f3, %f2, %f8;\n" " ld.global.f32 %f12, [%rl7];\n" " fma.rn.f32 %f11, %f12, %f18, %f3;\n" " ld.global.f32 %f4, [%rl6+4];\n" " mul.f32 %f5, %f4, %f8;\n" " ld.global.f32 %f14, [%rl7+4];\n" " fma.rn.f32 %f13, %f14, %f18, %f5;\n" " ld.global.f32 %f6, [%rl6+8];\n" " mul.f32 %f7, %f6, %f8;\n" " ld.global.f32 %f16, [%rl7+8];\n" " fma.rn.f32 %f15, %f16, %f18, %f7;\n" " ld.global.f32 %f9, [%rl9];\n" " setp.gt.f32 %p2, %f9, 0f00000000;\n" " @%p2 bra BB00_2;\n" "\n" " st.global.f32 [%rl8], %f11;\n" " st.global.f32 [%rl8+4], %f13;\n" " st.global.f32 [%rl8+8], %f15;\n" " ret;\n" "\n" " BB00_2:\n" " rcp.rn.f32 %f10, %f9;\n" " ld.global.f32 %f2, [%rl8];\n" " fma.rn.f32 %f3, %f2, %f10, %f11;\n" " ld.global.f32 %f4, [%rl8+4];\n" " fma.rn.f32 %f5, %f4, %f10, %f13;\n" " ld.global.f32 %f6, [%rl8+8];\n" " fma.rn.f32 %f7, %f6, %f10, %f15;\n" " st.global.f32 [%rl8], %f3;\n" " st.global.f32 [%rl8+4], %f5;\n" " st.global.f32 [%rl8+8], %f7;\n" " BB00_1:\n" " ret;\n" "}\n"; // S T R U C T S /////////////////////////////////////////////////// typedef Mesh::Vertex Vertex; typedef Mesh::VIndex VIndex; typedef Mesh::Face Face; typedef Mesh::FIndex FIndex; class MeshRefineCUDA { public: typedef Mesh::FaceIdxArr CameraFaces; typedef CLISTDEF2(CameraFaces) CameraFacesArr; // store necessary data about a view struct View { Image32F imageHost; // store temporarily the image pixels Image8U::Size size; CUDA::ArrayRT16F image; CUDA::MemDevice depthMap; CUDA::MemDevice faceMap; CUDA::MemDevice baryMap; inline View() {} inline View(View&) {} }; typedef CLISTDEF2(View) ViewsArr; struct CameraCUDA { Matrix3x4f P; Matrix3x3f R; Point3f C; Matrix3x3f K; Matrix3x3f invK; Image8U::Size size; inline CameraCUDA() {} inline CameraCUDA(const Camera& camera, const Image8U::Size& _size) : P(camera.P), R(camera.R), C(camera.C), K(camera.K), invK(camera.GetInvK()), size(_size) {} }; public: MeshRefineCUDA(Scene& _scene, unsigned _nAlternatePair=true, float _weightRegularity=1.5f, float _ratioRigidityElasticity=0.8f, unsigned _nResolutionLevel=0, unsigned _nMinResolution=640, unsigned nMaxViews=8); ~MeshRefineCUDA(); bool IsValid() const { return module != NULL && module->IsValid() && !pairs.IsEmpty(); } bool InitKernels(int device=-1); bool InitImages(float scale, float sigma=0); void ListVertexFacesPre(); void ListVertexFacesPost(); void ListCameraFaces(); void ListFaceAreas(Mesh::AreaArr& maxAreas); void SubdivideMesh(uint32_t maxArea, float fDecimate=1.f, unsigned nCloseHoles=15, unsigned nEnsureEdgeSize=1); void ComputeNormalFaces(); void ScoreMesh(float* gradients); void ProjectMesh( const CameraFaces& cameraFaces, const Camera& camera, const Image8U::Size& size, uint32_t idxImage); void ProcessPair(uint32_t idxImageA, uint32_t idxImageB); void ImageMeshWarp( const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, uint32_t idxImageA, uint32_t idxImageB); void ComputeLocalVariance(const CUDA::ArrayRT16F& image, const Image8U::Size& size, CUDA::MemDevice& imageMean, CUDA::MemDevice& imageVar); void ComputeLocalZNCC(const Image8U::Size& size); void ComputePhotometricGradient(const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, uint32_t idxImageA, uint32_t idxImageB, uint32_t numVertices, float RegularizationScale); void ComputeSmoothnessGradient(uint32_t numVertices); void CombineGradients(uint32_t numVertices); public: const float weightRegularity; // a scalar regularity weight to balance between photo-consistency and regularization terms float ratioRigidityElasticity; // a scalar ratio used to compute the regularity gradient as a combination of rigidity and elasticity const unsigned nResolutionLevel; // how many times to scale down the images before mesh optimization const unsigned nMinResolution; // how many times to scale down the images before mesh optimization unsigned nAlternatePair; // using an image pair alternatively as reference image (0 - both, 1 - alternate, 2 - only left, 3 - only right) unsigned iteration; // current refinement iteration Scene& scene; // the mesh vertices and faces // constant the entire time ImageArr& images; ViewsArr views; // views' data PairIdxArr pairs; // image pairs used to refine the mesh CUDA::ModuleRTPtr module; CUDA::KernelRT kernelProjectMesh; CUDA::KernelRT kernelCrossCheckProjection; CUDA::KernelRT kernelImageMeshWarp; CUDA::KernelRT kernelComputeImageMean; CUDA::KernelRT kernelComputeImageVar; CUDA::KernelRT kernelComputeImageCov; CUDA::KernelRT kernelComputeImageZNCC; CUDA::KernelRT kernelComputeImageDZNCC; CUDA::KernelRT kernelComputePhotometricGradient; CUDA::KernelRT kernelUpdatePhotoGradNorm; CUDA::KernelRT kernelComputeSmoothnessGradient; CUDA::KernelRT kernelCombineGradients; CUDA::KernelRT kernelCombineAllGradients; CUDA::MemDevice vertices; CUDA::MemDevice vertexVertices; CUDA::MemDevice faces; CUDA::MemDevice faceNormals; CUDA::TextureRT16F texImageRef; CUDA::SurfaceRT16F surfImageRef; CUDA::SurfaceRT16F surfImageProjRef; CUDA::MemDevice mask; CUDA::MemDevice imageMeanA; CUDA::MemDevice imageVarA; CUDA::ArrayRT16F imageAB; CUDA::MemDevice imageMeanAB; CUDA::MemDevice imageVarAB; CUDA::MemDevice imageCov; CUDA::MemDevice imageZNCC; CUDA::MemDevice imageDZNCC; CUDA::MemDevice photoGrad; CUDA::MemDevice photoGradNorm; CUDA::MemDevice photoGradPixels; CUDA::MemDevice vertexVerticesCont; CUDA::MemDevice vertexVerticesSizes; CUDA::MemDevice vertexVerticesPointers; CUDA::MemDevice smoothGrad1; CUDA::MemDevice smoothGrad2; enum { HalfSize = 2 }; // half window size used to compute ZNCC }; MeshRefineCUDA::MeshRefineCUDA(Scene& _scene, unsigned _nAlternatePair, float _weightRegularity, float _ratioRigidityElasticity, unsigned _nResolutionLevel, unsigned _nMinResolution, unsigned nMaxViews) : weightRegularity(_weightRegularity), ratioRigidityElasticity(_ratioRigidityElasticity), nResolutionLevel(_nResolutionLevel), nMinResolution(_nMinResolution), nAlternatePair(_nAlternatePair), scene(_scene), images(_scene.images) { if (!InitKernels(CUDA::desiredDeviceID)) return; // keep only best neighbor views for each image std::unordered_set mapPairs; mapPairs.reserve(images.GetSize()*nMaxViews); FOREACH(idxImage, images) { // keep only best neighbor views const float fMinArea(0.1f); const float fMinScale(0.2f), fMaxScale(3.2f); const float fMinAngle(FD2R(2.5f)), fMaxAngle(FD2R(45.f)); const Image& imageData = images[idxImage]; if (!imageData.IsValid()) continue; ViewScoreArr neighbors(imageData.neighbors); Scene::FilterNeighborViews(neighbors, fMinArea, fMinScale, fMaxScale, fMinAngle, fMaxAngle, nMaxViews); for (const ViewScore& neighbor: neighbors) { ASSERT(images[neighbor.ID].IsValid()); mapPairs.insert(MakePairIdx((uint32_t)idxImage, neighbor.ID)); } } pairs.Reserve(mapPairs.size()); for (uint64_t pair: mapPairs) pairs.AddConstruct(pair); } MeshRefineCUDA::~MeshRefineCUDA() { scene.mesh.ReleaseExtra(); } bool MeshRefineCUDA::InitKernels(int device) { STATIC_ASSERT(sizeof(CameraCUDA) == 176); // initialize CUDA device if needed if (CUDA::devices.IsEmpty() && CUDA::initDevice(device) != CUDA_SUCCESS) return false; // initialize CUDA kernels if (module != NULL && module->IsValid()) return true; module = new CUDA::ModuleRT(g_szMeshRefineModule); if (!module->IsValid()) { module.Release(); return false; } if (kernelProjectMesh.Reset(module, "ProjectMesh") != CUDA_SUCCESS) return false; ASSERT(kernelProjectMesh.IsValid()); if (kernelCrossCheckProjection.Reset(module, "CrossCheckProjection") != CUDA_SUCCESS) return false; ASSERT(kernelCrossCheckProjection.IsValid()); if (kernelImageMeshWarp.Reset(module, "ImageMeshWarp") != CUDA_SUCCESS) return false; ASSERT(kernelImageMeshWarp.IsValid()); if (kernelComputeImageMean.Reset(module, "ComputeImageMean") != CUDA_SUCCESS) return false; ASSERT(kernelComputeImageMean.IsValid()); if (kernelComputeImageVar.Reset(module, "ComputeImageVar") != CUDA_SUCCESS) return false; ASSERT(kernelComputeImageVar.IsValid()); if (kernelComputeImageCov.Reset(module, "ComputeImageCov") != CUDA_SUCCESS) return false; ASSERT(kernelComputeImageCov.IsValid()); if (kernelComputeImageZNCC.Reset(module, "ComputeImageZNCC") != CUDA_SUCCESS) return false; ASSERT(kernelComputeImageZNCC.IsValid()); if (kernelComputeImageDZNCC.Reset(module, "ComputeImageDZNCC") != CUDA_SUCCESS) return false; ASSERT(kernelComputeImageDZNCC.IsValid()); if (kernelComputePhotometricGradient.Reset(module, "ComputePhotometricGradient") != CUDA_SUCCESS) return false; ASSERT(kernelComputePhotometricGradient.IsValid()); if (kernelUpdatePhotoGradNorm.Reset(module, "UpdatePhotoGradNorm") != CUDA_SUCCESS) return false; ASSERT(kernelUpdatePhotoGradNorm.IsValid()); if (kernelComputeSmoothnessGradient.Reset(module, "ComputeSmoothnessGradient") != CUDA_SUCCESS) return false; ASSERT(kernelComputeSmoothnessGradient.IsValid()); if (kernelCombineGradients.Reset(module, "CombineGradients") != CUDA_SUCCESS) return false; ASSERT(kernelCombineGradients.IsValid()); if (kernelCombineAllGradients.Reset(module, "CombineAllGradients") != CUDA_SUCCESS) return false; ASSERT(kernelCombineAllGradients.IsValid()); // init textures if (texImageRef.Reset(module, "texImageRef", CU_TR_FILTER_MODE_LINEAR) != CUDA_SUCCESS) return false; if (surfImageRef.Reset(module, "surfImageRef") != CUDA_SUCCESS) return false; if (surfImageProjRef.Reset(module, "surfImageProjRef") != CUDA_SUCCESS) return false; return true; } // load and initialize all images at the given scale // and compute the gradient for each input image // optional: blur them using the given sigma bool MeshRefineCUDA::InitImages(float scale, float sigma) { views.Resize(images.GetSize()); #ifdef MESHCUDAOPT_USE_OPENMP bool bAbort(false); #pragma omp parallel for for (int_t ID=0; ID<(int_t)images.GetSize(); ++ID) { #pragma omp flush (bAbort) if (bAbort) continue; const uint32_t idxImage((uint32_t)ID); #else FOREACH(idxImage, images) { #endif Image& imageData = images[idxImage]; if (!imageData.IsValid()) continue; // load and init image unsigned level(nResolutionLevel); const unsigned imageSize(imageData.RecomputeMaxResolution(level, nMinResolution)); if ((imageData.image.empty() || MAXF(imageData.width,imageData.height) != imageSize) && !imageData.ReloadImage(imageSize)) { #ifdef MESHCUDAOPT_USE_OPENMP bAbort = true; #pragma omp flush (bAbort) continue; #else return false; #endif } View& view = views[idxImage]; Image32F& img = view.imageHost; imageData.image.toGray(img, cv::COLOR_BGR2GRAY, true); imageData.image.release(); if (sigma > 0) cv::GaussianBlur(img, img, cv::Size(), sigma); if (scale < 1.0) { cv::resize(img, img, cv::Size(), scale, scale, cv::INTER_AREA); imageData.width = img.width(); imageData.height = img.height(); } imageData.UpdateCamera(scene.platforms); } #ifdef MESHCUDAOPT_USE_OPENMP if (bAbort) return false; #endif // init GPU memory Image8U::Size maxSize(0,0); FOREACH(idxImage, views) { View& view = views[idxImage]; if (view.imageHost.empty()) continue; Image8U::Size& size(view.size); size = view.imageHost.size(); reportCudaError(view.image.Reset(size, CUDA_ARRAY3D_SURFACE_LDST)); reportCudaError(view.image.SetData(cvtImage(view.imageHost))); view.imageHost.release(); const size_t area((size_t)size.area()); reportCudaError(view.depthMap.Reset(sizeof(float)*area)); reportCudaError(view.faceMap.Reset(sizeof(FIndex)*area)); reportCudaError(view.baryMap.Reset(sizeof(hfloat)*3*area)); if (maxSize.width < size.width) maxSize.width = size.width; if (maxSize.height < size.height) maxSize.height = size.height; } const size_t area(maxSize.area()); reportCudaError(mask.Reset(sizeof(uint8_t)*area)); reportCudaError(imageMeanA.Reset(sizeof(float)*area)); reportCudaError(imageVarA.Reset(sizeof(float)*area)); reportCudaError(imageAB.Reset(maxSize, CUDA_ARRAY3D_SURFACE_LDST)); reportCudaError(imageMeanAB.Reset(sizeof(float)*area)); reportCudaError(imageVarAB.Reset(sizeof(float)*area)); reportCudaError(imageCov.Reset(sizeof(float)*area)); reportCudaError(imageZNCC.Reset(sizeof(float)*area)); reportCudaError(imageDZNCC.Reset(sizeof(float)*area)); surfImageProjRef.Bind(imageAB); iteration = 0; return true; } // extract array of triangles incident to each vertex // and check each vertex if it is at the boundary or not void MeshRefineCUDA::ListVertexFacesPre() { scene.mesh.EmptyExtra(); scene.mesh.ListIncidenteFaces(); reportCudaError(faces.Reset(scene.mesh.faces)); } void MeshRefineCUDA::ListVertexFacesPost() { scene.mesh.ListIncidenteVertices(); scene.mesh.ListBoundaryVertices(); ASSERT(!scene.mesh.vertices.IsEmpty() && scene.mesh.vertices.GetSize() == scene.mesh.vertexVertices.GetSize()); // set vertex vertices reportCudaError(vertexVertices.Reset(scene.mesh.vertexVertices)); // list adjacent vertices for each vertex const size_t numVertices(scene.mesh.vertices.GetSize()); Unsigned32Arr _vertexVerticesCont(0, numVertices*6); Unsigned32Arr _vertexVerticesSizes(0, numVertices); Unsigned32Arr _vertexVerticesPointers(0, numVertices); uint32_t lastPosition(0); FOREACH(idxV, scene.mesh.vertices) { if (scene.mesh.vertexBoundary[idxV]) { _vertexVerticesSizes.Insert(0); _vertexVerticesPointers.Insert(lastPosition); continue; } const Mesh::VertexIdxArr& verts = scene.mesh.vertexVertices[idxV]; _vertexVerticesCont.Join(verts.GetData(), verts.GetSize()); _vertexVerticesSizes.Insert(verts.GetSize()); _vertexVerticesPointers.Insert(lastPosition); lastPosition += verts.GetSize(); } reportCudaError(vertexVerticesCont.Reset(_vertexVerticesCont)); reportCudaError(vertexVerticesSizes.Reset(_vertexVerticesSizes)); reportCudaError(vertexVerticesPointers.Reset(_vertexVerticesPointers)); // init memory reportCudaError(photoGrad.Reset(sizeof(Point3f)*numVertices)); reportCudaError(photoGradNorm.Reset(sizeof(float)*numVertices)); reportCudaError(photoGradPixels.Reset(sizeof(float)*numVertices)); reportCudaError(smoothGrad1.Reset(sizeof(Point3f)*numVertices)); reportCudaError(smoothGrad2.Reset(sizeof(Point3f)*numVertices)); } // extract array of faces viewed by each image void MeshRefineCUDA::ListCameraFaces() { // extract array of faces viewed by each camera CameraFacesArr arrCameraFaces(images.GetSize()); { Mesh::Octree octree; Mesh::FacesInserter::CreateOctree(octree, scene.mesh); FOREACH(ID, images) { const Image& imageData = images[ID]; if (!imageData.IsValid()) continue; const TFrustum frustum(Matrix3x4f(imageData.camera.P), (float)imageData.width, (float)imageData.height); Mesh::FacesInserter inserter(arrCameraFaces[ID]); octree.Traverse(frustum, inserter); } } // project mesh to each camera plane reportCudaError(vertices.Reset(scene.mesh.vertices)); FOREACH(idxImage, images) { const Image& imageData = images[idxImage]; if (imageData.IsValid()) ProjectMesh(arrCameraFaces[idxImage], imageData.camera, views[idxImage].size, idxImage); } } // compute for each face the projection area as the maximum area in both images of a pair // (make sure ListCameraFaces() was called before) void MeshRefineCUDA::ListFaceAreas(Mesh::AreaArr& maxAreas) { ASSERT(maxAreas.IsEmpty()); // for each image, compute the projection area of visible faces typedef cList ImageAreaArr; ImageAreaArr viewAreas(images.GetSize()); FOREACH(idxImage, images) { const Image& imageData = images[idxImage]; if (!imageData.IsValid()) continue; Mesh::AreaArr& areas = viewAreas[idxImage]; areas.Resize(scene.mesh.faces.GetSize()); areas.Memset(0); // get faceMap from the GPU memory TImage faceMap(imageData.height, imageData.width); views[idxImage].faceMap.GetData(faceMap); // compute area covered by all vertices (incident faces) viewed by this image for (int j=0; ji]; const Mesh::AreaArr& areasB = viewAreas[pPair->j]; ASSERT(areasA.GetSize() == areasB.GetSize()); FOREACH(f, areasA) { const uint16_t minArea(MINF(areasA[f], areasB[f])); uint16_t& maxArea = maxAreas[f]; if (maxArea < minArea) maxArea = minArea; } } } // decimate or subdivide mesh such that for each face there is no image pair in which // its projection area is bigger than the given number of pixels in both images void MeshRefineCUDA::SubdivideMesh(uint32_t maxArea, float fDecimate, unsigned nCloseHoles, unsigned nEnsureEdgeSize) { Mesh::AreaArr maxAreas; // first decimate if necessary const bool bNoDecimation(fDecimate >= 1.f); const bool bNoSimplification(maxArea == 0); if (!bNoDecimation) { if (fDecimate > 0.f) { // decimate to the desired resolution scene.mesh.Clean(fDecimate, 0.f, false, nCloseHoles, 0u, 0.f, false); scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); #ifdef MESHOPT_ENSUREEDGESIZE // make sure there are no edges too small or too long if (nEnsureEdgeSize > 0 && bNoSimplification) { scene.mesh.EnsureEdgeSize(); scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); } #endif // re-map vertex and camera faces ListVertexFacesPre(); } else { // extract array of faces viewed by each camera ListCameraFaces(); // estimate the faces' area that have big projection areas in both images of a pair ListFaceAreas(maxAreas); ASSERT(!maxAreas.IsEmpty()); const float maxAreaf((float)(maxArea > 0 ? maxArea : 64)); const float medianArea(6.f*(float)Mesh::AreaArr(maxAreas).GetMedian()); if (medianArea < maxAreaf) { maxAreas.Empty(); // decimate to the auto detected resolution scene.mesh.Clean(MAXF(0.1f, medianArea/maxAreaf), 0.f, false, nCloseHoles, 0u, 0.f, false); scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); #ifdef MESHOPT_ENSUREEDGESIZE // make sure there are no edges too small or too long if (nEnsureEdgeSize > 0 && bNoSimplification) { scene.mesh.EnsureEdgeSize(); scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); } #endif // re-map vertex and camera faces ListVertexFacesPre(); } } } if (bNoSimplification) return; if (maxAreas.IsEmpty()) { // extract array of faces viewed by each camera ListCameraFaces(); // estimate the faces' area that have big projection areas in both images of a pair ListFaceAreas(maxAreas); } // subdivide mesh faces if its projection area is bigger than the given number of pixels const size_t numVertsOld(scene.mesh.vertices.GetSize()); const size_t numFacesOld(scene.mesh.faces.GetSize()); scene.mesh.Subdivide(maxAreas, maxArea); #ifdef MESHOPT_ENSUREEDGESIZE // make sure there are no edges too small or too long #if MESHOPT_ENSUREEDGESIZE==1 if ((nEnsureEdgeSize == 1 && !bNoDecimation) || nEnsureEdgeSize > 1) #endif { scene.mesh.EnsureEdgeSize(); scene.mesh.Clean(1.f, 0.f, false, nCloseHoles, 0u, 0.f, true); } #endif // re-map vertex and camera faces ListVertexFacesPre(); DEBUG_EXTRA("Mesh subdivided: %u/%u -> %u/%u vertices/faces", numVertsOld, numFacesOld, scene.mesh.vertices.GetSize(), scene.mesh.faces.GetSize()); #if TD_VERBOSE != TD_VERBOSE_OFF if (VERBOSITY_LEVEL > 3) scene.mesh.Save(MAKE_PATH("MeshSubdivided.ply")); #endif } // compute face normals void MeshRefineCUDA::ComputeNormalFaces() { const FIndex numFaces(scene.mesh.faces.GetSize()); reportCudaError(faceNormals.Reset(sizeof(Point3f)*numFaces)); reportCudaError(Mesh::kernelComputeFaceNormal((int)numFaces, vertices, faces, faceNormals, numFaces )); } // score mesh using photo-consistency // and compute vertices gradient using analytical method void MeshRefineCUDA::ScoreMesh(float* gradients) { // extract array of faces viewed by each camera ListCameraFaces(); // compute face normals ComputeNormalFaces(); // init memory const VIndex numVertices(scene.mesh.vertices.GetSize()); reportCudaError(cuMemsetD32(photoGrad, 0, numVertices*3)); reportCudaError(cuMemsetD32(photoGradNorm, 0, numVertices)); // for each pair of images, compute a photo-consistency score // between the reference image and the pixels of the second image // projected in the reference image through the mesh surface FOREACHPTR(pPair, pairs) { ASSERT(pPair->i < pPair->j); switch (nAlternatePair) { case 1: { const PairIdx pair(iteration%2 ? PairIdx(pPair->j,pPair->i) : PairIdx(pPair->i,pPair->j)); ProcessPair(pair.i, pair.j); break; } case 2: { ProcessPair(pPair->i, pPair->j); break; } case 3: { ProcessPair(pPair->j, pPair->i); break; } default: for (int ip=0; ip<2; ++ip) { const PairIdx pair(ip ? PairIdx(pPair->j,pPair->i) : PairIdx(pPair->i,pPair->j)); ProcessPair(pair.i, pair.j); } } } // loop through all vertices and compute the smoothing score ComputeSmoothnessGradient(numVertices); // set the final gradient as the combination of photometric and smoothness gradients CombineGradients(numVertices); reportCudaError(photoGrad.GetData(gradients, sizeof(Point3f)*numVertices)); } // project mesh to the given camera plane void MeshRefineCUDA::ProjectMesh( const CameraFaces& cameraFaces, const Camera& camera, const Image8U::Size& size, uint32_t idxImage) { View& view = views[idxImage]; // init depth-map const float fltMax(FLT_MAX); reportCudaError(cuMemsetD32(view.depthMap, (uint32_t&)fltMax, size.area())); // fetch only the faces viewed by this camera Mesh::FaceIdxArr faceIDsView(0, (FIndex)cameraFaces.size()); for (auto idxFace : cameraFaces) faceIDsView.Insert(idxFace); // project mesh reportCudaError(kernelProjectMesh((int)faceIDsView.GetSize(), vertices, faces, faceIDsView, view.depthMap, view.faceMap, view.baryMap, CameraCUDA(camera, size), faceIDsView.GetSize() )); kernelProjectMesh.Reset(); // cross-check valid depth and face index reportCudaError(kernelCrossCheckProjection(size, view.depthMap, view.faceMap, size.width, size.height )); #if 0 // debug view DepthMap depthMap(size); TImage faceMap(size); TImage baryMap(size); view.depthMap.GetData(depthMap); view.faceMap.GetData(faceMap); view.baryMap.GetData(baryMap); TImage _baryMap(cvtImage(baryMap)); #endif } void MeshRefineCUDA::ProcessPair(uint32_t idxImageA, uint32_t idxImageB) { // fetch view A data const Image& imageDataA = images[idxImageA]; ASSERT(imageDataA.IsValid()); const Camera& cameraA = imageDataA.camera; const Image8U::Size& sizeA(views[idxImageA].size); // fetch view B data const Image& imageDataB = images[idxImageB]; ASSERT(imageDataB.IsValid()); const Camera& cameraB = imageDataB.camera; // warp imageB to imageA using the mesh ImageMeshWarp(cameraA, cameraB, sizeA, idxImageA, idxImageB); // init vertex textures ComputeLocalVariance(imageAB, sizeA, imageMeanAB, imageVarAB); ComputeLocalVariance(views[idxImageA].image, sizeA, imageMeanA, imageVarA); ComputeLocalZNCC(sizeA); const float RegularizationScale((float)((REAL)(imageDataA.avgDepth*imageDataB.avgDepth)/(cameraA.GetFocalLength()*cameraB.GetFocalLength()))); ComputePhotometricGradient(cameraA, cameraB, sizeA, idxImageA, idxImageB, scene.mesh.vertices.GetSize(), RegularizationScale); } // project image from view B to view A through the mesh; // the projected image is stored in imageA void MeshRefineCUDA::ImageMeshWarp( const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, uint32_t idxImageA, uint32_t idxImageB) { // set image texture surfImageRef.Bind(views[idxImageA].image); texImageRef.Bind(views[idxImageB].image); // project image reportCudaError(kernelImageMeshWarp(size, views[idxImageA].depthMap, views[idxImageB].depthMap, mask, CameraCUDA(cameraA, size), CameraCUDA(cameraB, size) )); #if 0 // debug view Image16F _imageAB(size); Image8U _mask(size); imageAB.GetData(_imageAB); mask.GetData(_mask); Image32F __imageAB(cvtImage(_imageAB)); #endif } // compute local variance for each image pixel void MeshRefineCUDA::ComputeLocalVariance(const CUDA::ArrayRT16F& image, const Image8U::Size& size, CUDA::MemDevice& imageMean, CUDA::MemDevice& imageVar) { surfImageRef.Bind(image); reportCudaError(kernelComputeImageMean(size, mask, imageMean, size.width, size.height, HalfSize )); reportCudaError(kernelComputeImageVar(size, imageMean, mask, imageVar, size.width, size.height, HalfSize )); #if 0 // debug view Image32F mean(size); Image32F var(size); imageMean.GetData(mean); imageVar.GetData(var); #endif } // compute local ZNCC and its gradient for each image pixel void MeshRefineCUDA::ComputeLocalZNCC(const Image8U::Size& size) { reportCudaError(kernelComputeImageCov(size, imageMeanA, imageMeanAB, mask, imageCov, size.width, size.height, HalfSize )); reportCudaError(kernelComputeImageZNCC(size, imageCov, imageVarA, imageVarAB, mask, imageZNCC, size.width, size.height, HalfSize )); reportCudaError(kernelComputeImageDZNCC(size, imageMeanA, imageMeanAB, imageVarA, imageVarAB, imageZNCC, mask, imageDZNCC, size.width, size.height, HalfSize )); #if 0 // debug view Image32F _imageZNCC(size); Image32F _imageDZNCC(size); imageZNCC.GetData(_imageZNCC); imageDZNCC.GetData(_imageDZNCC); #endif } // compute the photometric gradient for all vertices seen by an image pair void MeshRefineCUDA::ComputePhotometricGradient(const Camera& cameraA, const Camera& cameraB, const Image8U::Size& size, uint32_t idxImageA, uint32_t idxImageB, uint32_t numVertices, float RegularizationScale) { // compute photometric gradient for all visible vertices reportCudaError(cuMemsetD32(photoGradPixels, 0, numVertices)); reportCudaError(kernelComputePhotometricGradient(size, faces, faceNormals, views[idxImageA].depthMap, views[idxImageA].faceMap, views[idxImageA].baryMap, imageDZNCC, mask, photoGrad, photoGradPixels, CameraCUDA(cameraA, size), CameraCUDA(cameraB, size), RegularizationScale )); // update photometric gradient norm for all visible vertices reportCudaError(kernelUpdatePhotoGradNorm(numVertices, photoGradNorm, photoGradPixels, numVertices )); #if 0 // debug view Point3fArr _photoGrad(numVertices); FloatArr _photoGradPixels(numVertices); FloatArr _photoGradNorm(numVertices); photoGrad.GetData(_photoGrad); photoGradPixels.GetData(_photoGradPixels); photoGradNorm.GetData(_photoGradNorm); #endif } void MeshRefineCUDA::ComputeSmoothnessGradient(uint32_t numVertices) { // compute smoothness gradient for all vertices reportCudaError(kernelComputeSmoothnessGradient((int)numVertices, vertices, vertexVerticesCont, vertexVerticesSizes, vertexVerticesPointers, smoothGrad1, numVertices, uint8_t(0) )); reportCudaError(kernelComputeSmoothnessGradient((int)numVertices, smoothGrad1, vertexVerticesCont, vertexVerticesSizes, vertexVerticesPointers, smoothGrad2, numVertices, uint8_t(1) )); #if 0 // debug view Point3fArr _smoothGrad1(numVertices); Point3fArr _smoothGrad2(numVertices); smoothGrad1.GetData(_smoothGrad1); smoothGrad2.GetData(_smoothGrad2); #endif } void MeshRefineCUDA::CombineGradients(uint32_t numVertices) { // compute smoothness gradient for all vertices if (ratioRigidityElasticity >= 1.f) { reportCudaError(kernelCombineGradients((int)numVertices, photoGrad, photoGradNorm, smoothGrad2, numVertices, weightRegularity )); } else { // compute smoothing gradient as a combination of level 1 and 2 of the Laplacian operator; // (see page 105 of "Stereo and Silhouette Fusion for 3D Object Modeling from Uncalibrated Images Under Circular Motion" C. Hernandez, 2004) const float rigidity((1.f-ratioRigidityElasticity)*weightRegularity); const float elasticity(ratioRigidityElasticity*weightRegularity); reportCudaError(kernelCombineAllGradients((int)numVertices, photoGrad, photoGradNorm, smoothGrad1, smoothGrad2, numVertices, rigidity, elasticity )); } #if 0 // debug view Point3fArr _photoGrad(numVertices); photoGrad.GetData(_photoGrad); #endif } /*----------------------------------------------------------------*/ // S T R U C T S /////////////////////////////////////////////////// // optimize mesh using photo-consistency bool Scene::RefineMeshCUDA(unsigned nResolutionLevel, unsigned nMinResolution, unsigned nMaxViews, float fDecimateMesh, unsigned nCloseHoles, unsigned nEnsureEdgeSize, unsigned nMaxFaceArea, unsigned nScales, float fScaleStep, unsigned nAlternatePair, float fRegularityWeight, float fRatioRigidityElasticity, float fGradientStep) { if (pointcloud.IsEmpty() && !ImagesHaveNeighbors()) SampleMeshWithVisibility(); MeshRefineCUDA refine(*this, nAlternatePair, fRegularityWeight, fRatioRigidityElasticity, nResolutionLevel, nMinResolution, nMaxViews); if (!refine.IsValid()) return false; // run the mesh optimization on multiple scales (coarse to fine) for (unsigned nScale=0; nScale 2) mesh.Save(MAKE_PATH(String::FormatString("MeshRefine%u.ply", nScales-nScale-1))); #endif // loop a constant number of iterations and apply the gradient int iters(25); float gstep(0.05f); if (fGradientStep > 1) { iters = FLOOR2INT(fGradientStep); gstep = (fGradientStep-(float)iters)*10; } iters = MAXF(iters/(int)(nScale+1),8); const int iterStop(iters*7/10); Eigen::Matrix gradients(mesh.vertices.GetSize(),3); Util::Progress progress(_T("Processed iterations"), iters); GET_LOGCONSOLE().Pause(); for (int iter=0; iter 2) mesh.Save(MAKE_PATH(String::FormatString("MeshRefined%u.ply", nScales-nScale-1))); #endif } return true; } // RefineMeshCUDA /*----------------------------------------------------------------*/ #endif // _USE_CUDA