From 841719e031ffc2ac8e278152583d0c1ffb91daaf Mon Sep 17 00:00:00 2001 From: Lars Ivar Hatledal Date: Wed, 6 May 2026 15:02:31 +0200 Subject: [PATCH] Removed the experimental two-level TLAS/BLAS acceleration structure from the WGPU path tracer --- examples/projects/Vehicle/main.cpp | 1 - examples/wgpu/wgpu_cornell_box.cpp | 5 +- examples/wgpu/wgpu_denoise.cpp | 1 - examples/wgpu/wgpu_gltf_samples.cpp | 4 - .../threepp/renderers/wgpu/WgpuPathTracer.hpp | 6 - .../wgpu/pathtracer/WgpuPathTracer.cpp | 395 +----------------- .../wgpu/pathtracer/WgpuPathTracerBvh.cpp | 211 +--------- .../wgpu/pathtracer/WgpuPathTracerBvh.hpp | 56 +-- .../pathtracer/WgpuPathTracerGeometry.hpp | 4 +- .../wgpu/pathtracer/WgpuPathTracerShaders.hpp | 2 - .../pathtracer/WgpuPathTracerShaders_Rt.cpp | 261 ------------ .../WgpuPathTracerShaders_VtRefit.cpp | 143 ------- .../wgpu/pathtracer/WgpuPathTracerTypes.hpp | 31 -- 13 files changed, 23 insertions(+), 1097 deletions(-) diff --git a/examples/projects/Vehicle/main.cpp b/examples/projects/Vehicle/main.cpp index de12fa13..e5d9a9a1 100644 --- a/examples/projects/Vehicle/main.cpp +++ b/examples/projects/Vehicle/main.cpp @@ -30,7 +30,6 @@ int main() { if (auto wgpu = dynamic_cast(renderer.get())) { wgpu->usePathTracer = true; auto& pt = wgpu->pathTracer(); - pt.setTlasEnabled(false); pt.setMaxBounces(1); pt.setDenoiserEnabled(false); } diff --git a/examples/wgpu/wgpu_cornell_box.cpp b/examples/wgpu/wgpu_cornell_box.cpp index 393ce766..e09fe10d 100644 --- a/examples/wgpu/wgpu_cornell_box.cpp +++ b/examples/wgpu/wgpu_cornell_box.cpp @@ -176,7 +176,6 @@ int main() { bool restdirOn = pathTracer.restirEnabled(); bool restirGIOn = pathTracer.restirGiEnabled(); bool foveatOn = pathTracer.foveatedRendering(); - bool tlasOn = pathTracer.tlasEnabled(); bool wobbleOn = false; int maxBounces = pathTracer.maxBounces(); float exposure = pathTracer.exposure(); @@ -221,8 +220,6 @@ int main() { pathTracer.setReSTIRGIEnabled(restirGIOn); if (ImGui::Checkbox("Foveated", &foveatOn)) pathTracer.setFoveatedRendering(foveatOn); - if (ImGui::Checkbox("TLAS", &tlasOn)) - pathTracer.setTlasEnabled(tlasOn); ImGui::Checkbox("Wobble back wall", &wobbleOn); if (ImGui::SliderInt("Max bounces", &maxBounces, 1, 8)) pathTracer.setMaxBounces(maxBounces); @@ -264,7 +261,7 @@ int main() { } // Animate back wall vertices — exercises the path tracer's per-frame - // geometry fast path (CPU repack + partial upload + BVH/BLAS refit). + // geometry fast path (CPU repack + partial upload + BVH refit). if (wobbleOn) { wobbleT += dt; const float wave = std::sin(math::TWO_PI * 0.5f * wobbleT); diff --git a/examples/wgpu/wgpu_denoise.cpp b/examples/wgpu/wgpu_denoise.cpp index 26612740..d70dcd7b 100644 --- a/examples/wgpu/wgpu_denoise.cpp +++ b/examples/wgpu/wgpu_denoise.cpp @@ -109,7 +109,6 @@ int main() { pathTracer.setMaxBounces(4); pathTracer.setReSTIREnabled(true); pathTracer.setReSTIRGIEnabled(true); - pathTracer.setTlasEnabled(true); // pathTracer.setFireflyClamp(0.001); // ---- Scene ---- diff --git a/examples/wgpu/wgpu_gltf_samples.cpp b/examples/wgpu/wgpu_gltf_samples.cpp index 7d8cacca..12292818 100644 --- a/examples/wgpu/wgpu_gltf_samples.cpp +++ b/examples/wgpu/wgpu_gltf_samples.cpp @@ -93,7 +93,6 @@ int main(int argc, char** argv) { pathTracer.setReSTIREnabled(false); pathTracer.setMaxBounces(4); pathTracer.setFoveatedRendering(false); - pathTracer.setTlasEnabled(false); pathTracer.setTextureResolution(1024); RGBELoader imgLoader; @@ -203,7 +202,6 @@ int main(int argc, char** argv) { int fpsFrames = 0; int aovMode = pathTracer.aovMode(); bool foveatOn = pathTracer.foveatedRendering(); - bool tlasOn = pathTracer.tlasEnabled(); bool dofEnabled = false; float lensFStop = 2.8f; @@ -279,8 +277,6 @@ int main(int argc, char** argv) { pathTracer.setReSTIRGIEnabled(restdirGIOn); if (ImGui::Checkbox("Foveated Rendering", &foveatOn)) pathTracer.setFoveatedRendering(foveatOn); - if (ImGui::Checkbox("TLAS/BLAS", &tlasOn)) - pathTracer.setTlasEnabled(tlasOn); if (ImGui::Checkbox("Show DirLight", &dirLight)) { light->visible = dirLight; diff --git a/include/threepp/renderers/wgpu/WgpuPathTracer.hpp b/include/threepp/renderers/wgpu/WgpuPathTracer.hpp index 4627ebf4..41f7cef8 100644 --- a/include/threepp/renderers/wgpu/WgpuPathTracer.hpp +++ b/include/threepp/renderers/wgpu/WgpuPathTracer.hpp @@ -83,12 +83,6 @@ namespace threepp { void setReSTIRGIEnabled(bool enabled); [[nodiscard]] bool restirGiEnabled() const; - /// Enable/disable the two-level TLAS/BLAS acceleration structure. Default: false. - /// Experimental — plumbing only in the current build; single-level BVH remains - /// the active traversal path until the shader rewrite lands. - void setTlasEnabled(bool enabled); - [[nodiscard]] bool tlasEnabled() const; - /// Samples per pixel per frame. Default: 1. Higher values reduce noise /// at the cost of proportionally more RT time per frame. void setSamplesPerPixel(int spp); diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracer.cpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracer.cpp index ecc6591c..3b80e238 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracer.cpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracer.cpp @@ -139,9 +139,6 @@ struct WgpuPathTracer::Impl { bool denoiserEnabled_ = true; bool restirEnabled_ = true; bool restirGiEnabled_ = false; - // Two-level BVH (TLAS/BLAS) — plumbing only in PR1. When false (default) - // the path tracer uses the existing single-level BVH over world-space tris. - bool tlasEnabled_ = false; int spp_ = 1; float fogAnisotropy_ = 0.0f; // HG g: 0 isotropic, >0 forward (god rays), <0 back. int maxBounces_ = 4; @@ -184,13 +181,6 @@ struct WgpuPathTracer::Impl { int bvhCapacity_ = 2 * INIT_TRI_CAP - 1; int emissiveTriCapacity_ = 1; // currently-allocated emissiveTriBuf capacity (tri entries) - // PR2c.1 — capacities for TLAS/BLAS GPU buffers. Populated on topology - // rebuild when tlasEnabled_; allocated lazily so single-level builds pay - // nothing. Shader does not yet consume these (PR2c.2 wires traversal). - int blasNodeCapacity_ = 0; - int blasRecordCapacity_ = 0; - int tlasInstanceCapacity_ = 0; - // Actual scene counts (distinct from capacity which includes headroom) int matCount_ = 0; // number of unique materials/meshes in use int meshCount_ = 0; // number of mesh entries (instances) in use @@ -213,19 +203,6 @@ struct WgpuPathTracer::Impl { WgpuBuffer bvhNodeBuf; WgpuBuffer bvhCounterBuf; WgpuBuffer refitMetaBuf; - // PR2c.1 — BLAS/TLAS GPU buffers. Allocated lazily (see - // blasNodeCapacity_ et al). Kept unbound from pipelines in PR2c.1 — - // they become live once the shader traversal switches to TLAS in PR2c.2. - WgpuBuffer blasNodeBuf; - WgpuBuffer blasRecordBuf; - WgpuBuffer tlasInstanceBuf; - // BLAS refit — object-space version of the BVH refit pipeline. Reads - // deformed obj-space triangles from objTriBuf/objTriBuf2, updates BLAS - // leaf AABBs, propagates up per-BLAS parent chains (stops at parent=-1). - WgpuBuffer blasRefitMetaBuf; - WgpuBuffer blasRefitCounterBuf; - WgpuBuffer blasLeafIndexBuf; - WgpuBuffer blasRefitUniBuf; WgpuBuffer pathCounterBuf; // atomic work-queue head for rt_main (persistent-thread path regeneration) WgpuBuffer primaryCounterBuf; // separate atomic for rt_primary_main — needed because CPU-side counter writes can't interleave with GPU compute passes WgpuBuffer bounceCounterBuf; // atomic for rt_bounces_main @@ -268,7 +245,6 @@ struct WgpuPathTracer::Impl { // Compute pipelines WgpuComputePipeline vtPipeline; WgpuComputePipeline refitPipeline; - WgpuComputePipeline blasRefitPipeline; // TLAS mode: refits BLAS AABBs from objTri when geometry deforms WgpuComputePipeline primaryPipeline; // kernel-split step 1: BVH primary traversal → primaryHitBuf WgpuComputePipeline rtPipeline; // kernel-split step 2: primaryShade + serialize to pathStateBuf WgpuComputePipeline bouncesPipeline; // kernel-split step 3: runBounces + accumulation @@ -306,19 +282,6 @@ struct WgpuPathTracer::Impl { std::vector bvhIndices; std::vector leafIndices; - // PR2b plumbing — TLAS/BLAS CPU state. Populated only when tlasEnabled_. - // Not yet consumed by the shader (traversal rewrite lands in a later PR); - // holding it on Impl keeps subsequent refits / appends from rebuilding - // from scratch once the shader is switched over. - std::vector blasNodes; - std::vector blasLeafIndices; - std::vector blasRecords; - std::vector tlasInstances; - // Maps tlasInstances[j] back to rtEntries[i] so per-frame mesh-matrix - // updates can refresh each instance's objToWorld/worldToObj without - // re-walking expandMeshEntries + entryTriRanges. - std::vector tlasToEntryIdx; - // Per-entry cache used by the per-frame dirty-geometry fast path. // Populated at topology commit from the AsyncBuildResult; indexed by // expanded entry index (same ordering as rtEntries from expandMeshEntries). @@ -394,13 +357,6 @@ struct WgpuPathTracer::Impl { int objTriSplit = 0; // split point for two-buffer objTri scheme int matCount = 0; // actual number of unique materials (matCapacity ≥ matCount) int meshCount = 0; // actual number of mesh entries (meshCapacity ≥ meshCount) - // PR2b plumbing — populated when tlasEnabled_; consumed by the shader - // traversal in a later PR. Empty for single-level BVH builds. - std::vector blasNodes; - std::vector blasLeafIndices; - std::vector blasRecords; - std::vector tlasInstances; - std::vector tlasToEntryIdx; std::vector> entryTriRanges; }; #ifndef __EMSCRIPTEN__ @@ -444,9 +400,6 @@ struct WgpuPathTracer::Impl { int numBvhNodes_ = 0; uint32_t vtDispatchX_ = 1, vtDispatchY_ = 1; uint32_t rfDispatchX_ = 1, rfDispatchY_ = 1; - uint32_t blasRfDispatchX_ = 1, blasRfDispatchY_ = 1; - std::vector blasRefitCounterZeros; // zeroed each BLAS-refit dispatch (atomics scratch) - bool geometryDirtyThisFrame_ = false; // set by dirty-scan; gates BLAS refit dispatch float frameCount_ = 0.f; uint32_t globalFrameCounter_ = 0; int prevNLights_ = -1; @@ -528,16 +481,6 @@ struct WgpuPathTracer::Impl { WgpuBuffer::Usage::Storage), refitMetaBuf(r, static_cast(2 * INIT_TRI_CAP - 1) * BVH4_REFIT_INTS * sizeof(int32_t), WgpuBuffer::Usage::Storage), - // PR2c.1 — 1-element stub allocations; grown on demand in the - // topology-change path when tlasEnabled_. - blasNodeBuf(r, BVH4_GPU_U32S * sizeof(uint32_t), WgpuBuffer::Usage::Storage), - blasRecordBuf(r, sizeof(BlasRecord), WgpuBuffer::Usage::Storage), - tlasInstanceBuf(r, sizeof(TlasInstance), WgpuBuffer::Usage::Storage), - // BLAS refit placeholders — grown when tlasEnabled_ + blasNodes exist. - blasRefitMetaBuf(r, BVH4_REFIT_INTS * sizeof(int32_t), WgpuBuffer::Usage::Storage), - blasRefitCounterBuf(r, sizeof(uint32_t), WgpuBuffer::Usage::Storage), - blasLeafIndexBuf(r, sizeof(int), WgpuBuffer::Usage::Storage), - blasRefitUniBuf(r, sizeof(RefitGpuUniforms)), pathCounterBuf(r, sizeof(uint32_t), WgpuBuffer::Usage::Storage), primaryCounterBuf(r, sizeof(uint32_t), WgpuBuffer::Usage::Storage), bounceCounterBuf(r, sizeof(uint32_t), WgpuBuffer::Usage::Storage), @@ -576,7 +519,6 @@ struct WgpuPathTracer::Impl { // commit with the env-CDF flag; here we construct with false. vtPipeline(r, buildVtShader(), "vt_main"), refitPipeline(r, buildRefitShader(), "bvh_refit"), - blasRefitPipeline(r, buildBlasRefitShader(), "blas_refit"), primaryPipeline(r, buildRtShader(false), "rt_primary_main"), rtPipeline(r, buildRtShader(false), "rt_main"), bouncesPipeline(r, buildRtShader(false), "rt_bounces_main"), @@ -658,14 +600,6 @@ struct WgpuPathTracer::Impl { refitPipeline.setUniformBuffer(4, refitUniBuf); refitPipeline.setStorageBufferRead(5, refitMetaBuf); - blasRefitPipeline.setStorageBufferRead(0, objTriBuf); - blasRefitPipeline.setStorageBufferRead(1, objTriBuf2); - blasRefitPipeline.setStorageBuffer(2, blasNodeBuf); - blasRefitPipeline.setStorageBuffer(3, blasRefitCounterBuf); - blasRefitPipeline.setStorageBufferRead(4, blasLeafIndexBuf); - blasRefitPipeline.setUniformBuffer(5, blasRefitUniBuf); - blasRefitPipeline.setStorageBufferRead(6, blasRefitMetaBuf); - // RT pipelines — set ALL bindings upfront (per-frame ones get overwritten) rtPipeline.setUniformBuffer(0, rtUniformBuf); rtPipeline.setStorageBufferRead(3, bvhNodeBuf); @@ -715,11 +649,6 @@ struct WgpuPathTracer::Impl { rtPipeline.setStorageBuffer(48, sortedAliveQueueBuf); rtPipeline.setStorageBuffer(49, sortCounterBuf); rtPipeline.setTexture(50, ggxELutTex_); - rtPipeline.setStorageBufferRead(51, blasNodeBuf); - rtPipeline.setStorageBufferRead(52, blasRecordBuf); - rtPipeline.setStorageBufferRead(53, tlasInstanceBuf); - rtPipeline.setStorageBufferRead(54, objTriBuf); - rtPipeline.setStorageBufferRead(55, objTriBuf2); // Primary-hit kernel (kernel split step 1). Same shader source, different // entry point — so the full bind group layout matches rtPipeline. Most @@ -773,11 +702,6 @@ struct WgpuPathTracer::Impl { primaryPipeline.setStorageBuffer(48, sortedAliveQueueBuf); primaryPipeline.setStorageBuffer(49, sortCounterBuf); primaryPipeline.setTexture(50, ggxELutTex_); - primaryPipeline.setStorageBufferRead(51, blasNodeBuf); - primaryPipeline.setStorageBufferRead(52, blasRecordBuf); - primaryPipeline.setStorageBufferRead(53, tlasInstanceBuf); - primaryPipeline.setStorageBufferRead(54, objTriBuf); - primaryPipeline.setStorageBufferRead(55, objTriBuf2); // Bounces kernel (bounce-split step 3). Full mirror of rtPipeline's // bindings because runBounces + accumulation touches the same resources. @@ -832,11 +756,6 @@ struct WgpuPathTracer::Impl { bouncesPipeline.setStorageBuffer(48, sortedAliveQueueBuf); bouncesPipeline.setStorageBuffer(49, sortCounterBuf); bouncesPipeline.setTexture(50, ggxELutTex_); - bouncesPipeline.setStorageBufferRead(51, blasNodeBuf); - bouncesPipeline.setStorageBufferRead(52, blasRecordBuf); - bouncesPipeline.setStorageBufferRead(53, tlasInstanceBuf); - bouncesPipeline.setStorageBufferRead(54, objTriBuf); - bouncesPipeline.setStorageBufferRead(55, objTriBuf2); // Compaction kernel (Stage F1). Full bind-group mirror because it // uses the same shader source as primary/rt/bounces — bind-group layout @@ -892,11 +811,6 @@ struct WgpuPathTracer::Impl { compactPipeline.setStorageBuffer(48, sortedAliveQueueBuf); compactPipeline.setStorageBuffer(49, sortCounterBuf); compactPipeline.setTexture(50, ggxELutTex_); - compactPipeline.setStorageBufferRead(51, blasNodeBuf); - compactPipeline.setStorageBufferRead(52, blasRecordBuf); - compactPipeline.setStorageBufferRead(53, tlasInstanceBuf); - compactPipeline.setStorageBufferRead(54, objTriBuf); - compactPipeline.setStorageBufferRead(55, objTriBuf2); // F2a: rt_bounce1_main processes bounce 1 (i=1) as a separate kernel. // Full bind-group mirror (same shader source, different entry point). @@ -949,11 +863,6 @@ struct WgpuPathTracer::Impl { bounce1Pipeline.setStorageBuffer(48, sortedAliveQueueBuf); bounce1Pipeline.setStorageBuffer(49, sortCounterBuf); bounce1Pipeline.setTexture(50, ggxELutTex_); - bounce1Pipeline.setStorageBufferRead(51, blasNodeBuf); - bounce1Pipeline.setStorageBufferRead(52, blasRecordBuf); - bounce1Pipeline.setStorageBufferRead(53, tlasInstanceBuf); - bounce1Pipeline.setStorageBufferRead(54, objTriBuf); - bounce1Pipeline.setStorageBufferRead(55, objTriBuf2); // F2b: rt_accum_main runs the accumulation / temporal-reprojection // pipeline over aliveQueue. Reads PathStateEntry (final radiance + @@ -1008,11 +917,6 @@ struct WgpuPathTracer::Impl { accumPipeline.setStorageBuffer(48, sortedAliveQueueBuf); accumPipeline.setStorageBuffer(49, sortCounterBuf); accumPipeline.setTexture(50, ggxELutTex_); - accumPipeline.setStorageBufferRead(51, blasNodeBuf); - accumPipeline.setStorageBufferRead(52, blasRecordBuf); - accumPipeline.setStorageBufferRead(53, tlasInstanceBuf); - accumPipeline.setStorageBufferRead(54, objTriBuf); - accumPipeline.setStorageBufferRead(55, objTriBuf2); // F2c: rt_sort_prefix_main runs a tiny 1-thread prefix sum over // matBucketCount. Full bind-group mirror — WebGPU layout validation. @@ -1065,11 +969,6 @@ struct WgpuPathTracer::Impl { sortPrefixPipeline.setStorageBuffer(48, sortedAliveQueueBuf); sortPrefixPipeline.setStorageBuffer(49, sortCounterBuf); sortPrefixPipeline.setTexture(50, ggxELutTex_); - sortPrefixPipeline.setStorageBufferRead(51, blasNodeBuf); - sortPrefixPipeline.setStorageBufferRead(52, blasRecordBuf); - sortPrefixPipeline.setStorageBufferRead(53, tlasInstanceBuf); - sortPrefixPipeline.setStorageBufferRead(54, objTriBuf); - sortPrefixPipeline.setStorageBufferRead(55, objTriBuf2); // F2c: rt_sort_scatter_main bucket-scatters aliveQueue → sortedAliveQueue. sortScatterPipeline.setUniformBuffer(0, rtUniformBuf); @@ -1121,11 +1020,6 @@ struct WgpuPathTracer::Impl { sortScatterPipeline.setStorageBuffer(48, sortedAliveQueueBuf); sortScatterPipeline.setStorageBuffer(49, sortCounterBuf); sortScatterPipeline.setTexture(50, ggxELutTex_); - sortScatterPipeline.setStorageBufferRead(51, blasNodeBuf); - sortScatterPipeline.setStorageBufferRead(52, blasRecordBuf); - sortScatterPipeline.setStorageBufferRead(53, tlasInstanceBuf); - sortScatterPipeline.setStorageBufferRead(54, objTriBuf); - sortScatterPipeline.setStorageBufferRead(55, objTriBuf2); // Spatial filter — set ALL bindings upfront. // Binding 1 (colorIn) and binding 2 (colorOut) are rebound per-pass; seed @@ -1156,7 +1050,6 @@ struct WgpuPathTracer::Impl { // rebuilt synchronously on the main thread a second time. vtPipeline.startAsyncBuild(); refitPipeline.startAsyncBuild(); - blasRefitPipeline.startAsyncBuild(); atrousPipeline.startAsyncBuild(); upscalePipeline.startAsyncBuild(); std::cerr << "[PathTracer] Async shader compilation started for helper pipelines" << std::endl; @@ -2054,27 +1947,16 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { r.matBuffer.resize(static_cast(r.matCapacity) * MAT_TEX_HEIGHT * 4, 0.f); r.rawObjTriBuf.resize(static_cast(r.triCapacity) * 32, 0.f); r.matrixCpuBuf.resize(static_cast(r.meshCapacity) * 32, 0.f); - // Always populate entryTriRanges — the TLAS/BLAS builder needs it in - // TLAS mode, and the per-frame dirty-geometry fast path needs it in - // both modes to locate each entry's slice in rawObjTriBuf. + // Populate entryTriRanges — the per-frame dirty-geometry fast path + // uses it to locate each entry's slice in rawObjTriBuf. r.triCount = buildGeometryBuffers(entries, r.texSlotMap, r.triBuffer, r.matBuffer, r.rawObjTriBuf, r.matrixCpuBuf, r.triCapacity, r.matCapacity, r.meshCapacity, 0, 0, 0, &r.entryTriRanges); r.matCount = matCount; // upper-bound: number of unique meshes (some may share materials) r.meshCount = meshCount; // number of expanded mesh entries (instances) - // PR2b plumbing — when TLAS mode is on, build per-entry BLASes *before* - // the global single-level buildBVH reorders rawObjTriBuf. BlasRecords - // are CPU-only in PR2b; the GPU shader keeps using the single-level BVH - // until the TLAS traversal shader lands in PR2c. - if (d.tlasEnabled_) { - buildBlasesForEntries(entries, r.entryTriRanges, r.rawObjTriBuf, - r.blasNodes, r.blasLeafIndices, - r.blasRecords, r.tlasInstances, - r.tlasToEntryIdx); - } buildBVH(r.triBuffer, r.triCount, r.bvhNodes, r.bvhIndices, r.leafIndices, - r.rawObjTriBuf, d.tlasEnabled_); + r.rawObjTriBuf); r.numBvhNodes = static_cast(r.bvhNodes.size()); // BVH headroom: same philosophy as triCapacity — capped absolute, not 2×. // Typical overlay BVH append adds a few hundred nodes; 32k headroom absorbs @@ -2191,24 +2073,16 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { r.matBuffer.resize(static_cast(r.matCapacity) * MAT_TEX_HEIGHT * 4, 0.f); r.rawObjTriBuf.resize(static_cast(r.triCapacity) * 32, 0.f); r.matrixCpuBuf.resize(static_cast(r.meshCapacity) * 32, 0.f); - // Always populate entryTriRanges — the TLAS/BLAS builder needs it in - // TLAS mode, and the per-frame dirty-geometry fast path needs it in - // both modes to locate each entry's slice in rawObjTriBuf. + // Populate entryTriRanges — the per-frame dirty-geometry fast path + // uses it to locate each entry's slice in rawObjTriBuf. r.triCount = buildGeometryBuffers(entries, r.texSlotMap, r.triBuffer, r.matBuffer, r.rawObjTriBuf, r.matrixCpuBuf, r.triCapacity, r.matCapacity, r.meshCapacity, 0, 0, 0, &r.entryTriRanges); r.matCount = matCount; // upper-bound: number of unique meshes (some may share materials) r.meshCount = meshCount; // number of expanded mesh entries (instances) - // PR2b plumbing — see emscripten branch above for rationale. - if (d.tlasEnabled_) { - buildBlasesForEntries(entries, r.entryTriRanges, r.rawObjTriBuf, - r.blasNodes, r.blasLeafIndices, - r.blasRecords, r.tlasInstances, - r.tlasToEntryIdx); - } buildBVH(r.triBuffer, r.triCount, r.bvhNodes, r.bvhIndices, r.leafIndices, - r.rawObjTriBuf, d.tlasEnabled_); + r.rawObjTriBuf); r.numBvhNodes = static_cast(r.bvhNodes.size()); // BVH headroom: same philosophy as triCapacity — capped absolute, not 2×. // Typical overlay BVH append adds a few hundred nodes; 32k headroom absorbs @@ -2272,11 +2146,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { d.bvhNodes = std::move(r.bvhNodes); d.bvhIndices = std::move(r.bvhIndices); d.leafIndices = std::move(r.leafIndices); - d.blasNodes = std::move(r.blasNodes); - d.blasLeafIndices = std::move(r.blasLeafIndices); - d.blasRecords = std::move(r.blasRecords); - d.tlasInstances = std::move(r.tlasInstances); - d.tlasToEntryIdx = std::move(r.tlasToEntryIdx); d.entryTriRanges_ = std::move(r.entryTriRanges); d.bvhNodeCpuBuf = std::move(r.bvhNodeCpuBuf); d.refitMetaCpuBuf = std::move(r.refitMetaCpuBuf); @@ -2293,15 +2162,10 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { d.stableFramesSinceAppend_ = 0; // Build inverse-permutation (entry-order tri → post-reorder position). - // For TLAS mode (preserveObjTriOrder=true), rawObjTriBuf is NOT reordered - // so inv_perm is identity; we store it anyway to unify the dirty-repack - // code path in both modes. + // d.bvhIndices[j] is the pre-reorder index of the tri now at j. + // Inverse: invPerm[preIdx] = j. d.triInvPerm_.assign(d.triCount_, 0); - if (d.tlasEnabled_) { - for (int i = 0; i < d.triCount_; ++i) d.triInvPerm_[i] = i; - } else { - // d.bvhIndices[j] is the pre-reorder index of the tri now at j. - // Inverse: invPerm[preIdx] = j. + { const int n = static_cast(d.bvhIndices.size()); for (int j = 0; j < n; ++j) { const int pre = d.bvhIndices[j]; @@ -2422,8 +2286,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { d.vtPipeline.setStorageBufferRead(4, d.objTriBuf2); d.refitPipeline.setTexture(0, d.triTex); d.refitPipeline.setStorageBufferRead(3, d.leafIndexBuf); - d.blasRefitPipeline.setStorageBufferRead(0, d.objTriBuf); - d.blasRefitPipeline.setStorageBufferRead(1, d.objTriBuf2); d.rtPipeline.setTexture(5, d.triTex); d.primaryPipeline.setTexture(5, d.triTex); d.bouncesPipeline.setTexture(5, d.triTex); @@ -2432,26 +2294,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { d.accumPipeline.setTexture(5, d.triTex); d.sortPrefixPipeline.setTexture(5, d.triTex); d.sortScatterPipeline.setTexture(5, d.triTex); - // PR2c.2c.i — rebind objTriBuf/objTriBuf2 on all rt-family pipelines. - // PR2c.2c.ii's TLAS traversal reads object-space triangles from these - // bindings; in this PR they're only touched by the sceneHitRaw - // keepalive branch, which is never taken at runtime. - d.rtPipeline.setStorageBufferRead(54, d.objTriBuf); - d.rtPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.primaryPipeline.setStorageBufferRead(54, d.objTriBuf); - d.primaryPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.bouncesPipeline.setStorageBufferRead(54, d.objTriBuf); - d.bouncesPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.compactPipeline.setStorageBufferRead(54, d.objTriBuf); - d.compactPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.bounce1Pipeline.setStorageBufferRead(54, d.objTriBuf); - d.bounce1Pipeline.setStorageBufferRead(55, d.objTriBuf2); - d.accumPipeline.setStorageBufferRead(54, d.objTriBuf); - d.accumPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.sortPrefixPipeline.setStorageBufferRead(54, d.objTriBuf); - d.sortPrefixPipeline.setStorageBufferRead(55, d.objTriBuf2); - d.sortScatterPipeline.setStorageBufferRead(54, d.objTriBuf); - d.sortScatterPipeline.setStorageBufferRead(55, d.objTriBuf2); } // Size emissiveTriBuf to the ACTUAL emissive-tri count (was previously @@ -2531,142 +2373,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { d.sortScatterPipeline.setStorageBufferRead(27, d.motionMatBuf); } - // PR2c.2b — upload BLAS/TLAS CPU state to GPU when tlasEnabled_, and - // rebind all 8 rt-family pipelines if any buffer was reallocated. The - // bindings themselves are declared in the shader at slots 51/52/53 and - // kept alive by a runtime-false branch in `sceneHitRaw` (rt.bvhAux.y is - // never set to 1 in this PR). The nested TLAS->BLAS walk lands in - // PR2c.2c; this PR just ensures the plumbing is ready end-to-end. - if (d.tlasEnabled_) { - const int blasNodeCount = static_cast(d.blasNodes.size()); - const int blasRecordCount = static_cast(d.blasRecords.size()); - const int tlasInstanceCount = static_cast(d.tlasInstances.size()); - - bool rebindBlasNodes = false; - bool rebindBlasRecords = false; - bool rebindTlasInstances = false; - if (blasNodeCount > d.blasNodeCapacity_) { - d.blasNodeCapacity_ = std::max(blasNodeCount, 1); - d.blasNodeBuf = WgpuBuffer(d.renderer, - static_cast(d.blasNodeCapacity_) * BVH4_GPU_U32S * sizeof(uint32_t), - WgpuBuffer::Usage::Storage); - rebindBlasNodes = true; - } - if (blasRecordCount > d.blasRecordCapacity_) { - d.blasRecordCapacity_ = std::max(blasRecordCount, 1); - d.blasRecordBuf = WgpuBuffer(d.renderer, - static_cast(d.blasRecordCapacity_) * sizeof(BlasRecord), - WgpuBuffer::Usage::Storage); - rebindBlasRecords = true; - } - if (tlasInstanceCount > d.tlasInstanceCapacity_) { - d.tlasInstanceCapacity_ = std::max(tlasInstanceCount, 1); - d.tlasInstanceBuf = WgpuBuffer(d.renderer, - static_cast(d.tlasInstanceCapacity_) * sizeof(TlasInstance), - WgpuBuffer::Usage::Storage); - rebindTlasInstances = true; - } - - if (rebindBlasNodes) { - d.rtPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.primaryPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.bouncesPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.compactPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.bounce1Pipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.accumPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.sortPrefixPipeline.setStorageBufferRead(51, d.blasNodeBuf); - d.sortScatterPipeline.setStorageBufferRead(51, d.blasNodeBuf); - } - if (rebindBlasRecords) { - d.rtPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.primaryPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.bouncesPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.compactPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.bounce1Pipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.accumPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.sortPrefixPipeline.setStorageBufferRead(52, d.blasRecordBuf); - d.sortScatterPipeline.setStorageBufferRead(52, d.blasRecordBuf); - } - if (rebindTlasInstances) { - d.rtPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.primaryPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.bouncesPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.compactPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.bounce1Pipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.accumPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.sortPrefixPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - d.sortScatterPipeline.setStorageBufferRead(53, d.tlasInstanceBuf); - } - - if (blasNodeCount > 0) { - std::vector blasPacked( - static_cast(d.blasNodeCapacity_) * BVH4_GPU_U32S, 0u); - packBvh4Buffer(d.blasNodes, blasPacked, d.blasNodeCapacity_); - d.blasNodeBuf.write(blasPacked.data(), - static_cast(blasNodeCount) * BVH4_GPU_U32S * sizeof(uint32_t)); - } - if (blasRecordCount > 0) { - d.blasRecordBuf.write(d.blasRecords.data(), - static_cast(blasRecordCount) * sizeof(BlasRecord)); - } - if (tlasInstanceCount > 0) { - d.tlasInstanceBuf.write(d.tlasInstances.data(), - static_cast(tlasInstanceCount) * sizeof(TlasInstance)); - } - - // BLAS refit GPU resources — grown alongside blasNodeCapacity_ / blasLeafIndices. - // On grow, reallocate + rebind; always re-upload refit metadata & leaf indices - // since blasNodes content changed during this topology commit. - if (blasNodeCount > 0) { - const int blasLeafCount = static_cast(d.blasLeafIndices.size()); - const size_t metaBytes = static_cast(d.blasNodeCapacity_) * BVH4_REFIT_INTS * sizeof(int32_t); - const size_t counterBytes = static_cast(d.blasNodeCapacity_) * sizeof(uint32_t); - const size_t leafBytes = static_cast(std::max(blasLeafCount, 1)) * sizeof(int); - // We key the grow on blasNodeCapacity_ for meta/counter buffers and on - // leafCount for the leaf-index buffer. Reallocate eagerly — BLAS topology - // rebuilds are infrequent (only on true scene-level changes). - d.blasRefitMetaBuf = WgpuBuffer(d.renderer, metaBytes, WgpuBuffer::Usage::Storage); - d.blasRefitCounterBuf = WgpuBuffer(d.renderer, counterBytes, WgpuBuffer::Usage::Storage); - d.blasLeafIndexBuf = WgpuBuffer(d.renderer, leafBytes, WgpuBuffer::Usage::Storage); - d.blasRefitCounterZeros.assign(d.blasNodeCapacity_, 0u); - - // Rebind BLAS refit pipeline to new buffers. objTri / objTri2 bindings - // are refreshed below if those buffers were also resized. - d.blasRefitPipeline.setStorageBuffer(2, d.blasNodeBuf); - d.blasRefitPipeline.setStorageBuffer(3, d.blasRefitCounterBuf); - d.blasRefitPipeline.setStorageBufferRead(4, d.blasLeafIndexBuf); - d.blasRefitPipeline.setStorageBufferRead(6, d.blasRefitMetaBuf); - - // Pack refit metadata from blasNodes (parent absolute, childCount, - // numInternal, 0) — identical to single-level. BLAS roots carry - // parent = -1 so the shader's propagate-up loop stops there. - std::vector blasRefitMetaCpu( - static_cast(d.blasNodeCapacity_) * BVH4_REFIT_INTS, 0); - packRefitMetadata(d.blasNodes, blasRefitMetaCpu, d.blasNodeCapacity_); - d.blasRefitMetaBuf.write(blasRefitMetaCpu.data(), - static_cast(blasNodeCount) * BVH4_REFIT_INTS * sizeof(int32_t)); - - if (blasLeafCount > 0) { - d.blasLeafIndexBuf.write(d.blasLeafIndices.data(), - static_cast(blasLeafCount) * sizeof(int)); - } - - const uint32_t blasRfTotal = (static_cast(blasLeafCount) + 63u) / 64u; - const uint32_t blasRfGx = (std::min)(std::max(blasRfTotal, 1u), 65535u); - const uint32_t blasRfGy = (std::max(blasRfTotal, 1u) + blasRfGx - 1u) / blasRfGx; - d.blasRfDispatchX_ = blasRfGx; - d.blasRfDispatchY_ = blasRfGy; - - RefitGpuUniforms blasRfU{}; - blasRfU.leafCount = static_cast(blasLeafCount); - blasRfU.groupsX = blasRfGx; - // splitAt lives in _p[0] — reused for the objTri split index. - blasRfU._p[0] = static_cast(d.objTriSplit_); - blasRfU._p[1] = 0; - d.blasRefitUniBuf.write(&blasRfU, sizeof(blasRfU)); - } - } - // Upload atlas if (r.atlasLayers != d.atlasLayers_ || r.atlasCols != d.atlasCols_ || r.tileSize != d.tileSize_) { d.atlasLayers_ = r.atlasLayers; @@ -2770,7 +2476,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { // anyMeshMoved drives the vertex-transform and BVH-refit pipelines. uint32_t movedBits[4] = {0u, 0u, 0u, 0u}; bool anyMeshMoved = (d.prevEntryMatrices.size() != rtEntries.size()); - d.geometryDirtyThisFrame_ = false; if (topoJustFinished) { // Topology change: all pixels need to re-accumulate (mesh-to-triangle mapping changed) @@ -2907,7 +2612,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { } if (anyGeoDirty) { anyMeshMoved = true; - d.geometryDirtyThisFrame_ = true; // v1 upload strategy: re-push the entire rawObjTriBuf. Simple and // correct; the async queue write overlaps with compute on modern // drivers. A future optimisation can compress to contiguous dirty @@ -2984,47 +2688,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { } d.matrixBuf.write(d.matrixCpuBuf.data(), static_cast(d.meshCapacity_) * 32 * sizeof(float)); - // Per-frame TLAS instance refresh. tlasInstances[] is populated only - // at scene build time; without this refresh the ray traversal would - // keep transforming rays with stale object->world matrices, so moving - // meshes would render at their original position even though the - // single-level BVH refit + VT keep the emissive / shadow paths current - // (which is exactly the "shadow moves, mesh doesn't" symptom). - if (d.tlasEnabled_ && !topoChanged - && !d.tlasInstances.empty() - && d.tlasToEntryIdx.size() == d.tlasInstances.size()) { - auto writeMat3x4 = [](const Matrix4& m, float out[3][4]) { - const auto& e = m.elements; - for (int row = 0; row < 3; ++row) { - out[row][0] = e[row]; - out[row][1] = e[4 + row]; - out[row][2] = e[8 + row]; - out[row][3] = e[12 + row]; - } - }; - for (size_t j = 0; j < d.tlasInstances.size(); ++j) { - const uint32_t ei = d.tlasToEntryIdx[j]; - if (ei >= rtEntries.size()) continue; - const Matrix4& w = rtEntries[ei].worldMatrix; - auto& inst = d.tlasInstances[j]; - writeMat3x4(w, inst.objToWorld); - Matrix4 inv(w); - inv.invert(); - writeMat3x4(inv, inst.worldToObj); - const auto& e = w.elements; - const Vector3 bx(e[0], e[1], e[2]); - const Vector3 by(e[4], e[5], e[6]); - const Vector3 bz(e[8], e[9], e[10]); - const float lx = bx.length(), ly = by.length(), lz = bz.length(); - const float eps = 1e-4f * std::max({lx, ly, lz, 1.f}); - const bool nonUniform = - std::abs(lx - ly) > eps || std::abs(ly - lz) > eps; - inst.flags = nonUniform ? 1u : 0u; - } - d.tlasInstanceBuf.write(d.tlasInstances.data(), - d.tlasInstances.size() * sizeof(TlasInstance)); - } - // Compute 2D dispatch dimensions (WebGPU max per-dimension is 65535) const uint32_t vtTotal = (static_cast(d.triCount_) + 63u) / 64u; const uint32_t vtGx = (std::min)(vtTotal, 65535u); @@ -3338,12 +3001,9 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { u.restirParams[2] = anyEmissiveMoved ? 1.f : 0.f; // emissive source moved → tight accum cap u.restirParams[3] = 1.0f; // unused — emissive multiplier removed; per-material emissiveIntensity is pre-baked in the material atlas u.bvhAux[0] = static_cast(d.bvhRootIdx_); // traversal root (0=normal, >0=overlay) - // .y = tlas_enabled flag (selects the nested TLAS->BLAS walk in the shader). - // .z = tlasInstanceCount — consumed by sceneHitRawTlas / sceneAnyHitTlas. - // .w = objTriSplit for loadObjTri's buf1/buf2 routing. - u.bvhAux[1] = d.tlasEnabled_ ? 1u : 0u; - u.bvhAux[2] = static_cast(d.tlasInstances.size()); - u.bvhAux[3] = static_cast(d.objTriSplit_); + u.bvhAux[1] = 0u; + u.bvhAux[2] = 0u; + u.bvhAux[3] = 0u; u.lens[0] = d.lens_.fStop; u.lens[1] = d.lens_.focusDistance; u.lens[2] = static_cast(d.lens_.apertureBlades); @@ -3588,22 +3248,6 @@ void WgpuPathTracer::render(Object3D& scene, Camera& camera) { wgpuComputePassEncoderEnd(rfPass); wgpuComputePassEncoderRelease(rfPass); } - - // BLAS refit — only in TLAS mode and only when object-space geometry - // actually deformed this frame (matrix-only moves leave BLAS AABBs - // valid since BLAS is object-space). Zero the atomic counter first. - if (d.tlasEnabled_ && d.geometryDirtyThisFrame_ - && !topoJustFinished - && !d.blasLeafIndices.empty() - && d.blasNodeCapacity_ > 0) { - d.blasRefitCounterBuf.write(d.blasRefitCounterZeros.data(), - static_cast(d.blasNodeCapacity_) * sizeof(uint32_t)); - passDesc.label = WGPUStringView{"blas_rf_pass", WGPU_STRLEN}; - WGPUComputePassEncoder brfPass = wgpuCommandEncoderBeginComputePass(encoder, &passDesc); - d.blasRefitPipeline.encode(brfPass, d.blasRfDispatchX_, d.blasRfDispatchY_); - wgpuComputePassEncoderEnd(brfPass); - wgpuComputePassEncoderRelease(brfPass); - } } // Reset both work-queue counters to 0 before the encoder runs. @@ -4310,21 +3954,6 @@ bool WgpuPathTracer::restirGiEnabled() const { return pimpl_->restirGiEnabled_; } -void WgpuPathTracer::setTlasEnabled(bool enabled) { - if (pimpl_->tlasEnabled_ == enabled) return; - pimpl_->frameCount_ = 0.f; - pimpl_->tlasEnabled_ = enabled; - // Force a fresh topology build so tlasInstances/blasRecords are rebuilt - // (toggling on) or freshly invalidated (toggling off). Without this the - // shader's TLAS branch would run against whatever stale state the flag - // was last built for — at best rendering the scene as background. - pimpl_->prevMeshes.clear(); -} - -bool WgpuPathTracer::tlasEnabled() const { - return pimpl_->tlasEnabled_; -} - void WgpuPathTracer::setSamplesPerPixel(int spp) { pimpl_->spp_ = std::max(1, spp); } diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.cpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.cpp index ab15bc44..b1d1e44f 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.cpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.cpp @@ -2,12 +2,8 @@ #include "threepp/renderers/wgpu/pathtracer/WgpuPathTracerAtlas.hpp" #include "threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp" -#include "threepp/math/Matrix4.hpp" -#include "threepp/math/Vector3.hpp" - #include #include -#include #include #include #include @@ -441,8 +437,7 @@ void packRefitMetadata(const std::vector& nodes, std::vector& void buildBVH(std::vector& triBuffer, int triCount, std::vector& wideNodes, std::vector& indices, std::vector& leafIndices, - std::vector& rawObjTriBuf, - bool preserveObjTriOrder) { + std::vector& rawObjTriBuf) { indices.resize(triCount); std::iota(indices.begin(), indices.end(), 0); @@ -462,11 +457,6 @@ void buildBVH(std::vector& triBuffer, int triCount, // Sort triangle data to match BVH index ordering. // Uses in-place cycle-following permutation to avoid allocating two full copies // (~1.2 GB for large scenes). Extra memory: O(n/8) for the visited bitmap + 2 tri temps. - // - // When `preserveObjTriOrder` is set, the TLAS/BLAS path has already built - // BLAS records that address `rawObjTriBuf` by its pre-sort (entry-contiguous) - // indices. Reordering it here would silently invalidate those indices, so - // the secondary buffer is skipped and only `triBuffer` gets permuted. { std::vector visited(triCount, false); std::vector tmpTri(TRI_TEX_HEIGHT * 4); // one triangle's worth of triBuffer rows (8*4=32) @@ -481,9 +471,7 @@ void buildBVH(std::vector& triBuffer, int triCount, for (int row = 0; row < TRI_TEX_HEIGHT; row++) for (int c = 0; c < 4; c++) tmpTri[row * 4 + c] = triBuffer[pagedIdx(i, row) + c]; - if (!preserveObjTriOrder) { - std::memcpy(tmpObj.data(), rawObjTriBuf.data() + i * 32, 32 * sizeof(float)); - } + std::memcpy(tmpObj.data(), rawObjTriBuf.data() + i * 32, 32 * sizeof(float)); int j = i; while (true) { @@ -493,10 +481,8 @@ void buildBVH(std::vector& triBuffer, int triCount, for (int row = 0; row < TRI_TEX_HEIGHT; row++) for (int c = 0; c < 4; c++) triBuffer[pagedIdx(j, row) + c] = triBuffer[pagedIdx(k, row) + c]; - if (!preserveObjTriOrder) { - std::memcpy(rawObjTriBuf.data() + j * 32, - rawObjTriBuf.data() + k * 32, 32 * sizeof(float)); - } + std::memcpy(rawObjTriBuf.data() + j * 32, + rawObjTriBuf.data() + k * 32, 32 * sizeof(float)); visited[k] = true; j = k; } @@ -504,9 +490,7 @@ void buildBVH(std::vector& triBuffer, int triCount, for (int row = 0; row < TRI_TEX_HEIGHT; row++) for (int c = 0; c < 4; c++) triBuffer[pagedIdx(j, row) + c] = tmpTri[row * 4 + c]; - if (!preserveObjTriOrder) { - std::memcpy(rawObjTriBuf.data() + j * 32, tmpObj.data(), 32 * sizeof(float)); - } + std::memcpy(rawObjTriBuf.data() + j * 32, tmpObj.data(), 32 * sizeof(float)); } } } @@ -618,189 +602,4 @@ void buildOverlayBVH( } } -BlasRecord buildBlas( - std::vector& objTriBuf, - int triStartLocal, - int triCount, - std::vector& blasNodes, - std::vector& leafIndicesOut) { - - BlasRecord rec{}; - rec.triStart = static_cast(triStartLocal); - rec.triCount = static_cast(std::max(0, triCount)); - rec.rootNodeOffset = static_cast(blasNodes.size()); - rec.nodeCount = 0; - for (int c = 0; c < 3; c++) { rec.aabbMin[c] = 0.f; rec.aabbMax[c] = 0.f; } - rec.aabbMin[3] = rec.aabbMax[3] = 0.f; - - if (triCount <= 0) return rec; - - // Pack object-space vertex positions into a temporary paged buffer so the - // shared `buildBvhNode` code can read them via `triGet` without a second - // accessor. Only rows 0/1/2 (the three vertex positions) are needed by - // the builder — normals/UVs don't affect tree shape. - const int localPages = triTexPages(triCount); - const std::size_t localWords = - static_cast(localPages) * TEX_PAGE_WIDTH * TRI_TEX_HEIGHT * 4; - std::vector localPaged(localWords, 0.f); - for (int li = 0; li < triCount; li++) { - const int gi = triStartLocal + li; - const float* src = objTriBuf.data() + static_cast(gi) * 32; - for (int row = 0; row < 3; row++) { - const int lp = ((li / TEX_PAGE_WIDTH * TRI_TEX_HEIGHT + row) * TEX_PAGE_WIDTH - + li % TEX_PAGE_WIDTH) * 4; - localPaged[lp + 0] = src[row * 4 + 0]; - localPaged[lp + 1] = src[row * 4 + 1]; - localPaged[lp + 2] = src[row * 4 + 2]; - } - } - - // Phase 1: binary BVH over local indices. - std::vector localIdx(triCount); - std::iota(localIdx.begin(), localIdx.end(), 0); - std::vector binNodes; - binNodes.reserve(static_cast(triCount) * 2); - buildBvhNode(binNodes, localIdx, localPaged, 0, triCount, -1); - - // Phase 2: collapse into the shared `blasNodes` buffer. Because - // `collapseBvh4` uses `wide.size()` for self-indexing, child indices - // emitted into `blasNodes` are already absolute (they account for prior - // BLASes already in the buffer). `leafIndicesOut` receives absolute - // indices by the same mechanism. - const std::size_t nodeBase = blasNodes.size(); - if (!binNodes.empty()) { - collapseBvh4(binNodes, blasNodes, leafIndicesOut, 0, -1); - } - rec.nodeCount = static_cast(blasNodes.size() - nodeBase); - - // Offset leaf `triStart` values in the newly-emitted nodes from local - // (0..triCount-1) to global (triStartLocal + localStart), matching the - // overlay builder's convention. - for (std::size_t ni = nodeBase; ni < blasNodes.size(); ni++) { - auto& node = blasNodes[ni]; - for (int c = 0; c < 4; c++) { - const int ci = node.childIdx[c]; - if (ci >= 0 || ci == INT_MIN) continue; // internal or empty - const int raw = -ci; - const int lStart = (raw - 1) / MAX_LEAF_TRIS; - const int cnt = ((raw - 1) % MAX_LEAF_TRIS) + 1; - node.childIdx[c] = -(((lStart + triStartLocal) * MAX_LEAF_TRIS) + cnt); - } - } - - // Reorder the object-space triangle slice in `objTriBuf` into BVH leaf - // order. Same cycle-permutation pattern as `buildBVH` / `buildOverlayBVH` - // but operating on the 32-float linear layout only (no paged world buf). - { - std::vector visited(triCount, false); - std::array tmp{}; - for (int i = 0; i < triCount; i++) { - if (visited[i]) continue; - visited[i] = true; - if (localIdx[i] == i) continue; - - std::memcpy(tmp.data(), - objTriBuf.data() + static_cast(triStartLocal + i) * 32, - 32 * sizeof(float)); - int j = i; - while (true) { - const int k = localIdx[j]; - if (k == i) break; - std::memcpy(objTriBuf.data() + static_cast(triStartLocal + j) * 32, - objTriBuf.data() + static_cast(triStartLocal + k) * 32, - 32 * sizeof(float)); - visited[k] = true; - j = k; - } - std::memcpy(objTriBuf.data() + static_cast(triStartLocal + j) * 32, - tmp.data(), 32 * sizeof(float)); - } - } - - // Root AABB — union of the root node's children. Used later by the TLAS - // builder to produce each instance's world-space bounding box. - if (rec.nodeCount > 0) { - const auto& root = blasNodes[nodeBase]; - float mnX = 1e30f, mnY = 1e30f, mnZ = 1e30f; - float mxX = -1e30f, mxY = -1e30f, mxZ = -1e30f; - for (int c = 0; c < root.childCount; c++) { - if (root.childIdx[c] == INT_MIN) continue; - mnX = std::min(mnX, root.childMinX[c]); - mnY = std::min(mnY, root.childMinY[c]); - mnZ = std::min(mnZ, root.childMinZ[c]); - mxX = std::max(mxX, root.childMaxX[c]); - mxY = std::max(mxY, root.childMaxY[c]); - mxZ = std::max(mxZ, root.childMaxZ[c]); - } - rec.aabbMin[0] = mnX; rec.aabbMin[1] = mnY; rec.aabbMin[2] = mnZ; - rec.aabbMax[0] = mxX; rec.aabbMax[1] = mxY; rec.aabbMax[2] = mxZ; - } - - return rec; -} - -void buildBlasesForEntries( - const std::vector& entries, - const std::vector>& entryTriRanges, - std::vector& objTriBuf, - std::vector& blasNodes, - std::vector& blasLeafIndices, - std::vector& blasRecords, - std::vector& tlasInstances, - std::vector& tlasToEntryIdx) { - - auto writeMat3x4 = [](const Matrix4& m, float out[3][4]) { - const auto& e = m.elements; - for (int row = 0; row < 3; ++row) { - out[row][0] = e[row]; - out[row][1] = e[4 + row]; - out[row][2] = e[8 + row]; - out[row][3] = e[12 + row]; - } - }; - - const std::size_t n = std::min(entries.size(), entryTriRanges.size()); - for (std::size_t i = 0; i < n; ++i) { - const auto [triStart, triCount] = entryTriRanges[i]; - if (triCount <= 0) continue; - - const std::uint32_t blasIndex = - static_cast(blasRecords.size()); - BlasRecord rec = buildBlas(objTriBuf, triStart, triCount, - blasNodes, blasLeafIndices); - blasRecords.push_back(rec); - - // Pull matIdx / meshIdx out of the first tri in the slice. All tris - // in one entry share both, so any index in [triStart, triStart+triCount) - // is fine — and buildBlas only reorders within that slice. - const float* first = objTriBuf.data() + static_cast(triStart) * 32; - const auto matIdx = static_cast(first[3]); // field 0.w - const auto meshIdx = static_cast(first[7]); // field 1.w - - TlasInstance inst{}; - writeMat3x4(entries[i].worldMatrix, inst.objToWorld); - Matrix4 inv(entries[i].worldMatrix); - inv.invert(); - writeMat3x4(inv, inst.worldToObj); - inst.blasIndex = blasIndex; - inst.matIdx = matIdx; - inst.meshId = meshIdx; - - // Non-uniform scale flag (bit 0): compare basis-vector lengths from the - // 3x3 upper-left of the world matrix (column-major elements). - const auto& e = entries[i].worldMatrix.elements; - const Vector3 bx(e[0], e[1], e[2]); - const Vector3 by(e[4], e[5], e[6]); - const Vector3 bz(e[8], e[9], e[10]); - const float lx = bx.length(), ly = by.length(), lz = bz.length(); - const float eps = 1e-4f * std::max({lx, ly, lz, 1.f}); - const bool nonUniform = - std::abs(lx - ly) > eps || std::abs(ly - lz) > eps; - inst.flags = nonUniform ? 1u : 0u; - - tlasInstances.push_back(inst); - tlasToEntryIdx.push_back(static_cast(i)); - } -} - }// namespace threepp::wgpu_pt diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.hpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.hpp index 32dd4637..df5a9ed6 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.hpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerBvh.hpp @@ -19,19 +19,12 @@ namespace threepp::wgpu_pt { /// Reorders `triBuffer` into BVH leaf order (cycle-permutation, in-place). /// `indices` is repurposed as scratch and also returned as the reorder /// permutation. `leafIndices` lists which BVH4 nodes contain leaf children - /// (needed by the GPU refit pass). - /// - /// If `preserveObjTriOrder` is false (default), `rawObjTriBuf` is reordered - /// alongside `triBuffer` — the single-level BVH path relies on this so its - /// leaf triStart values address both buffers consistently. When true, the - /// object-space buffer is left in its original entry-contiguous order — used - /// by the TLAS/BLAS path where per-entry BLAS records already address - /// `rawObjTriBuf` directly and must not be invalidated. + /// (needed by the GPU refit pass). `rawObjTriBuf` is reordered alongside + /// `triBuffer` so leaf triStart values address both buffers consistently. void buildBVH(std::vector& triBuffer, int triCount, std::vector& wideNodes, std::vector& indices, std::vector& leafIndices, - std::vector& rawObjTriBuf, - bool preserveObjTriOrder = false); + std::vector& rawObjTriBuf); /// Build a BVH for newly appended triangles at indices /// [oldTriCount, oldTriCount+newTriCount). On return `overlayNodes` @@ -45,49 +38,6 @@ namespace threepp::wgpu_pt { std::vector& overlayNodes, std::vector& overlayLeafIndices); - /// Build a BVH4 over a contiguous slice of object-space triangles in - /// `objTriBuf` at [triStartLocal, triStartLocal + triCount). Vertex - /// positions are read from 32-float fields 0/1/2 (xyz of each vertex). - /// - /// Nodes are appended to `blasNodes` (a shared buffer across all BLASes). - /// Emitted leaf `triStart` values are global — they index directly into - /// `objTriBuf`. The object-space slice is reordered in-place to match - /// BVH leaf order; the world-space `triBuffer` is *not* touched (world - /// positions will be regenerated per-frame from `objTriBuf` in the TLAS - /// path once PR2b lands). - /// - /// Returns a `BlasRecord` describing the new BLAS: `rootNodeOffset` - /// (absolute index into `blasNodes`), `nodeCount`, `triStart`/`triCount` - /// within `objTriBuf`, and the object-space root AABB. `leafIndicesOut` - /// receives absolute `blasNodes` indices of nodes with leaf children — - /// needed later for per-BLAS refit. - /// - /// PR2a plumbing — not yet wired into the scene build. Callable in - /// isolation for unit testing. - BlasRecord buildBlas( - std::vector& objTriBuf, - int triStartLocal, - int triCount, - std::vector& blasNodes, - std::vector& leafIndicesOut); - - /// PR2b plumbing — build one BLAS per `RtMeshEntry` with non-zero tris and - /// emit a matching TlasInstance record. `entryTriRanges` comes from - /// buildGeometryBuffers (see its docs); entries with zero tris are skipped. - /// `objTriBuf` is reordered in-place per-slice by each buildBlas call. - /// Outputs are appended to `blasNodes`, `blasLeafIndices`, `blasRecords`, - /// `tlasInstances`, and `tlasToEntryIdx` (per-instance index back into the - /// source `entries` vector — required for per-frame TLAS matrix refresh). - void buildBlasesForEntries( - const std::vector& entries, - const std::vector>& entryTriRanges, - std::vector& objTriBuf, - std::vector& blasNodes, - std::vector& blasLeafIndices, - std::vector& blasRecords, - std::vector& tlasInstances, - std::vector& tlasToEntryIdx); - /// Pack BVH4 nodes into a flat GPU buffer (7 × vec4 = 28 floats per node). /// `capacity` caps the number of nodes written; remaining bytes are zeroed. void packBvh4Buffer(const std::vector& nodes, std::vector& buf, int capacity); diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp index 1433d33b..423c5b49 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerGeometry.hpp @@ -40,8 +40,8 @@ namespace threepp::wgpu_pt { /// If `entryTriRanges` is non-null it is resized to entries.size() and /// populated with (triStart, triCount) pairs per entry (into triBuffer / /// rawObjTriBuf). Entries skipped due to material/mesh/tri caps get a - /// zero-count range. Used by the TLAS/BLAS builder to locate each - /// entry's triangle slice for per-mesh BLAS construction. + /// zero-count range. Used by the per-frame dirty-geometry fast path to + /// locate each entry's triangle slice in rawObjTriBuf. int buildGeometryBuffers( const std::vector& entries, const std::unordered_map& texSlotMap, diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders.hpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders.hpp index 2a08090b..6e925595 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders.hpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders.hpp @@ -26,10 +26,8 @@ namespace threepp::wgpu_pt { extern const char* const csSharedDefsWGSL; extern const char* const vtWGSL_; extern const char* const refitWGSL_; - extern const char* const blasRefitWGSL_; std::string buildVtShader(); std::string buildRefitShader(); - std::string buildBlasRefitShader(); // -- Denoiser pipeline fragments ------------------------------------------- extern const char* const svgfAtrousWGSL; diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_Rt.cpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_Rt.cpp index 4f59915a..b15e58b5 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_Rt.cpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_Rt.cpp @@ -59,33 +59,6 @@ struct Bvh4NodeGpu { cIdx: vec4, // child indices (bitcast to i32 for leaf encoding) } -// PR2c.2b — TLAS/BLAS declarations + binding wiring. Bindings are reachable -// from `sceneHitRaw` through a runtime-gated branch (rt.bvhAux.y == 1u) so -// naga keeps them alive in the bindgroup layout. The branch is never taken -// at runtime in this PR — traversal is still single-level. The nested -// TLAS->BLAS walk that actually uses these bindings lands in PR2c.2c. -struct BlasRecord { - rootNodeOffset: u32, - nodeCount: u32, - triStart: u32, - triCount: u32, - aabbMin: vec4, - aabbMax: vec4, -}; - -struct TlasInstance { - objToWorld_r0: vec4, - objToWorld_r1: vec4, - objToWorld_r2: vec4, - worldToObj_r0: vec4, - worldToObj_r1: vec4, - worldToObj_r2: vec4, - blasIndex: u32, - matIdx: u32, - meshId: u32, - flags: u32, -}; - @group(0) @binding(0) var rt: RtUniforms; // bindings 1 and 2 were the combined main accum (read/write). Removed 2026-04-23 // — normal display reads diffAccum+specAccum directly; combined accum was dead @@ -379,49 +352,6 @@ R"( // retained by the VNDF-sampled GGX estimator (see WgpuPathTracerGgxLut.hpp). @group(0) @binding(50) var ggxELut: texture_2d; -// PR2c.2b — TLAS/BLAS storage buffers. `blasNodes` holds every BLAS's BVH4 -// nodes concatenated back-to-back; `blasRecords[i]` locates BLAS `i` inside -// that buffer and gives its object-space AABB. `tlasInstances[i]` carries -// the world↔object 3x4 matrices and a pointer (blasIndex) to the BLAS this -// instance uses. Consumed by the nested traversal in PR2c.2c; this PR keeps -// them live via a runtime-false branch in `sceneHitRaw` so naga retains them -// in the pipeline layouts. -@group(0) @binding(51) var blasNodes: array; -@group(0) @binding(52) var blasRecords: array; -@group(0) @binding(53) var tlasInstances: array; - -// Object-space triangle data — mirrors the CPU `RawObjTri` record. Populated -// by buildGeometryBuffers into objTriBuf/objTriBuf2 (split to stay below the -// WebGPU max-buffer limit); consumed by the VT pass (world-space triTex) and, -// from PR2c.2c.ii on, by the BLAS leaf intersection in the TLAS traversal path. -struct ObjTriData { - v0: vec4, // .xyz obj-space pos, .w matIdx - v1: vec4, // .xyz obj-space pos, .w meshIdx - v2: vec4, // .xyz obj-space pos, .w unused - n0: vec4, - n1: vec4, - n2: vec4, - uv01: vec4, - uv2: vec4, // .w unused -}; - -// PR2c.2c.i — object-space triangle storage. Split across two buffers -// (`objTris` + `objTris2`) to stay below the WebGPU max-buffer limit; the -// split point `objTriSplit` is delivered via `rt.bvhAux.w` at upload time. -// Used by the nested BLAS traversal in PR2c.2c.ii for obj-space triangle -// intersection. Kept alive in this PR through the `sceneHitRaw` keepalive -// branch so naga retains bindings 54/55 in the pipeline layouts. -@group(0) @binding(54) var objTris: array; -@group(0) @binding(55) var objTris2: array; - -// Fetch an object-space triangle by global index, routing through the -// primary/overflow buffer split. Mirrors the VT compute shader's addressing. -fn loadObjTri(ti: i32) -> ObjTriData { - let splitAt = i32(rt.bvhAux.w); - if (ti < splitAt) { return objTris[ti]; } - return objTris2[ti - splitAt]; -} - // Bilinear sample of the GGX LUT. cos_o and alpha are clamped to (0,1]. // The CPU builder places cells at (i+0.5)/N for cos_o (column) and (i+1)/N // for α (row), so we align UV to those centers before interpolating. @@ -1042,197 +972,9 @@ fn loadShadowHitMaterial(rh: RawHit, ray: Ray) -> ShadowHit { return h; } -// PR2c.2c.ii — TLAS-walking variants of sceneHitRaw / sceneAnyHit. -// Enabled when `rt.bvhAux.y == 1u`, which also signals that the single-level -// `bvhNodes` buffer wasn't built (see tlasEnabled_ in WgpuPathTracer). The -// traversal loops linearly over `tlasInstances`, transforms the world-space -// ray into each instance's object space via worldToObj (3x4 rows), and walks -// that instance's BLAS starting from `blasRecords[blasIndex].rootNodeOffset`. -// -// Ray parameter t is invariant under the (un-normalized) ray transform: -// `t_obj == t_world`. So we can compare and accumulate hits across instances -// using the same `rh.t` without any remap — even under non-uniform scale. -// -// Hit triangles carry the GLOBAL triangle index, same convention as the -// single-level path. `loadHitMaterial` reads world-space data from the -// triData texture which — under `preserveObjTriOrder=true` for tlasEnabled_ — -// is populated by the VT pass in objTris-order, so triData[ti] is the -// world-space version of the same triangle. - -fn testTriangleObj(ray: Ray, ti: i32, rh: ptr) { - let ot = loadObjTri(ti); - let v0 = ot.v0.xyz; - let v1 = ot.v1.xyz; - let v2 = ot.v2.xyz; - let isect = triIntersect(ray, v0, v1, v2); - if (isect.t >= (*rh).t) { return; } - - let matIdx = i32(ot.v0.w); - let mat3 = textureLoad(matData, vec2(matIdx, 3), 0); - - let sideFlag = mat3.z; - if (sideFlag < 0.5) { - let geoNormal = cross(v1 - v0, v2 - v0); - if (dot(ray.dir, geoNormal) > 0.0) { return; } - } else if (sideFlag > 1.5) { - let geoNormal = cross(v1 - v0, v2 - v0); - if (dot(ray.dir, geoNormal) < 0.0) { return; } - } - - let alphaTest = mat3.y; - let blendMode = mat3.w < 0.0; - let opacity = abs(mat3.w); - let needsAlpha = alphaTest > 0.0 || blendMode; - if (needsAlpha) { - var alpha = opacity; - let mat1 = textureLoad(matData, vec2(matIdx, 1), 0); - if (mat1.x >= 0.0) { - let w = 1.0 - isect.u - isect.v; - let iuv0 = vec2(ot.uv01.x, ot.uv01.y) * w - + vec2(ot.uv01.z, ot.uv01.w) * isect.u - + ot.uv2.xy * isect.v; - let tuv = transformUV(iuv0, iuv0, matIdx, 6); - alpha *= sampleAtlasAlpha(tuv, mat1.x); - } - if (alphaTest > 0.0) { - if (alpha < alphaTest) { return; } - } else { - if (alpha >= 0.99) { - // accept - } else if (alpha <= 0.01) { - return; - } else { - let h = pcg(pcg(u32(ti) ^ u32(rt.params.y) * 2654435761u) ^ pcg(bitcast(isect.t))); - let rng = f32(h) / 4294967295.0; - if (rng > alpha) { return; } - } - } - } - - (*rh).t = isect.t; - (*rh).triIdx = ti; - (*rh).u = isect.u; - (*rh).v = isect.v; -} - -fn decodeLeafObj(ci: i32, ray: Ray, rh: ptr) { - let raw = -ci; - let triStart = (raw - 1) / MAX_LEAF_TRIS; - let triCount = ((raw - 1) % MAX_LEAF_TRIS) + 1; - for (var t = triStart; t < triStart + triCount; t++) { - testTriangleObj(ray, t, rh); - } -} - -fn xformRay(inst: TlasInstance, ray: Ray) -> Ray { - var out: Ray; - out.origin = vec3( - dot(inst.worldToObj_r0.xyz, ray.origin) + inst.worldToObj_r0.w, - dot(inst.worldToObj_r1.xyz, ray.origin) + inst.worldToObj_r1.w, - dot(inst.worldToObj_r2.xyz, ray.origin) + inst.worldToObj_r2.w); - out.dir = vec3( - dot(inst.worldToObj_r0.xyz, ray.dir), - dot(inst.worldToObj_r1.xyz, ray.dir), - dot(inst.worldToObj_r2.xyz, ray.dir)); - return out; -} - -fn walkBlas(oRay: Ray, rootOffset: i32, rh: ptr) { - let invD = vec3(1.0) / oRay.dir; - var stack: array; - var top: i32 = 0; - stack[0] = rootOffset; top = 1; - while (top > 0) { - top -= 1; - let nd = blasNodes[stack[top]]; - let dists = aabbDist4(nd, oRay, invD, (*rh).t); - if (all(dists >= vec4(1e30))) { continue; } - - let ci0 = bitcast(nd.cIdx.x); - let ci1 = bitcast(nd.cIdx.y); - let ci2 = bitcast(nd.cIdx.z); - let ci3 = bitcast(nd.cIdx.w); - - if (dists.x < 1e30 && ci0 < 0 && ci0 != EMPTY_CHILD) { decodeLeafObj(ci0, oRay, rh); } - if (dists.y < 1e30 && ci1 < 0 && ci1 != EMPTY_CHILD) { decodeLeafObj(ci1, oRay, rh); } - if (dists.z < 1e30 && ci2 < 0 && ci2 != EMPTY_CHILD) { decodeLeafObj(ci2, oRay, rh); } - if (dists.w < 1e30 && ci3 < 0 && ci3 != EMPTY_CHILD) { decodeLeafObj(ci3, oRay, rh); } - - var n0 = dists.x; var n1 = dists.y; var n2 = dists.z; var n3 = dists.w; - var k0 = ci0; var k1 = ci1; var k2 = ci2; var k3 = ci3; - if (k0 < 0) { n0 = 1e30; } if (k1 < 0) { n1 = 1e30; } - if (k2 < 0) { n2 = 1e30; } if (k3 < 0) { n3 = 1e30; } - var c: bool; var tn: f32; var tk: i32; - c=n0) { - let invD = vec3(1.0) / oRay.dir; - var stack: array; - var top: i32 = 0; - stack[0] = rootOffset; top = 1; - while (top > 0) { - top -= 1; - let nd = blasNodes[stack[top]]; - let dists = aabbDist4(nd, oRay, invD, (*rh).t); - if (all(dists >= vec4(1e30))) { continue; } - - let ci0 = bitcast(nd.cIdx.x); - let ci1 = bitcast(nd.cIdx.y); - let ci2 = bitcast(nd.cIdx.z); - let ci3 = bitcast(nd.cIdx.w); - - if (dists.x < 1e30 && ci0 < 0 && ci0 != EMPTY_CHILD) { decodeLeafObj(ci0, oRay, rh); if ((*rh).triIdx >= 0) { return; } } - if (dists.y < 1e30 && ci1 < 0 && ci1 != EMPTY_CHILD) { decodeLeafObj(ci1, oRay, rh); if ((*rh).triIdx >= 0) { return; } } - if (dists.z < 1e30 && ci2 < 0 && ci2 != EMPTY_CHILD) { decodeLeafObj(ci2, oRay, rh); if ((*rh).triIdx >= 0) { return; } } - if (dists.w < 1e30 && ci3 < 0 && ci3 != EMPTY_CHILD) { decodeLeafObj(ci3, oRay, rh); if ((*rh).triIdx >= 0) { return; } } - - if (dists.x < 1e30 && ci0 >= 0) { stack[top] = ci0; top++; } - if (dists.y < 1e30 && ci1 >= 0) { stack[top] = ci1; top++; } - if (dists.z < 1e30 && ci2 >= 0) { stack[top] = ci2; top++; } - if (dists.w < 1e30 && ci3 >= 0) { stack[top] = ci3; top++; } - } -} - -fn sceneHitRawTlas(ray: Ray, maxT: f32) -> RawHit { - var rh: RawHit; rh.t = maxT; rh.triIdx = -1; - let instCount = i32(rt.bvhAux.z); - for (var i: i32 = 0; i < instCount; i++) { - let inst = tlasInstances[i]; - let oRay = xformRay(inst, ray); - let rec = blasRecords[inst.blasIndex]; - walkBlas(oRay, i32(rec.rootNodeOffset), &rh); - } - return rh; -} - -fn sceneAnyHitTlas(ray: Ray, maxT: f32) -> RawHit { - var rh: RawHit; rh.t = maxT; rh.triIdx = -1; - let instCount = i32(rt.bvhAux.z); - for (var i: i32 = 0; i < instCount; i++) { - let inst = tlasInstances[i]; - let oRay = xformRay(inst, ray); - let rec = blasRecords[inst.blasIndex]; - walkBlasAny(oRay, i32(rec.rootNodeOffset), &rh); - if (rh.triIdx >= 0) { return rh; } - } - return rh; -} fn sceneHitRaw(ray: Ray, maxT: f32) -> RawHit { var rh: RawHit; rh.t = maxT; rh.triIdx = -1; - if (rt.bvhAux.y == 1u) { - return sceneHitRawTlas(ray, maxT); - } let invD = vec3(1.0) / ray.dir; var stack: array; var top: i32 = 0; @@ -1299,9 +1041,6 @@ R"( // No sorting, no closest-hit search. Much faster for large scenes. fn sceneAnyHit(ray: Ray, maxT: f32) -> RawHit { var rh: RawHit; rh.t = maxT; rh.triIdx = -1; - if (rt.bvhAux.y == 1u) { - return sceneAnyHitTlas(ray, maxT); - } let invD = vec3(1.0) / ray.dir; var stack: array; var top: i32 = 0; diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_VtRefit.cpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_VtRefit.cpp index 7822cfad..2d2a9f7f 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_VtRefit.cpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerShaders_VtRefit.cpp @@ -193,150 +193,7 @@ fn bvh_refit(@builtin(global_invocation_id) gid: vec3) { } )"; -// --------------------------------------------------------------------------- -// WGSL BLAS refit compute shader (TLAS mode) -// --------------------------------------------------------------------------- -// Refits BLAS leaf AABBs from object-space triangles in `objTris`/`objTris2`, -// then propagates up parent AABBs within each BLAS sub-tree. Propagation -// terminates at BLAS roots (parent == -1), so the single flat `blasNodes` -// buffer is walked correctly without crossing BLAS boundaries. -// -// Separate pipeline from bvh_refit because: -// - reads object-space tri vertices (not the world-space triTex) -// - writes to the `blasNodes` buffer (not `bvhNodes`) -// - needs its own atomic counter + leaf-index + refit-metadata buffers -const char* const blasRefitWGSL_ = R"( -struct ObjTriData { - v0: vec4, - v1: vec4, - v2: vec4, - n0: vec4, - n1: vec4, - n2: vec4, - uv01: vec4, - uv2: vec4, -} -struct Bvh4NodeGpu { - cMinX: vec4, - cMinY: vec4, - cMinZ: vec4, - cMaxX: vec4, - cMaxY: vec4, - cMaxZ: vec4, - cIdx: vec4, -} -struct BlasRefitUniforms { - leafCount: u32, - groupsX: u32, - splitAt: u32, // objTri split point (in tris); tis < splitAt read objTris, else objTris2 - _p: u32, -} - -@group(0) @binding(0) var objTris: array; -@group(0) @binding(1) var objTris2: array; -@group(0) @binding(2) var blasNodes: array; -@group(0) @binding(3) var blasCtrs: array>; -@group(0) @binding(4) var leafIdxBuf: array; -@group(0) @binding(5) var refitUni: BlasRefitUniforms; -@group(0) @binding(6) var refitMeta: array>; - -fn loadObjTri(ti: i32) -> ObjTriData { - let splitAt = i32(refitUni.splitAt); - if (ti < splitAt) { return objTris[ti]; } - return objTris2[ti - splitAt]; -} - -fn writeChildAABB(ni: i32, c: i32, bmin: vec3, bmax: vec3) { - let extent = max(abs(bmin), abs(bmax)); - let E = max(extent * vec3(1e-5), vec3(1e-6)); - var n = blasNodes[ni]; - n.cMinX[c] = bmin.x - E.x; - n.cMinY[c] = bmin.y - E.y; - n.cMinZ[c] = bmin.z - E.z; - n.cMaxX[c] = bmax.x + E.x; - n.cMaxY[c] = bmax.y + E.y; - n.cMaxZ[c] = bmax.z + E.z; - blasNodes[ni] = n; -} - -@compute @workgroup_size(64) -fn blas_refit(@builtin(global_invocation_id) gid: vec3) { - let linearId = gid.x + gid.y * refitUni.groupsX * 64u; - if (linearId >= refitUni.leafCount) { return; } - let wideNi = leafIdxBuf[i32(linearId)]; - let nfo = refitMeta[wideNi]; - let childCount = nfo.y; - - let cIdxVec = blasNodes[wideNi].cIdx; - let leafIdx = array(bitcast(cIdxVec.x), bitcast(cIdxVec.y), - bitcast(cIdxVec.z), bitcast(cIdxVec.w)); - - for (var c: i32 = 0; c < childCount; c++) { - let ci = leafIdx[c]; - if (ci >= 0) { continue; } - - let raw = -ci; - let triStart = (raw - 1) / MAX_LEAF_TRIS; - let triCount = ((raw - 1) % MAX_LEAF_TRIS) + 1; - - var bmin = vec3(1e30); - var bmax = vec3(-1e30); - for (var ti = triStart; ti < triStart + triCount; ti++) { - let ot = loadObjTri(ti); - bmin = min(bmin, ot.v0.xyz); bmax = max(bmax, ot.v0.xyz); - bmin = min(bmin, ot.v1.xyz); bmax = max(bmax, ot.v1.xyz); - bmin = min(bmin, ot.v2.xyz); bmax = max(bmax, ot.v2.xyz); - } - writeChildAABB(wideNi, c, bmin, bmax); - } - - let numInternal = nfo.z; - if (numInternal > 0) { - let cnt = atomicAdd(&blasCtrs[wideNi], 1u); - if (cnt < u32(numInternal)) { return; } - } - - // Propagate up to parent — parent == -1 is the BLAS root, stop there. - var curNi = nfo.x; - loop { - if (curNi < 0) { break; } - let curNfo = refitMeta[curNi]; - let numInt = u32(curNfo.z); - let cnt = atomicAdd(&blasCtrs[curNi], 1u); - let curChildCount = curNfo.y; - let hasLeaves = u32(curChildCount) > numInt; - let expected = select(numInt - 1u, numInt, hasLeaves); - if (cnt < expected) { break; } - - let pIdxVec = blasNodes[curNi].cIdx; - let pIdx = array(bitcast(pIdxVec.x), bitcast(pIdxVec.y), - bitcast(pIdxVec.z), bitcast(pIdxVec.w)); - for (var c: i32 = 0; c < curChildCount; c++) { - let ci = pIdx[c]; - if (ci < 0) { continue; } - let child = blasNodes[ci]; - let cc = refitMeta[ci].y; - var bmin = vec3(1e30); - var bmax = vec3(-1e30); - let cMinXa = array(child.cMinX.x, child.cMinX.y, child.cMinX.z, child.cMinX.w); - let cMinYa = array(child.cMinY.x, child.cMinY.y, child.cMinY.z, child.cMinY.w); - let cMinZa = array(child.cMinZ.x, child.cMinZ.y, child.cMinZ.z, child.cMinZ.w); - let cMaxXa = array(child.cMaxX.x, child.cMaxX.y, child.cMaxX.z, child.cMaxX.w); - let cMaxYa = array(child.cMaxY.x, child.cMaxY.y, child.cMaxY.z, child.cMaxY.w); - let cMaxZa = array(child.cMaxZ.x, child.cMaxZ.y, child.cMaxZ.z, child.cMaxZ.w); - for (var gc: i32 = 0; gc < cc; gc++) { - bmin = min(bmin, vec3(cMinXa[gc], cMinYa[gc], cMinZa[gc])); - bmax = max(bmax, vec3(cMaxXa[gc], cMaxYa[gc], cMaxZa[gc])); - } - writeChildAABB(curNi, c, bmin, bmax); - } - curNi = curNfo.x; - } -} -)"; - std::string buildVtShader() { return std::string(csSharedDefsWGSL) + "\n" + vtWGSL_; } std::string buildRefitShader() { return std::string(csSharedDefsWGSL) + "\n" + refitWGSL_; } -std::string buildBlasRefitShader() { return std::string(csSharedDefsWGSL) + "\n" + blasRefitWGSL_; } }// namespace threepp::wgpu_pt diff --git a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerTypes.hpp b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerTypes.hpp index 11f765a1..a79e8ba2 100644 --- a/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerTypes.hpp +++ b/src/threepp/renderers/wgpu/pathtracer/WgpuPathTracerTypes.hpp @@ -142,37 +142,6 @@ namespace threepp::wgpu_pt { int parent; }; - // ----------------------------------------------------------------------- - // TLAS/BLAS records (PR1 plumbing — unused until PR2 wires traversal) - // ----------------------------------------------------------------------- - // One per unique Geometry* — indexes into the shared bvhNodeBuf and object- - // space triangle buffer. `triStart` is the global tri-buffer offset of this - // mesh's first triangle; BLAS leaves encode triStart relative to 0 within - // the BLAS and the shader adds this offset when fetching. - struct alignas(16) BlasRecord { - std::uint32_t rootNodeOffset; // first Bvh4Node index belonging to this BLAS - std::uint32_t nodeCount; - std::uint32_t triStart; // global offset into triBuf/triTex - std::uint32_t triCount; - float aabbMin[4]; // BLAS root AABB in object space (w unused) - float aabbMax[4]; - }; - static_assert(sizeof(BlasRecord) == 48, "BlasRecord must be 48 bytes"); - - // One per mesh instance. `objToWorld` / `worldToObj` stored as mat3x4 - // (three vec4 rows, 48 B each) — std430-safe and saves 16 B over mat4x4. - // The shader transforms incoming world-space rays into BLAS object space - // with `worldToObj`, then returns hit position via `objToWorld`. - struct alignas(16) TlasInstance { - float objToWorld[3][4]; // 48 B — rows of mat3x4 (rotation*scale + translation) - float worldToObj[3][4]; // 48 B — inverse - std::uint32_t blasIndex; // index into blasRecords buffer - std::uint32_t matIdx; // material offset for this instance - std::uint32_t meshId; // stable scene-mesh identifier - std::uint32_t flags; // bit 0 = non-uniform scale (needs |det| for t-remap) - }; - static_assert(sizeof(TlasInstance) == 112, "TlasInstance must be 112 bytes"); - // ----------------------------------------------------------------------- // PingPong — ping-pong texture pair with read/write pointer aliases // -----------------------------------------------------------------------