diff --git a/.github/workflows/openxr_engine_ci.yml b/.github/workflows/openxr_engine_ci.yml new file mode 100644 index 00000000..f118382f --- /dev/null +++ b/.github/workflows/openxr_engine_ci.yml @@ -0,0 +1,189 @@ +name: OpenXR Engine CI + +on: + push: + paths: + - 'attachments/openxr_engine/**' + - 'attachments/simple_engine/**' + - '.github/workflows/openxr_engine_ci.yml' + pull_request: + paths: + - 'attachments/openxr_engine/**' + - 'attachments/simple_engine/**' + - '.github/workflows/openxr_engine_ci.yml' + workflow_dispatch: + +jobs: + desktop: + name: Build (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + + defaults: + run: + working-directory: attachments/openxr_engine + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Clang + Ninja + ccache (Linux) + if: runner.os == 'Linux' + shell: bash + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y clang ninja-build ccache libopenxr-dev + + - name: Select Clang toolchain (Linux) + if: runner.os == 'Linux' + shell: bash + run: | + set -euo pipefail + echo "CC=clang" >> "$GITHUB_ENV" + echo "CXX=clang++" >> "$GITHUB_ENV" + + - name: Set up MSVC dev environment + if: runner.os == 'Windows' + uses: ilammy/msvc-dev-cmd@v1 + + - name: Set up Ninja + sccache + if: runner.os == 'Windows' + shell: pwsh + run: | + choco install -y ninja sccache + $chocoBin = "C:\ProgramData\chocolatey\bin" + if (Test-Path $chocoBin) { + $chocoBin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + } + "SCCACHE_DIR=$env:LOCALAPPDATA\Mozilla\sccache" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: ccache (Linux) + if: runner.os == 'Linux' + uses: actions/cache@v4 + with: + path: ~/.cache/ccache + key: ${{ runner.os }}-openxr-ccache-${{ github.sha }} + restore-keys: ${{ runner.os }}-openxr-ccache- + + - name: sccache (Windows) + if: runner.os == 'Windows' + uses: actions/cache@v4 + with: + path: ${{ env.SCCACHE_DIR }} + key: ${{ runner.os }}-openxr-sccache-${{ github.sha }} + restore-keys: ${{ runner.os }}-openxr-sccache- + + - name: Cache Vulkan SDK (Windows) + if: runner.os == 'Windows' + id: cache-vulkan-windows + uses: actions/cache@v4 + with: + path: C:\VulkanSDK + key: ${{ runner.os }}-vulkan-sdk + + - name: Install Vulkan SDK (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + $ErrorActionPreference = 'Stop' + if ("${{ steps.cache-vulkan-windows.outputs.cache-hit }}" -ne "true") { + choco install -y aria2 + $installer = Join-Path $env:TEMP "vulkan-sdk.exe" + aria2c --split=8 --max-connection-per-server=8 --min-split-size=1M --dir="$env:TEMP" --out="vulkan-sdk.exe" "https://sdk.lunarg.com/sdk/download/latest/windows/vulkan-sdk.exe" + Start-Process -FilePath $installer -ArgumentList "--accept-licenses --default-answer --confirm-command install" -Wait -NoNewWindow + } + $vulkanPath = Get-ChildItem "C:\VulkanSDK" | Sort-Object -Property Name -Descending | Select-Object -First 1 -ExpandProperty FullName + "VULKAN_SDK=$vulkanPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + "$vulkanPath\Bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + "CMAKE_PREFIX_PATH=$vulkanPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: Install Vulkan SDK (Linux) + if: runner.os == 'Linux' + shell: bash + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y curl xz-utils + VULKAN_VERSION=$(curl -s https://vulkan.lunarg.com/sdk/latest/linux.txt) + SDK_TGZ="${RUNNER_TEMP}/vulkansdk-linux.tar.xz" + curl -L -o "$SDK_TGZ" "https://sdk.lunarg.com/sdk/download/$VULKAN_VERSION/linux/vulkansdk-linux-x86_64-$VULKAN_VERSION.tar.xz" + mkdir -p "${RUNNER_TEMP}/VulkanSDK" + tar -xJf "$SDK_TGZ" -C "${RUNNER_TEMP}/VulkanSDK" + VULKAN_SDK_PATH=$(find "${RUNNER_TEMP}/VulkanSDK" -maxdepth 1 -type d -name '1.*' | head -n 1) + SDK_SYSROOT="$VULKAN_SDK_PATH/x86_64" + echo "VULKAN_SDK=$VULKAN_SDK_PATH" >> "$GITHUB_ENV" + echo "VULKAN_SDK_SYSROOT=$SDK_SYSROOT" >> "$GITHUB_ENV" + echo "$SDK_SYSROOT/bin" >> "$GITHUB_PATH" + echo "CMAKE_PREFIX_PATH=$SDK_SYSROOT" >> "$GITHUB_ENV" + echo "Vulkan_INCLUDE_DIR=$SDK_SYSROOT/include" >> "$GITHUB_ENV" + + # Use the engine's dependency install scripts instead of calling vcpkg directly in CI. + - name: Cache vcpkg (Windows) + if: runner.os == 'Windows' + id: cache-vcpkg-windows + uses: actions/cache@v4 + with: + path: | + ${{ runner.temp }}/vcpkg + ${{ runner.temp }}/vcpkg-cache + key: ${{ runner.os }}-openxr-vcpkg-${{ hashFiles('attachments/simple_engine/vcpkg.json') }} + + - name: Bootstrap vcpkg (Windows) + if: runner.os == 'Windows' && steps.cache-vcpkg-windows.outputs.cache-hit != 'true' + shell: pwsh + run: | + $ErrorActionPreference = 'Stop' + $vcpkgRoot = Join-Path $env:RUNNER_TEMP "vcpkg" + if (-not (Test-Path $vcpkgRoot)) { + git clone https://github.com/microsoft/vcpkg $vcpkgRoot + } + Push-Location $vcpkgRoot + .\bootstrap-vcpkg.bat + Pop-Location + "VCPKG_INSTALLATION_ROOT=$vcpkgRoot" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + "$vcpkgRoot" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + "CMAKE_TOOLCHAIN_FILE=$vcpkgRoot\scripts\buildsystems\vcpkg.cmake" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: Set vcpkg env (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + $vcpkgRoot = Join-Path $env:RUNNER_TEMP "vcpkg" + $vcpkgCache = Join-Path $env:RUNNER_TEMP "vcpkg-cache" + if (-not (Test-Path $vcpkgCache)) { + New-Item -Path $vcpkgCache -ItemType Directory + } + "VCPKG_INSTALLATION_ROOT=$vcpkgRoot" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + "$vcpkgRoot" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + "CMAKE_TOOLCHAIN_FILE=$vcpkgRoot\scripts\buildsystems\vcpkg.cmake" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + "VCPKG_DEFAULT_BINARY_CACHE=$vcpkgCache" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + "VCPKG_BINARY_SOURCES=clear;files,$vcpkgCache,readwrite" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: Install dependencies (Windows) + if: runner.os == 'Windows' + shell: cmd + run: | + call install_dependencies_windows.bat + + - name: Install dependencies (Linux) + if: runner.os == 'Linux' + shell: bash + run: | + bash install_dependencies_linux.sh + + - name: Configure (Windows) + if: runner.os == 'Windows' + run: | + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="$env:CMAKE_TOOLCHAIN_FILE" -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache + + - name: Configure (Linux) + if: runner.os == 'Linux' + run: | + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_PREFIX_PATH="/home/runner/.local;${{ env.VULKAN_SDK_SYSROOT }}" + + - name: Build + run: cmake --build build --target OpenXREngine --parallel 4 diff --git a/antora/modules/ROOT/nav.adoc b/antora/modules/ROOT/nav.adoc index 89c2dc6f..19625144 100644 --- a/antora/modules/ROOT/nav.adoc +++ b/antora/modules/ROOT/nav.adoc @@ -149,3 +149,104 @@ *** xref:Building_a_Simple_Engine/Advanced_Topics/Robustness2.adoc[Robustness2] ** Appendix *** xref:Building_a_Simple_Engine/Appendix/appendix.adoc[Appendix] + +* OpenXR and Vulkan 1.4 Spatial Computing +** xref:OpenXR_Vulkan_Spatial_Computing/introduction.adoc[Introduction] +** The OpenXR-Vulkan 1.4 Handshake +*** xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc[System Integration] +*** xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc[Hardware Alignment (LUID)] +*** xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc[Vulkan 1.4 Feature Requirements] +*** xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Runtime-Owned Swapchains +*** xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc[External Image Negotiation] +*** xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc[RAII Resource Integration] +*** xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc[Memory Ownership Lifecycle] +*** xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Dynamic Rendering for Spatial Views +*** xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc[Rendering into Spatial Swapchains] +*** xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc[Stereo Viewport & Scissor Management] +*** xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** The Predictive Frame Loop +*** xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc[The XR Lifecycle] +*** xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc[Display Time Prediction] +*** xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Late Latching +*** xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc[The Last-Second Update] +*** xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc[Implementation] +*** xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Action Spaces and Input +*** xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc[The OpenXR Action System] +*** xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc[Space Manifolds] +*** xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Slang for Spatial Shaders +*** xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc[Native Multiview (N=2)] +*** xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc[Slang Architecture] +*** xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Quad-Views and Foveated Rendering +*** xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc[Primary Stereo with Insets] +*** xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc[Multi-Layer Composition] +*** xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Variable Rate Shading +*** xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc[Fragment Density Control] +*** xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc[Gaze-Driven Logic] +*** xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Canted Displays +*** xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc[Non-Parallel Projections] +*** xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc[Viewport Swizzling] +*** xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** CAVE Architecture +*** xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc[Projector-Based Spatial Tech] +*** xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc[Hardware Sync] +*** xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Warp and Blend +*** xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc[Geometric Correction] +*** xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc[Post-Process Warping] +*** xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** LightField Theory +*** xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc[4D LightField Representation] +*** xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc[High-Density View Arrays] +*** xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Plenoptic Synthesis +*** xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc[Synthesis Shaders] +*** xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc[Ray Traced Synthesis] +*** xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Scene Understanding +*** xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc[Environmental Ingestion] +*** xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc[Zero-Copy Hand-off] +*** xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** ML Inference for Spatial Data +*** xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc[On-GPU Inference] +*** xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc[Refining Spatial Data] +*** xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Semantic Occlusion +*** xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc[ML-Driven Segmentation] +*** xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc[Per-Pixel Masking] +*** xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Platform Divergence +*** xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc[Desktop High-End] +*** xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc[Mobile Mastery] +*** xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** Spatial Diagnostics +*** xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc[Introduction] +*** xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc[Spatial Debugging] +*** xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc[Automated QA] +*** xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc[Incorporating into the Engine] +** xref:OpenXR_Vulkan_Spatial_Computing/conclusion.adoc[Conclusion] diff --git a/attachments/openxr_engine/CMakeLists.txt b/attachments/openxr_engine/CMakeLists.txt new file mode 100644 index 00000000..f8f2388d --- /dev/null +++ b/attachments/openxr_engine/CMakeLists.txt @@ -0,0 +1,254 @@ +cmake_minimum_required(VERSION 3.29) + +project(OpenXREngine VERSION 1.0.0 LANGUAGES CXX C) + +# Option to enable/disable Vulkan C++20 module support for this standalone project +option(ENABLE_CPP20_MODULE "Enable C++ 20 module support for Vulkan in OpenXREngine" OFF) + +# Enable C++ module dependency scanning only when modules are enabled +if(ENABLE_CPP20_MODULE) + set(CMAKE_CXX_SCAN_FOR_MODULES ON) +endif() + +# Path to the base Simple Engine +set(SIMPLE_ENGINE_DIR "${CMAKE_CURRENT_LIST_DIR}/../simple_engine") + +# Add CMake module path for custom find modules +list(APPEND CMAKE_MODULE_PATH "${SIMPLE_ENGINE_DIR}/CMake") + +# Find required packages +find_package (glm REQUIRED) +find_package (Vulkan REQUIRED) +find_package (tinygltf REQUIRED) +find_package (KTX REQUIRED) + +# Find or download Vulkan-Hpp headers matching the Vulkan SDK/NDK version +find_package(VulkanHpp REQUIRED) + +# OpenXR is REQUIRED for this engine variant +find_package(OpenXR REQUIRED) +if (NOT TARGET OpenXR::OpenXR AND TARGET OpenXR::openxr_loader) + add_library(OpenXR::OpenXR ALIAS OpenXR::openxr_loader) +endif() + +if(ENABLE_CPP20_MODULE) + # Set up Vulkan C++ module + add_library(VulkanCppModule) + add_library(Vulkan::cppm ALIAS VulkanCppModule) + + target_compile_definitions(VulkanCppModule + PUBLIC VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1 VULKAN_HPP_NO_STRUCT_CONSTRUCTORS=1 + ) + target_include_directories(VulkanCppModule + PUBLIC + "${Vulkan_INCLUDE_DIR}" + "${VulkanHpp_INCLUDE_DIRS}" + ) + target_link_libraries(VulkanCppModule + PUBLIC + Vulkan::Vulkan + ) + + set_target_properties(VulkanCppModule PROPERTIES CXX_STANDARD 20) + + target_sources(VulkanCppModule + PUBLIC + FILE_SET cxx_modules TYPE CXX_MODULES + BASE_DIRS + "${VulkanHpp_CPPM_DIR}" + FILES + "${VulkanHpp_CPPM_DIR}/vulkan/vulkan.cppm" + ) + + # MSVC-specific options to improve module support + if(MSVC) + target_compile_options(VulkanCppModule PRIVATE + /std:c++latest + /permissive- + /Zc:__cplusplus + /EHsc + /Zc:preprocessor + ) + endif() +else() + add_library(VulkanCppModule INTERFACE) + add_library(Vulkan::cppm ALIAS VulkanCppModule) + target_link_libraries(VulkanCppModule INTERFACE Vulkan::Vulkan) + target_compile_definitions(VulkanCppModule + INTERFACE VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1 VULKAN_HPP_NO_STRUCT_CONSTRUCTORS=1 + ) + target_include_directories(VulkanCppModule INTERFACE "${VulkanHpp_INCLUDE_DIRS}") +endif() + +# Platform-specific settings +if(ANDROID) + # Android-specific settings + add_definitions(-DPLATFORM_ANDROID) + find_package(game-activity REQUIRED CONFIG) +else() + # Desktop-specific settings + add_definitions(-DPLATFORM_DESKTOP) + find_package(glfw3 REQUIRED) + find_package(OpenAL REQUIRED) +endif() + +# Shader compilation (using local shaders in openxr_engine) +file(GLOB SLANG_SHADER_SOURCES ${CMAKE_CURRENT_LIST_DIR}/shaders/*.slang) +list(FILTER SLANG_SHADER_SOURCES EXCLUDE REGEX ".*/(common_types|pbr_utils|lighting_utils|tonemapping_utils)\\.slang$") + +# Find slangc executable +find_program(SLANGC_EXECUTABLE slangc HINTS $ENV{VULKAN_SDK}/bin) + +if(SLANGC_EXECUTABLE) + # Ensure the output directory for compiled shaders exists + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/shaders) + + # Compile Slang shaders using slangc + foreach(SHADER ${SLANG_SHADER_SOURCES}) + get_filename_component(SHADER_NAME ${SHADER} NAME) + string(REGEX REPLACE "\.slang$" "" OUTPUT_NAME ${SHADER_NAME}) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv + COMMAND ${SLANGC_EXECUTABLE} ${SHADER} -target spirv -profile spirv_1_4 -emit-spirv-directly -o ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv + DEPENDS ${SHADER} + COMMENT "Compiling Slang shader ${SHADER_NAME} with slangc" + ) + list(APPEND SHADER_SPVS ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv) + endforeach() + + add_custom_target(shaders DEPENDS ${SHADER_SPVS}) +else() + add_custom_target(shaders) +endif() + +# Source files +# We reference most files from Simple Engine via SIMPLE_ENGINE_DIR. +# Local files are those modified for OpenXR support. +set(SOURCES_COMMON + engine.cpp # OpenXR version + renderer_core.cpp # OpenXR version + renderer_rendering.cpp # OpenXR version + renderer_resources.cpp # OpenXR version + camera_component.cpp # OpenXR version + xr_context.cpp # OpenXR only + + ${SIMPLE_ENGINE_DIR}/scene_loading.cpp + ${SIMPLE_ENGINE_DIR}/platform.cpp + renderer_pipelines.cpp + ${SIMPLE_ENGINE_DIR}/renderer_compute.cpp + ${SIMPLE_ENGINE_DIR}/renderer_utils.cpp + ${SIMPLE_ENGINE_DIR}/renderer_ray_query.cpp + memory_pool.cpp + ${SIMPLE_ENGINE_DIR}/resource_manager.cpp + ${SIMPLE_ENGINE_DIR}/entity.cpp + ${SIMPLE_ENGINE_DIR}/component.cpp + ${SIMPLE_ENGINE_DIR}/transform_component.cpp + ${SIMPLE_ENGINE_DIR}/mesh_component.cpp + ${SIMPLE_ENGINE_DIR}/animation_component.cpp + ${SIMPLE_ENGINE_DIR}/model_loader.cpp + ${SIMPLE_ENGINE_DIR}/audio_system.cpp + ${SIMPLE_ENGINE_DIR}/physics_system.cpp + ${SIMPLE_ENGINE_DIR}/imgui_system.cpp + ${SIMPLE_ENGINE_DIR}/imgui/imgui.cpp + ${SIMPLE_ENGINE_DIR}/imgui/imgui_draw.cpp + ${SIMPLE_ENGINE_DIR}/vulkan_device.cpp + ${SIMPLE_ENGINE_DIR}/pipeline.cpp + ${SIMPLE_ENGINE_DIR}/descriptor_manager.cpp + ${SIMPLE_ENGINE_DIR}/renderdoc_debug_system.cpp + ${SIMPLE_ENGINE_DIR}/mikktspace.c +) + +set(SOURCES_DESKTOP + main.cpp +) + +# Create target +if (ANDROID) + add_library(OpenXREngine STATIC ${SOURCES_COMMON}) +else () + add_executable(OpenXREngine ${SOURCES_COMMON} ${SOURCES_DESKTOP}) +endif () + +add_dependencies(OpenXREngine shaders) +set_target_properties (OpenXREngine PROPERTIES CXX_STANDARD 20) + +# Include directories +target_include_directories(OpenXREngine PRIVATE + . + ${SIMPLE_ENGINE_DIR} + ${SIMPLE_ENGINE_DIR}/imgui +) + +# Enable required defines +target_compile_definitions(OpenXREngine PRIVATE + GLM_ENABLE_EXPERIMENTAL + _USE_MATH_DEFINES + VULKAN_HPP_NO_STRUCT_CONSTRUCTORS + VULKAN_HPP_DISPATCH_LOADER_DYNAMIC +) + +# Link libraries +target_link_libraries(OpenXREngine PUBLIC + Vulkan::cppm + glm::glm + tinygltf::tinygltf + KTX::ktx + OpenXR::OpenXR +) + +if (ANDROID) + target_link_libraries(OpenXREngine PUBLIC game-activity::game-activity OpenSLES android log) +else () + target_link_libraries(OpenXREngine PRIVATE glfw OpenAL::OpenAL) +endif() + +# Windows/MSVC portability and build settings +if(MSVC) + target_compile_definitions(OpenXREngine PRIVATE + NOMINMAX + WIN32_LEAN_AND_MEAN + _CRT_SECURE_NO_WARNINGS + ) + target_compile_options(OpenXREngine PRIVATE + /permissive- + /Zc:__cplusplus + /EHsc + /W3 + /MP + /bigobj + ) + target_link_libraries(OpenXREngine PRIVATE Dbghelp) +elseif(WIN32) + target_compile_definitions(OpenXREngine PRIVATE + NOMINMAX + WIN32_LEAN_AND_MEAN + _CRT_SECURE_NO_WARNINGS + ) +endif() + +# Copy Assets from Simple Engine +if(EXISTS ${SIMPLE_ENGINE_DIR}/Assets) + if (NOT ANDROID) + add_custom_command(TARGET OpenXREngine POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SIMPLE_ENGINE_DIR}/Assets ${CMAKE_CURRENT_BINARY_DIR}/Assets + COMMENT "Copying Assets from Simple Engine to output directory" + ) +endif() +endif () + +# Packaging configuration +include(CPack) +set(CPACK_PACKAGE_NAME "OpenXREngine") +set(CPACK_PACKAGE_VENDOR "OpenXREngine Team") +set(CPACK_PACKAGE_VERSION "1.0.0") + +if (NOT ANDROID) + install(TARGETS OpenXREngine DESTINATION bin) + if(SLANGC_EXECUTABLE) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/shaders DESTINATION share/OpenXREngine) + endif() + # Assets are copied from Simple Engine + if(EXISTS ${SIMPLE_ENGINE_DIR}/Assets) + install(DIRECTORY ${SIMPLE_ENGINE_DIR}/Assets DESTINATION share/OpenXREngine) + endif() +endif () diff --git a/attachments/openxr_engine/README.adoc b/attachments/openxr_engine/README.adoc new file mode 100644 index 00000000..6eec9ef5 --- /dev/null +++ b/attachments/openxr_engine/README.adoc @@ -0,0 +1,13 @@ += OpenXR Engine + +This directory contains a variant of the Simple Engine that incorporates OpenXR support for Vulkan. + +== Dependency + +This project depends on the `simple_engine` located in the sibling directory `../simple_engine`. +The `CMakeLists.txt` is configured to pull shared source files, headers, and assets directly from the `simple_engine` directory to avoid redundancy. + +== Building + +To build this project, ensure that you have the Vulkan SDK and OpenXR SDK installed. +Since it references files from `simple_engine`, do not move this directory relative to the `simple_engine` directory. diff --git a/attachments/openxr_engine/camera_component.cpp b/attachments/openxr_engine/camera_component.cpp new file mode 100644 index 00000000..1e6ecc84 --- /dev/null +++ b/attachments/openxr_engine/camera_component.cpp @@ -0,0 +1,142 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "camera_component.h" +#include "xr_context.h" + +#include "entity.h" +#include + +// Most of the CameraComponent class implementation is in the header file +// This file is mainly for any methods that might need additional implementation +// +// This implementation corresponds to the Camera_Transformations chapter in the tutorial: +// @see en/Building_a_Simple_Engine/Camera_Transformations/03_camera_implementation.adoc + +// Initializes the camera by updating the view and projection matrices +// @see en/Building_a_Simple_Engine/Camera_Transformations/03_camera_implementation.adoc#camera-initialization +void CameraComponent::Initialize() +{ + UpdateViewMatrix(); + UpdateProjectionMatrix(); +} + +// Returns the view matrix, updating it if necessary +// @see en/Building_a_Simple_Engine/Camera_Transformations/03_camera_implementation.adoc#accessing-camera-matrices +const glm::mat4 &CameraComponent::GetViewMatrix() +{ + if (viewMatrixDirty) + { + UpdateViewMatrix(); + } + return viewMatrix; +} + +// Returns the projection matrix, updating it if necessary +// @see en/Building_a_Simple_Engine/Camera_Transformations/03_camera_implementation.adoc#accessing-camera-matrices +const glm::mat4 &CameraComponent::GetProjectionMatrix() +{ + if (projectionMatrixDirty) + { + UpdateProjectionMatrix(); + } + return projectionMatrix; +} + +// Updates the view matrix based on the camera's position and orientation +// @see en/Building_a_Simple_Engine/Camera_Transformations/04_transformation_matrices.adoc#view-matrix +void CameraComponent::UpdateViewMatrix() +{ + auto transformComponent = owner->GetComponent(); + if (transformComponent) + { + // Build camera world transform (T * R) from the camera entity's transform + // and compute the view matrix as its inverse. This ensures consistency + // with rasterization and avoids relying on an external target vector. + const glm::vec3 position = transformComponent->GetPosition(); + const glm::vec3 euler = transformComponent->GetRotation(); // radians + + const glm::quat qx = glm::angleAxis(euler.x, glm::vec3(1.0f, 0.0f, 0.0f)); + const glm::quat qy = glm::angleAxis(euler.y, glm::vec3(0.0f, 1.0f, 0.0f)); + const glm::quat qz = glm::angleAxis(euler.z, glm::vec3(0.0f, 0.0f, 1.0f)); + const glm::quat q = qz * qy * qx; // match TransformComponent's ZYX composition + + const glm::mat4 T = glm::translate(glm::mat4(1.0f), position); + const glm::mat4 R = glm::mat4_cast(q); + const glm::mat4 worldNoScale = T * R; + + viewMatrix = glm::inverse(worldNoScale); + } + else + { + // Fallback: default camera at origin looking towards +Z with Y up + // Note: keep consistent with right-handed convention used elsewhere + const glm::vec3 position(0.0f); + const glm::vec3 forward(0.0f, 0.0f, 1.0f); + const glm::vec3 upVec(0.0f, 1.0f, 0.0f); + viewMatrix = glm::lookAt(position, position + forward, upVec); + } + viewMatrixDirty = false; +} + +// Updates the projection matrix based on the camera's projection type and parameters +// @see en/Building_a_Simple_Engine/Camera_Transformations/04_transformation_matrices.adoc#projection-matrix +void CameraComponent::UpdateProjectionMatrix() +{ + if (projectionType == ProjectionType::Perspective) + { + projectionMatrix = glm::perspective(glm::radians(fieldOfView), aspectRatio, nearPlane, farPlane); + } + else + { + float halfWidth = orthoWidth * 0.5f; + float halfHeight = orthoHeight * 0.5f; + projectionMatrix = glm::ortho(-halfWidth, halfWidth, -halfHeight, halfHeight, nearPlane, farPlane); + } + projectionMatrixDirty = false; +} + +// Helper: Build asymmetric projection from FOV tangents (Chapter 5) +inline glm::mat4 getAsymmetricProjection(const XrFovf& fov, float nearZ, float farZ) { + float tanLeft = std::tan(fov.angleLeft); + float tanRight = std::tan(fov.angleRight); + float tanUp = std::tan(fov.angleUp); + float tanDown = std::tan(fov.angleDown); + + float tanWidth = tanRight - tanLeft; + float tanHeight = tanUp - tanDown; + + glm::mat4 projection(0.0f); + projection[0][0] = 2.0f / tanWidth; + projection[1][1] = 2.0f / tanHeight; + projection[2][0] = (tanRight + tanLeft) / tanWidth; + projection[2][1] = (tanUp + tanDown) / tanHeight; + projection[2][2] = -farZ / (farZ - nearZ); + projection[2][3] = -1.0f; + projection[3][2] = -(farZ * nearZ) / (farZ - nearZ); + + return projection; +} + +void CameraComponent::SetStereoViews(const XrView& left, const XrView& right) { + // 1. Convert OpenXR poses to 4x4 matrices + eyeViewMatrices[0] = xrPoseToMatrix(left.pose); + eyeViewMatrices[1] = xrPoseToMatrix(right.pose); + + // 2. Build asymmetric projection matrices from FOV tangents + eyeProjectionMatrices[0] = getAsymmetricProjection(left.fov, nearPlane, farPlane); + eyeProjectionMatrices[1] = getAsymmetricProjection(right.fov, nearPlane, farPlane); +} diff --git a/attachments/openxr_engine/camera_component.h b/attachments/openxr_engine/camera_component.h new file mode 100644 index 00000000..b2675ce5 --- /dev/null +++ b/attachments/openxr_engine/camera_component.h @@ -0,0 +1,290 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include "component.h" +#include "entity.h" +#include "transform_component.h" + +/** + * @brief Component that handles the camera view and projection. + * + * This class implements the camera system as described in the Camera_Transformations chapter: + * @see en/Building_a_Simple_Engine/Camera_Transformations/03_camera_implementation.adoc + */ +class CameraComponent : public Component +{ + public: + enum class ProjectionType + { + Perspective, + Orthographic + }; + + private: + ProjectionType projectionType = ProjectionType::Perspective; + + // Perspective projection parameters + float fieldOfView = 45.0f; + float aspectRatio = 16.0f / 9.0f; + + // Orthographic projection parameters + float orthoWidth = 10.0f; + float orthoHeight = 10.0f; + + // Common parameters + float nearPlane = 0.1f; + float farPlane = 100.0f; + + // Matrices + glm::mat4 viewMatrix = glm::mat4(1.0f); + glm::mat4 projectionMatrix = glm::mat4(1.0f); + + // Stereo Matrices (Chapter 5) + std::array eyeViewMatrices = {glm::mat4(1.0f), glm::mat4(1.0f)}; + std::array eyeProjectionMatrices = {glm::mat4(1.0f), glm::mat4(1.0f)}; + + // Camera properties + glm::vec3 target = {0.0f, 0.0f, 0.0f}; + glm::vec3 up = {0.0f, 1.0f, 0.0f}; + + bool viewMatrixDirty = true; + bool projectionMatrixDirty = true; + + public: + /** + * @brief Constructor with optional name. + * @param componentName The name of the component. + */ + explicit CameraComponent(const std::string &componentName = "CameraComponent") : + Component(componentName) + {} + + /** + * @brief Initialize the camera component. + */ + void Initialize() override; + + /** + * @brief Set the projection type. + * @param type The projection type. + */ + void SetProjectionType(ProjectionType type) + { + projectionType = type; + projectionMatrixDirty = true; + } + + /** + * @brief Get the projection type. + * @return The projection type. + */ + ProjectionType GetProjectionType() const + { + return projectionType; + } + + /** + * @brief Set the field of view for perspective projection. + * @param fov The field of view in degrees. + */ + void SetFieldOfView(float fov) + { + fieldOfView = fov; + projectionMatrixDirty = true; + } + + /** + * @brief Get the field of view. + * @return The field of view in degrees. + */ + float GetFieldOfView() const + { + return fieldOfView; + } + + /** + * @brief Set the aspect ratio for perspective projection. + * @param ratio The aspect ratio (width / height). + */ + void SetAspectRatio(float ratio) + { + aspectRatio = ratio; + projectionMatrixDirty = true; + } + + /** + * @brief Get the aspect ratio. + * @return The aspect ratio. + */ + float GetAspectRatio() const + { + return aspectRatio; + } + + /** + * @brief Set the orthographic width and height. + * @param width The width of the orthographic view. + * @param height The height of the orthographic view. + */ + void SetOrthographicSize(float width, float height) + { + orthoWidth = width; + orthoHeight = height; + projectionMatrixDirty = true; + } + + /** + * @brief Set the near and far planes. + * @param near The near plane distance. + * @param far The far plane distance. + */ + void SetClipPlanes(float near, float far) + { + nearPlane = near; + farPlane = far; + projectionMatrixDirty = true; + } + + float GetNearPlane() const + { + return nearPlane; + } + float GetFarPlane() const + { + return farPlane; + } + + /** + * @brief Set the camera target. + * @param newTarget The new target position. + */ + void SetTarget(const glm::vec3 &newTarget) + { + target = newTarget; + viewMatrixDirty = true; + } + + /** + * @brief Set the camera up vector. + * @param newUp The new up vector. + */ + void SetUp(const glm::vec3 &newUp) + { + up = newUp; + viewMatrixDirty = true; + } + + /** + * @brief Make the camera look at a specific target position. + * @param targetPosition The position to look at. + * @param upVector The up vector (optional, defaults to current up vector). + */ + void LookAt(const glm::vec3 &targetPosition, const glm::vec3 &upVector = glm::vec3(0.0f, 1.0f, 0.0f)) + { + target = targetPosition; + up = upVector; + viewMatrixDirty = true; + } + + /** + * @brief Get the view matrix. + * @return The view matrix. + */ + const glm::mat4 &GetViewMatrix(); + + /** + * @brief Get the projection matrix. + * @return The projection matrix. + */ + const glm::mat4 &GetProjectionMatrix(); + + /** + * @brief Get the camera position. + * @return The camera position. + */ + glm::vec3 GetPosition() const + { + auto transform = GetOwner()->GetComponent(); + return transform ? transform->GetPosition() : glm::vec3(0.0f, 0.0f, 0.0f); + } + + /** + * @brief Get the camera target. + * @return The camera target. + */ + const glm::vec3 &GetTarget() const + { + return target; + } + + /** + * @brief Get the camera up vector. + * @return The camera up vector. + */ + const glm::vec3 &GetUp() const + { + return up; + } + + /** + * @brief Force view matrix recalculation without modifying camera orientation. + * This is used when the camera's transform position changes externally (e.g., from GLTF loading). + */ + void ForceViewMatrixUpdate() + { + viewMatrixDirty = true; + } + + /** + * @brief Set the stereo views for XR (Chapter 5). + * @param left The left eye view data. + * @param right The right eye view data. + */ + void SetStereoViews(const XrView& left, const XrView& right); + + /** + * @brief Get the eye-specific view matrix. + * @param eye The eye index (0 for left, 1 for right). + * @return The view matrix. + */ + const glm::mat4& GetViewMatrix(uint32_t eye) const { return eyeViewMatrices[eye]; } + + /** + * @brief Get the eye-specific projection matrix. + * @param eye The eye index (0 for left, 1 for right). + * @return The projection matrix. + */ + const glm::mat4& GetProjectionMatrix(uint32_t eye) const { return eyeProjectionMatrices[eye]; } + + private: + /** + * @brief Update the view matrix based on the camera position and target. + */ + void UpdateViewMatrix(); + + /** + * @brief Update the projection matrix based on the projection type and parameters. + */ + void UpdateProjectionMatrix(); +}; diff --git a/attachments/openxr_engine/engine.cpp b/attachments/openxr_engine/engine.cpp new file mode 100644 index 00000000..6c5303e6 --- /dev/null +++ b/attachments/openxr_engine/engine.cpp @@ -0,0 +1,1098 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "engine.h" +#include "mesh_component.h" +#include "scene_loading.h" + +#include +#include +#include +#include +#include +#include + +// This implementation corresponds to the Engine_Architecture chapter in the tutorial: +// @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc + +Engine::Engine() : resourceManager(std::make_unique()) { +} + +bool Engine::IsMainThread() const { + return std::this_thread::get_id() == mainThreadId; +} + +void Engine::ProcessPendingEntityRemovals() { + std::vector names; { + std::lock_guard lk(pendingEntityRemovalsMutex); + if (pendingEntityRemovalNames.empty()) + return; + names.swap(pendingEntityRemovalNames); + } + + // Process on the main thread only (safety) + if (!IsMainThread()) { + // Put them back; we'll retry next main-thread tick + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.insert(pendingEntityRemovalNames.end(), names.begin(), names.end()); + return; + } + + // Apply removals using the normal API (which takes the appropriate locks). + for (const auto& name : names) { + (void) RemoveEntity(name); + } +} + +Engine::~Engine() { + Cleanup(); +} + +bool Engine::Initialize(const std::string& appName, int width, int height, bool enableValidationLayers, bool useXR) { + // Create platform +#if defined(PLATFORM_ANDROID) + // For Android, the platform is created with the android_app + // This will be handled in the android_main function + return false; +#else + // Record main thread identity for deferring destructive operations from background threads + mainThreadId = std::this_thread::get_id(); + + platform = CreatePlatform(); + if (!platform->Initialize(appName, width, height)) { + return false; + } + + // Set resize callback + platform->SetResizeCallback([this](int width, int height) { + HandleResize(width, height); + }); + + // Set mouse callback + platform->SetMouseCallback([this](float x, float y, uint32_t buttons) { + handleMouseInput(x, y, buttons); + }); + + // Set keyboard callback + platform->SetKeyboardCallback([this](uint32_t key, bool pressed) { + handleKeyInput(key, pressed); + }); + + // Set char callback + platform->SetCharCallback([this](uint32_t c) { + if (imguiSystem) { + imguiSystem->HandleChar(c); + } + }); + + // Create renderer + renderer = std::make_unique(platform.get()); + if (!renderer->Initialize(appName, enableValidationLayers, useXR)) { + return false; + } + + try { + // Model loader via constructor; also wire into renderer + modelLoader = std::make_unique(renderer.get()); + renderer->SetModelLoader(modelLoader.get()); + + // Audio system via constructor + audioSystem = std::make_unique(this, renderer.get()); + + // Physics system via constructor (GPU enabled) + physicsSystem = std::make_unique(renderer.get(), true); + + // ImGui via constructor, then connect audio system + imguiSystem = std::make_unique(renderer.get(), width, height); + imguiSystem->SetAudioSystem(audioSystem.get()); + } catch (const std::exception& e) { + std::cerr << "Subsystem initialization failed: " << e.what() << std::endl; + return false; + } + + // Generate ball material properties once at load time + GenerateBallMaterial(); + + // Initialize physics scaling system + InitializePhysicsScaling(); + + initialized = true; + return true; +#endif +} + +void Engine::Run() { + if (!initialized) { + throw std::runtime_error("Engine not initialized"); + } + + running = true; + + // Main loop + while (running) { + // Process platform events + if (!platform->ProcessEvents()) { + running = false; + break; + } + + if (renderer->IsXrMode()) { + auto& xrContext = renderer->GetXrContext(); + auto frameState = xrContext.waitFrame(); + xrContext.beginFrame(); + + deltaTimeMs = CalculateDeltaTimeMs(); + Update(frameState.predictedDisplayTime); + Render(frameState.predictedDisplayTime); + + xrContext.endFrame(renderer->GetXrImageViews()); + + // Update frame counter and FPS for window title (companion window) + frameCount++; + fpsUpdateTimer += deltaTimeMs.count() * 0.001f; + if (fpsUpdateTimer >= 1.0f) { + UpdateFPSWindowTitle(); + } + } else { + // Calculate delta time + deltaTimeMs = CalculateDeltaTimeMs(); + + // Update frame counter and FPS + frameCount++; + fpsUpdateTimer += deltaTimeMs.count() * 0.001f; + + // Update window title with FPS and frame time every second + if (fpsUpdateTimer >= 1.0f) { + UpdateFPSWindowTitle(); + } + + // Update + Update(deltaTimeMs); + + // Render + Render(); + } + } +} + +void Engine::Cleanup() { + if (initialized) { + // Wait for the device to be idle before cleaning up + if (renderer) { + renderer->WaitIdle(); + } + + // Clear entities + { + std::unique_lock lk(entitiesMutex); + entities.clear(); + entityMap.clear(); + } + + // Clean up subsystems in reverse order of creation + imguiSystem.reset(); + physicsSystem.reset(); + audioSystem.reset(); + modelLoader.reset(); + renderer.reset(); + platform.reset(); + + initialized = false; + } +} + +Entity* Engine::CreateEntity(const std::string& name) { + std::unique_lock lk(entitiesMutex); + // Always allow duplicate names; map stores a representative entity + // Create the entity + auto entity = std::make_unique(name); + // Add to the vector and map + entities.push_back(std::move(entity)); + Entity* rawPtr = entities.back().get(); + // Update the map to point to the most recently created entity with this name + entityMap[name] = rawPtr; + + return rawPtr; +} + +Entity* Engine::GetEntity(const std::string& name) { + std::shared_lock lk(entitiesMutex); + auto it = entityMap.find(name); + if (it != entityMap.end()) { + return it->second; + } + return nullptr; +} + +bool Engine::RemoveEntity(Entity* entity) { + if (!entity) { + return false; + } + + // If called from a background thread, defer removal to avoid deleting entities + // while the render thread may be iterating a snapshot. + if (!IsMainThread()) { + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.push_back(entity->GetName()); + return true; + } + + std::unique_lock lk(entitiesMutex); + + // Remember the name before erasing ownership + std::string name = entity->GetName(); + + // Find the entity in the vector + auto it = std::ranges::find_if(entities, + [entity](const std::unique_ptr& e) { + return e.get() == entity; + }); + + if (it != entities.end()) { + // Remove from the vector (ownership) + entities.erase(it); + + // Update the map: point to another entity with the same name if one exists + auto remainingIt = std::ranges::find_if(entities, + [&name](const std::unique_ptr& e) { + return e->GetName() == name; + }); + + if (remainingIt != entities.end()) { + entityMap[name] = remainingIt->get(); + } else { + entityMap.erase(name); + } + + return true; + } + + return false; +} + +bool Engine::RemoveEntity(const std::string& name) { + // If called from a background thread, defer removal to avoid deleting entities + // while the render thread may be iterating a snapshot. + if (!IsMainThread()) { + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.push_back(name); + return true; + } + + std::unique_lock lk(entitiesMutex); + auto it = entityMap.find(name); + if (it == entityMap.end()) + return false; + Entity* entity = it->second; + if (!entity) + return false; + + // Find the entity in the vector + auto vecIt = std::ranges::find_if(entities, + [entity](const std::unique_ptr& e) { + return e.get() == entity; + }); + if (vecIt == entities.end()) { + entityMap.erase(name); + return false; + } + + entities.erase(vecIt); + + // Update the map: point to another entity with the same name if one exists + auto remainingIt = std::ranges::find_if(entities, + [&name](const std::unique_ptr& e) { + return e && e->GetName() == name; + }); + if (remainingIt != entities.end()) { + entityMap[name] = remainingIt->get(); + } else { + entityMap.erase(name); + } + return true; +} + +void Engine::SetActiveCamera(CameraComponent* cameraComponent) { + activeCamera = cameraComponent; +} + +const CameraComponent* Engine::GetActiveCamera() const { + return activeCamera; +} + +const ResourceManager* Engine::GetResourceManager() const { + return resourceManager.get(); +} + +const Platform* Engine::GetPlatform() const { + return platform.get(); +} + +Renderer* Engine::GetRenderer() { + return renderer.get(); +} + +ModelLoader* Engine::GetModelLoader() { + return modelLoader.get(); +} + +const AudioSystem* Engine::GetAudioSystem() const { + return audioSystem.get(); +} + +PhysicsSystem* Engine::GetPhysicsSystem() { + return physicsSystem.get(); +} + +const ImGuiSystem* Engine::GetImGuiSystem() const { + return imguiSystem.get(); +} + +void Engine::handleMouseInput(float x, float y, uint32_t buttons) { + // Check if ImGui wants to capture mouse input first + bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse(); + + // Suppress right-click while loading + if (renderer&& renderer + + -> + IsLoading() + ) { + buttons &= ~2u; // clear right button bit + } + + if (!imguiWantsMouse) { + // Handle mouse click for ball throwing (right mouse button) + if (buttons & 2) { + // Right mouse button (bit 1) + if (!cameraControl.mouseRightPressed) { + cameraControl.mouseRightPressed = true; + // Throw a ball on mouse click + ThrowBall(x, y); + } + } else { + cameraControl.mouseRightPressed = false; + } + + // Handle camera rotation when left mouse button is pressed + if (buttons & 1) { + // Left mouse button (bit 0) + if (!cameraControl.mouseLeftPressed) { + cameraControl.mouseLeftPressed = true; + cameraControl.firstMouse = true; + } + + if (cameraControl.firstMouse) { + cameraControl.lastMouseX = x; + cameraControl.lastMouseY = y; + cameraControl.firstMouse = false; + } + + float xOffset = x - cameraControl.lastMouseX; + float yOffset = y - cameraControl.lastMouseY; + cameraControl.lastMouseX = x; + cameraControl.lastMouseY = y; + + xOffset *= cameraControl.mouseSensitivity; + yOffset *= cameraControl.mouseSensitivity; + + // Mouse look: positive X moves view to the right; positive Y moves view up. + // Platform mouse coordinates increase downward, so invert Y. + cameraControl.yaw -= xOffset; + cameraControl.pitch -= yOffset; + + // Constrain pitch to avoid gimbal lock + if (cameraControl.pitch > 89.0f) + cameraControl.pitch = 89.0f; + if (cameraControl.pitch < -89.0f) + cameraControl.pitch = -89.0f; + } else { + cameraControl.mouseLeftPressed = false; + } + } + + if (imguiSystem) { + imguiSystem->HandleMouse(x, y, buttons); + } + + // Always perform hover detection (even when ImGui is active) + HandleMouseHover(x, y); +} +void Engine::handleKeyInput(uint32_t key, bool pressed) { +#if !defined(PLATFORM_ANDROID) + switch (key) { + case GLFW_KEY_W: + case GLFW_KEY_UP: + cameraControl.moveForward = pressed; + break; + case GLFW_KEY_S: + case GLFW_KEY_DOWN: + cameraControl.moveBackward = pressed; + break; + case GLFW_KEY_A: + case GLFW_KEY_LEFT: + cameraControl.moveLeft = pressed; + break; + case GLFW_KEY_D: + case GLFW_KEY_RIGHT: + cameraControl.moveRight = pressed; + break; + case GLFW_KEY_Q: + case GLFW_KEY_PAGE_UP: + cameraControl.moveUp = pressed; + break; + case GLFW_KEY_E: + case GLFW_KEY_PAGE_DOWN: + cameraControl.moveDown = pressed; + break; + default: + break; + } + + if (imguiSystem) { + imguiSystem->HandleKeyboard(key, pressed); + } +#else + // Android uses different input handling via touch events + (void) key; + (void) pressed; +#endif +} + +void Engine::Update(XrTime predictedTime) { + if (renderer && renderer->IsXrMode()) { + auto& xrContext = renderer->GetXrContext(); + + // 1. Poll OpenXR action states (Grab, Teleport, etc.) (Chapter 7) + xrContext.pollActions(); + + // 2. Retrieve the predicted views (poses and FOVs) from OpenXR (Chapter 5) + xrContext.locateViews(predictedTime); + + // 3. Update the camera with the spatial data + if (activeCamera) { + auto views = xrContext.getLatestViews(); + if (views.size() >= 2) { + activeCamera->SetStereoViews(views[0], views[1]); + } + } + } + + // Use the standard update with the last calculated delta for simulation consistency + Update(deltaTimeMs); +} + +void Engine::Update(TimeDelta deltaTime) { + // Apply any entity removals requested by background threads. + ProcessPendingEntityRemovals(); + + // During background scene loading we avoid touching the live entity + // list from the main thread. This lets the loading thread construct + // entities/components safely while the main thread only drives the + // UI/loading overlay. + if (renderer&& renderer + + -> + IsLoading() + ) { + if (imguiSystem) { + imguiSystem->NewFrame(); + } + return; + } + + // Process pending ball creations (outside rendering loop to avoid memory pool constraints) + ProcessPendingBalls(); + + if (activeCamera) { + glm::vec3 currentCameraPosition = activeCamera->GetPosition(); + physicsSystem->SetCameraPosition(currentCameraPosition); + } + + // Use real deltaTime for physics to maintain proper timing + physicsSystem->Update(deltaTime); + + // Update audio system + audioSystem->Update(deltaTime); + + // Update ImGui system + imguiSystem->NewFrame(); + + // Update camera controls + if (activeCamera) { + UpdateCameraControls(deltaTime); + } + + // Update all entities. + // Do not hold `entitiesMutex` while calling `Entity::Update()`. + // Background threads may need the unique lock to add entities during loading, + // and holding a shared lock for a long time can starve them. + std::vector snapshot; { + std::shared_lock lk(entitiesMutex); + snapshot.reserve(entities.size()); + for (auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + } + for (Entity* entity : snapshot) { + if (!entity || !entity->IsActive()) + continue; + entity->Update(deltaTime); + } +} + +void Engine::Render(XrTime predictedTime) { + // Ensure renderer is ready + if (!renderer || !renderer->IsInitialized()) { + return; + } + + // Apply any entity removals requested by background threads before taking a snapshot. + ProcessPendingEntityRemovals(); + + // Snapshot entity pointers under a short shared lock, then release the lock + // before rendering. This prevents starving the background loader/physics threads + // that need the unique lock to create entities/components. + std::vector snapshot; { + std::shared_lock lk(entitiesMutex); + snapshot.reserve(entities.size()); + for (auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + } + + // Render the scene for XR (per-eye) + renderer->Render(snapshot, activeCamera, imguiSystem.get(), predictedTime); +} + +void Engine::Render() { + // Ensure renderer is ready + if (!renderer || !renderer->IsInitialized()) { + return; + } + + // Check if we have an active camera + if (!activeCamera) { + return; + } + + // Apply any entity removals requested by background threads before taking a snapshot. + ProcessPendingEntityRemovals(); + + // Snapshot entity pointers under a short shared lock, then release the lock + // before rendering. This prevents starving the background loader/physics threads + // that need the unique lock to create entities/components. + std::vector snapshot; { + std::shared_lock lk(entitiesMutex); + snapshot.reserve(entities.size()); + for (auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + } + + // Render the scene (ImGui will be rendered within the render pass) + renderer->Render(snapshot, activeCamera, imguiSystem.get()); +} + +void Engine::UpdateFPSWindowTitle() { + uint64_t framesSinceLastUpdate = frameCount - lastFPSUpdateFrame; + double avgMs = 0.0; + if (framesSinceLastUpdate > 0 && fpsUpdateTimer > 0.0f) { + currentFPS = static_cast(static_cast(framesSinceLastUpdate) / static_cast(fpsUpdateTimer)); + avgMs = (fpsUpdateTimer / static_cast(framesSinceLastUpdate)) * 1000.0; + } else { + // Avoid divide-by-zero; keep previous FPS and estimate avgMs from last delta + currentFPS = std::max(currentFPS, 1.0f); + avgMs = static_cast(deltaTimeMs.count()); + } + + // Update window title with frame count, FPS, and frame time + std::string title = (renderer && renderer->IsXrMode() ? "Simple Engine [XR] - Frame: " : "Simple Engine - Frame: ") + + std::to_string(frameCount) + + " | FPS: " + std::to_string(static_cast(currentFPS)) + + " | ms: " + std::to_string(static_cast(avgMs)); + platform->SetWindowTitle(title); + + // Reset timer and frame counter for next update + fpsUpdateTimer = 0.0f; + lastFPSUpdateFrame = frameCount; +} + +std::chrono::milliseconds Engine::CalculateDeltaTimeMs() { + // Get current time using a steady clock to avoid system time jumps + uint64_t currentTime = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + + // Initialize lastFrameTimeMs on first call + if (lastFrameTimeMs == 0) { + lastFrameTimeMs = currentTime; + return std::chrono::milliseconds(16); // ~16ms as a sane initial guess + } + + // Calculate delta time in milliseconds + uint64_t delta = currentTime - lastFrameTimeMs; + + // Update last frame time + lastFrameTimeMs = currentTime; + + return std::chrono::milliseconds(static_cast(delta)); +} + +void Engine::HandleResize(int width, int height) const { + if (height <= 0 || width <= 0) { + return; + } + // Update the active camera's aspect ratio + if (activeCamera) { + activeCamera->SetAspectRatio(static_cast(width) / static_cast(height)); + } + + // Notify the renderer that the framebuffer has been resized + if (renderer) { + renderer->SetFramebufferResized(); + } + + // Notify ImGui system about the resize + if (imguiSystem) { + imguiSystem->HandleResize(static_cast(width), static_cast(height)); + } +} + +void Engine::UpdateCameraControls(TimeDelta deltaTime) { + if (!activeCamera) + return; + + // Get a camera transform component + auto* cameraTransform = activeCamera->GetOwner()->GetComponent(); + if (!cameraTransform) + return; + + // Check if camera tracking is enabled + if (imguiSystem&& imguiSystem + + -> + IsCameraTrackingEnabled() + ) { + // Find the first active ball entity + Entity* ballEntity = nullptr; { + std::shared_lock lk(entitiesMutex); + auto ballEntityIt = std::ranges::find_if(entities, + [](auto const& entity) { + return entity && entity->IsActive() && (entity->GetName().find("Ball_") != std::string::npos); + }); + ballEntity = (ballEntityIt != entities.end()) ? ballEntityIt->get() : nullptr; + } + + if (ballEntity) { + // Get ball's transform component + auto* ballTransform = ballEntity->GetComponent(); + if (ballTransform) { + glm::vec3 ballPosition = ballTransform->GetPosition(); + + // Position camera at a fixed offset from the ball for good viewing + glm::vec3 cameraOffset = glm::vec3(2.0f, 1.5f, 2.0f); // Behind and above the ball + glm::vec3 cameraPosition = ballPosition + cameraOffset; + + // Update camera position and target + cameraTransform->SetPosition(cameraPosition); + activeCamera->SetTarget(ballPosition); + + return; // Skip manual controls when tracking + } + } + } + + // Manual camera controls (only when tracking is disabled) + // Calculate movement speed + float velocity = cameraControl.cameraSpeed * deltaTime.count() * .001f; + + // Capture base orientation from GLTF camera once and then apply mouse deltas relative to it + if (!cameraControl.baseOrientationCaptured) { + // TransformComponent stores Euler in radians; convert to quaternion + glm::vec3 baseEuler = cameraTransform->GetRotation(); + const glm::quat qx = glm::angleAxis(baseEuler.x, glm::vec3(1.0f, 0.0f, 0.0f)); + const glm::quat qy = glm::angleAxis(baseEuler.y, glm::vec3(0.0f, 1.0f, 0.0f)); + const glm::quat qz = glm::angleAxis(baseEuler.z, glm::vec3(0.0f, 0.0f, 1.0f)); + // Match CameraComponent::UpdateViewMatrix composition (q = qz * qy * qx) + cameraControl.baseOrientation = qz * qy * qx; + cameraControl.baseOrientationCaptured = true; + } + + // Build delta orientation from yaw/pitch mouse deltas (degrees -> radians) + const float yawRad = glm::radians(cameraControl.yaw); + const float pitchRad = glm::radians(cameraControl.pitch); + const glm::quat qDeltaY = glm::angleAxis(yawRad, glm::vec3(0.0f, 1.0f, 0.0f)); + const glm::quat qDeltaX = glm::angleAxis(pitchRad, glm::vec3(1.0f, 0.0f, 0.0f)); + // Apply yaw then pitch in the same convention as CameraComponent (ZYX overall), so delta = Ry * Rx + glm::quat qDelta = qDeltaY * qDeltaX; + glm::quat qFinal = cameraControl.baseOrientation * qDelta; + + // Derive camera basis directly from rotated axes to avoid ambiguity + glm::vec3 right = glm::normalize(qFinal * glm::vec3(1.0f, 0.0f, 0.0f)); + glm::vec3 up = glm::normalize(qFinal * glm::vec3(0.0f, 1.0f, 0.0f)); + // Camera forward in world space. + // Our view/projection conventions assume the camera looks down -Z in its local space. + glm::vec3 front = glm::normalize(qFinal * glm::vec3(0.0f, 0.0f, -1.0f)); + + // Get the current camera position + glm::vec3 position = cameraTransform->GetPosition(); + + // Apply movement based on input + if (cameraControl.moveForward) { + position += front * velocity; + } + if (cameraControl.moveBackward) { + position -= front * velocity; + } + if (cameraControl.moveLeft) { + position -= right * velocity; + } + if (cameraControl.moveRight) { + position += right * velocity; + } + if (cameraControl.moveUp) { + position += up * velocity; + } + if (cameraControl.moveDown) { + position -= up * velocity; + } + + // Update camera position + cameraTransform->SetPosition(position); + // Apply rotation to the camera transform based on GLTF base orientation plus mouse deltas + // TransformComponent expects radians Euler (ZYX order in our CameraComponent). + cameraTransform->SetRotation(glm::eulerAngles(qFinal)); + + // Update camera target based on a direction + glm::vec3 target = position + front; + activeCamera->SetTarget(target); + + // Ensure the camera view matrix reflects the new transform immediately this frame + activeCamera->ForceViewMatrixUpdate(); +} + +void Engine::GenerateBallMaterial() { + // Generate 8 random material properties for PBR + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0.0f, 1.0f); + + // Generate bright, vibrant albedo colors for better visibility + std::uniform_real_distribution brightDis(0.6f, 1.0f); // Ensure bright colors + ballMaterial.albedo = glm::vec3(brightDis(gen), brightDis(gen), brightDis(gen)); + + // Random metallic value (0.0 to 1.0) + ballMaterial.metallic = dis(gen); + + // Random roughness value (0.0 to 1.0) + ballMaterial.roughness = dis(gen); + + // Random ambient occlusion (typically 0.8 to 1.0 for good lighting) + ballMaterial.ao = 0.8f + dis(gen) * 0.2f; + + // Random emissive color (usually subtle) + ballMaterial.emissive = glm::vec3(dis(gen) * 0.3f, dis(gen) * 0.3f, dis(gen) * 0.3f); + + // Decent bounciness (0.6 to 0.9) so bounces are clearly visible + ballMaterial.bounciness = 0.6f + dis(gen) * 0.3f; +} + +void Engine::InitializePhysicsScaling() { + // Based on issue analysis: balls reaching 120+ m/s and extreme positions like (-244, -360, -244) + // The previous 200.0f force scale was causing supersonic speeds and balls flying out of scene + // Need much more conservative scaling for realistic visual gameplay + + // Use smaller game unit scale for more controlled physics + physicsScaling.gameUnitsToMeters = 0.1f; // 1 game unit = 0.1 meter (10cm) - smaller scale + + // Much reduced force scaling to prevent extreme speeds + // With base forces 0.01f-0.05f, this gives final forces of 0.001f-0.005f + physicsScaling.forceScale = 1.0f; // Minimal force scaling for realistic movement + physicsScaling.physicsTimeScale = 1.0f; // Keep time scale normal + physicsScaling.gravityScale = 1.0f; // Keep gravity proportional to scale + + // Apply scaled gravity to physics system + glm::vec3 realWorldGravity(0.0f, -9.81f, 0.0f); + glm::vec3 scaledGravity = ScaleGravityForPhysics(realWorldGravity); + physicsSystem->SetGravity(scaledGravity); +} + +float Engine::ScaleForceForPhysics(float gameForce) const { + // Scale force based on the relationship between game units and real world + // and the force scaling factor to make physics feel right + return gameForce * physicsScaling.forceScale * physicsScaling.gameUnitsToMeters; +} + +glm::vec3 Engine::ScaleGravityForPhysics(const glm::vec3& realWorldGravity) const { + // Scale gravity based on game unit scale and gravity scaling factor + // If 1 game unit = 1 meter, then gravity should remain -9.81 + // If 1 game unit = 0.1 meter, then gravity should be -0.981 + return realWorldGravity * physicsScaling.gravityScale * physicsScaling.gameUnitsToMeters; +} + +float Engine::ScaleTimeForPhysics(float deltaTime) const { + // Scale time for physics simulation if needed + // This can be used to slow down or speed up physics relative to rendering + return deltaTime * physicsScaling.physicsTimeScale; +} + +void Engine::ThrowBall(float mouseX, float mouseY) { + if (!activeCamera || !physicsSystem) { + return; + } + + // Get window dimensions + int windowWidth, windowHeight; + platform->GetWindowSize(&windowWidth, &windowHeight); + + // Convert mouse coordinates to normalized device coordinates (-1 to 1) + float ndcX = (2.0f * mouseX) / static_cast(windowWidth) - 1.0f; + float ndcY = 1.0f - (2.0f * mouseY) / static_cast(windowHeight); + + // Get camera matrices + glm::mat4 viewMatrix = activeCamera->GetViewMatrix(); + glm::mat4 projMatrix = activeCamera->GetProjectionMatrix(); + + // Calculate inverse matrices + glm::mat4 invView = glm::inverse(viewMatrix); + glm::mat4 invProj = glm::inverse(projMatrix); + + // Convert NDC to world space for direction + glm::vec4 rayClip = glm::vec4(ndcX, ndcY, -1.0f, 1.0f); + glm::vec4 rayEye = invProj * rayClip; + rayEye = glm::vec4(rayEye.x, rayEye.y, -1.0f, 0.0f); + glm::vec4 rayWorld = invView * rayEye; + + // Calculate screen center in world coordinates + // Screen center is at NDC (0, 0) which corresponds to the center of the view + glm::vec4 screenCenterClip = glm::vec4(0.0f, 0.0f, -1.0f, 1.0f); + glm::vec4 screenCenterEye = invProj * screenCenterClip; + screenCenterEye = glm::vec4(screenCenterEye.x, screenCenterEye.y, -1.0f, 0.0f); + glm::vec4 screenCenterWorld = invView * screenCenterEye; + glm::vec3 screenCenterDirection = glm::normalize(glm::vec3(screenCenterWorld)); + + // Calculate world position for screen center at a reasonable distance from camera + glm::vec3 cameraPosition = activeCamera->GetPosition(); + glm::vec3 screenCenterWorldPos = cameraPosition + screenCenterDirection * 2.0f; // 2 units in front of camera + + // Calculate throw direction from screen center toward mouse position + glm::vec3 throwDirection = glm::normalize(glm::vec3(rayWorld)); + + // Add upward component for realistic arc trajectory + throwDirection.y += 0.3f; // Add upward bias for throwing arc + throwDirection = glm::normalize(throwDirection); // Re-normalize after modification + + // Generate ball properties now + static int ballCounter = 0; + std::string ballName = "Ball_" + std::to_string(ballCounter++); + + std::random_device rd; + std::mt19937 gen(rd()); + + // Launch balls from screen center toward mouse cursor + glm::vec3 spawnPosition = screenCenterWorldPos; + + // Add small random variation to avoid identical paths + std::uniform_real_distribution posDis(-0.1f, 0.1f); + spawnPosition.x += posDis(gen); + spawnPosition.y += posDis(gen); + spawnPosition.z += posDis(gen); + + std::uniform_real_distribution spinDis(-10.0f, 10.0f); + std::uniform_real_distribution forceDis(15.0f, 35.0f); // Stronger force range for proper throwing feel + + // Store ball creation data for processing outside rendering loop + PendingBall pendingBall; + pendingBall.spawnPosition = spawnPosition; + pendingBall.throwDirection = throwDirection; // This is now the corrected direction toward geometry + pendingBall.throwForce = ScaleForceForPhysics(forceDis(gen)); // Apply physics scaling to force + pendingBall.randomSpin = glm::vec3(spinDis(gen), spinDis(gen), spinDis(gen)); + pendingBall.ballName = ballName; + + pendingBalls.push_back(pendingBall); +} + +void Engine::ProcessPendingBalls() { + // Process all pending balls + for (const auto& pendingBall : pendingBalls) { + // Create ball entity + Entity* ballEntity = CreateEntity(pendingBall.ballName); + if (!ballEntity) { + std::cerr << "Failed to create ball entity: " << pendingBall.ballName << std::endl; + continue; + } + + // Add transform component + auto* transform = ballEntity->AddComponent(); + if (!transform) { + std::cerr << "Failed to add TransformComponent to ball: " << pendingBall.ballName << std::endl; + continue; + } + transform->SetPosition(pendingBall.spawnPosition); + transform->SetScale(glm::vec3(1.0f)); // Tennis ball size scale + + // Add mesh component with sphere geometry + auto* mesh = ballEntity->AddComponent(); + if (!mesh) { + std::cerr << "Failed to add MeshComponent to ball: " << pendingBall.ballName << std::endl; + continue; + } + // Create tennis ball-sized, bright red sphere + glm::vec3 brightRed(1.0f, 0.0f, 0.0f); + mesh->CreateSphere(0.0335f, brightRed, 32); // Tennis ball radius, bright color, high detail + mesh->SetTexturePath(renderer->SHARED_BRIGHT_RED_ID); // Use bright red texture for visibility + + // Verify mesh geometry was created + const auto& vertices = mesh->GetVertices(); + const auto& indices = mesh->GetIndices(); + if (vertices.empty() || indices.empty()) { + std::cerr << "ERROR: CreateSphere failed to generate geometry!" << std::endl; + continue; + } + + // Pre-allocate Vulkan resources for this entity (now outside rendering loop) + if (!renderer->preAllocateEntityResources(ballEntity)) { + std::cerr << "Failed to pre-allocate resources for ball: " << pendingBall.ballName << std::endl; + continue; + } + + // Create rigid body with sphere collision shape + RigidBody* rigidBody = physicsSystem->CreateRigidBody(ballEntity, CollisionShape::Sphere, 1.0f); + if (rigidBody) { + // Set bounciness from material + rigidBody->SetRestitution(ballMaterial.bounciness); + + // Request an acceleration structure build so the new ball is included in Ray Query mode. + // We do this after creating the rigid body and initializing the entity. + renderer->RequestAccelerationStructureBuild("Ball spawned"); + + // Apply throw force and spin + glm::vec3 throwImpulse = pendingBall.throwDirection * pendingBall.throwForce; + rigidBody->ApplyImpulse(throwImpulse, glm::vec3(0.0f)); + rigidBody->SetAngularVelocity(pendingBall.randomSpin); + } + } + + // Clear processed balls + pendingBalls.clear(); +} + +void Engine::HandleMouseHover(float mouseX, float mouseY) { + // Update current mouse position for any systems that might need it + currentMouseX = mouseX; + currentMouseY = mouseY; +} + +#if defined(PLATFORM_ANDROID) +// Android-specific implementation +bool Engine::InitializeAndroid(android_app* app, const std::string& appName, bool enableValidationLayers, bool useXR) { + // Create platform + platform = CreatePlatform(app); + if (!platform->Initialize(appName, 0, 0)) { + return false; + } + + // Set resize callback + platform->SetResizeCallback([this](int width, int height) { + HandleResize(width, height); + }); + + // Set mouse callback + platform->SetMouseCallback([this](float x, float y, uint32_t buttons) { + // Check if ImGui wants to capture mouse input first + bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse(); + + if (!imguiWantsMouse) { + // Handle mouse click for ball throwing (right mouse button) + if (buttons & 2) { + // Right mouse button (bit 1) + if (!cameraControl.mouseRightPressed) { + cameraControl.mouseRightPressed = true; + // Throw a ball on mouse click + ThrowBall(x, y); + } + } else { + cameraControl.mouseRightPressed = false; + } + } + + if (imguiSystem) { + imguiSystem->HandleMouse(x, y, buttons); + } + }); + + // Set keyboard callback + platform->SetKeyboardCallback([this](uint32_t key, bool pressed) { + if (imguiSystem) { + imguiSystem->HandleKeyboard(key, pressed); + } + }); + + // Set char callback + platform->SetCharCallback([this](uint32_t c) { + if (imguiSystem) { + imguiSystem->HandleChar(c); + } + }); + + // Create renderer + renderer = std::make_unique(platform.get()); + if (useXR) { + renderer->SetAndroidApp(app); + } + if (!renderer->Initialize(appName, enableValidationLayers, useXR)) { + return false; + } + + // Get window dimensions from platform for ImGui initialization + int width, height; + platform->GetWindowSize(&width, &height); + + try { + // Model loader via constructor; also wire into renderer + modelLoader = std::make_unique(renderer.get()); + renderer->SetModelLoader(modelLoader.get()); + + // Audio system via constructor + audioSystem = std::make_unique(this, renderer.get()); + + // Physics system via constructor (GPU enabled) + physicsSystem = std::make_unique(renderer.get(), true); + + // ImGui via constructor, then connect audio system + imguiSystem = std::make_unique(renderer.get(), width, height); + imguiSystem->SetAudioSystem(audioSystem.get()); + } catch (const std::exception& e) { + std::cerr << "Subsystem initialization failed: " << e.what() << std::endl; + return false; + } + + // Generate ball material properties once at load time + GenerateBallMaterial(); + + // Initialize physics scaling system + InitializePhysicsScaling(); + + initialized = true; + return true; +} + +void Engine::RunAndroid() { + Run(); +} +#endif diff --git a/attachments/openxr_engine/engine.h b/attachments/openxr_engine/engine.h new file mode 100644 index 00000000..65567b44 --- /dev/null +++ b/attachments/openxr_engine/engine.h @@ -0,0 +1,417 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "audio_system.h" +#include "camera_component.h" +#include "entity.h" +#include "imgui_system.h" +#include "model_loader.h" +#include "physics_system.h" +#include "platform.h" +#include "renderer.h" +#include "resource_manager.h" + +/** + * @brief Main engine class that manages the game loop and subsystems. + * + * This class implements the core engine architecture as described in the Engine_Architecture chapter: + * @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc + */ +class Engine +{ + public: + using TimeDelta = std::chrono::milliseconds; + /** + * @brief Default constructor. + */ + Engine(); + + /** + * @brief Destructor for proper cleanup. + */ + ~Engine(); + + /** + * @brief Initialize the engine. + * @param appName The name of the application. + * @param width The width of the window. + * @param height The height of the window. + * @param enableValidationLayers Whether to enable Vulkan validation layers. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(const std::string &appName, int width, int height, bool enableValidationLayers = true, bool useXR = false); + + /** + * @brief Run the main game loop. + */ + void Run(); + + /** + * @brief Clean up engine resources. + */ + void Cleanup(); + + /** + * @brief Create a new entity. + * @param name The name of the entity. + * @return A pointer to the newly created entity. + */ + Entity *CreateEntity(const std::string &name); + + /** + * @brief Get an entity by name. + * @param name The name of the entity. + * @return A pointer to the entity, or nullptr if not found. + */ + Entity *GetEntity(const std::string &name); + + /** + * @brief Get all entities. + * @return A const reference to the vector of entities. + */ + const std::vector> &GetEntities() const + { + return entities; + } + + /** + * @brief Remove an entity. + * @param entity The entity to remove. + * @return True if the entity was removed, false otherwise. + */ + bool RemoveEntity(Entity *entity); + + /** + * @brief Remove an entity by name. + * @param name The name of the entity to remove. + * @return True if the entity was removed, false otherwise. + */ + bool RemoveEntity(const std::string &name); + + /** + * @brief Set the active camera. + * @param cameraComponent The camera component to set as active. + */ + void SetActiveCamera(CameraComponent *cameraComponent); + + /** + * @brief Get the active camera. + * @return A pointer to the active camera component, or nullptr if none is set. + */ + const CameraComponent *GetActiveCamera() const; + + /** + * @brief Get the resource manager. + * @return A pointer to the resource manager. + */ + const ResourceManager *GetResourceManager() const; + + /** + * @brief Get the platform. + * @return A pointer to the platform. + */ + const Platform *GetPlatform() const; + + /** + * @brief Get the renderer. + * @return A pointer to the renderer. + */ + Renderer *GetRenderer(); + + /** + * @brief Get the model loader. + * @return A pointer to the model loader. + */ + ModelLoader *GetModelLoader(); + + /** + * @brief Get the audio system. + * @return A pointer to the audio system. + */ + const AudioSystem *GetAudioSystem() const; + + /** + * @brief Get the physics system. + * @return A pointer to the physics system. + */ + PhysicsSystem *GetPhysicsSystem(); + + /** + * @brief Get the ImGui system. + * @return A pointer to the ImGui system. + */ + const ImGuiSystem *GetImGuiSystem() const; + + /** + * @brief Handles mouse input for interaction and camera control. + * + * This method processes mouse input for various functionalities, including interacting with the scene, + * camera rotation, and delegating handling to ImGui or hover systems. + * + * @param x The x-coordinate of the mouse position. + * @param y The y-coordinate of the mouse position. + * @param buttons A bitmask representing the state of mouse buttons. + * Bit 0 corresponds to the left button, and Bit 1 corresponds to the right button. + */ + void handleMouseInput(float x, float y, uint32_t buttons); + + /** + * @brief Handles keyboard input events for controlling the camera and other subsystems. + * + * This method processes key press and release events to update the camera's movement state. + * It also forwards the input to other subsystems like the ImGui interface if applicable. + * + * @param key The key code of the keyboard input. + * @param pressed Indicates whether the key is pressed (true) or released (false). + */ + void handleKeyInput(uint32_t key, bool pressed); + +#if defined(PLATFORM_ANDROID) +/** + * @brief Initialize the engine for Android. + * @param app The Android app. + * @param appName The name of the application. + * @param enableValidationLayers Whether to enable Vulkan validation layers. + * @return True if initialization was successful, false otherwise. + */ +# if defined(NDEBUG) + bool InitializeAndroid(android_app *app, const std::string &appName, bool enableValidationLayers = false, bool useXR = false); +# else + bool InitializeAndroid(android_app *app, const std::string &appName, bool enableValidationLayers = true, bool useXR = false); +# endif + + /** + * @brief Run the engine on Android. + */ + void RunAndroid(); +#endif + + private: + // Subsystems + std::unique_ptr platform; + std::unique_ptr renderer; + std::unique_ptr resourceManager; + std::unique_ptr modelLoader; + std::unique_ptr audioSystem; + std::unique_ptr physicsSystem; + std::unique_ptr imguiSystem; + + // Entities + // NOTE: Entities can be created from a background loading thread (see `main.cpp`). + // Protect the containers to avoid iterator invalidation/data races while the render thread + // iterates them. + mutable std::shared_mutex entitiesMutex; + std::vector> entities; + std::unordered_map entityMap; + + // Main thread identity (used to defer destructive operations from background threads) + std::thread::id mainThreadId{}; + + // Background threads may request entity removal while the render thread is iterating snapshots. + // To keep `Entity*` snapshots safe, defer removals to the main thread at a safe point. + std::mutex pendingEntityRemovalsMutex; + std::vector pendingEntityRemovalNames; + void ProcessPendingEntityRemovals(); + bool IsMainThread() const; + void UpdateFPSWindowTitle(); + + // Active camera + CameraComponent *activeCamera = nullptr; + + // Engine state + bool initialized = false; + bool running = false; + + // Delta time calculation + // deltaTimeMs: time since last frame in milliseconds (for clarity) + std::chrono::milliseconds deltaTimeMs{0}; + uint64_t lastFrameTimeMs = 0; + + // Frame counter and FPS calculation + uint64_t frameCount = 0; + float fpsUpdateTimer = 0.0f; + float currentFPS = 0.0f; + uint64_t lastFPSUpdateFrame = 0; + + // Camera control state + struct CameraControlState + { + bool moveForward = false; + bool moveBackward = false; + bool moveLeft = false; + bool moveRight = false; + bool moveUp = false; + bool moveDown = false; + bool mouseLeftPressed = false; + bool mouseRightPressed = false; + float lastMouseX = 0.0f; + float lastMouseY = 0.0f; + float yaw = 0.0f; + float pitch = 0.0f; + bool firstMouse = true; + float cameraSpeed = 5.0f; + float mouseSensitivity = 0.1f; + bool baseOrientationCaptured = false; + glm::quat baseOrientation{1.0f, 0.0f, 0.0f, 0.0f}; + } cameraControl; + + // Mouse position tracking + float currentMouseX = 0.0f; + float currentMouseY = 0.0f; + + // Ball material properties for PBR + struct BallMaterial + { + glm::vec3 albedo; + float metallic; + float roughness; + float ao; + glm::vec3 emissive; + float bounciness; + }; + + BallMaterial ballMaterial; + + // Physics scaling configuration + // The bistro scene spans roughly 20 game units and represents a realistic cafe/bistro space + // Based on issue feedback: game units should NOT equal 1m and need proper scaling + // Analysis shows bistro geometry pieces are much smaller than assumed + struct PhysicsScaling + { + float gameUnitsToMeters = 0.1f; // 1 game unit = 0.1 meter (10cm) - more realistic scale + float physicsTimeScale = 1.0f; // Normal time scale for stable physics + float forceScale = 2.0f; // Much reduced force scaling for visual gameplay (was 10.0f) + float gravityScale = 0.1f; // Scaled gravity for smaller world scale + }; + + PhysicsScaling physicsScaling; + + // Pending ball creation data + struct PendingBall + { + glm::vec3 spawnPosition; + glm::vec3 throwDirection; + float throwForce; + glm::vec3 randomSpin; + std::string ballName; + }; + + std::vector pendingBalls; + + /** + * @brief Update the engine state. + * @param deltaTime The time elapsed since the last update. + */ + // Accepts a time delta in milliseconds for clarity + void Update(TimeDelta deltaTime); + + /** + * @brief Update the engine state for XR. + * @param predictedTime The predicted display time from OpenXR. + */ + void Update(XrTime predictedTime); + + /** + * @brief Render the scene. + */ + void Render(); + + /** + * @brief Render the scene for XR. + * @param predictedTime The predicted display time from OpenXR. + */ + void Render(XrTime predictedTime); + + /** + * @brief Calculate the time delta between frames. + * @return The delta time in milliseconds (steady_clock based). + */ + std::chrono::milliseconds CalculateDeltaTimeMs(); + + /** + * @brief Handle window resize events. + * @param width The new width of the window. + * @param height The new height of the window. + */ + void HandleResize(int width, int height) const; + + /** + * @brief Update camera controls based on input state. + * @param deltaTime The time elapsed since the last update. + */ + void UpdateCameraControls(TimeDelta deltaTime); + + /** + * @brief Generate random PBR material properties for the ball. + */ + void GenerateBallMaterial(); + + /** + * @brief Initialize physics scaling based on scene analysis. + */ + void InitializePhysicsScaling(); + + /** + * @brief Convert a force value from game units to physics units. + * @param gameForce Force in game units. + * @return Force scaled for physics simulation. + */ + float ScaleForceForPhysics(float gameForce) const; + + /** + * @brief Convert gravity from real-world units to game physics units. + * @param realWorldGravity Gravity in m/s². + * @return Gravity scaled for game physics. + */ + glm::vec3 ScaleGravityForPhysics(const glm::vec3 &realWorldGravity) const; + + /** + * @brief Convert time delta for physics simulation. + * @param deltaTime Real delta time. + * @return Scaled delta time for physics. + */ + float ScaleTimeForPhysics(float deltaTime) const; + + /** + * @brief Throw a ball into the scene with random properties. + * @param mouseX The x-coordinate of the mouse click. + * @param mouseY The y-coordinate of the mouse click. + */ + void ThrowBall(float mouseX, float mouseY); + + /** + * @brief Process pending ball creations outside the rendering loop. + */ + void ProcessPendingBalls(); + + /** + * @brief Handle mouse hover to track current mouse position. + * @param mouseX The x-coordinate of the mouse position. + * @param mouseY The y-coordinate of the mouse position. + */ + void HandleMouseHover(float mouseX, float mouseY); +}; diff --git a/attachments/openxr_engine/install_dependencies_linux.sh b/attachments/openxr_engine/install_dependencies_linux.sh new file mode 100755 index 00000000..276fb42f --- /dev/null +++ b/attachments/openxr_engine/install_dependencies_linux.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -e +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +SIMPLE_ENGINE_DIR="$SCRIPT_DIR/../simple_engine" + +if [ ! -d "$SIMPLE_ENGINE_DIR" ]; then + echo "Error: simple_engine directory not found at $SIMPLE_ENGINE_DIR" + exit 1 +fi + +# Run simple engine dependencies first +# We pass the work root to avoid conflicts if needed, but the script uses a fixed one anyway. +bash "$SIMPLE_ENGINE_DIR/install_dependencies_linux.sh" + +# Install OpenXR +if [ -f /etc/os-release ]; then + . /etc/os-release + OS=$NAME +fi + +echo "Installing OpenXR dependencies for $OS..." + +case "$OS" in + "Ubuntu"* | "Debian"* | "Linux Mint"*) + sudo apt-get update + sudo apt-get install -y libopenxr-dev + ;; + "Fedora"* | "Red Hat"* | "CentOS"* | "Rocky"*) + sudo dnf install -y openxr-devel + ;; + "Arch"* | "Manjaro"*) + sudo pacman -S --noconfirm openxr + ;; + *) + echo "Warning: Unsupported OS for automatic OpenXR installation. Please install OpenXR SDK manually." + ;; +esac + +echo "OpenXR dependencies installation completed!" diff --git a/attachments/openxr_engine/install_dependencies_windows.bat b/attachments/openxr_engine/install_dependencies_windows.bat new file mode 100644 index 00000000..8ce0334c --- /dev/null +++ b/attachments/openxr_engine/install_dependencies_windows.bat @@ -0,0 +1,37 @@ +@echo off +set SCRIPT_DIR=%~dp0 +set SIMPLE_ENGINE_DIR=%SCRIPT_DIR%..\simple_engine + +if not exist "%SIMPLE_ENGINE_DIR%" ( + echo Error: simple_engine directory not found at %SIMPLE_ENGINE_DIR% + exit /b 1 +) + +:: Ensure vcpkg is accessible +where vcpkg >nul 2>nul +if %ERRORLEVEL% neq 0 ( + if defined VCPKG_INSTALLATION_ROOT ( + if exist "%VCPKG_INSTALLATION_ROOT%\vcpkg.exe" ( + set "PATH=%VCPKG_INSTALLATION_ROOT%;%PATH%" + ) + ) +) + +echo Calling simple_engine dependencies installer... +call "%SIMPLE_ENGINE_DIR%\install_dependencies_windows.bat" +if %ERRORLEVEL% neq 0 ( + echo Error: simple_engine dependencies installation failed. + exit /b %ERRORLEVEL% +) + +echo. +echo Installing OpenXR loader... +vcpkg install openxr-loader --triplet=x64-windows +if %ERRORLEVEL% neq 0 ( + echo Error: Failed to install openxr-loader. + exit /b %ERRORLEVEL% +) + +echo. +echo OpenXR dependencies installation completed! +exit /b 0 diff --git a/attachments/openxr_engine/main.cpp b/attachments/openxr_engine/main.cpp new file mode 100644 index 00000000..ee822408 --- /dev/null +++ b/attachments/openxr_engine/main.cpp @@ -0,0 +1,157 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "camera_component.h" +#include "crash_reporter.h" +#include "engine.h" +#include "scene_loading.h" +#include "transform_component.h" +#include "xr_context.h" + +#include +#include +#include + +// Constants +constexpr int WINDOW_WIDTH = 800; +constexpr int WINDOW_HEIGHT = 600; +#if defined(NDEBUG) +constexpr bool ENABLE_VALIDATION_LAYERS = false; +#else +constexpr bool ENABLE_VALIDATION_LAYERS = true; +#endif + +/** + * @brief Set up a simple scene with a camera and some objects. + * @param engine The engine to set up the scene in. + */ +void SetupScene(Engine *engine) +{ + // Create a camera entity + Entity *cameraEntity = engine->CreateEntity("Camera"); + if (!cameraEntity) + { + throw std::runtime_error("Failed to create camera entity"); + } + + // Add a transform component to the camera + auto *cameraTransform = cameraEntity->AddComponent(); + cameraTransform->SetPosition(glm::vec3(0.0f, 0.0f, 3.0f)); + + // Add a camera component to the camera entity + auto *camera = cameraEntity->AddComponent(); + camera->SetAspectRatio(static_cast(WINDOW_WIDTH) / static_cast(WINDOW_HEIGHT)); + + // Set the camera as the active camera + engine->SetActiveCamera(camera); + + // Kick off GLTF model loading on a background thread so the main loop + // can start and render the UI/progress bar while the scene is being + // constructed. Engine::Update will avoid updating entities while + // loading is in progress to prevent data races. + if (auto *renderer = engine->GetRenderer()) + { + renderer->SetLoading(true); + renderer->SetLoadingPhase(Renderer::LoadingPhase::Textures); + } + std::thread([engine] { + LoadGLTFModel(engine, "../Assets/bistro/bistro.gltf"); + }).detach(); +} + +#if defined(PLATFORM_ANDROID) +/** + * @brief Android entry point. + * @param app The Android app. + */ +void android_main(android_app *app) +{ + try + { + // Create the engine + Engine engine; + + // Test if we are able to run in XR mode + bool useXR = false; + if (XrContext::checkRuntimeAvailable()) { + std::cout << "OpenXR Runtime detected. Attempting spatial initialization..." << std::endl; + useXR = true; + } + + // Initialize the engine + if (!engine.InitializeAndroid(app, "Simple Engine", ENABLE_VALIDATION_LAYERS, useXR)) + { + throw std::runtime_error("Failed to initialize engine"); + } + + // Set up the scene + SetupScene(&engine); + + // Run the engine + engine.RunAndroid(); + } + catch (const std::exception &e) + { + LOGE("Exception: %s", e.what()); + } +} +#else +/** + * @brief Desktop entry point. + * @return The exit code. + */ +int main(int, char *[]) +{ + try + { + // Enable minidump generation for Release-only crashes (e.g., stack cookie failures / fast-fail). + // Writes dumps under the current working directory (the build/run directory). + CrashReporter::GetInstance().Initialize("crashes", "SimpleEngine", "1.0.0"); + + // Create the engine + Engine engine; + + // Test if we are able to run in XR mode + bool useXR = false; + if (XrContext::checkRuntimeAvailable()) { + std::cout << "OpenXR Runtime detected. Attempting spatial initialization..." << std::endl; + useXR = true; + } + + // Initialize the engine + if (!engine.Initialize("Simple Engine", WINDOW_WIDTH, WINDOW_HEIGHT, ENABLE_VALIDATION_LAYERS, useXR)) + { + throw std::runtime_error("Failed to initialize engine"); + } + + // Set up the scene + SetupScene(&engine); + + // Run the engine + engine.Run(); + + CrashReporter::GetInstance().Cleanup(); + + return 0; + } + catch (const std::exception &e) + { + std::cerr << "Exception: " << e.what() << std::endl; + CrashReporter::GetInstance().Cleanup(); + return 1; + } +} +#endif diff --git a/attachments/openxr_engine/memory_pool.cpp b/attachments/openxr_engine/memory_pool.cpp new file mode 100644 index 00000000..32c60333 --- /dev/null +++ b/attachments/openxr_engine/memory_pool.cpp @@ -0,0 +1,589 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "memory_pool.h" +#include +#include +#include +#include + +MemoryPool::MemoryPool(const vk::raii::Device& device, const vk::raii::PhysicalDevice& physicalDevice) : device(device), physicalDevice(physicalDevice) { +} + +MemoryPool::~MemoryPool() { + // RAII will handle cleanup automatically + std::lock_guard lock(poolMutex); + pools.clear(); +} + +bool MemoryPool::initialize() { + std::lock_guard lock(poolMutex); + + try { + // Configure default pool settings based on typical usage patterns + + // Vertex buffer pool: Large allocations, device-local (increased for large models like bistro) + configurePool( + PoolType::VERTEX_BUFFER, + 128 * 1024 * 1024, + // 128MB blocks (doubled) + 4096, + // 4KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // Index buffer pool: Medium allocations, device-local (increased for large models like bistro) + configurePool( + PoolType::INDEX_BUFFER, + 64 * 1024 * 1024, + // 64MB blocks (doubled) + 2048, + // 2KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // Uniform buffer pool: Small allocations, host-visible + // Use 64-byte alignment to match nonCoherentAtomSize and prevent validation errors + configurePool( + PoolType::UNIFORM_BUFFER, + 4 * 1024 * 1024, + // 4MB blocks + 64, + // 64B allocation units (aligned to nonCoherentAtomSize) + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Staging buffer pool: Variable allocations, host-visible + // Use 64-byte alignment to match nonCoherentAtomSize and prevent validation errors + configurePool( + PoolType::STAGING_BUFFER, + 16 * 1024 * 1024, + // 16MB blocks + 64, + // 64B allocation units (aligned to nonCoherentAtomSize) + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Texture image pool: Use moderate block sizes to reduce allocation failures on mid-range GPUs + configurePool( + PoolType::TEXTURE_IMAGE, + 64 * 1024 * 1024, + // 64MB blocks (smaller blocks reduce contiguous allocation pressure) + 4096, + // 4KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to initialize memory pool: " << e.what() << std::endl; + return false; + } +} + +void MemoryPool::configurePool( + const PoolType poolType, + const vk::DeviceSize blockSize, + const vk::DeviceSize allocationUnit, + const vk::MemoryPropertyFlags properties) { + PoolConfig config; + config.blockSize = blockSize; + config.allocationUnit = allocationUnit; + config.properties = properties; + + poolConfigs[poolType] = config; +} + +uint32_t MemoryPool::findMemoryType(const uint32_t typeFilter, const vk::MemoryPropertyFlags properties) const { + const vk::PhysicalDeviceMemoryProperties memProperties = physicalDevice.getMemoryProperties(); + + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if ((typeFilter & (1 << i)) && + (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("Failed to find suitable memory type"); +} + +std::unique_ptr MemoryPool::createMemoryBlock(PoolType poolType, vk::DeviceSize size, vk::MemoryAllocateFlags allocFlags) { + auto configIt = poolConfigs.find(poolType); + if (configIt == poolConfigs.end()) { + throw std::runtime_error("Pool type not configured"); + } + + const PoolConfig& config = configIt->second; + + // Use the larger of the requested size or configured block size + const vk::DeviceSize blockSize = std::max(size, config.blockSize); + + // Create a dummy buffer to get memory requirements for the memory type + vk::BufferCreateInfo bufferInfo{ + .size = blockSize, + .usage = vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndexBuffer | + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer dummyBuffer(device, bufferInfo); + vk::MemoryRequirements memRequirements = dummyBuffer.getMemoryRequirements(); + + uint32_t memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, config.properties); + + // Allocate the memory block using the device-required size + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = memRequirements.size, + .memoryTypeIndex = memoryTypeIndex + }; + + // Add allocation flags (e.g., VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT) if needed + vk::MemoryAllocateFlagsInfo flagsInfo{}; + if (allocFlags != vk::MemoryAllocateFlags{}) { + flagsInfo.flags = allocFlags; + allocInfo.pNext = &flagsInfo; + } + + // Create MemoryBlock with proper initialization to avoid default constructor issues + auto block = std::unique_ptr(new MemoryBlock{ + .memory = vk::raii::DeviceMemory(device, allocInfo), + .size = memRequirements.size, + .used = 0, + .memoryTypeIndex = memoryTypeIndex, + .isMapped = false, + .mappedPtr = nullptr, + .freeList = {}, + .allocationUnit = config.allocationUnit + }); + + // Map memory if it's host-visible + block->isMapped = (config.properties & vk::MemoryPropertyFlagBits::eHostVisible) != vk::MemoryPropertyFlags{}; + if (block->isMapped) { + block->mappedPtr = block->memory.mapMemory(0, memRequirements.size); + } else { + block->mappedPtr = nullptr; + } + + // Initialize a free list based on the actual allocated size + const size_t numUnits = static_cast(block->size / config.allocationUnit); + block->freeList.resize(numUnits, true); // All units initially free + + return block; +} + +std::unique_ptr MemoryPool::createMemoryBlockWithType(PoolType poolType, vk::DeviceSize size, uint32_t memoryTypeIndex, vk::MemoryAllocateFlags allocFlags) { + auto configIt = poolConfigs.find(poolType); + if (configIt == poolConfigs.end()) { + throw std::runtime_error("Pool type not configured"); + } + const PoolConfig& config = configIt->second; + + // Allocate the memory block with the exact requested size + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = size, + .memoryTypeIndex = memoryTypeIndex + }; + + // Add allocation flags (e.g., VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT) if needed + vk::MemoryAllocateFlagsInfo flagsInfo{}; + if (allocFlags != vk::MemoryAllocateFlags{}) { + flagsInfo.flags = allocFlags; + allocInfo.pNext = &flagsInfo; + } + + // Determine properties from the chosen memory type + const auto memProps = physicalDevice.getMemoryProperties(); + if (memoryTypeIndex >= memProps.memoryTypeCount) { + throw std::runtime_error("Invalid memoryTypeIndex for createMemoryBlockWithType"); + } + const vk::MemoryPropertyFlags typeProps = memProps.memoryTypes[memoryTypeIndex].propertyFlags; + + auto block = std::unique_ptr(new MemoryBlock{ + .memory = vk::raii::DeviceMemory(device, allocInfo), + .size = size, + .used = 0, + .memoryTypeIndex = memoryTypeIndex, + .isMapped = false, + .mappedPtr = nullptr, + .freeList = {}, + .allocationUnit = config.allocationUnit + }); + + block->isMapped = (typeProps & vk::MemoryPropertyFlagBits::eHostVisible) != vk::MemoryPropertyFlags{}; + if (block->isMapped) { + block->mappedPtr = block->memory.mapMemory(0, size); + } + + const size_t numUnits = static_cast(block->size / config.allocationUnit); + block->freeList.resize(numUnits, true); + + return block; +} + +std::pair MemoryPool::findSuitableBlock(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment) { + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(poolType).first; + } + + auto& poolBlocks = poolIt->second; + const PoolConfig& config = poolConfigs[poolType]; + + // Calculate required units (accounting for size alignment) + const vk::DeviceSize alignedSize = ((size + alignment - 1) / alignment) * alignment; + const size_t requiredUnits = static_cast((alignedSize + config.allocationUnit - 1) / config.allocationUnit); + + // Search existing blocks for sufficient free space with proper offset alignment + for (const auto& block : poolBlocks) { + const vk::DeviceSize unit = config.allocationUnit; + const size_t totalUnits = block->freeList.size(); + + size_t i = 0; + while (i < totalUnits) { + // Ensure starting unit produces an offset aligned to 'alignment' + vk::DeviceSize startOffset = static_cast(i) * unit; + if ((alignment > 0) && (startOffset % alignment != 0)) { + // Advance i to the next unit that aligns with 'alignment' + const vk::DeviceSize remainder = startOffset % alignment; + const vk::DeviceSize advanceBytes = alignment - remainder; + const size_t advanceUnits = static_cast((advanceBytes + unit - 1) / unit); + i += std::max(advanceUnits, 1); + continue; + } + + // From aligned i, check for consecutive free units + size_t consecutiveFree = 0; + size_t j = i; + while (j < totalUnits && block->freeList[j] && consecutiveFree < requiredUnits) { + ++consecutiveFree; + ++j; + } + + if (consecutiveFree >= requiredUnits) { + return {block.get(), i}; + } + + // Move past the checked range + i = (j > i) ? j : (i + 1); + } + } + + // No suitable block found; create a new one on demand (no hard limits, allowed during rendering) + try { + auto newBlock = createMemoryBlock(poolType, alignedSize); + poolBlocks.push_back(std::move(newBlock)); + std::cout << "Created new memory block (pool type: " + << static_cast(poolType) << ")" << std::endl; + return {poolBlocks.back().get(), 0}; + } catch (const std::exception& e) { + std::cerr << "Failed to create new memory block: " << e.what() << std::endl; + return {nullptr, 0}; + } +} + +std::unique_ptr MemoryPool::allocate(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment) { + std::lock_guard lock(poolMutex); + + auto [block, startUnit] = findSuitableBlock(poolType, size, alignment); + if (!block) { + return nullptr; + } + + const PoolConfig& config = poolConfigs[poolType]; + + // Calculate required units (accounting for alignment) + const vk::DeviceSize alignedSize = ((size + alignment - 1) / alignment) * alignment; + const size_t requiredUnits = (alignedSize + config.allocationUnit - 1) / config.allocationUnit; + + // Mark units as used + for (size_t i = startUnit; i < startUnit + requiredUnits; ++i) { + block->freeList[i] = false; + } + + // Create allocation info + auto allocation = std::make_unique(); + allocation->memory = *block->memory; + allocation->offset = startUnit * config.allocationUnit; + allocation->size = alignedSize; + allocation->memoryTypeIndex = block->memoryTypeIndex; + allocation->isMapped = block->isMapped; + allocation->mappedPtr = block->isMapped ? static_cast(block->mappedPtr) + allocation->offset : nullptr; + + block->used += alignedSize; + + return allocation; +} + +void MemoryPool::deallocate(std::unique_ptr allocation) { + if (!allocation) { + return; + } + + std::lock_guard lock(poolMutex); + + // Find the block that contains this allocation + for (auto& [poolType, poolBlocks] : pools) { + const PoolConfig& config = poolConfigs[poolType]; + + for (auto& block : poolBlocks) { + if (*block->memory == allocation->memory) { + // Calculate which units to free + size_t startUnit = allocation->offset / config.allocationUnit; + size_t numUnits = (allocation->size + config.allocationUnit - 1) / config.allocationUnit; + + // Mark units as free + for (size_t i = startUnit; i < startUnit + numUnits; ++i) { + block->freeList[i] = true; + } + + block->used -= allocation->size; + return; + } + } + } + + std::cerr << "Warning: Could not find memory block for deallocation" << std::endl; +} + +std::pair> MemoryPool::createBuffer( + const vk::DeviceSize size, + const vk::BufferUsageFlags usage, + const vk::MemoryPropertyFlags properties) { + // Determine a pool type based on usage and properties + PoolType poolType = PoolType::VERTEX_BUFFER; + + // Check for host-visible requirements first (for instance buffers and staging) + if (properties & vk::MemoryPropertyFlagBits::eHostVisible) { + poolType = PoolType::STAGING_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eVertexBuffer) { + poolType = PoolType::VERTEX_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eIndexBuffer) { + poolType = PoolType::INDEX_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eUniformBuffer) { + poolType = PoolType::UNIFORM_BUFFER; + } + + // Create the buffer + const vk::BufferCreateInfo bufferInfo{ + .size = size, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer buffer(device, bufferInfo); + + // Get memory requirements + vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements(); + + // Check if buffer requires device address support (for ray tracing) + const bool needsDeviceAddress = (usage & vk::BufferUsageFlagBits::eShaderDeviceAddress) != vk::BufferUsageFlags{}; + + std::unique_ptr allocation; + + if (needsDeviceAddress) { + // Buffers with device address usage require VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT flag + // Create a dedicated memory block for this buffer (similar to image allocation) + uint32_t memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + std::lock_guard lock(poolMutex); + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(poolType).first; + } + auto& poolBlocks = poolIt->second; + auto block = createMemoryBlockWithType(poolType, + memRequirements.size, + memoryTypeIndex, + vk::MemoryAllocateFlagBits::eDeviceAddress); + + // Prepare allocation that uses the new block from offset 0 + allocation = std::make_unique(); + allocation->memory = *block->memory; + allocation->offset = 0; + allocation->size = memRequirements.size; + allocation->memoryTypeIndex = memoryTypeIndex; + allocation->isMapped = block->isMapped; + allocation->mappedPtr = block->mappedPtr; + + // Mark the entire block as used + block->used = memRequirements.size; + const size_t units = block->freeList.size(); + for (size_t i = 0; i < units; ++i) { + block->freeList[i] = false; + } + + // Keep the block owned by the pool for lifetime management + poolBlocks.push_back(std::move(block)); + } else { + // Normal pooled allocation path + allocation = allocate(poolType, memRequirements.size, memRequirements.alignment); + if (!allocation) { + throw std::runtime_error("Failed to allocate memory from pool"); + } + } + + // Bind memory to buffer + buffer.bindMemory(allocation->memory, allocation->offset); + + return {std::move(buffer), std::move(allocation)}; +} + +std::pair> MemoryPool::createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels, + vk::SharingMode sharingMode, + const std::vector& queueFamilyIndices, + uint32_t arrayLayers) { + // Create the image + vk::ImageCreateInfo imageInfo{ + .imageType = vk::ImageType::e2D, + .format = format, + .extent = {width, height, 1}, + .mipLevels = std::max(1u, mipLevels), + .arrayLayers = arrayLayers, + .samples = vk::SampleCountFlagBits::e1, + .tiling = tiling, + .usage = usage, + .sharingMode = sharingMode, + .initialLayout = vk::ImageLayout::eUndefined + }; + + // If concurrent sharing is requested, provide queue family indices + std::vector fam = queueFamilyIndices; + if (sharingMode == vk::SharingMode::eConcurrent && !fam.empty()) { + imageInfo.queueFamilyIndexCount = static_cast(fam.size()); + imageInfo.pQueueFamilyIndices = fam.data(); + } + + vk::raii::Image image(device, imageInfo); + + // Get memory requirements for this image + vk::MemoryRequirements memRequirements = image.getMemoryRequirements(); + + // Pick a memory type compatible with this image + uint32_t memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + // Create a dedicated memory block for this image with the exact type and size + std::unique_ptr allocation; { + std::lock_guard lock(poolMutex); + auto poolIt = pools.find(PoolType::TEXTURE_IMAGE); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(PoolType::TEXTURE_IMAGE).first; + } + auto& poolBlocks = poolIt->second; + auto block = createMemoryBlockWithType(PoolType::TEXTURE_IMAGE, memRequirements.size, memoryTypeIndex); + + // Prepare allocation that uses the new block from offset 0 + allocation = std::make_unique(); + allocation->memory = *block->memory; + allocation->offset = 0; + allocation->size = memRequirements.size; + allocation->memoryTypeIndex = memoryTypeIndex; + allocation->isMapped = block->isMapped; + allocation->mappedPtr = block->mappedPtr; + + // Mark the entire block as used + block->used = memRequirements.size; + const size_t units = block->freeList.size(); + for (size_t i = 0; i < units; ++i) { + block->freeList[i] = false; + } + + // Keep the block owned by the pool for lifetime management and deallocation support + poolBlocks.push_back(std::move(block)); + } + + // Bind memory to image + image.bindMemory(allocation->memory, allocation->offset); + + return {std::move(image), std::move(allocation)}; +} + +std::pair MemoryPool::getMemoryUsage(PoolType poolType) const { + std::lock_guard lock(poolMutex); + + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + return {0, 0}; + } + + auto [used, total] = std::accumulate( + poolIt->second.begin(), + poolIt->second.end(), + std::pair{0, 0}, + [](const auto& acc, const auto& block) { + return std::pair{acc.first + block->used, acc.second + block->size}; + }); + + return {used, total}; +} + +std::pair MemoryPool::getTotalMemoryUsage() const { + std::lock_guard lock(poolMutex); + + vk::DeviceSize totalUsed = 0; + vk::DeviceSize totalAllocated = 0; + + for (const auto& [poolType, poolBlocks] : pools) { + for (const auto& block : poolBlocks) { + totalUsed += block->used; + totalAllocated += block->size; + } + } + + return {totalUsed, totalAllocated}; +} + +bool MemoryPool::preAllocatePools() { + std::lock_guard lock(poolMutex); + + try { + std::cout << "Pre-allocating initial memory blocks for pools..." << std::endl; + + // Pre-allocate at least one block for each pool type + for (const auto& [poolType, config] : poolConfigs) { + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(poolType).first; + } + + auto& poolBlocks = poolIt->second; + if (poolBlocks.empty()) { + // Create initial block for this pool type + auto newBlock = createMemoryBlock(poolType, config.blockSize); + poolBlocks.push_back(std::move(newBlock)); + std::cout << " Pre-allocated block for pool type " << static_cast(poolType) << std::endl; + } + } + + std::cout << "Memory pool pre-allocation completed successfully" << std::endl; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to pre-allocate memory pools: " << e.what() << std::endl; + return false; + } +} + +void MemoryPool::setRenderingActive(bool active) { + std::lock_guard lock(poolMutex); + renderingActive = active; +} + +bool MemoryPool::isRenderingActive() const { + std::lock_guard lock(poolMutex); + return renderingActive; +} \ No newline at end of file diff --git a/attachments/openxr_engine/memory_pool.h b/attachments/openxr_engine/memory_pool.h new file mode 100644 index 00000000..04dd1f8c --- /dev/null +++ b/attachments/openxr_engine/memory_pool.h @@ -0,0 +1,218 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +/** + * @brief Memory pool allocator for Vulkan resources + * + * This class implements a memory pool system to reduce memory fragmentation + * and improve allocation performance by pre-allocating large chunks of memory + * and sub-allocating from them. + */ +class MemoryPool +{ + public: + /** + * @brief Types of memory pools based on usage patterns + */ + enum class PoolType + { + VERTEX_BUFFER, // Device-local memory for vertex data + INDEX_BUFFER, // Device-local memory for index data + UNIFORM_BUFFER, // Host-visible memory for uniform data + STAGING_BUFFER, // Host-visible memory for staging operations + TEXTURE_IMAGE // Device-local memory for texture images + }; + + /** + * @brief Allocation information for a memory block + */ + struct Allocation + { + vk::DeviceMemory memory; // The underlying device memory + vk::DeviceSize offset; // Offset within the memory block + vk::DeviceSize size; // Size of the allocation + uint32_t memoryTypeIndex; // Memory type index + bool isMapped; // Whether the memory is persistently mapped + void *mappedPtr; // Mapped pointer (if applicable) + }; + + /** + * @brief Memory block within a pool + */ + struct MemoryBlock + { + vk::raii::DeviceMemory memory; // RAII wrapper for device memory + vk::DeviceSize size; // Total size of the block + vk::DeviceSize used; // Currently used bytes + uint32_t memoryTypeIndex; // Memory type index + bool isMapped; // Whether the block is mapped + void *mappedPtr; // Mapped pointer (if applicable) + std::vector freeList; // Free list for sub-allocations + vk::DeviceSize allocationUnit; // Size of each allocation unit + }; + + private: + const vk::raii::Device &device; + const vk::raii::PhysicalDevice &physicalDevice; + vk::PhysicalDeviceMemoryProperties memPropsCache{}; + + // Pool configurations + struct PoolConfig + { + vk::DeviceSize blockSize; // Size of each memory block + vk::DeviceSize allocationUnit; // Minimum allocation unit + vk::MemoryPropertyFlags properties; // Memory properties + }; + + // Memory pools for different types + std::unordered_map>> pools; + std::unordered_map poolConfigs; + + // Thread safety + mutable std::mutex poolMutex; + + // Optional rendering state flag (no allocation restrictions enforced) + bool renderingActive = false; + + // Helper methods + uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const; + std::unique_ptr createMemoryBlock(PoolType poolType, vk::DeviceSize size, vk::MemoryAllocateFlags allocFlags = {}); + // Create a memory block with an explicit memory type index (used for images requiring a specific type) + std::unique_ptr createMemoryBlockWithType(PoolType poolType, vk::DeviceSize size, uint32_t memoryTypeIndex, vk::MemoryAllocateFlags allocFlags = {}); + std::pair findSuitableBlock(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment); + + public: + /** + * @brief Constructor + * @param device Vulkan device + * @param physicalDevice Vulkan physical device + */ + MemoryPool(const vk::raii::Device &device, const vk::raii::PhysicalDevice &physicalDevice); + + /** + * @brief Destructor + */ + ~MemoryPool(); + + /** + * @brief Initialize the memory pool with default configurations + * @return True if initialization was successful + */ + bool initialize(); + + /** + * @brief Allocate memory from a specific pool + * @param poolType Type of pool to allocate from + * @param size Size of the allocation + * @param alignment Required alignment + * @return Allocation information, or nullptr if allocation failed + */ + std::unique_ptr allocate(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment = 1); + + /** + * @brief Free a previously allocated memory block + * @param allocation The allocation to free + */ + void deallocate(std::unique_ptr allocation); + + /** + * @brief Create a buffer using pooled memory + * @param size Size of the buffer + * @param usage Buffer usage flags + * @param properties Memory properties + * @return Pair of buffer and allocation info + */ + std::pair> createBuffer( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties); + + /** + * @brief Create an image using pooled memory + * @param width Image width + * @param height Image height + * @param format Image format + * @param tiling Image tiling + * @param usage Image usage flags + * @param properties Memory properties + * @return Pair of image and allocation info + */ + std::pair> createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels = 1, + vk::SharingMode sharingMode = vk::SharingMode::eExclusive, + const std::vector &queueFamilyIndices = {}, + uint32_t arrayLayers = 1); + + /** + * @brief Get memory usage statistics + * @param poolType Type of pool to query + * @return Pair of (used bytes, total bytes) + */ + std::pair getMemoryUsage(PoolType poolType) const; + + /** + * @brief Get total memory usage across all pools + * @return Pair of (used bytes, total bytes) + */ + std::pair getTotalMemoryUsage() const; + + /** + * @brief Configure a specific pool type + * @param poolType Type of pool to configure + * @param blockSize Size of each memory block + * @param allocationUnit Minimum allocation unit + * @param properties Memory properties + */ + void configurePool( + PoolType poolType, + vk::DeviceSize blockSize, + vk::DeviceSize allocationUnit, + vk::MemoryPropertyFlags properties); + + /** + * @brief Pre-allocate initial memory blocks for configured pools + * @return True if pre-allocation was successful + */ + bool preAllocatePools(); + + /** + * @brief Set rendering active state flag (informational only) + * @param active Whether rendering is currently active + */ + void setRenderingActive(bool active); + + /** + * @brief Check if rendering is currently active (informational only) + * @return True if rendering is active + */ + bool isRenderingActive() const; +}; diff --git a/attachments/openxr_engine/renderer.h b/attachments/openxr_engine/renderer.h new file mode 100644 index 00000000..ab476393 --- /dev/null +++ b/attachments/openxr_engine/renderer.h @@ -0,0 +1,1954 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xr_context.h" +#include "camera_component.h" +#include "entity.h" +#include "memory_pool.h" +#include "mesh_component.h" +#include "model_loader.h" +#include "platform.h" +#include "thread_pool.h" + +// Fallback defines for optional extension names (allow compiling against older headers) +#ifndef VK_EXT_ROBUSTNESS_2_EXTENSION_NAME +# define VK_EXT_ROBUSTNESS_2_EXTENSION_NAME "VK_EXT_robustness2" +#endif +#ifndef VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME +# define VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME "VK_KHR_dynamic_rendering_local_read" +#endif +#ifndef VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME +# define VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME "VK_EXT_shader_tile_image" +#endif + +// Forward declarations +class ImGuiSystem; + +/** + * @brief Structure for Vulkan queue family indices. + */ +struct QueueFamilyIndices { + std::optional graphicsFamily; + std::optional presentFamily; + std::optional computeFamily; + std::optional transferFamily; // optional dedicated transfer queue family + + [[nodiscard]] bool isComplete() const { + return graphicsFamily.has_value() && presentFamily.has_value() && computeFamily.has_value(); + } +}; + +/** + * @brief Structure for swap chain support details. + */ +struct SwapChainSupportDetails { + vk::SurfaceCapabilitiesKHR capabilities; + std::vector formats; + std::vector presentModes; +}; + +/** + * @brief Structure for individual light data in the storage buffer. + */ +struct LightData { + alignas(16) glm::vec4 position; // Light position (w component used for direction vs position) + alignas(16) glm::vec4 color; // Light color and intensity + alignas(16) glm::mat4 lightSpaceMatrix; // Light space matrix for shadow mapping + alignas(16) glm::vec4 direction; // Light direction (for directional/spotlights) + alignas(4) int lightType; // 0=Point, 1=Directional, 2=Spot, 3=Emissive + alignas(4) float range; // Light range + alignas(4) float innerConeAngle; // For spotlights + alignas(4) float outerConeAngle; // For spotlights +}; + +struct ShadowUniforms { + alignas(16) glm::mat4 view; + alignas(16) glm::mat4 proj; +}; + +struct ShadowPushConstants { + alignas(16) glm::mat4 model; +}; + +/** + * @brief Structure for the uniform buffer object (now without fixed light arrays). + */ +struct UniformBufferObject { + alignas(16) glm::mat4 model; + union { + alignas(16) glm::mat4 views[4]; // Supporting up to 4 views (stereo + quad views) + alignas(16) glm::mat4 view; // Single-view alias + }; + union { + alignas(16) glm::mat4 projs[4]; + alignas(16) glm::mat4 proj; // Single-view alias + }; + union { + alignas(16) glm::vec4 camPoses[4]; + alignas(16) glm::vec4 camPos; // Single-view alias + }; + alignas(16) glm::mat4 viewProjections[4]; + alignas(4) float exposure; + alignas(4) float gamma; + alignas(4) float prefilteredCubeMipLevels; + alignas(4) float scaleIBLAmbient; + alignas(4) int lightCount; + alignas(4) int padding0; // match shader UBO layout + alignas(4) float padding1; // match shader UBO layout + alignas(4) float padding2; // match shader UBO layout + alignas(8) glm::vec2 screenDimensions; + alignas(4) float nearZ; + alignas(4) float farZ; + alignas(4) float slicesZ; + alignas(4) float _uboPad3; + // Planar reflections + alignas(16) glm::mat4 reflectionVP; // projection * mirroredView + alignas(4) int reflectionEnabled; // 1 when sampling reflection in main pass + alignas(4) int reflectionPass; // 1 during reflection render pass + alignas(8) glm::vec2 _reflectPad0; + alignas(16) glm::vec4 clipPlaneWS; // world-space plane ax+by+cz+d=0 + // Controls + alignas(4) float reflectionIntensity; // scales reflection mix in glass + alignas(4) int enableRayQueryReflections = 1; // 1 to enable reflections in ray query mode + alignas(4) int enableRayQueryTransparency = 1; // 1 to enable transparency/refraction in ray query mode + alignas(4) float _padReflect[1]{}; + // Ray-query specific: number of per-instance geometry infos in buffer + alignas(4) int geometryInfoCount{0}; + alignas(4) int _padGeo0{0}; + alignas(4) int _padGeo1{0}; + alignas(4) int _padGeo2{0}; + alignas(16) glm::vec4 _rqReservedWorldPos{0.0f, 0.0f, 0.0f, 0.0f}; + // Ray-query specific: number of materials in materialBuffer + alignas(4) int materialCount{0}; + alignas(4) int _padMat0{0}; + alignas(4) int _padMat1{0}; + alignas(4) int _padMat2{0}; +}; + +// Ray Query uses a dedicated uniform buffer with its own tightly-defined layout. +// This avoids relying on the (much larger) shared raster UBO layout and prevents +// CPU↔shader layout drift from breaking Ray Query-only fields. +// +// IMPORTANT: This layout must match `RayQueryUniforms` in `shaders/ray_query.slang`. +struct RayQueryUniformBufferObject { + alignas(16) glm::mat4 model; + alignas(16) glm::mat4 view; + alignas(16) glm::mat4 proj; + alignas(16) glm::vec4 camPos; + + alignas(4) float exposure; + alignas(4) float gamma; + // Match raster UBO conventions so Ray Query can run the same lighting math. + alignas(4) float scaleIBLAmbient; + alignas(4) int lightCount; + alignas(4) int enableRayQueryReflections; + alignas(4) int enableRayQueryTransparency; + + alignas(8) glm::vec2 screenDimensions; + alignas(4) int geometryInfoCount; + alignas(4) int materialCount; + alignas(4) int _pad0; // used for rayQueryMaxBounces + // Thick-glass controls (RQ-only) + alignas(4) int enableThickGlass; // 0/1 toggle + alignas(4) float thicknessClamp; // max thickness in meters + alignas(4) float absorptionScale; // scales sigma_a + alignas(4) int _pad1; // Ray Query: enable hard shadows for direct lighting (0/1) + // Ray Query soft shadows (area-light approximation) + alignas(4) int shadowSampleCount; // 1 = hard shadows; >1 = multi-sample + alignas(4) float shadowSoftness; // 0 = hard; otherwise scales effective light radius (fraction of range) + alignas(4) float reflectionIntensity; // User control for glass reflection strength + alignas(4) float _padShadow[2]{}; +}; + +static_assert(sizeof(RayQueryUniformBufferObject) == 288, "RayQueryUniformBufferObject size must match shader layout"); +static_assert(offsetof(RayQueryUniformBufferObject, model) == 0); +static_assert(offsetof(RayQueryUniformBufferObject, view) == 64); +static_assert(offsetof(RayQueryUniformBufferObject, proj) == 128); +static_assert(offsetof(RayQueryUniformBufferObject, camPos) == 192); +static_assert(offsetof(RayQueryUniformBufferObject, exposure) == 208); +static_assert(offsetof(RayQueryUniformBufferObject, gamma) == 212); +static_assert(offsetof(RayQueryUniformBufferObject, scaleIBLAmbient) == 216); +static_assert(offsetof(RayQueryUniformBufferObject, lightCount) == 220); +static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryReflections) == 224); +static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryTransparency) == 228); +static_assert(offsetof(RayQueryUniformBufferObject, screenDimensions) == 232); +static_assert(offsetof(RayQueryUniformBufferObject, geometryInfoCount) == 240); +static_assert(offsetof(RayQueryUniformBufferObject, materialCount) == 244); +static_assert(offsetof(RayQueryUniformBufferObject, _pad0) == 248); +static_assert(offsetof(RayQueryUniformBufferObject, enableThickGlass) == 252); +static_assert(offsetof(RayQueryUniformBufferObject, thicknessClamp) == 256); +static_assert(offsetof(RayQueryUniformBufferObject, absorptionScale) == 260); +static_assert(offsetof(RayQueryUniformBufferObject, _pad1) == 264); +static_assert(offsetof(RayQueryUniformBufferObject, shadowSampleCount) == 268); +static_assert(offsetof(RayQueryUniformBufferObject, shadowSoftness) == 272); + +/** + * @brief Structure for PBR material properties. + * This structure must match the PushConstants structure in the PBR shader. + */ +struct MaterialProperties { + alignas(16) glm::vec4 baseColorFactor; + alignas(4) float metallicFactor; + alignas(4) float roughnessFactor; + alignas(4) int baseColorTextureSet; + alignas(4) int physicalDescriptorTextureSet; + alignas(4) int normalTextureSet; + alignas(4) int occlusionTextureSet; + alignas(4) int emissiveTextureSet; + alignas(4) float alphaMask; + alignas(4) float alphaMaskCutoff; + alignas(16) glm::vec3 emissiveFactor; // Emissive factor for HDR emissive sources + alignas(4) float emissiveStrength; // KHR_materials_emissive_strength extension + alignas(4) float transmissionFactor; // KHR_materials_transmission + alignas(4) int useSpecGlossWorkflow; // 1 if using KHR_materials_pbrSpecularGlossiness + alignas(4) float glossinessFactor; // SpecGloss glossiness scalar + alignas(16) glm::vec3 specularFactor; // SpecGloss specular color factor + alignas(4) float ior = 1.5f; // index of refraction + alignas(4) bool hasEmissiveStrengthExtension; +}; + +/** + * @brief Rendering mode selection + */ +enum class RenderMode { + Rasterization, // Traditional rasterization pipeline + RayQuery // Ray query compute shader +}; + +/** + * @brief Class for managing Vulkan rendering. + * + * This class implements the rendering pipeline as described in the Engine_Architecture chapter: + * @see en/Building_a_Simple_Engine/Engine_Architecture/05_rendering_pipeline.adoc + */ +class Renderer { + public: + /** + * @brief Constructor with a platform. + * @param platform The platform to use for rendering. + */ + explicit Renderer(Platform* platform); + + /** + * @brief Destructor for proper cleanup. + */ + ~Renderer(); + + /** + * @brief Initialize the renderer. + * @param appName The name of the application. + * @param enableValidationLayers Whether to enable validation layers. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(const std::string& appName, bool enableValidationLayers = true, bool useXR = false); + + /** + * @brief Clean up renderer resources. + */ + void Cleanup(); + + /** + * @brief Render the scene. + * @param entities The entities to render. + * @param camera The camera to use for rendering. + * @param imguiSystem The ImGui system for UI rendering (optional). + */ + void Render(const std::vector>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr); + + // Render overload that accepts a snapshot of raw entity pointers. + // This allows the Engine to release its entity-container lock before rendering + // (avoiding writer starvation of background loading/physics threads). + void Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr); + void Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem, XrTime predictedTime); + + /** + * @brief Wait for the device to be idle. + */ + void WaitIdle(); + + /** + * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection. + * Must be called from the render thread. + */ + vk::Result waitForFencesSafe(const std::vector& fences, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL); + + /** + * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection. + * Must be called from the render thread. Overload for a single fence. + */ + vk::Result waitForFencesSafe(vk::Fence fence, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL); + + /** + * @brief Dispatch a compute shader. + * @param groupCountX The number of local workgroups to dispatch in the X dimension. + * @param groupCountY The number of local workgroups to dispatch in the Y dimension. + * @param groupCountZ The number of local workgroups to dispatch in the Z dimension. + * @param inputBuffer The input buffer. + * @param outputBuffer The output buffer. + * @param hrtfBuffer The HRTF data buffer. + * @param paramsBuffer The parameters buffer. + * @return A fence that can be used to synchronize with the compute operation. + */ + vk::raii::Fence DispatchCompute(uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + vk::Buffer inputBuffer, + vk::Buffer outputBuffer, + vk::Buffer hrtfBuffer, + vk::Buffer paramsBuffer); + + /** + * @brief Check if the renderer is initialized. + * @return True if the renderer is initialized, false otherwise. + */ + bool IsInitialized() const { + return initialized; + } + + /** + * @brief Get the Vulkan device. + * @return The Vulkan device. + */ + vk::Device GetDevice() const { + return *device; + } + + // Expose max frames in flight for per-frame resource duplication + uint32_t GetMaxFramesInFlight() const { + return MAX_FRAMES_IN_FLIGHT; + } + + /** + * @brief Get the Vulkan RAII device. + * @return The Vulkan RAII device. + */ + const vk::raii::Device& GetRaiiDevice() const { + return device; + } + +#if defined(PLATFORM_ANDROID) + void SetAndroidApp(struct android_app* app) { xrContext.setAndroidApp(app); } +#endif + + // Expose uploads timeline semaphore and last value for external waits + vk::Semaphore GetUploadsTimelineSemaphore() const { + return *uploadsTimeline; + } + uint64_t GetUploadsTimelineValue() const { + return uploadTimelineLastSubmitted.load(std::memory_order_relaxed); + } + + /** + * @brief Get the compute queue. + * @return The compute queue. + */ + vk::Queue GetComputeQueue() const { + std::lock_guard lock(queueMutex); + return *computeQueue; + } + + /** + * @brief Find a suitable memory type. + * @param typeFilter The type filter. + * @param properties The memory properties. + * @return The memory type index. + */ + uint32_t FindMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const { + return findMemoryType(typeFilter, properties); + } + + /** + * @brief Get the compute queue family index. + * @return The compute queue family index. + */ + uint32_t GetComputeQueueFamilyIndex() const { + if (queueFamilyIndices.computeFamily.has_value()) { + return queueFamilyIndices.computeFamily.value(); + } + // Fallback to graphics family to avoid crashes on devices without a separate compute queue + return queueFamilyIndices.graphicsFamily.value(); + } + + /** + * @brief Submit a command buffer to the compute queue with proper dispatch loader preservation. + * @param commandBuffer The command buffer to submit. + * @param fence The fence to signal when the operation completes. + */ + void SubmitToComputeQueue(vk::CommandBuffer commandBuffer, vk::Fence fence) const { + // Use mutex to ensure thread-safe access to queues + vk::SubmitInfo submitInfo{ + .commandBufferCount = 1, + .pCommandBuffers = &commandBuffer + }; + std::lock_guard lock(queueMutex); + // Prefer compute queue when available; otherwise, fall back to graphics queue to avoid crashes + if (*computeQueue) { + computeQueue.submit(submitInfo, fence); + } else { + graphicsQueue.submit(submitInfo, fence); + } + } + + /** + * @brief Create a shader module from SPIR-V code. + * @param code The SPIR-V code. + * @return The shader module. + */ + vk::raii::ShaderModule CreateShaderModule(const std::vector& code) { + return createShaderModule(code); + } + + /** + * @brief Create a shader module from a file. + * @param filename The filename. + * @return The shader module. + */ + vk::raii::ShaderModule CreateShaderModule(const std::string& filename) { + auto code = readFile(filename); + return createShaderModule(code); + } + + /** + * @brief Load a texture from a file. + * @param texturePath The path to the texture file. + * @return True if the texture was loaded successfully, false otherwise. + */ + bool LoadTexture(const std::string& texturePath); + + // Asynchronous texture loading APIs (thread-pool backed). + // The 'critical' flag is used to front-load important textures (e.g., + // baseColor/albedo) so the scene looks mostly correct before the loading + // screen disappears. Non-critical textures (normals, MR, AO, emissive) + // can stream in after geometry is visible. + std::future LoadTextureAsync(const std::string& texturePath, bool critical = false); + + /** + * @brief Load a texture from raw image data in memory. + * @param textureId The identifier for the texture. + * @param imageData The raw image data. + * @param width The width of the image. + * @param height The height of the image. + * @param channels The number of channels in the image. + * @return True if the texture was loaded successfully, false otherwise. + */ + bool LoadTextureFromMemory(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels); + + // Asynchronous upload from memory (RGBA/RGB/other). Safe for concurrent calls. + std::future LoadTextureFromMemoryAsync(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels, + bool critical = false); + + // Progress query for UI + uint32_t GetTextureTasksScheduled() const { + return textureTasksScheduled.load(); + } + uint32_t GetTextureTasksCompleted() const { + return textureTasksCompleted.load(); + } + + // GPU upload progress (per-texture jobs processed on the main thread). + uint32_t GetUploadJobsTotal() const { + return uploadJobsTotal.load(); + } + uint32_t GetUploadJobsCompleted() const { + return uploadJobsCompleted.load(); + } + + // --- Acceleration structure build progress (for UI) --- + // Exposed so the loading overlay can show meaningful progress when + // BLAS/TLAS builds take a long time (>= ~10 seconds). + bool IsASBuildInProgress() const { + return asBuildUiActive.load(std::memory_order_relaxed); + } + float GetASBuildProgress() const { + return asBuildUiProgress.load(std::memory_order_relaxed); + } + uint32_t GetASBuildItemsDone() const { + return asBuildUiDone.load(std::memory_order_relaxed); + } + uint32_t GetASBuildItemsTotal() const { + return asBuildUiTotal.load(std::memory_order_relaxed); + } + const char* GetASBuildStage() const { + return asBuildUiStage.load(std::memory_order_relaxed); + } + double GetASBuildElapsedSeconds() const { + const uint64_t start = asBuildUiStartNs.load(std::memory_order_relaxed); + if (start == 0) + return 0.0; + const uint64_t now = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + if (now <= start) + return 0.0; + return static_cast(now - start) / 1'000'000'000.0; + } + bool ShouldShowASBuildProgressInUI() const { + return IsASBuildInProgress() && GetASBuildElapsedSeconds() >= 10.0; + } + + // Block until all currently-scheduled texture tasks have completed. + // Intended for use during initial scene loading so that descriptor + // creation sees the final textureResources instead of fallbacks. + void WaitForAllTextureTasks(); + + // Process pending texture GPU uploads on the calling thread. + // This should be invoked from the main/render thread so that all + // Vulkan work happens from a single thread while worker threads + // perform only CPU-side decoding. + // + // Parameters allow us to: + // - limit the number of jobs processed per call (for streaming), and + // - choose whether to include critical and/or non-critical jobs. + void ProcessPendingTextureJobs(uint32_t maxJobs = UINT32_MAX, + bool includeCritical = true, + bool includeNonCritical = true); + + // Track which entities use a given texture ID so that descriptor sets + // can be refreshed when textures finish streaming in. + void RegisterTextureUser(const std::string& textureId, Entity* entity); + void OnTextureUploaded(const std::string& textureId); + + // Global loading state (model/scene). Consider the scene "loading" while + // either the model is being parsed/instantiated OR there are still + // outstanding critical texture uploads (e.g., baseColor/albedo). + // Loading state: show blocking loading overlay only until the initial scene is ready. + // Background streaming may continue after that without blocking the scene. + enum class LoadingPhase : uint32_t { + Scene = 0, + Textures, + Physics, + AccelerationStructures, + Finalizing + }; + LoadingPhase GetLoadingPhase() const { + return static_cast(loadingPhase.load(std::memory_order_relaxed)); + } + const char* GetLoadingPhaseName() const { + switch (GetLoadingPhase()) { + case LoadingPhase::Scene: + return "Scene"; + case LoadingPhase::Textures: + return "Textures"; + case LoadingPhase::Physics: + return "Physics"; + case LoadingPhase::AccelerationStructures: + return "Acceleration Structures"; + case LoadingPhase::Finalizing: + return "Finalizing"; + default: + return "Loading"; + } + } + float GetLoadingPhaseProgress() const { + return std::clamp(loadingPhaseProgress.load(std::memory_order_relaxed), 0.0f, 1.0f); + } + void SetLoadingPhase(LoadingPhase phase) { + loadingPhase.store(static_cast(phase), std::memory_order_relaxed); + loadingPhaseProgress.store(0.0f, std::memory_order_relaxed); + } + void SetLoadingPhaseProgress(float v) { + loadingPhaseProgress.store(std::clamp(v, 0.0f, 1.0f), std::memory_order_relaxed); + } + void MarkInitialLoadComplete() { + initialLoadComplete.store(true, std::memory_order_relaxed); + SetLoadingPhase(LoadingPhase::Finalizing); + loadingPhaseProgress.store(1.0f, std::memory_order_relaxed); + } + bool IsLoading() const { + // Keep the blocking overlay visible until the engine has finished + // post-load blockers (AS build, descriptor cold-init, etc.). + return (loadingFlag.load(std::memory_order_relaxed) || criticalJobsOutstanding.load(std::memory_order_relaxed) > 0u || + !initialLoadComplete.load(std::memory_order_relaxed)); + } + // True only while the model/scene is still being constructed or while critical + // texture jobs remain outstanding. This excludes the "finalizing" stage where + // the render thread may still be doing post-load work (AS build, descriptor init). + // + // IMPORTANT: Do NOT use critical texture completion as a gate for starting TLAS/BLAS builds. + // AS builds depend on geometry buffers and instance transforms, not on texture readiness. + bool IsSceneLoaderActive() const { + return loadingFlag.load(std::memory_order_relaxed); + } + void SetLoading(bool v) { + loadingFlag.store(v, std::memory_order_relaxed); + if (v) { + // New load cycle starting + initialLoadComplete.store(false, std::memory_order_relaxed); + SetLoadingPhase(LoadingPhase::Scene); + } + } + + // Descriptor set deferred update machinery + void MarkEntityDescriptorsDirty(Entity *entity); + void ProcessDirtyDescriptorsForFrame(uint32_t frameIndex); + + // Texture aliasing: map canonical IDs to actual loaded keys (e.g., file paths) to avoid duplicates + inline void RegisterTextureAlias(const std::string& aliasId, const std::string& targetId) { + std::unique_lock lock(textureResourcesMutex); + if (aliasId.empty() || targetId.empty()) + return; + // Resolve targetId without re-locking by walking the alias map directly + std::string resolved = targetId; + for (int i = 0; i < 8; ++i) { + auto it = textureAliases.find(resolved); + if (it == textureAliases.end()) + break; + if (it->second == resolved) + break; + resolved = it->second; + } + if (aliasId == resolved) { + textureAliases.erase(aliasId); + } else { + textureAliases[aliasId] = resolved; + } + } + inline std::string ResolveTextureId(const std::string& id) const { + std::shared_lock lock(textureResourcesMutex); + std::string cur = id; + for (int i = 0; i < 8; ++i) { + // prevent pathological cycles + auto it = textureAliases.find(cur); + if (it == textureAliases.end()) + break; + if (it->second == cur) + break; // self-alias guard + cur = it->second; + } + return cur; + } + + /** + * @brief Transition an image layout. + * @param image The image. + * @param format The image format. + * @param oldLayout The old layout. + * @param newLayout The new layout. + */ + void TransitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout) { + transitionImageLayout(image, format, oldLayout, newLayout, 1, 1); + } + + /** + * @brief Copy a buffer to an image. + * @param buffer The buffer. + * @param image The image. + * @param width The image width. + * @param height The image height. + */ + void CopyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height) { + // Create a default single region for backward compatibility + std::vector regions = { + { + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1} + } + }; + copyBufferToImage(buffer, image, width, height, regions); + } + + /** + * @brief Get the current command buffer. + * @return The current command buffer. + */ + vk::raii::CommandBuffer& GetCurrentCommandBuffer() { + return commandBuffers[currentFrame]; + } + + /** + * @brief Get the swap chain image format. + * @return The swap chain image format. + */ + /** + * @brief Get the OpenXR image views. + * @return Array of two vectors of references to image views (one per eye). + */ + std::array, 2> GetXrImageViews() { + // In this implementaiton, we use a single swapchain with 2 layers for multiview. + // We return raw vk::ImageView to avoid copy/ownership issues with vk::raii::ImageView. + std::vector views; + for (const auto& raiiView : swapChainImageViews) { + views.push_back(*raiiView); + } + return { views, views }; + } + + /** + * This should be called when the window is resized to trigger swap chain recreation. + */ + void SetFramebufferResized() { + framebufferResized.store(true, std::memory_order_relaxed); + } + + /** + * @brief Set the model loader reference for accessing extracted lights. + * @param _modelLoader Pointer to the model loader. + */ + void SetModelLoader(ModelLoader* _modelLoader) { + modelLoader = _modelLoader; + // Materials are resolved via ModelLoader; invalidate cached per-entity material info. + for (auto& kv : entityResources) { + kv.second.materialCacheValid = false; + kv.second.cachedMaterial = nullptr; + kv.second.cachedIsBlended = false; + kv.second.cachedIsGlass = false; + kv.second.cachedIsLiquid = false; + kv.second.cachedMaterialProps = MaterialProperties{}; + } + } + + /** + * @brief Set static lights loaded during model initialization. + * @param lights The lights to store statically. + */ + void SetStaticLights(const std::vector& lights) { + staticLights = lights; + std::cout << "[Lights] staticLights set: " << staticLights.size() << " entries" << std::endl; + } + + /** + * @brief Set the gamma correction value for PBR rendering. + * @param _gamma The gamma correction value (typically 2.2). + */ + void SetGamma(float _gamma) { + gamma = _gamma; + } + + /** + * @brief Set the exposure value for HDR tone mapping. + * @param _exposure The exposure value (1.0 = no adjustment). + */ + void SetExposure(float _exposure) { + exposure = _exposure; + } + + // Reflection intensity (UI + shader control) + void SetReflectionIntensity(float v) { + reflectionIntensity = v; + } + float GetReflectionIntensity() const { + return reflectionIntensity; + } + + void SetPlanarReflectionsEnabled(bool enabled); + void TogglePlanarReflections(); + bool IsPlanarReflectionsEnabled() const { + return enablePlanarReflections; + } + + // Ray query rendering mode control + void SetRenderMode(RenderMode mode) { + currentRenderMode = mode; + } + RenderMode GetRenderMode() const { + return currentRenderMode; + } + void ToggleRenderMode() { + currentRenderMode = (currentRenderMode == RenderMode::Rasterization) ? RenderMode::RayQuery : RenderMode::Rasterization; + } + + // Ray query capability getters + bool GetRayQueryEnabled() const { + return rayQueryEnabled; + } + bool GetAccelerationStructureEnabled() const { + return accelerationStructureEnabled; + } + + // Ray Query static-only mode (disable animation/physics updates and TLAS refits to render a static opaque scene) + void SetRayQueryStaticOnly(bool v) { + rayQueryStaticOnly = v; + } + bool IsRayQueryStaticOnly() const { + return rayQueryStaticOnly; + } + + /** + * @brief Request acceleration structure build at next safe frame point. + * Safe to call from any thread (e.g., background loading thread). + */ + void RequestAccelerationStructureBuild() { + if (!accelerationStructureEnabled || !rayQueryEnabled) + return; + // Record when the request was made so the render loop can enforce a bounded deferral + // policy (avoid getting stuck waiting for “perfect” readiness forever). + // NOTE: `asBuildRequested` may already be true due to other triggers; still ensure + // the request timestamp is armed so the timeout logic can work. + if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) { + const uint64_t nowNs = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed); + } + // Allow AS build to take longer than the watchdog threshold (large scenes in Debug). + watchdogSuppressed.store(true, std::memory_order_relaxed); + asBuildRequested.store(true, std::memory_order_release); + } + // Overload with reason tracking for diagnostics + void RequestAccelerationStructureBuild(const char* reason) { + if (!accelerationStructureEnabled || !rayQueryEnabled) + return; + if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) { + const uint64_t nowNs = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed); + } + if (reason) { + lastASBuildRequestReason = reason; + std::cout << "[AS] Requesting rebuild. Reason: " << reason << std::endl; + } else { + lastASBuildRequestReason = "(no reason)"; + } + + // Explicit requests bypass the freeze to ensure dynamic objects (like balls) are added + asDevOverrideAllowRebuild = true; + + watchdogSuppressed.store(true, std::memory_order_relaxed); + asBuildRequested.store(true, std::memory_order_release); + } + + /** + * @brief Build acceleration structures for ray query rendering. + * @param entities The entities to include in the acceleration structures. + * @return True if successful, false otherwise. + */ + bool buildAccelerationStructures(const std::vector& entities); + + // Refit/UPDATE the TLAS with latest entity transforms (no rebuild) + bool refitTopLevelAS(const std::vector& entities, CameraComponent* camera); + + /** + * @brief Update ray query descriptor sets with current resources. + * @param frameIndex The frame index to update (or all frames if not specified). + * @return True if successful, false otherwise. + */ + bool updateRayQueryDescriptorSets(uint32_t frameIndex, const std::vector& entities); + + /** + * @brief Create or resize light storage buffers to accommodate the given number of lights. + * @param lightCount The number of lights to accommodate. + * @return True if successful, false otherwise. + */ + bool createOrResizeLightStorageBuffers(size_t lightCount); + + /** + * @brief Update the light storage buffer with current light data. + * @param frameIndex The current frame index. + * @param lights The light data to upload. + * @return True if successful, false otherwise. + */ + bool updateLightStorageBuffer(uint32_t frameIndex, const std::vector& lights, CameraComponent* camera = nullptr); + + /** + * @brief Update all existing descriptor sets with new light storage buffer references. + * Called when light storage buffers are recreated to ensure descriptor sets reference valid buffers. + */ + // Update PBR descriptor sets to point to the latest light SSBOs. + // When allFrames=true, refresh all frames (use only when the device is idle — e.g., after waitIdle()). + // Otherwise, refresh only the current frame at the frame safe point to avoid touching in‑flight frames. + void updateAllDescriptorSetsWithNewLightBuffers(bool allFrames = false); + + // Upload helper: record both layout transitions and the copy in a single submit with a fence + void uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + const std::vector& regions, + uint32_t mipLevels = 1); + + // Generate full mip chain for a 2D color image using GPU blits + void generateMipmaps(vk::Image image, + vk::Format format, + int32_t texWidth, + int32_t texHeight, + uint32_t mipLevels); + + vk::Format findDepthFormat(); + + /** + * @brief Pre-allocate all Vulkan resources for an entity during scene loading. + * @param entity The entity to pre-allocate resources for. + * @return True if pre-allocation was successful, false otherwise. + */ + bool preAllocateEntityResources(Entity* entity); + + /** + * @brief Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads. + * + * This variant is optimized for large scene loads (e.g., GLTF Bistro). It will: + * - Create per-mesh GPU buffers as usual, but record all buffer copy commands + * into a single command buffer and submit them in one batch. + * - Then create uniform buffers and descriptor sets per entity. + * + * Callers that load many geometry entities at once (like GLTF scene loading) + * should prefer this over repeated preAllocateEntityResources() calls. + */ + bool preAllocateEntityResourcesBatch(const std::vector& entities); + + // Thread-safe: enqueue entities that need GPU-side resource preallocation. + // The actual Vulkan work will be performed on the render thread at the frame-start safe point. + void EnqueueEntityPreallocationBatch(const std::vector& entities); + void EnqueueInstanceBufferRecreation(Entity* entity); + + /** + * @brief Recreate the instance buffer for an entity that had its instances cleared. + * + * When an entity that was originally set up for instanced rendering needs to be + * converted to a single non-instanced entity (e.g., for animation), this method + * recreates the GPU instance buffer with a single identity instance. + * + * @param entity The entity whose instance buffer should be recreated. + * @return True if successful, false otherwise. + */ + bool recreateInstanceBuffer(Entity* entity); + + // Shared default PBR texture identifiers (to avoid creating hundreds of identical textures) + static const std::string SHARED_DEFAULT_ALBEDO_ID; + static const std::string SHARED_DEFAULT_NORMAL_ID; + static const std::string SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + static const std::string SHARED_DEFAULT_OCCLUSION_ID; + static const std::string SHARED_DEFAULT_EMISSIVE_ID; + static const std::string SHARED_BRIGHT_RED_ID; + + /** + * @brief Determine the appropriate texture format based on the texture type. + * @param textureId The texture identifier to analyze. + * @return The appropriate Vulkan format (sRGB for baseColor, linear for others). + */ + static vk::Format determineTextureFormat(const std::string& textureId); + + public: + // OpenXR support + bool IsXrMode() const { return xrMode; } + XrContext& GetXrContext() { return xrContext; } + + private: + XrContext xrContext; + bool xrMode = false; + + // Platform + Platform* platform = nullptr; + + // Model loader reference for accessing extracted lights + class ModelLoader* modelLoader = nullptr; + + // PBR rendering parameters + float gamma = 2.2f; // Gamma correction value + float exposure = 1.2f; // HDR exposure value (default tuned to avoid washout) + float reflectionIntensity = 1.0f; // User control for glass reflection strength + // Raster shadows (experimental): use ray queries in the raster PBR fragment shader. + // Wired through `UniformBufferObject.padding2` to avoid UBO layout churn. + bool enableRasterRayQueryShadows = false; + + // Ray Query tuning + int rayQueryMaxBounces = 1; // 0 = no secondary rays, 1 = one-bounce reflection/refraction + bool enableRayQueryShadows = true; // Hard shadows for Ray Query direct lighting (shadow rays) + int rayQueryShadowSampleCount = 1; // 1 = hard; >1 enables soft-shadow sampling in the shader + float rayQueryShadowSoftness = 0.0f; // 0 = hard; otherwise scales effective light radius (fraction of range) + // Thick-glass controls (RQ-only) + bool enableThickGlass = true; + float thickGlassAbsorptionScale = 1.0f; + float thickGlassThicknessClamp = 0.2f; // meters + + // Vulkan RAII context + vk::raii::Context context; + + // Vulkan instance and debug messenger + vk::raii::Instance instance = nullptr; + vk::raii::DebugUtilsMessengerEXT debugMessenger = nullptr; + + // Vulkan device + vk::raii::PhysicalDevice physicalDevice = nullptr; + vk::raii::Device device = nullptr; + + // Memory pool for efficient memory management + std::unique_ptr memoryPool; + + // Vulkan queues + vk::raii::Queue graphicsQueue = nullptr; + vk::raii::Queue presentQueue = nullptr; + vk::raii::Queue computeQueue = nullptr; + + // Vulkan surface + vk::raii::SurfaceKHR surface = nullptr; + + // Swap chain + vk::raii::SwapchainKHR swapChain = nullptr; + std::vector swapChainImages; + vk::Format swapChainImageFormat = vk::Format::eUndefined; + vk::Extent2D swapChainExtent = {0, 0}; + std::vector swapChainImageViews; + // OpenXR Swapchains + std::vector eyeSwapchainImages[2]; + std::vector eyeSwapchainImageViews[2]; + + // Tracked layouts for swapchain images (VVL requires correct oldLayout in barriers). + // Initialized at swapchain creation and updated as we transition. + std::vector swapChainImageLayouts; + + // Dynamic rendering info + vk::RenderingInfo renderingInfo; + std::vector colorAttachments; + vk::RenderingAttachmentInfo depthAttachment; + + // Pipelines + vk::raii::PipelineLayout pipelineLayout = nullptr; + vk::raii::Pipeline graphicsPipeline = nullptr; + vk::raii::PipelineLayout pbrPipelineLayout = nullptr; + vk::raii::Pipeline pbrGraphicsPipeline = nullptr; + vk::raii::Pipeline pbrBlendGraphicsPipeline = nullptr; + // Transparent PBR pipeline variant for premultiplied alpha content + vk::raii::Pipeline pbrPremulBlendGraphicsPipeline = nullptr; + // Opaque PBR pipeline variant used after a depth pre-pass (depth read-only, compare with pre-pass depth) + vk::raii::Pipeline pbrPrepassGraphicsPipeline = nullptr; + // Reflection PBR pipeline used for mirrored off-screen pass (cull none to avoid winding issues) + vk::raii::Pipeline pbrReflectionGraphicsPipeline = nullptr; + // Specialized pipeline for architectural glass (windows, lamp glass, etc.). + // Shares descriptor layouts and vertex input with the PBR pipelines but uses + // a dedicated fragment shader entry point for more stable glass shading. + vk::raii::Pipeline glassGraphicsPipeline = nullptr; + vk::raii::PipelineLayout lightingPipelineLayout = nullptr; + vk::raii::Pipeline lightingPipeline = nullptr; + + // Fullscreen composite pipeline to draw the opaque off-screen color to the swapchain + // (used to avoid gamma-incorrect vkCmdCopyImage and to apply tone mapping when desired). + vk::raii::PipelineLayout compositePipelineLayout = nullptr; + vk::raii::Pipeline compositePipeline = nullptr; + vk::raii::DescriptorSetLayout compositeDescriptorSetLayout = nullptr; // not used; reuse transparentDescriptorSetLayout + std::vector compositeDescriptorSets; // unused; reuse transparentDescriptorSets + + // Pipeline rendering create info structures (for proper lifetime management) + vk::PipelineRenderingCreateInfo mainPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo pbrPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo lightingPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo compositePipelineRenderingCreateInfo; + + // Create composite pipeline + bool createCompositePipeline(); + + // Compute pipeline + vk::raii::PipelineLayout computePipelineLayout = nullptr; + vk::raii::Pipeline computePipeline = nullptr; + vk::raii::DescriptorSetLayout computeDescriptorSetLayout = nullptr; + vk::raii::DescriptorPool computeDescriptorPool = nullptr; + std::vector computeDescriptorSets; + vk::raii::CommandPool computeCommandPool = nullptr; + + // Thread safety for queue access - unified mutex since queues may share the same underlying VkQueue + mutable std::mutex queueMutex; + // Thread safety for descriptor pool/set operations across all engine threads + mutable std::mutex descriptorMutex; + // Monotonic generation counter for descriptor pool rebuilds (future use for hardening) + std::atomic descriptorPoolGeneration{0}; + + // Command pool and buffers + vk::raii::CommandPool commandPool = nullptr; + std::vector commandBuffers; + // Protect usage of shared commandPool for transient command buffers + mutable std::mutex commandMutex; + + // Dedicated transfer queue (falls back to graphics if unavailable) + vk::raii::Queue transferQueue = nullptr; + + // Synchronization objects + std::vector imageAvailableSemaphores; + std::vector renderFinishedSemaphores; + std::vector inFlightFences; + + // Upload timeline semaphore for transfer -> graphics handoff (signaled per upload) + vk::raii::Semaphore uploadsTimeline = nullptr; + // Tracks last timeline value that has been submitted for signaling on uploadsTimeline + std::atomic uploadTimelineLastSubmitted{0}; + + // Depth buffer + vk::raii::Image depthImage = nullptr; + std::unique_ptr depthImageAllocation = nullptr; + vk::raii::DeviceMemory depthImageMemory = nullptr; + vk::raii::ImageView depthImageView = nullptr; + + // Forward+ configuration + bool useForwardPlus = true; // default enabled + uint32_t forwardPlusTileSizeX = 16; + uint32_t forwardPlusTileSizeY = 16; + uint32_t forwardPlusSlicesZ = 16; // clustered depth slices + static constexpr uint32_t MAX_LIGHTS_PER_TILE = 256; // conservative cap + + struct TileHeader { + uint32_t offset; // into tileLightIndices + uint32_t count; // number of indices for this tile + uint32_t pad0; + uint32_t pad1; + }; + + struct ForwardPlusPerFrame { + // SSBOs for per-tile light lists + vk::raii::Buffer tileHeaders = nullptr; + std::unique_ptr tileHeadersAlloc = nullptr; + vk::raii::Buffer tileLightIndices = nullptr; + std::unique_ptr tileLightIndicesAlloc = nullptr; + size_t tilesCapacity = 0; // number of tiles allocated + size_t indicesCapacity = 0; // number of indices allocated + + // Uniform buffer with view/proj, screen size, tile size, etc. + vk::raii::Buffer params = nullptr; + std::unique_ptr paramsAlloc = nullptr; + void* paramsMapped = nullptr; + + // Optional compute debug output buffer (uints), host-visible + vk::raii::Buffer debugOut = nullptr; + std::unique_ptr debugOutAlloc = nullptr; + bool debugOutAwaitingReadback = false; + + // One-frame color probes (host-visible, small buffers) + vk::raii::Buffer probeOffscreen = nullptr; + std::unique_ptr probeOffscreenAlloc = nullptr; + vk::raii::Buffer probeSwapchain = nullptr; + std::unique_ptr probeSwapchainAlloc = nullptr; + bool probeAwaitingReadback = false; + + // Compute descriptor set for culling + vk::raii::DescriptorSet computeSet = nullptr; + }; + std::vector forwardPlusPerFrame; // size MAX_FRAMES_IN_FLIGHT + // Per-frame light count used by shaders (set once before main pass) + uint32_t lastFrameLightCount = 0; + + // Forward+ compute resources + vk::raii::PipelineLayout forwardPlusPipelineLayout = nullptr; + vk::raii::Pipeline forwardPlusPipeline = nullptr; + vk::raii::DescriptorSetLayout forwardPlusDescriptorSetLayout = nullptr; + + // Depth pre-pass pipeline + vk::raii::Pipeline depthPrepassPipeline = nullptr; + + // Ray query rendering mode + RenderMode currentRenderMode = RenderMode::RayQuery; + + // Ray query pipeline and resources + vk::raii::PipelineLayout rayQueryPipelineLayout = nullptr; + vk::raii::Pipeline rayQueryPipeline = nullptr; + vk::raii::DescriptorSetLayout rayQueryDescriptorSetLayout = nullptr; + std::vector rayQueryDescriptorSets; + // Track when the ray query descriptor set for each frame has been written. + // Updating binding 6 (large texture table) can be expensive; avoid doing it every frame. + std::vector rayQueryDescriptorsWritten; // size = MAX_FRAMES_IN_FLIGHT + // Bitmask of frames whose ray query descriptor set needs a refresh (e.g., after TLAS rebuild or texture upload). + std::atomic rayQueryDescriptorsDirtyMask{0}; + + // Dedicated ray query UBO (one per frame in flight) - separate from entity UBOs + std::vector rayQueryUniformBuffers; + std::vector> rayQueryUniformAllocations; + std::vector rayQueryUniformBuffersMapped; + + // Ray query output image (storage image for compute shader output) + vk::raii::Image rayQueryOutputImage = nullptr; + std::unique_ptr rayQueryOutputImageAllocation = nullptr; + vk::raii::ImageView rayQueryOutputImageView = nullptr; + + // Acceleration structures for ray query + struct AccelerationStructure { + vk::raii::Buffer buffer = nullptr; + std::unique_ptr allocation = nullptr; + vk::raii::AccelerationStructureKHR handle = nullptr; // Use RAII for proper lifetime management + vk::DeviceAddress deviceAddress = 0; + }; + std::vector blasStructures; // Bottom-level AS (one per mesh) + AccelerationStructure tlasStructure; // Top-level AS (scene) + + // Deferred deletion queue for old AS structures + // Keeps old AS buffers alive until all frames in flight have finished using them + struct PendingASDelete { + std::vector blasStructures; + AccelerationStructure tlasStructure; + uint32_t framesSinceDestroy = 0; // Increment each frame, delete when >= MAX_FRAMES_IN_FLIGHT + }; + std::vector pendingASDeletions; + + // GPU data structures for ray query proper normal and material access + struct GeometryInfo { + uint64_t vertexBufferAddress; // Device address of vertex buffer + uint64_t indexBufferAddress; // Device address of index buffer + uint32_t vertexCount; // Number of vertices + uint32_t materialIndex; // Index into material buffer + uint32_t indexCount; // Number of indices (to bound primitiveIndex in shader) + uint32_t _pad0; + // Instance-space -> world-space normal transform (3 columns). Matches raster convention. + // Stored as float4 columns (xyz used, w unused) for stable std430 layout. + alignas(16) glm::vec4 normalMatrix0; + alignas(16) glm::vec4 normalMatrix1; + alignas(16) glm::vec4 normalMatrix2; + }; + + struct MaterialData { + alignas(16) glm::vec3 albedo; + alignas(4) float metallic; + alignas(16) glm::vec3 emissive; + alignas(4) float roughness; + alignas(4) float ao; + alignas(4) float ior; + alignas(4) float emissiveStrength; + alignas(4) float alpha; + alignas(4) float transmissionFactor; + alignas(4) float alphaCutoff; + // glTF alpha mode encoding (matches shader): 0=OPAQUE, 1=MASK, 2=BLEND + alignas(4) int32_t alphaMode; + alignas(4) uint32_t isGlass; // bool as uint32 + alignas(4) uint32_t isLiquid; // bool as uint32 + + // Thick-glass parameters (RQ-only) + alignas(16) glm::vec3 absorptionColor{1.0f, 1.0f, 1.0f}; + alignas(4) float absorptionDistance = 1.0f; // meters + alignas(4) uint32_t thinWalled = 1u; // 1 = thin surface, 0 = thick volume + + // Raster parity: texture-set flags (-1 = no texture; 0 = sample from binding 6 table). + // Ray Query uses a single texture table (binding 6); indices are always valid even when + // the set flag is -1, so the shader can choose the correct no-texture behavior. + alignas(4) int32_t baseColorTextureSet; + alignas(4) int32_t physicalDescriptorTextureSet; + alignas(4) int32_t normalTextureSet; + alignas(4) int32_t occlusionTextureSet; + alignas(4) int32_t emissiveTextureSet; + + // Ray Query texture table indices (binding 6). These always reference a valid descriptor + // (real streamed texture or a shared default slot). + alignas(4) int32_t baseColorTexIndex; + alignas(4) int32_t normalTexIndex; + alignas(4) int32_t physicalTexIndex; // metallic-roughness (default) or spec-gloss when useSpecGlossWorkflow=1 + alignas(4) int32_t occlusionTexIndex; + alignas(4) int32_t emissiveTexIndex; + + // Specular-glossiness workflow support (KHR_materials_pbrSpecularGlossiness) + alignas(4) int32_t useSpecGlossWorkflow; // 1 if SpecGloss + alignas(4) float glossinessFactor; + alignas(16) glm::vec3 specularFactor; + alignas(4) int32_t hasEmissiveStrengthExt; + alignas(4) uint32_t _padMat[3]; + }; + + // Ray query geometry and material buffers + vk::raii::Buffer geometryInfoBuffer = nullptr; + std::unique_ptr geometryInfoAllocation = nullptr; + vk::raii::Buffer materialBuffer = nullptr; + std::unique_ptr materialAllocation = nullptr; + + // Ray query baseColor texture array (binding 6) + static constexpr uint32_t RQ_MAX_TEX = 2048; + // Reserved slots in the Ray Query texture table (binding 6) + static constexpr uint32_t RQ_SLOT_DEFAULT_BASECOLOR = 0; + static constexpr uint32_t RQ_SLOT_DEFAULT_NORMAL = 1; + static constexpr uint32_t RQ_SLOT_DEFAULT_METALROUGH = 2; + static constexpr uint32_t RQ_SLOT_DEFAULT_OCCLUSION = 3; + static constexpr uint32_t RQ_SLOT_DEFAULT_EMISSIVE = 4; + // NOTE: Textures can stream in asynchronously and their underlying VkImageView/VkSampler + // can be destroyed/recreated. Therefore, the Ray Query texture table must NOT cache + // VkDescriptorImageInfo (which contains raw handles). Instead, cache only the canonical + // texture key per slot and rebuild VkDescriptorImageInfo each descriptor update. + // + // Slots 0..4 are reserved for shared default PBR textures. + std::vector rayQueryTexKeys; // slot -> canonical texture key + std::vector rayQueryTexFallbackSlots; // slot -> fallback slot (type-appropriate default) + uint32_t rayQueryTexCount = 0; // number of valid slots in rayQueryTexKeys + std::unordered_map rayQueryTexIndex; // canonicalKey -> slot + + // Per-material texture path mapping captured at AS build time; used for streaming requests + // and debugging, but Ray Query primarily uses per-material texture indices. + struct RQMaterialTexPaths { + std::string baseColor; + std::string normal; + std::string physical; + std::string occlusion; + std::string emissive; + }; + std::vector rqMaterialTexPaths; + + // Count of GeometryInfo instances currently uploaded (CPU-side tracking) + size_t geometryInfoCountCPU = 0; + // Count of materials currently uploaded (CPU-side tracking) + size_t materialCountCPU = 0; + + // --- Pending GPU uploads (to be executed on the render thread safe point) --- + std::mutex pendingMeshUploadsMutex; + std::vector pendingMeshUploads; // meshes with staged data to copy + + struct InFlightMeshUploadBatch { + uint64_t signalValue = 0; + std::vector meshes; + std::unique_ptr commandPool; + std::unique_ptr commandBuffers; + }; + std::mutex inFlightMeshUploadsMutex; + std::deque inFlightMeshUploads; + + // Enqueue mesh uploads collected on background/loading threads + void EnqueueMeshUploads(const std::vector& meshes); + // Execute pending mesh uploads on the render thread (called from Render after fence wait) + void ProcessPendingMeshUploads(); + + // --- Pending entity GPU preallocation (enqueued by scene loader thread; executed on render thread) --- + std::mutex pendingEntityPreallocMutex; + std::vector pendingEntityPrealloc; + std::vector pendingInstanceBufferRecreations; + std::atomic pendingEntityPreallocQueued{false}; + void ProcessPendingEntityPreallocations(); + + // Descriptor set layouts (declared before pools and sets) + vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr; + vk::raii::DescriptorSetLayout pbrDescriptorSetLayout = nullptr; + vk::raii::DescriptorSetLayout transparentDescriptorSetLayout = nullptr; + vk::raii::PipelineLayout pbrTransparentPipelineLayout = nullptr; + + // The texture that will hold a snapshot of the opaque scene + // One off-screen color image per frame-in-flight to avoid cross-frame read/write hazards. + std::vector opaqueSceneColorImages; + std::vector> opaqueSceneColorImageAllocations; + std::vector opaqueSceneColorImageViews; + // Track the current layout per frame (initialized to eUndefined at creation) + std::vector opaqueSceneColorImageLayouts; + vk::raii::Sampler opaqueSceneColorSampler{nullptr}; + + // A descriptor set for the opaque scene color texture. One per frame in flight. + std::vector transparentDescriptorSets; + // Fallback descriptor sets for opaque pass (binds a default SHADER_READ_ONLY texture as Set 1) + std::vector transparentFallbackDescriptorSets; + + // Ray Query composite descriptor sets: sample the rayQueryOutputImage in a fullscreen pass + std::vector rqCompositeDescriptorSets; + // Fallback sampler for the RQ composite if no other sampler is available at init time + vk::raii::Sampler rqCompositeSampler{nullptr}; + + // Mesh resources + struct MeshResources { + // Device-local vertex/index buffers used for rendering + vk::raii::Buffer vertexBuffer = nullptr; + std::unique_ptr vertexBufferAllocation = nullptr; + vk::raii::Buffer indexBuffer = nullptr; + std::unique_ptr indexBufferAllocation = nullptr; + uint32_t indexCount = 0; + + // Optional per-mesh staging buffers used when uploads are batched. + // These are populated when createMeshResources(..., deferUpload=true) is used + // and are consumed and cleared by preAllocateEntityResourcesBatch(). + vk::raii::Buffer stagingVertexBuffer = nullptr; + vk::raii::DeviceMemory stagingVertexBufferMemory = nullptr; + vk::DeviceSize vertexBufferSizeBytes = 0; + + vk::raii::Buffer stagingIndexBuffer = nullptr; + vk::raii::DeviceMemory stagingIndexBufferMemory = nullptr; + vk::DeviceSize indexBufferSizeBytes = 0; + + // Material index for ray query (extracted from entity name or MaterialMesh) + int32_t materialIndex = -1; // -1 = no material/default + }; + std::unordered_map meshResources; + + // Texture resources + struct TextureResources { + vk::raii::Image textureImage = nullptr; + std::unique_ptr textureImageAllocation = nullptr; + vk::raii::ImageView textureImageView = nullptr; + vk::raii::Sampler textureSampler = nullptr; + vk::Format format = vk::Format::eR8G8B8A8Srgb; // Store texture format for proper color space handling + uint32_t mipLevels = 1; // Store number of mipmap levels + // Hint: true if source texture appears to use alpha masking (any alpha < ~1.0) + bool alphaMaskedHint = false; + }; + std::unordered_map textureResources; + + // Pending texture jobs that require GPU-side work. Worker threads + // enqueue these jobs; the main thread drains them and performs the + // actual LoadTexture/LoadTextureFromMemory calls. + struct PendingTextureJob { + enum class Type { + FromFile, + FromMemory + } type; + enum class Priority { + Critical, + NonCritical + } priority; + std::string idOrPath; + std::vector data; // only used for FromMemory + int width = 0; + int height = 0; + int channels = 0; + }; + + std::mutex pendingTextureJobsMutex; + std::condition_variable pendingTextureCv; + std::vector pendingTextureJobs; + // Track outstanding critical texture jobs (for IsLoading) + std::atomic criticalJobsOutstanding{0}; + + // Background uploader worker controls (multiple workers) + std::atomic stopUploadsWorker{false}; + std::vector uploadsWorkerThreads; + + // Track how many texture upload jobs have been scheduled vs completed + // on the GPU side. Used only for UI feedback during streaming. + std::atomic uploadJobsTotal{0}; + std::atomic uploadJobsCompleted{0}; + // When true, initial scene load is complete and the loading overlay should be hidden + std::atomic initialLoadComplete{false}; + // Loading-phase UI state (atomic because ImGui may query at any point) + std::atomic loadingPhase{static_cast(LoadingPhase::Scene)}; + std::atomic loadingPhaseProgress{0.0f}; + + // Performance counters for texture uploads + std::atomic bytesUploadedTotal{0}; + // Streaming window start time in nanoseconds from steady_clock epoch (0 when inactive) + std::atomic uploadWindowStartNs{0}; + // Aggregate per-texture CPU upload durations (nanoseconds) and count + std::atomic totalUploadNs{0}; + std::atomic uploadCount{0}; + + // Reverse mapping from texture ID to entities that reference it. Used to + // update descriptor sets when a streamed texture finishes uploading. + std::mutex textureUsersMutex; + std::unordered_map> textureToEntities; + + // Entities needing descriptor set refresh due to streamed textures + std::mutex dirtyEntitiesMutex; + // Map of entity -> bitmask of frames-in-flight that still need a descriptor refresh. + // This avoids the “frame 0 updated / frame 1 still default” oscillation when + // MAX_FRAMES_IN_FLIGHT > 1 and a texture becomes available mid-stream. + std::unordered_map descriptorDirtyEntities; + + // Protect concurrent access to textureResources + mutable std::shared_mutex textureResourcesMutex; + + // Texture aliasing: maps alias (canonical) IDs to actual loaded keys + std::unordered_map textureAliases; + + // Per-texture load de-duplication (serialize loads of the same texture ID only) + mutable std::mutex textureLoadStateMutex; + std::condition_variable textureLoadStateCv; + std::unordered_set texturesLoading; + + // Serialize GPU-side texture upload (image/buffer creation, transitions) to avoid driver/memory pool races + mutable std::mutex textureUploadMutex; + + // Thread pool for background background tasks (textures, etc.) + std::unique_ptr threadPool; + // Mutex to protect threadPool access during initialization/cleanup + mutable std::shared_mutex threadPoolMutex; + + // Texture loading progress (for UI) + std::atomic textureTasksScheduled{0}; + std::atomic textureTasksCompleted{0}; + std::atomic loadingFlag{false}; + + // Acceleration structure build UI progress (written on render thread). + // Kept as atomics because ImGui can query at any point during the frame. + std::atomic asBuildUiActive{false}; + std::atomic asBuildUiProgress{0.0f}; + std::atomic asBuildUiDone{0}; + std::atomic asBuildUiTotal{0}; + std::atomic asBuildUiStage{"idle"}; + std::atomic asBuildUiStartNs{0}; + + // Default texture resources (used when no texture is provided) + TextureResources defaultTextureResources; + + // Performance clamps (to reduce per-frame cost) + static constexpr uint32_t MAX_ACTIVE_LIGHTS = 1024; // Limit the number of lights processed per frame + + // Static lights loaded during model initialization + std::vector staticLights; + + // Dynamic lighting system using storage buffers + struct LightStorageBuffer { + vk::raii::Buffer buffer = nullptr; + std::unique_ptr allocation = nullptr; + void* mapped = nullptr; + size_t capacity = 0; // Current capacity in number of lights + size_t size = 0; // Current number of lights + }; + std::vector lightStorageBuffers; // One per frame in flight + + // Entity resources (contains descriptor sets - must be declared before descriptor pool) + struct EntityResources { + std::vector uniformBuffers; + std::vector> uniformBufferAllocations; + std::vector uniformBuffersMapped; + std::vector basicDescriptorSets; // For basic pipeline + std::vector pbrDescriptorSets; // For PBR pipeline + + // Instance buffer for instanced rendering + vk::raii::Buffer instanceBuffer = nullptr; + std::unique_ptr instanceBufferAllocation = nullptr; + void* instanceBufferMapped = nullptr; + + // Tracks whether binding 0 (UBO) has been written at least once for each frame + // for each pipeline type. Descriptor sets for non-current frames are allocated + // but not necessarily initialized immediately (to avoid update-after-bind hazards), + // so each frame needs a one-time initialization at its safe point. + std::vector pbrUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT + std::vector basicUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Tracks whether image bindings have been written at least once for each frame. + // If false for the current frame at the safe point, we cold-initialize the + // image bindings (PBR: b1..b5 [+b6 when applicable], Basic: b1) with either + // real textures or shared defaults to avoid per-frame "black" flashes. + std::vector pbrImagesWritten; // size = MAX_FRAMES_IN_FLIGHT + std::vector basicImagesWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Tracks whether the remaining required bindings in the PBR set 0 layout have + // been written at least once for each frame. + // This includes bindings like Forward+ tile buffers (7/8), reflection sampler (10), + // and TLAS (11). These bindings are required by the pipeline layout and must be + // valid before any draw that uses the PBR/glass pipelines. + std::vector pbrFixedBindingsWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Cached material lookup/classification for raster rendering. + // Avoids per-frame string parsing of entity names ("_Material_") and repeated + // ModelLoader material lookups across culling, sorting, and draw loops. + bool materialCacheValid = false; + const Material* cachedMaterial = nullptr; + // Derived flags used by render queues and sorting heuristics + bool cachedIsBlended = false; + bool cachedIsGlass = false; + bool cachedIsLiquid = false; + // Material-derived push constants defaults (static per-entity unless material changes) + MaterialProperties cachedMaterialProps{}; + }; + + // Cached job for rendering a single entity in a frame + struct RenderJob + { + Entity *entity; + EntityResources *entityRes; + MeshResources *meshRes; + MeshComponent *meshComp; + TransformComponent *transformComp; + bool isAlphaMasked; + }; + std::unordered_map entityResources; + + // Descriptor pool (declared after entity resources to ensure proper destruction order) + vk::raii::DescriptorPool descriptorPool = nullptr; + + // Current frame index + uint32_t currentFrame = 0; + + // Queue family indices + QueueFamilyIndices queueFamilyIndices; + + // Validation layers + const std::vector validationLayers = { + "VK_LAYER_KHRONOS_validation" + }; + + // Required device extensions + const std::vector requiredDeviceExtensions = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME + }; + + // Optional device extensions + const std::vector optionalDeviceExtensions = { + VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME, + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME, + VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME, + // Robustness and safety + VK_EXT_ROBUSTNESS_2_EXTENSION_NAME, + // Tile/local memory friendly dynamic rendering readback + VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME, + // Shader tile image for fast tile access + VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME, + // Ray query support for ray-traced rendering + VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME, + VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME, + VK_KHR_RAY_QUERY_EXTENSION_NAME + }; + + // All device extensions (required + optional) + std::vector deviceExtensions; + + // Initialization flag + bool initialized = false; + // Whether VK_EXT_descriptor_indexing (update-after-bind) path is enabled + bool descriptorIndexingEnabled = false; + bool storageAfterBindEnabled = false; + // Feature toggles detected/enabled at device creation + bool robustness2Enabled = false; + bool dynamicRenderingLocalReadEnabled = false; + bool shaderTileImageEnabled = false; + bool rayQueryEnabled = false; + bool accelerationStructureEnabled = false; + + // When true and current render mode is RayQuery, the engine renders a static opaque scene: + // - Animation/physics updates are suppressed by the Engine (input/Update hook) + // - TLAS refit per-frame is skipped to avoid any animation-driven changes + // - The AS is built once after loading completes + // Default now OFF so animation is enabled again for AS (per user request) + bool rayQueryStaticOnly = false; + + // (No debug-only TLAS filtering in production.) + + // Framebuffer resized flag (atomic to handle platform callback vs. render thread) + std::atomic framebufferResized{false}; + // Guard to prevent descriptor updates while a command buffer is recording + std::atomic isRecordingCmd{false}; + // Descriptor sets may be temporarily invalid during swapchain recreation; suppress updates then. + std::atomic descriptorSetsValid{true}; + // Request flag for acceleration structure build (set by loading thread, cleared by render thread) + std::atomic asBuildRequested{false}; + // Timestamp of the most recent AS build request (steady_clock ns). Used to prevent infinite deferral. + std::atomic asBuildRequestStartNs{0}; + + // Track last successfully built AS sizes to avoid rebuilding with a smaller subset + // (e.g., during incremental streaming where not all meshes are ready yet). + // We only accept AS builds that are monotonically non-decreasing in counts. + size_t lastASBuiltBLASCount = 0; + // NOTE: This is the number of renderable ENTITIES included in the AS build (not TLAS instances). + size_t lastASBuiltInstanceCount = 0; + // TLAS instance count (includes per-mesh instancing). Used for logging and shader bounds. + size_t lastASBuiltTlasInstanceCount = 0; + + // Freeze TLAS rebuilds after a full build to prevent regressions (e.g., animation-only TLAS) + bool asFreezeAfterFullBuild = true; // enable freezing behavior + bool asFrozen = false; // once frozen, ignore rebuilds unless explicitly overridden + // Optional developer override to allow rebuild while frozen + bool asDevOverrideAllowRebuild = false; + // Reason string for the last time a build was requested (for logging) + std::string lastASBuildRequestReason; + + // Opportunistic rebuilds (when counts increase) can cause unintended TLAS churn during animation. + // Leave this disabled by default; TLAS builds should be explicit (on mode switch / scene ready). + bool asOpportunisticRebuildEnabled = false; + + // --- AS UPDATE/Refit state --- + // Persistent TLAS instances buffer & order for UPDATE (refit) + struct TlasInstanceRef { + class Entity* entity{nullptr}; + uint32_t instanceIndex{0}; // valid only when instanced==true + bool instanced{false}; // true when this TLAS entry comes from MeshComponent instancing + }; + vk::raii::Buffer tlasInstancesBuffer{nullptr}; + std::unique_ptr tlasInstancesAllocation; + uint32_t tlasInstanceCount = 0; + std::vector tlasInstanceOrder; // order must match buffer instances + + // Scratch buffer for TLAS UPDATE operations + vk::raii::Buffer tlasUpdateScratchBuffer{nullptr}; + std::unique_ptr tlasUpdateScratchAllocation; + + // Maximum number of frames in flight + // More than 1 allows CPU/GPU overlap and reduce per-frame stalls. + // All per-frame resources (UBOs, descriptor sets, reflection RTs, etc.) + // are sized dynamically based on this value. + const uint32_t MAX_FRAMES_IN_FLIGHT = 2u; + + // --- Performance & diagnostics --- + UniformBufferObject frameUboTemplate{}; + bool enableFrustumCulling = true; + uint32_t lastCullingVisibleCount = 0; + uint32_t lastCullingCulledCount = 0; + // Distance-based LOD (projected-size skip in pixels) + bool enableDistanceLOD = true; + float lodPixelThresholdOpaque = 1.5f; + float lodPixelThresholdTransparent = 2.5f; + // Sampler anisotropy preference (clamped to device limits) + float samplerMaxAnisotropy = 8.0f; + // Upper bound on auto-generated mip levels (to avoid excessive VRAM use on huge textures) + uint32_t maxAutoGeneratedMipLevels = 4; + + // --- Planar reflections (scaffolding) --- + bool enablePlanarReflections = false; // UI toggle to enable/disable planar reflections + float reflectionResolutionScale = 0.5f; // Scale relative to swapchain size + // Cached per-frame reflection data used by UBO population + // Current frame's reflection VP (for rendering the reflection pass) + glm::mat4 currentReflectionVP{1.0f}; + glm::vec4 currentReflectionPlane{0.0f, 1.0f, 0.0f, 0.0f}; + // Per-frame stored reflection VP (written during reflection pass) + std::vector reflectionVPs; // size MAX_FRAMES_IN_FLIGHT + // The VP to sample in the main pass (prev-frame VP to match prev-frame texture) + glm::mat4 sampleReflectionVP{1.0f}; + bool reflectionResourcesDirty = false; // recreate reflection RTs at safe point + + // --- Ray query rendering options --- + bool enableRayQueryReflections = true; // UI toggle to enable reflections in ray query mode + bool enableRayQueryTransparency = true; // UI toggle to enable transparency/refraction in ray query mode + + // === Watchdog system to detect application hangs === + // Atomic timestamp updated every frame - watchdog thread checks if stale + std::atomic lastFrameUpdateTime; + // Low-noise progress marker to pinpoint where the render thread stalled when the watchdog fires + std::atomic watchdogProgressLabel{"init"}; + // Optional numeric marker to help pinpoint stalls inside large loops + std::atomic watchdogProgressIndex{0}; + std::thread watchdogThread; + std::atomic watchdogRunning{false}; + // Some operations (notably BLAS/TLAS builds in Debug on large scenes) can legitimately take + // longer than the watchdog threshold. When set, the watchdog will not abort. + std::atomic watchdogSuppressed{false}; + + // === Descriptor update deferral while recording === + struct PendingDescOp { + Entity* entity; + std::string texPath; + bool usePBR; + uint32_t frameIndex; + bool imagesOnly; + }; + std::mutex pendingDescMutex; + std::vector pendingDescOps; // flushed at frame safe point + std::atomic descriptorRefreshPending{false}; + + struct ReflectionRT { + vk::raii::Image color{nullptr}; + std::unique_ptr colorAlloc{nullptr}; + vk::raii::ImageView colorView{nullptr}; + vk::raii::Sampler colorSampler{nullptr}; + + vk::raii::Image depth{nullptr}; + std::unique_ptr depthAlloc{nullptr}; + vk::raii::ImageView depthView{nullptr}; + + uint32_t width{0}; + uint32_t height{0}; + }; + std::vector reflections; // one per frame-in-flight + + // Private methods + bool createInstance(const std::string& appName, bool enableValidationLayers); + bool setupDebugMessenger(bool enableValidationLayers); + bool createSurface(); + bool checkValidationLayerSupport() const; + bool pickPhysicalDevice(); + void addSupportedOptionalExtensions(); + bool createLogicalDevice(bool enableValidationLayers); + bool createSwapChain(); + bool createImageViews(); + bool setupDynamicRendering(); + bool createDescriptorSetLayout(); + bool createPBRDescriptorSetLayout(); + bool createGraphicsPipeline(); + + bool createPBRPipeline(); + bool createLightingPipeline(); + bool createDepthPrepassPipeline(); + bool createForwardPlusPipelinesAndResources(); + + // Ray query pipeline creation + bool createRayQueryDescriptorSetLayout(); + bool createRayQueryPipeline(); + bool createRayQueryResources(); + // If updateOnlyCurrentFrame is true, only descriptor sets for currentFrame will be updated. + // Use updateOnlyCurrentFrame=false during initialization/swapchain recreation when the device is idle. + bool createOrResizeForwardPlusBuffers(uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, bool updateOnlyCurrentFrame = false); + void updateForwardPlusParams(uint32_t frameIndex, const glm::mat4& view, const glm::mat4& proj, uint32_t lightCount, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, float nearZ, float farZ); + void dispatchForwardPlus(vk::raii::CommandBuffer& cmd, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ); + // Ensure Forward+ compute descriptor set binding 0 (lights SSBO) is bound for a frame + void refreshForwardPlusComputeLightsBindingForFrame(uint32_t frameIndex); + bool createComputePipeline(); + void pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const; + bool createCommandPool(); + + // Shadow mapping methods + bool createComputeCommandPool(); + bool createDepthResources(); + bool createTextureImage(const std::string& texturePath, TextureResources& resources); + bool createTextureImageView(TextureResources& resources); + bool createTextureSampler(TextureResources& resources); + bool createDefaultTextureResources(); + bool createSharedDefaultPBRTextures(); + bool createMeshResources(MeshComponent* meshComponent, bool deferUpload = false); + bool createUniformBuffers(Entity* entity); + bool createDescriptorPool(); + bool createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR = false); + bool createDescriptorSets(Entity *entity, EntityResources &res, const std::string &texturePath, bool usePBR = false); + bool updateDescriptorSetsForFrame(Entity *entity, + const std::string &texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly = false, + bool uboOnly = false); + bool updateDescriptorSetsForFrame(Entity *entity, + EntityResources &res, + const std::string &texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly = false, + bool uboOnly = false); + // Refresh only the currentFrame PBR descriptor set bindings that Forward+ relies on + // (b6 = lights SSBO, b7 = tile headers, b8 = tile indices). Safe to call after + // we've waited on the frame fence at the start of Render(). + void refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex); + bool createCommandBuffers(); + bool createSyncObjects(); + + void cleanupSwapChain(); + + // Planar reflection helpers (initial scaffolding) + bool createReflectionResources(uint32_t width, uint32_t height); + void destroyReflectionResources(); + // Render the scene into the reflection RT (mirrored about a plane) — to be fleshed out next step + void renderReflectionPass(vk::raii::CommandBuffer& cmd, + const glm::vec4& planeWS, + CameraComponent* camera, + const std::vector &jobs); + + // Ensure Vulkan-Hpp dispatcher is initialized for the current thread when using RAII objects on worker threads + void ensureThreadLocalVulkanInit() const; + + // Cache and classify an entity's material for raster rendering (opaque vs blended, glass/liquid flags, + // and push-constant defaults). This avoids repeated per-frame string parsing and material lookups. + void ensureEntityMaterialCache(Entity* entity, EntityResources &res); + + // ===================== Culling helpers ===================== + struct FrustumPlanes { + // Plane equation ax + by + cz + d >= 0 considered inside + glm::vec4 planes[6]{}; // 0=L,1=R,2=B,3=T,4=N,5=F + }; + + static FrustumPlanes extractFrustumPlanes(const glm::mat4& vp); + + static void transformAABB(const glm::mat4& M, + const glm::vec3& localMin, + const glm::vec3& localMax, + glm::vec3& outMin, + glm::vec3& outMax); + + static bool aabbIntersectsFrustum(const glm::vec3& worldMin, + const glm::vec3& worldMax, + const FrustumPlanes& frustum); + void recreateSwapChain(); + + void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, TransformComponent *tc = nullptr); + void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, const glm::mat4& customTransform); + void updateUniformBuffer(uint32_t currentFrame, uint32_t eye, const glm::mat4& view, const glm::mat4& proj, const glm::vec3& camPos); + void updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, UniformBufferObject& ubo); + void prepareFrameUboTemplate(CameraComponent *camera); + void drawRenderJob(const vk::raii::CommandBuffer& cmd, const RenderJob& job, uint32_t currentFrame, uint32_t eye, bool transparent); + + vk::raii::ShaderModule createShaderModule(const std::vector& code); + + QueueFamilyIndices findQueueFamilies(const vk::raii::PhysicalDevice& device); + SwapChainSupportDetails querySwapChainSupport(const vk::raii::PhysicalDevice& device); + bool isDeviceSuitable(vk::raii::PhysicalDevice& device); + bool checkDeviceExtensionSupport(vk::raii::PhysicalDevice& device); + + vk::SurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector& availableFormats); + vk::PresentModeKHR chooseSwapPresentMode(const std::vector& availablePresentModes); + vk::Extent2D chooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities); + + uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const; + + std::pair createBuffer(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties); + bool createOpaqueSceneColorResources(); + void createTransparentDescriptorSets(); + void createTransparentFallbackDescriptorSets(); + std::pair> createBufferPooled(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties); + void copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size); + + std::pair createImage(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties, uint32_t arrayLayers = 1); + std::pair> createImagePooled(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties, uint32_t mipLevels = 1, uint32_t arrayLayers = 1, vk::SharingMode sharingMode = vk::SharingMode::eExclusive, const std::vector& queueFamilies = {}); + void transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels = 1); + void transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels, uint32_t layerCount); + void transitionImageLayout(vk::CommandBuffer cmd, vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels = 1, uint32_t layerCount = 1); + void copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy regions); + // Extended: track stagedBytes for perf stats + void uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + vk::ArrayProxy regions, + uint32_t mipLevels, + vk::DeviceSize stagedBytes); + + vk::raii::ImageView createImageView(vk::Image image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels = 1, uint32_t layerCount = 1); + vk::Format findSupportedFormat(const std::vector& candidates, vk::ImageTiling tiling, vk::FormatFeatureFlags features); + bool hasStencilComponent(vk::Format format); + + std::vector readFile(const std::string& filename); + + // Background uploader helpers + void StartUploadsWorker(size_t workerCount = 0); + void StopUploadsWorker(); + + // Serialize descriptor writes vs command buffer recording to avoid mid-record updates during recording + std::mutex renderRecordMutex; + + // (Descriptor API wrappers were considered but avoided here to keep RAII types intact.) + + // Upload perf getters + public: + uint64_t GetBytesUploadedTotal() const { + return bytesUploadedTotal.load(std::memory_order_relaxed); + } + double GetAverageUploadMs() const { + uint64_t ns = totalUploadNs.load(std::memory_order_relaxed); + uint32_t cnt = uploadCount.load(std::memory_order_relaxed); + if (cnt == 0) + return 0.0; + return static_cast(ns) / 1e6 / static_cast(cnt); + } + double GetUploadThroughputMBps() const { + uint64_t startNs = uploadWindowStartNs.load(std::memory_order_relaxed); + if (startNs == 0) + return 0.0; + auto now = std::chrono::steady_clock::now().time_since_epoch(); + uint64_t nowNs = static_cast(std::chrono::duration_cast(now).count()); + if (nowNs <= startNs) + return 0.0; + double seconds = static_cast(nowNs - startNs) / 1e9; + double mb = static_cast(bytesUploadedTotal.load(std::memory_order_relaxed)) / (1024.0 * 1024.0); + return seconds > 0.0 ? (mb / seconds) : 0.0; + } +}; diff --git a/attachments/openxr_engine/renderer_core.cpp b/attachments/openxr_engine/renderer_core.cpp new file mode 100644 index 00000000..c362744f --- /dev/null +++ b/attachments/openxr_engine/renderer_core.cpp @@ -0,0 +1,1193 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "renderer.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE; // In a .cpp file + +#include +#include // For PFN_vkGetInstanceProcAddr and C types +#include + +// Debug callback for vk::raii - uses raw Vulkan C types for cross-platform compatibility +static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallbackVkRaii( + VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + [[maybe_unused]] VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, + [[maybe_unused]] void* pUserData) { + if (messageSeverity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { + // Print a message to the console + std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl; + } else { + // Print a message to the console + std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl; + } + + return VK_FALSE; +} + +// Vulkan-Hpp style callback signature for newer headers expecting vk:: types +static VKAPI_ATTR vk::Bool32 VKAPI_CALL debugCallbackVkHpp( + vk::DebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + [[maybe_unused]] vk::DebugUtilsMessageTypeFlagsEXT messageType, + const vk::DebugUtilsMessengerCallbackDataEXT* pCallbackData, + [[maybe_unused]] void* pUserData) { + if (messageSeverity >= vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning) { + std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl; + } else { + std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl; + } + return vk::False; +} + +// Watchdog thread function - monitors frame updates and aborts if application hangs +static void WatchdogThreadFunc(std::atomic* lastFrameTime, + std::atomic* running, + std::atomic* suppressed, + std::atomic* progressLabel, + std::atomic* progressIndex) { + while (running->load(std::memory_order_relaxed)) { + std::this_thread::sleep_for(std::chrono::seconds(5)); + + if (!running->load(std::memory_order_relaxed)) { + break; // Shutdown requested + } + + // Check if frame timestamp was updated recently. + // Some operations (e.g., BLAS/TLAS builds in Debug on large scenes) can legitimately take + // much longer than 5 or 10 seconds. When suppressed, allow a longer grace period. + auto now = std::chrono::steady_clock::now(); + auto lastUpdate = lastFrameTime->load(std::memory_order_relaxed); + auto elapsed = std::chrono::duration_cast(now - lastUpdate).count(); + const int64_t allowedSeconds = (suppressed && suppressed->load(std::memory_order_relaxed)) ? 60 : 10; + + if (elapsed >= allowedSeconds) { + // APPLICATION HAS HUNG - no frame updates for 10+ seconds + const char* label = nullptr; + if (progressLabel) { + label = progressLabel->load(std::memory_order_relaxed); + } + uint32_t idx = 0; + if (progressIndex) { + idx = progressIndex->load(std::memory_order_relaxed); + } + + std::cerr << "\n\n"; + std::cerr << "========================================\n"; + std::cerr << "WATCHDOG: APPLICATION HAS HUNG!\n"; + std::cerr << "========================================\n"; + std::cerr << "Last frame update was " << elapsed << " seconds ago.\n"; + if (label && label[0] != '\0') { + std::cerr << "Last progress marker: " << label << "\n"; + } + if (progressIndex) { + std::cerr << "Progress index: " << idx << "\n"; + } + std::cerr << "The render loop is not progressing.\n"; + std::cerr << "Aborting to generate stack trace...\n"; + std::cerr << "========================================\n\n"; + std::abort(); // Force crash with stack trace + } + } + + std::cout << "[Watchdog] Stopped\n"; +} + +// Renderer core implementation for the "Rendering Pipeline" chapter of the tutorial. +Renderer::Renderer(Platform* platform) : platform(platform) { + // Initialize deviceExtensions with required extensions only + // Optional extensions will be added later after checking device support + deviceExtensions = requiredDeviceExtensions; +} + +// Destructor +Renderer::~Renderer() { + Cleanup(); +} + +// Initialize the renderer +bool Renderer::Initialize(const std::string& appName, bool enableValidationLayers, bool useXR) { + xrMode = useXR; + + // Initialize OpenXR early to get required Vulkan extensions + if (xrMode) { + if (!xrContext.createInstance(appName)) { + std::cerr << "Failed to create OpenXR instance" << std::endl; + return false; + } + } + + // Initialize the Vulkan-Hpp default dispatcher using the global symbol directly. + // This avoids differences across Vulkan-Hpp versions for DynamicLoader placement. + VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr); + // Create a Vulkan instance + if (!createInstance(appName, enableValidationLayers)) { + std::cerr << "Failed to create Vulkan instance" << std::endl; + return false; + } + + // Setup debug messenger + if (!setupDebugMessenger(enableValidationLayers)) { + std::cerr << "Failed to setup debug messenger" << std::endl; + return false; + } + + // Create surface + if (!createSurface()) { + std::cerr << "Failed to create surface" << std::endl; + return false; + } + + // Pick the physical device + if (!pickPhysicalDevice()) { + std::cerr << "Failed to pick physical device" << std::endl; + return false; + } + + // Create logical device + if (!createLogicalDevice(enableValidationLayers)) { + std::cerr << "Failed to create logical device" << std::endl; + return false; + } + + // Initialize memory pool for efficient memory management + try { + memoryPool = std::make_unique(device, physicalDevice); + if (!memoryPool->initialize()) { + std::cerr << "Failed to initialize memory pool" << std::endl; + return false; + } + + // Optionally pre-allocate initial memory blocks for pools. + // For large scenes (e.g., Bistro) on mid-range GPUs this can cause early OOM. + // Skip pre-allocation to reduce peak memory pressure; blocks will be created on demand. + // if (!memoryPool->preAllocatePools()) { /* non-fatal */ } + } catch (const std::exception& e) { + std::cerr << "Failed to create memory pool: " << e.what() << std::endl; + return false; + } + + // Create swap chain + if (!createSwapChain()) { + std::cerr << "Failed to create swap chain" << std::endl; + return false; + } + + // Create image views + if (!createImageViews()) { + std::cerr << "Failed to create image views" << std::endl; + return false; + } + + // Setup dynamic rendering + if (!setupDynamicRendering()) { + std::cerr << "Failed to setup dynamic rendering" << std::endl; + return false; + } + + // Create the descriptor set layout + if (!createDescriptorSetLayout()) { + std::cerr << "Failed to create descriptor set layout" << std::endl; + return false; + } + + // Create the graphics pipeline + if (!createGraphicsPipeline()) { + std::cerr << "Failed to create graphics pipeline" << std::endl; + return false; + } + + // Create PBR pipeline + if (!createPBRPipeline()) { + std::cerr << "Failed to create PBR pipeline" << std::endl; + return false; + } + + // Create the lighting pipeline + if (!createLightingPipeline()) { + std::cerr << "Failed to create lighting pipeline" << std::endl; + return false; + } + + // Create composite pipeline (fullscreen pass for off-screen → swapchain) + if (!createCompositePipeline()) { + std::cerr << "Failed to create composite pipeline" << std::endl; + return false; + } + + // Create compute pipeline + if (!createComputePipeline()) { + std::cerr << "Failed to create compute pipeline" << std::endl; + return false; + } + + // Ensure light storage buffers exist before creating Forward+ resources + // so that compute descriptor binding 0 (lights SSBO) can be populated safely. + if (!createOrResizeLightStorageBuffers(1)) { + std::cerr << "Failed to create initial light storage buffers" << std::endl; + return false; + } + + // Create Forward+ compute and depth pre-pass pipelines/resources + if (useForwardPlus) { + if (!createForwardPlusPipelinesAndResources()) { + std::cerr << "Failed to create Forward+ resources" << std::endl; + return false; + } + } + + // Create ray query descriptor set layout and pipeline (but not resources yet - need descriptor pool first) + if (!createRayQueryDescriptorSetLayout()) { + std::cerr << "Failed to create ray query descriptor set layout" << std::endl; + return false; + } + if (!createRayQueryPipeline()) { + std::cerr << "Failed to create ray query pipeline" << std::endl; + return false; + } + + // Create the command pool + if (!createCommandPool()) { + std::cerr << "Failed to create command pool" << std::endl; + return false; + } + + // Create depth resources + if (!createDepthResources()) { + std::cerr << "Failed to create depth resources" << std::endl; + return false; + } + + if (useForwardPlus) { + if (!createDepthPrepassPipeline()) { + std::cerr << "Failed to create depth prepass pipeline" << std::endl; + return false; + } + } + + // Create the descriptor pool + if (!createDescriptorPool()) { + std::cerr << "Failed to create descriptor pool" << std::endl; + return false; + } + + // Create ray query resources AFTER descriptor pool (needs pool for descriptor set allocation) + if (!createRayQueryResources()) { + std::cerr << "Failed to create ray query resources" << std::endl; + return false; + } + + // Note: Acceleration structure build is requested by scene_loading.cpp after entities load + // No need to request it here during init + + // Light storage buffers were already created earlier to satisfy Forward+ binding requirements + + if (!createOpaqueSceneColorResources()) { + std::cerr << "Failed to create opaque scene color resources" << std::endl; + return false; + } + + createTransparentDescriptorSets(); + + // Create default texture resources + if (!createDefaultTextureResources()) { + std::cerr << "Failed to create default texture resources" << std::endl; + return false; + } + + // Create fallback transparent descriptor sets (must occur after default textures exist) + createTransparentFallbackDescriptorSets(); + + // Create shared default PBR textures (to avoid creating hundreds of identical textures) + if (!createSharedDefaultPBRTextures()) { + std::cerr << "Failed to create shared default PBR textures" << std::endl; + return false; + } + + // Create command buffers + if (!createCommandBuffers()) { + std::cerr << "Failed to create command buffers" << std::endl; + return false; + } + + // Create sync objects + if (!createSyncObjects()) { + std::cerr << "Failed to create sync objects" << std::endl; + return false; + } + + // Initialize background thread pool for async tasks (textures, etc.) AFTER all Vulkan resources are ready + try { + // Size the thread pool based on hardware concurrency, clamped to a sensible range + unsigned int hw = std::max(2u, std::min(8u, std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 4u)); + threadPool = std::make_unique(hw); + } catch (const std::exception& e) { + std::cerr << "Failed to create thread pool: " << e.what() << std::endl; + return false; + } + + // Start background uploads worker now that queues/semaphores exist + StartUploadsWorker(); + + // Start watchdog thread to detect application hangs + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogRunning.store(true, std::memory_order_relaxed); + watchdogThread = std::thread(WatchdogThreadFunc, &lastFrameUpdateTime, &watchdogRunning, &watchdogSuppressed, &watchdogProgressLabel, &watchdogProgressIndex); + + std::cout << "[Watchdog] Started - will abort if no frame updates for 10+ seconds\n"; + + initialized = true; + return true; +} + +void Renderer::ensureThreadLocalVulkanInit() const { + // Initialize Vulkan-Hpp dispatcher per-thread; required for multi-threaded RAII usage + static thread_local bool s_tlsInitialized = false; + if (s_tlsInitialized) + return; + try { + // Initialize the dispatcher for this thread using the global symbol. + VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr); + if (*instance) { + VULKAN_HPP_DEFAULT_DISPATCHER.init(*instance); + } + if (*device) { + VULKAN_HPP_DEFAULT_DISPATCHER.init(*device); + } + s_tlsInitialized = true; + } catch (...) { + // best-effort + } +} + +// Clean up renderer resources +void Renderer::Cleanup() { + // Stop watchdog thread first to prevent false hang detection during shutdown + if (watchdogRunning.load(std::memory_order_relaxed)) { + watchdogRunning.store(false, std::memory_order_relaxed); + if (watchdogThread.joinable()) { + watchdogThread.join(); + } + } + + // Ensure background workers are stopped before tearing down Vulkan resources + StopUploadsWorker(); + + // Disallow any further descriptor writes during shutdown. + // This prevents late updates/frees racing against pool destruction. + descriptorSetsValid.store(false, std::memory_order_relaxed); { + std::lock_guard lk(pendingDescMutex); + pendingDescOps.clear(); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } { + std::unique_lock lock(threadPoolMutex); + if (threadPool) { + threadPool.reset(); + } + } + + if (!initialized) { + return; + } + + std::cout << "Starting renderer cleanup..." << std::endl; + + // Wait for the device to be idle before cleaning up + try { + WaitIdle(); + } catch (...) { + } + + // 1) Clean up any swapchain-scoped resources first + cleanupSwapChain(); + + // 2) Clear per-entity resources (descriptor sets and buffers) while descriptor pools still exist + for (auto& kv : entityResources) { + auto& resources = kv.second; + resources.basicDescriptorSets.clear(); + resources.pbrDescriptorSets.clear(); + resources.uniformBuffers.clear(); + resources.uniformBufferAllocations.clear(); + resources.uniformBuffersMapped.clear(); + resources.instanceBuffer = nullptr; + resources.instanceBufferAllocation = nullptr; + resources.instanceBufferMapped = nullptr; + } + entityResources.clear(); + + // 3) Clear any global descriptor sets that are allocated from pools to avoid dangling refs + transparentDescriptorSets.clear(); + transparentFallbackDescriptorSets.clear(); + compositeDescriptorSets.clear(); + computeDescriptorSets.clear(); + rqCompositeDescriptorSets.clear(); + + // 3.5) Clear ray query descriptor sets BEFORE destroying descriptor pool + // Without this, rayQueryDescriptorSets' RAII destructor tries to free them after + // the pool is destroyed, causing "Invalid VkDescriptorPool Object" validation errors + rayQueryDescriptorSets.clear(); + + // Ray Query composite sampler/sets are allocated from the shared descriptor pool. + // Ensure they are released before destroying the pool. + rqCompositeSampler = nullptr; + + // 4) Destroy/Reset pipelines and pipeline layouts (graphics/compute/forward+) + graphicsPipeline = nullptr; + pbrGraphicsPipeline = nullptr; + pbrBlendGraphicsPipeline = nullptr; + pbrPremulBlendGraphicsPipeline = nullptr; + pbrPrepassGraphicsPipeline = nullptr; + glassGraphicsPipeline = nullptr; + lightingPipeline = nullptr; + compositePipeline = nullptr; + forwardPlusPipeline = nullptr; + depthPrepassPipeline = nullptr; + + pipelineLayout = nullptr; + pbrPipelineLayout = nullptr; + lightingPipelineLayout = nullptr; + compositePipelineLayout = nullptr; + pbrTransparentPipelineLayout = nullptr; + forwardPlusPipelineLayout = nullptr; + + // 4.3) Ray query pipelines and layouts + rayQueryPipeline = nullptr; + rayQueryPipelineLayout = nullptr; + + // 4.5) Forward+ per-frame resources (including descriptor sets) must be released + // BEFORE destroying descriptor pools to avoid vkFreeDescriptorSets with invalid pool + for (auto& fp : forwardPlusPerFrame) { + fp.tileHeaders = nullptr; + fp.tileHeadersAlloc = nullptr; + fp.tileLightIndices = nullptr; + fp.tileLightIndicesAlloc = nullptr; + fp.params = nullptr; + fp.paramsAlloc = nullptr; + fp.paramsMapped = nullptr; + fp.debugOut = nullptr; + fp.debugOutAlloc = nullptr; + fp.probeOffscreen = nullptr; + fp.probeOffscreenAlloc = nullptr; + fp.probeSwapchain = nullptr; + fp.probeSwapchainAlloc = nullptr; + fp.computeSet = nullptr; // descriptor set allocated from compute/graphics pools + } + forwardPlusPerFrame.clear(); + + // 5) Destroy descriptor set layouts and pools (compute + graphics) + descriptorSetLayout = nullptr; + pbrDescriptorSetLayout = nullptr; + transparentDescriptorSetLayout = nullptr; + compositeDescriptorSetLayout = nullptr; + forwardPlusDescriptorSetLayout = nullptr; + computeDescriptorSetLayout = nullptr; + rayQueryDescriptorSetLayout = nullptr; + + // Pools last, after sets are cleared + computeDescriptorPool = nullptr; + descriptorPool = nullptr; + + // 6) Clear textures and aliases, including default resources + { + std::unique_lock lk(textureResourcesMutex); + textureResources.clear(); + textureAliases.clear(); + } + // Reset default texture resources + defaultTextureResources.textureSampler = nullptr; + defaultTextureResources.textureImageView = nullptr; + defaultTextureResources.textureImage = nullptr; + defaultTextureResources.textureImageAllocation = nullptr; + + // 7) Opaque scene color and related descriptors + opaqueSceneColorSampler = nullptr; + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + + // 7.5) Ray query output image and acceleration structures + rayQueryOutputImageView = nullptr; + rayQueryOutputImage = nullptr; + rayQueryOutputImageAllocation = nullptr; + + // Clear acceleration structures (BLAS and TLAS buffers) + blasStructures.clear(); + tlasStructure = AccelerationStructure{}; + + // 8) (moved above) Forward+ per-frame buffers cleared prior to pool destruction + + // 9) Command buffers/pools + commandBuffers.clear(); + commandPool = nullptr; + computeCommandPool = nullptr; + + // 10) Sync objects + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + uploadsTimeline = nullptr; + + // 11) Queues and surface (RAII handles will release upon reset; keep device alive until the end) + graphicsQueue = nullptr; + presentQueue = nullptr; + computeQueue = nullptr; + transferQueue = nullptr; + surface = nullptr; + + // 12) Memory pool last + memoryPool.reset(); + + // Finally mark uninitialized + initialized = false; + std::cout << "Renderer cleanup completed." << std::endl; +} + +// Create instance +bool Renderer::createInstance(const std::string& appName, bool enableValidationLayers) { + try { + // Create application info + vk::ApplicationInfo appInfo{ + .pApplicationName = appName.c_str(), + .applicationVersion = VK_MAKE_VERSION(1, 0, 0), + .pEngineName = "Simple Engine", + .engineVersion = VK_MAKE_VERSION(1, 0, 0), + .apiVersion = VK_API_VERSION_1_3 + }; + + // Get required extensions + std::vector extensions; + + // Add required extensions for GLFW +#if defined(PLATFORM_DESKTOP) + uint32_t glfwExtensionCount = 0; + const char** glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + extensions.insert(extensions.end(), glfwExtensions, glfwExtensions + glfwExtensionCount); +#endif + + // Add debug extension if validation layers are enabled + if (enableValidationLayers) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + + // NEW: Add OpenXR mandatory extensions + if (xrMode) { + auto xrExtensions = xrContext.getVulkanInstanceExtensions(); + extensions.insert(extensions.end(), xrExtensions.begin(), xrExtensions.end()); + } + + // Create instance info + vk::InstanceCreateInfo createInfo{ + .pApplicationInfo = &appInfo, + .enabledExtensionCount = static_cast(extensions.size()), + .ppEnabledExtensionNames = extensions.data() + }; + + // Enable validation layers if requested + vk::ValidationFeaturesEXT validationFeatures{}; + std::vector enabledValidationFeatures; + + if (enableValidationLayers) { + if (!checkValidationLayerSupport()) { + std::cerr << "Validation layers requested, but not available" << std::endl; + return false; + } + + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + + // Keep validation output quiet by default (no DebugPrintf feature). + // Ray Query debugPrintf/printf diagnostics are intentionally removed. + + validationFeatures.enabledValidationFeatureCount = static_cast(enabledValidationFeatures.size()); + validationFeatures.pEnabledValidationFeatures = enabledValidationFeatures.data(); + + createInfo.pNext = &validationFeatures; + } + + // Create instance + instance = vk::raii::Instance(context, createInfo); + + if (xrMode) { + xrContext.setVulkanInstance(*instance); + } + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create instance: " << e.what() << std::endl; + return false; + } +} + +// Setup debug messenger +bool Renderer::setupDebugMessenger(bool enableValidationLayers) { + if (!enableValidationLayers) { + return true; + } + + try { + // Create debug messenger info + vk::DebugUtilsMessengerCreateInfoEXT createInfo{}; + createInfo.messageSeverity = vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eError; + createInfo.messageType = vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral | + vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation | + vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance; + + // Select callback via simple platform macro: Android typically expects C PFN types in headers + // while desktop (newer Vulkan-Hpp) expects vk:: types. +#if defined(__ANDROID__) + createInfo.pfnUserCallback = &debugCallbackVkRaii; +#else + createInfo.pfnUserCallback = &debugCallbackVkHpp; +#endif + + // Create debug messenger + debugMessenger = vk::raii::DebugUtilsMessengerEXT(instance, createInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to set up debug messenger: " << e.what() << std::endl; + return false; + } +} + +// Create surface +bool Renderer::createSurface() { + try { + // Create surface + VkSurfaceKHR _surface; + if (!platform->CreateVulkanSurface(*instance, &_surface)) { + std::cerr << "Failed to create window surface" << std::endl; + return false; + } + + surface = vk::raii::SurfaceKHR(instance, _surface); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create surface: " << e.what() << std::endl; + return false; + } +} + +// Pick a physical device +bool Renderer::pickPhysicalDevice() { + try { + // Get available physical devices + std::vector devices = instance.enumeratePhysicalDevices(); + + if (devices.empty()) { + std::cerr << "Failed to find GPUs with Vulkan support" << std::endl; + return false; + } + + // Prioritize discrete GPUs (like NVIDIA RTX 2080) over integrated GPUs (like Intel UHD Graphics) + // First, collect all suitable devices with their suitability scores + std::multimap suitableDevices; + + for (auto& _device : devices) { + // Print device properties for debugging + vk::PhysicalDeviceProperties deviceProperties = _device.getProperties(); + std::cout << "Checking device: " << deviceProperties.deviceName + << " (Type: " << vk::to_string(deviceProperties.deviceType) << ")" << std::endl; + + if (xrMode) { + // Match the LUID provided by OpenXR + auto props2 = _device.getProperties2(); + const auto& idProps = props2.get(); + + const uint8_t* requiredLuid = xrContext.getRequiredLUID(); + if (requiredLuid && std::memcmp(idProps.deviceLUID, requiredLuid, VK_LUID_SIZE) != 0) { + std::cout << " - LUID mismatch for OpenXR" << std::endl; + continue; // Not the right GPU for XR! + } + } + + // Check if the device supports Vulkan 1.3 + bool supportsVulkan1_3 = deviceProperties.apiVersion >= VK_API_VERSION_1_3; + if (!supportsVulkan1_3) { + std::cout << " - Does not support Vulkan 1.3" << std::endl; + continue; + } + + // Check queue families + QueueFamilyIndices indices = findQueueFamilies(_device); + bool supportsGraphics = indices.isComplete(); + if (!supportsGraphics) { + std::cout << " - Missing required queue families" << std::endl; + continue; + } + + // Check device extensions + bool supportsAllRequiredExtensions = checkDeviceExtensionSupport(_device); + if (!supportsAllRequiredExtensions) { + std::cout << " - Missing required extensions" << std::endl; + continue; + } + + // Check swap chain support + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(_device); + bool swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); + if (!swapChainAdequate) { + std::cout << " - Inadequate swap chain support" << std::endl; + continue; + } + + // Check for required features + auto features = _device.getFeatures2(); + bool supportsRequiredFeatures = features.get().dynamicRendering; + if (!supportsRequiredFeatures) { + std::cout << " - Does not support required features (dynamicRendering)" << std::endl; + continue; + } + + // Calculate suitability score - prioritize discrete GPUs + int score = 0; + + // Discrete GPUs get the highest priority (NVIDIA RTX 2080, AMD, etc.) + if (deviceProperties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { + score += 1000; + std::cout << " - Discrete GPU: +1000 points" << std::endl; + } + // Integrated GPUs get lower priority (Intel UHD Graphics, etc.) + else if (deviceProperties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { + score += 100; + std::cout << " - Integrated GPU: +100 points" << std::endl; + } + + // Add points for memory size (more VRAM is better) + vk::PhysicalDeviceMemoryProperties memProperties = _device.getMemoryProperties(); + for (uint32_t i = 0; i < memProperties.memoryHeapCount; i++) { + if (memProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + // Add 1 point per GB of VRAM + score += static_cast(memProperties.memoryHeaps[i].size / (1024 * 1024 * 1024)); + break; + } + } + + std::cout << " - Device is suitable with score: " << score << std::endl; + suitableDevices.emplace(score, _device); + } + + if (!suitableDevices.empty()) { + // Select the device with the highest score (discrete GPU with most VRAM) + physicalDevice = suitableDevices.rbegin()->second; + + vk::PhysicalDeviceProperties deviceProperties = physicalDevice.getProperties(); + std::cout << "Selected device: " << deviceProperties.deviceName + << " (Type: " << vk::to_string(deviceProperties.deviceType) + << ", Score: " << suitableDevices.rbegin()->first << ")" << std::endl; + + // Store queue family indices for the selected device + queueFamilyIndices = findQueueFamilies(physicalDevice); + + // Add supported optional extensions + addSupportedOptionalExtensions(); + + return true; + } + std::cerr << "Failed to find a suitable GPU. Make sure your GPU supports Vulkan and has the required extensions." << std::endl; + return false; + } catch (const std::exception& e) { + std::cerr << "Failed to pick physical device: " << e.what() << std::endl; + return false; + } +} + +// Add supported optional extensions +void Renderer::addSupportedOptionalExtensions() { + try { + // Get available extensions + auto availableExtensions = physicalDevice.enumerateDeviceExtensionProperties(); + + // Build a set of available extension names for quick lookup + std::set avail; + for (const auto& e : availableExtensions) { + avail.insert(e.extensionName); + } + + for (const auto& optionalExt : optionalDeviceExtensions) { + if (avail.contains(optionalExt)) { + deviceExtensions.push_back(optionalExt); + std::cout << "Adding optional extension: " << optionalExt << std::endl; + } + } + + // NEW: Add OpenXR mandatory device extensions + if (xrMode) { + auto xrDevExtensions = xrContext.getVulkanDeviceExtensions(*physicalDevice); + for (const auto& ext : xrDevExtensions) { + // Ensure we don't duplicate + if (std::find(deviceExtensions.begin(), deviceExtensions.end(), std::string(ext)) == deviceExtensions.end()) { + deviceExtensions.push_back(ext); + } + } + } + } catch (const std::exception& e) { + std::cerr << "Warning: Failed to add optional extensions: " << e.what() << std::endl; + } +} + +// Create logical device +bool Renderer::createLogicalDevice(bool enableValidationLayers) { + try { + // Create queue create info for each unique queue family + std::vector queueCreateInfos; + std::set uniqueQueueFamilies = { + queueFamilyIndices.graphicsFamily.value(), + queueFamilyIndices.presentFamily.value(), + queueFamilyIndices.computeFamily.value(), + queueFamilyIndices.transferFamily.value() + }; + + float queuePriority = 1.0f; + for (uint32_t queueFamily : uniqueQueueFamilies) { + vk::DeviceQueueCreateInfo queueCreateInfo{ + .queueFamilyIndex = queueFamily, + .queueCount = 1, + .pQueuePriorities = &queuePriority + }; + queueCreateInfos.push_back(queueCreateInfo); + } + + // Query supported features before enabling them + auto supportedFeatures = physicalDevice.getFeatures2< + vk::PhysicalDeviceFeatures2, + vk::PhysicalDeviceTimelineSemaphoreFeatures, + vk::PhysicalDeviceVulkanMemoryModelFeatures, + vk::PhysicalDeviceBufferDeviceAddressFeatures, + vk::PhysicalDevice8BitStorageFeatures, + vk::PhysicalDeviceVulkan11Features, + vk::PhysicalDeviceVulkan13Features>(); + + // Verify critical features are supported + const auto& coreSupported = supportedFeatures.get().features; + const auto& timelineSupported = supportedFeatures.get(); + const auto& memoryModelSupported = supportedFeatures.get(); + const auto& bufferAddressSupported = supportedFeatures.get(); + const auto& storage8BitSupported = supportedFeatures.get(); + const auto& vulkan11Supported = supportedFeatures.get(); + const auto& vulkan13Supported = supportedFeatures.get(); + + // Check for required features + if (!coreSupported.samplerAnisotropy || + !timelineSupported.timelineSemaphore || + !memoryModelSupported.vulkanMemoryModel || + !bufferAddressSupported.bufferDeviceAddress || + !vulkan11Supported.shaderDrawParameters || + !vulkan13Supported.dynamicRendering || + !vulkan13Supported.synchronization2) { + throw std::runtime_error("Required Vulkan features not supported by physical device"); + } + + // Enable required features (now verified to be supported) + auto features = physicalDevice.getFeatures2(); + features.features.samplerAnisotropy = vk::True; + features.features.depthBiasClamp = coreSupported.depthBiasClamp ? vk::True : vk::False; + + // Explicitly configure device features to prevent validation layer warnings + // These features are required by extensions or other features, so we enable them explicitly + + // Timeline semaphore features (required for synchronization2) + vk::PhysicalDeviceTimelineSemaphoreFeatures timelineSemaphoreFeatures; + timelineSemaphoreFeatures.timelineSemaphore = vk::True; + + // NEW: Enable Multiview if in XR mode (Chapter 8) + vk::PhysicalDeviceMultiviewFeatures multiviewFeatures; + multiviewFeatures.multiview = xrMode ? vk::True : vk::False; + multiviewFeatures.pNext = &timelineSemaphoreFeatures; + + // Vulkan memory model features (required for some shader operations) + vk::PhysicalDeviceVulkanMemoryModelFeatures memoryModelFeatures; + memoryModelFeatures.vulkanMemoryModel = vk::True; + memoryModelFeatures.vulkanMemoryModelDeviceScope = memoryModelSupported.vulkanMemoryModelDeviceScope ? vk::True : vk::False; + memoryModelFeatures.pNext = &multiviewFeatures; + + // Buffer device address features (required for some buffer operations) + vk::PhysicalDeviceBufferDeviceAddressFeatures bufferDeviceAddressFeatures; + bufferDeviceAddressFeatures.bufferDeviceAddress = vk::True; + + // 8-bit storage features (required for some shader storage operations) + vk::PhysicalDevice8BitStorageFeatures storage8BitFeatures; + storage8BitFeatures.storageBuffer8BitAccess = storage8BitSupported.storageBuffer8BitAccess ? vk::True : vk::False; + + // Enable Vulkan 1.3 features + vk::PhysicalDeviceVulkan13Features vulkan13Features; + vulkan13Features.dynamicRendering = vk::True; + vulkan13Features.synchronization2 = vk::True; + + // Vulkan 1.1 features: shaderDrawParameters to satisfy SPIR-V DrawParameters capability + vk::PhysicalDeviceVulkan11Features vulkan11Features{}; + vulkan11Features.shaderDrawParameters = vk::True; + // Query extended feature support +#if !defined(PLATFORM_ANDROID) + auto featureChain = physicalDevice.getFeatures2< + vk::PhysicalDeviceFeatures2, + vk::PhysicalDeviceDescriptorIndexingFeatures, + vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDeviceDynamicRenderingLocalReadFeaturesKHR, + vk::PhysicalDeviceShaderTileImageFeaturesEXT, + vk::PhysicalDeviceAccelerationStructureFeaturesKHR, + vk::PhysicalDeviceRayQueryFeaturesKHR>(); + const auto& localReadSupported = featureChain.get(); + const auto& tileImageSupported = featureChain.get(); +#else + auto featureChain = physicalDevice.getFeatures2< + vk::PhysicalDeviceFeatures2, + vk::PhysicalDeviceDescriptorIndexingFeatures, + vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDeviceAccelerationStructureFeaturesKHR, + vk::PhysicalDeviceRayQueryFeaturesKHR>(); +#endif + const auto& coreFeaturesSupported = featureChain.get().features; + const auto& indexingFeaturesSupported = featureChain.get(); + const auto& robust2Supported = featureChain.get(); + const auto& accelerationStructureSupported = featureChain.get(); + const auto& rayQuerySupported = featureChain.get(); + + // Ray Query shader uses indexing into a (large) sampled-image array. + // Some drivers require this core feature to be explicitly enabled. + if (coreFeaturesSupported.shaderSampledImageArrayDynamicIndexing) { + features.features.shaderSampledImageArrayDynamicIndexing = vk::True; + } + + // Prepare descriptor indexing features to enable if supported + vk::PhysicalDeviceDescriptorIndexingFeatures indexingFeaturesEnable{}; + descriptorIndexingEnabled = false; + // Enable non-uniform indexing of sampled image arrays when supported — required for + // `NonUniformResourceIndex()` in the ray-query shader to actually take effect. + if (indexingFeaturesSupported.shaderSampledImageArrayNonUniformIndexing) { + indexingFeaturesEnable.shaderSampledImageArrayNonUniformIndexing = vk::True; + descriptorIndexingEnabled = true; + } + + // These are not strictly required when writing a fully-populated descriptor array, + // but enabling them when available avoids edge-case driver behavior for large arrays. + if (descriptorIndexingEnabled) { + if (indexingFeaturesSupported.descriptorBindingPartiallyBound) { + indexingFeaturesEnable.descriptorBindingPartiallyBound = vk::True; + } + if (indexingFeaturesSupported.descriptorBindingUpdateUnusedWhilePending) { + indexingFeaturesEnable.descriptorBindingUpdateUnusedWhilePending = vk::True; + } + } + // Optionally enable UpdateAfterBind flags when supported (not strictly required for RQ textures) + if (indexingFeaturesSupported.descriptorBindingSampledImageUpdateAfterBind) + indexingFeaturesEnable.descriptorBindingSampledImageUpdateAfterBind = vk::True; + if (indexingFeaturesSupported.descriptorBindingUniformBufferUpdateAfterBind) + indexingFeaturesEnable.descriptorBindingUniformBufferUpdateAfterBind = vk::True; + if (indexingFeaturesSupported.descriptorBindingUpdateUnusedWhilePending) + indexingFeaturesEnable.descriptorBindingUpdateUnusedWhilePending = vk::True; + + // Helper to check if an extension is enabled (using string comparison) + auto hasExtension = [&](const char* name) { + return std::find_if(deviceExtensions.begin(), + deviceExtensions.end(), + [&](const char* ext) { + return std::strcmp(ext, name) == 0; + }) != deviceExtensions.end(); + }; + + // Prepare Robustness2 features if the extension is enabled and device supports + auto hasRobust2 = hasExtension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME); + vk::PhysicalDeviceRobustness2FeaturesEXT robust2Enable{}; + if (hasRobust2) { + if (robust2Supported.robustBufferAccess2) + robust2Enable.robustBufferAccess2 = vk::True; + if (robust2Supported.robustImageAccess2) + robust2Enable.robustImageAccess2 = vk::True; + if (robust2Supported.nullDescriptor) + robust2Enable.nullDescriptor = vk::True; + } + +#if !defined(PLATFORM_ANDROID) + // Prepare Dynamic Rendering Local Read features if extension is enabled and supported + auto hasLocalRead = hasExtension(VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME); + vk::PhysicalDeviceDynamicRenderingLocalReadFeaturesKHR localReadEnable{}; + if (hasLocalRead && localReadSupported.dynamicRenderingLocalRead) { + localReadEnable.dynamicRenderingLocalRead = vk::True; + } + + // Prepare Shader Tile Image features if extension is enabled and supported + auto hasTileImage = hasExtension(VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME); + vk::PhysicalDeviceShaderTileImageFeaturesEXT tileImageEnable{}; + if (hasTileImage) { + if (tileImageSupported.shaderTileImageColorReadAccess) + tileImageEnable.shaderTileImageColorReadAccess = vk::True; + if (tileImageSupported.shaderTileImageDepthReadAccess) + tileImageEnable.shaderTileImageDepthReadAccess = vk::True; + if (tileImageSupported.shaderTileImageStencilReadAccess) + tileImageEnable.shaderTileImageStencilReadAccess = vk::True; + } +#endif + + // Prepare Acceleration Structure features if extension is enabled and supported + auto hasAccelerationStructure = hasExtension(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME); + vk::PhysicalDeviceAccelerationStructureFeaturesKHR accelerationStructureEnable{}; + if (hasAccelerationStructure && accelerationStructureSupported.accelerationStructure) { + accelerationStructureEnable.accelerationStructure = vk::True; + } + + // Prepare Ray Query features if extension is enabled and supported + auto hasRayQuery = hasExtension(VK_KHR_RAY_QUERY_EXTENSION_NAME); + vk::PhysicalDeviceRayQueryFeaturesKHR rayQueryEnable{}; + if (hasRayQuery && rayQuerySupported.rayQuery) { + rayQueryEnable.rayQuery = vk::True; + } + + // Chain the feature structures together (build pNext chain explicitly) + // Base + features.pNext = &multiviewFeatures; + multiviewFeatures.pNext = &timelineSemaphoreFeatures; + timelineSemaphoreFeatures.pNext = &memoryModelFeatures; + memoryModelFeatures.pNext = &bufferDeviceAddressFeatures; + bufferDeviceAddressFeatures.pNext = &storage8BitFeatures; + storage8BitFeatures.pNext = &vulkan11Features; // link 1.1 first + vulkan11Features.pNext = &vulkan13Features; // then 1.3 features + + // Build tail chain starting at Vulkan 1.3 features + void** tailNext = reinterpret_cast(&vulkan13Features.pNext); + if (descriptorIndexingEnabled) { + *tailNext = &indexingFeaturesEnable; + tailNext = reinterpret_cast(&indexingFeaturesEnable.pNext); + } + if (hasRobust2) { + *tailNext = &robust2Enable; + tailNext = reinterpret_cast(&robust2Enable.pNext); + } +#if !defined(PLATFORM_ANDROID) + if (hasLocalRead) { + *tailNext = &localReadEnable; + tailNext = reinterpret_cast(&localReadEnable.pNext); + } + if (hasTileImage) { + *tailNext = &tileImageEnable; + tailNext = reinterpret_cast(&tileImageEnable.pNext); + } +#endif + if (hasAccelerationStructure) { + *tailNext = &accelerationStructureEnable; + tailNext = reinterpret_cast(&accelerationStructureEnable.pNext); + } + if (hasRayQuery) { + *tailNext = &rayQueryEnable; + tailNext = reinterpret_cast(&rayQueryEnable.pNext); + } + + // Record which features ended up enabled (for runtime decisions/tutorial diagnostics) + robustness2Enabled = hasRobust2 && (robust2Enable.robustBufferAccess2 == vk::True || + robust2Enable.robustImageAccess2 == vk::True || + robust2Enable.nullDescriptor == vk::True); +#if !defined(PLATFORM_ANDROID) + dynamicRenderingLocalReadEnabled = hasLocalRead && (localReadEnable.dynamicRenderingLocalRead == vk::True); + shaderTileImageEnabled = hasTileImage && (tileImageEnable.shaderTileImageColorReadAccess == vk::True || + tileImageEnable.shaderTileImageDepthReadAccess == vk::True || + tileImageEnable.shaderTileImageStencilReadAccess == vk::True); +#else + dynamicRenderingLocalReadEnabled = false; + shaderTileImageEnabled = false; +#endif + accelerationStructureEnabled = hasAccelerationStructure && (accelerationStructureEnable.accelerationStructure == vk::True); + rayQueryEnabled = hasRayQuery && (rayQueryEnable.rayQuery == vk::True); + + // One-time startup diagnostics (Ray Query + texture array indexing) + static bool printedFeatureDiag = false; + if (!printedFeatureDiag) { + printedFeatureDiag = true; + std::cout << "[DeviceFeatures] shaderSampledImageArrayDynamicIndexing=" + << (features.features.shaderSampledImageArrayDynamicIndexing == vk::True ? "ON" : "OFF") + << ", shaderSampledImageArrayNonUniformIndexing=" + << (indexingFeaturesEnable.shaderSampledImageArrayNonUniformIndexing == vk::True ? "ON" : "OFF") + << ", descriptorIndexingEnabled=" + << (descriptorIndexingEnabled ? "true" : "false") + << "\n"; + } + + // Create a device. Device layers are deprecated and ignored, so we + // only configure extensions and features here; validation is enabled + // via instance layers. + vk::DeviceCreateInfo createInfo{ + .pNext = &features, + .queueCreateInfoCount = static_cast(queueCreateInfos.size()), + .pQueueCreateInfos = queueCreateInfos.data(), + .enabledExtensionCount = static_cast(deviceExtensions.size()), + .ppEnabledExtensionNames = deviceExtensions.data(), + .pEnabledFeatures = nullptr // Using pNext for features + }; + + // Create the logical device + device = vk::raii::Device(physicalDevice, createInfo); + + // After logical device is created, we can initialize the OpenXR session + if (xrMode) { + if (!xrContext.createSession(*physicalDevice, *device, queueFamilyIndices.graphicsFamily.value(), 0)) { + std::cerr << "Failed to create OpenXR session" << std::endl; + return false; + } + } + + // Get queue handles + graphicsQueue = vk::raii::Queue(device, queueFamilyIndices.graphicsFamily.value(), 0); + presentQueue = vk::raii::Queue(device, queueFamilyIndices.presentFamily.value(), 0); + computeQueue = vk::raii::Queue(device, queueFamilyIndices.computeFamily.value(), 0); + transferQueue = vk::raii::Queue(device, queueFamilyIndices.transferFamily.value(), 0); + + // Create global timeline semaphore for uploads early (needed before default texture creation) + vk::StructureChain timelineChain( + {}, + {.semaphoreType = vk::SemaphoreType::eTimeline, .initialValue = 0}); + uploadsTimeline = vk::raii::Semaphore(device, timelineChain.get()); + uploadTimelineLastSubmitted.store(0, std::memory_order_relaxed); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create logical device: " << e.what() << std::endl; + return false; + } +} + +// Check validation layer support +bool Renderer::checkValidationLayerSupport() const { + // Get available layers + std::vector availableLayers = context.enumerateInstanceLayerProperties(); + + // Check if all requested layers are available + for (const char* layerName : validationLayers) { + bool layerFound = false; + + for (const auto& layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } + } + + return true; +} diff --git a/attachments/openxr_engine/renderer_pipelines.cpp b/attachments/openxr_engine/renderer_pipelines.cpp new file mode 100644 index 00000000..e9dccc9b --- /dev/null +++ b/attachments/openxr_engine/renderer_pipelines.cpp @@ -0,0 +1,1412 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "mesh_component.h" +#include "renderer.h" +#include +#include +#include + +// This file contains pipeline-related methods from the Renderer class + +// Create a descriptor set layout +bool Renderer::createDescriptorSetLayout() { + try { + // Create binding for a uniform buffer + vk::DescriptorSetLayoutBinding uboLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }; + + // Create binding for texture sampler + vk::DescriptorSetLayoutBinding samplerLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }; + + // Create a descriptor set layout + std::array bindings = {uboLayoutBinding, samplerLayoutBinding}; + + // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND if enabled + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + if (descriptorIndexingEnabled) { + layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + layoutInfo.pNext = &bindingFlagsInfo; + } + + descriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create PBR descriptor set layout +bool Renderer::createPBRDescriptorSetLayout() { + try { + // Create descriptor set layout bindings for PBR shader + std::array bindings = { + // Binding 0: Uniform buffer (UBO) + vk::DescriptorSetLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 1: Base color map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 2: Metallic roughness map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 2, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 3: Normal map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 3, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 4: Occlusion map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 4, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 5: Emissive map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 5, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 6: Light storage buffer (shadows removed) + vk::DescriptorSetLayoutBinding{ + .binding = 6, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 7: Forward+ tile headers SSBO + vk::DescriptorSetLayoutBinding{ + .binding = 7, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 8: Forward+ tile light indices SSBO + vk::DescriptorSetLayoutBinding{ + .binding = 8, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 9: Fragment debug output buffer (optional) + vk::DescriptorSetLayoutBinding{ + .binding = 9, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 10: Reflection texture (planar reflections) + vk::DescriptorSetLayoutBinding{ + .binding = 10, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 11: TLAS (ray-query shadows in raster fragment shader) + vk::DescriptorSetLayoutBinding{ + .binding = 11, + .descriptorType = vk::DescriptorType::eAccelerationStructureKHR, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 12: Ray-query geometry info buffer (per-instance addresses + material indices) + vk::DescriptorSetLayoutBinding{ + .binding = 12, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 13: Ray-query material buffer (PBR material properties) + vk::DescriptorSetLayoutBinding{ + .binding = 13, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + } + }; + + // Create a descriptor set layout + // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND on UBO (0) and sampled images (1..5) + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[10] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + if (descriptorIndexingEnabled) { + layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + layoutInfo.pNext = &bindingFlagsInfo; + } + + pbrDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + + // Binding 7: transparent passes input + // Layout for Set 1: Just the scene color texture + vk::DescriptorSetLayoutBinding sceneColorBinding{ + .binding = 0, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eFragment + }; + vk::DescriptorSetLayoutCreateInfo transparentLayoutInfo{.bindingCount = 1, .pBindings = &sceneColorBinding}; + if (descriptorIndexingEnabled) { + // Make this sampler binding update-after-bind safe as well (optional) + vk::DescriptorSetLayoutBindingFlagsCreateInfo transBindingFlagsInfo{}; + vk::DescriptorBindingFlags transFlags = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + transBindingFlagsInfo.bindingCount = 1; + transBindingFlagsInfo.pBindingFlags = &transFlags; + transparentLayoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + transparentLayoutInfo.pNext = &transBindingFlagsInfo; + + // Create the layout while the pNext chain is still valid (avoid dangling pointer) + transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo); + } else { + // Create without extra binding flags + transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create a graphics pipeline +bool Renderer::createGraphicsPipeline() { + try { + // Read shader code + auto shaderCode = readFile("shaders/texturedMesh.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + // Fragment entry point specialized for architectural glass + vk::PipelineShaderStageCreateInfo fragGlassStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "GlassPSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info with instancing support + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceAttributeDescriptions = InstanceData::getAttributeDescriptions(); + + // Combine all attribute descriptions (no duplicates) + std::vector allAttributeDescriptions; + allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceAttributeDescriptions.begin(), instanceAttributeDescriptions.end()); + + // Note: materialIndex attribute (Location 11) is not used by current shaders + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributeDescriptions.size()), + .pVertexAttributeDescriptions = allAttributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + // Use LessOrEqual so that the main shading pass works after a depth pre-pass + .depthCompareOp = vk::CompareOp::eLessOrEqual, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr + }; + + pipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + std::cout << "Creating main graphics pipeline with depth format: " << static_cast(depthFormat) << std::endl; + + // Initialize member variable for proper lifetime management + mainPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .viewMask = xrMode ? 0x3u : 0x0u, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // Create the graphics pipeline + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + // Disable back-face culling for opaque PBR to avoid disappearing geometry when + // instance/model transforms flip winding (ensures PASS 1 actually shades pixels) + rasterizerBack.cullMode = vk::CullModeFlagBits::eNone; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .pNext = &mainPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + graphicsPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create PBR pipeline +bool Renderer::createPBRPipeline() { + try { + // Create PBR descriptor set layout + if (!createPBRDescriptorSetLayout()) { + return false; + } + + // Read shader code + auto shaderCode = readFile("shaders/pbr.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + // Fragment entry point specialized for architectural glass + vk::PipelineShaderStageCreateInfo fragGlassStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "GlassPSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Define vertex and instance binding descriptions + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + // Define vertex and instance attribute descriptions + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions(); + auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions(); + + // Combine all attribute descriptions + std::vector allAttributeDescriptions; + allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end()); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributeDescriptions.size()), + .pVertexAttributeDescriptions = allAttributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + std::array transparentSetLayouts = {*pbrDescriptorSetLayout, *transparentDescriptorSetLayout}; + // Create a pipeline layout for opaque PBR with only the PBR descriptor set (set 0) + std::array pbrOnlySetLayouts = {*pbrDescriptorSetLayout}; + // Create BOTH pipeline layouts with two descriptor sets (PBR set 0 + scene color set 1) + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = static_cast(transparentSetLayouts.size()), + .pSetLayouts = transparentSetLayouts.data(), + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + pbrPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Transparent PBR layout uses the same two-set layout + vk::PipelineLayoutCreateInfo transparentPipelineLayoutInfo{.setLayoutCount = static_cast(transparentSetLayouts.size()), .pSetLayouts = transparentSetLayouts.data(), .pushConstantRangeCount = 1, .pPushConstantRanges = &pushConstantRange}; + pbrTransparentPipelineLayout = vk::raii::PipelineLayout(device, transparentPipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + + // Initialize member variable for proper lifetime management + pbrPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .viewMask = xrMode ? 0x3u : 0x0u, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // 1) Opaque PBR pipeline (no blending, depth writes enabled) + vk::PipelineColorBlendAttachmentState opaqueBlendAttachment = colorBlendAttachment; + opaqueBlendAttachment.blendEnable = VK_FALSE; + vk::PipelineColorBlendStateCreateInfo colorBlendingOpaque{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &opaqueBlendAttachment + }; + vk::PipelineDepthStencilStateCreateInfo depthStencilOpaque = depthStencil; + depthStencilOpaque.depthWriteEnable = VK_TRUE; + + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + rasterizerBack.cullMode = vk::CullModeFlagBits::eBack; + + // For architectural glass we often want to see both the inner and outer + // walls of thin shells (e.g., bar glasses viewed from above). Use + // no culling for the glass pipeline to render both sides, while + // keeping back-face culling for the generic PBR pipelines. + vk::PipelineRasterizationStateCreateInfo rasterizerGlass = rasterizer; + rasterizerGlass.cullMode = vk::CullModeFlagBits::eNone; + + vk::GraphicsPipelineCreateInfo opaquePipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilOpaque, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaquePipelineInfo); + + // 1b) Opaque PBR pipeline variant for color pass after a depth pre-pass. + // Depth writes disabled (read-only) and compare against pre-pass depth. + vk::PipelineDepthStencilStateCreateInfo depthStencilAfterPrepass = depthStencil; + depthStencilAfterPrepass.depthTestEnable = VK_TRUE; + depthStencilAfterPrepass.depthWriteEnable = VK_FALSE; + depthStencilAfterPrepass.depthCompareOp = vk::CompareOp::eEqual; + + vk::GraphicsPipelineCreateInfo opaqueAfterPrepassInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilAfterPrepass, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrPrepassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaqueAfterPrepassInfo); + + // 1c) Reflection PBR pipeline for mirrored off-screen pass (cull none to avoid winding issues) + vk::PipelineRasterizationStateCreateInfo rasterizerReflection = rasterizer; + rasterizerReflection.cullMode = vk::CullModeFlagBits::eNone; + vk::GraphicsPipelineCreateInfo reflectionPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerReflection, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilOpaque, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrReflectionGraphicsPipeline = vk::raii::Pipeline(device, nullptr, reflectionPipelineInfo); + + // 2) Blended PBR pipeline (straight alpha blending, depth writes disabled for translucency) + vk::PipelineColorBlendAttachmentState blendedAttachment = colorBlendAttachment; + blendedAttachment.blendEnable = VK_TRUE; + // Straight alpha blending: out.rgb = src.rgb*src.a + dst.rgb*(1-src.a) + blendedAttachment.srcColorBlendFactor = vk::BlendFactor::eSrcAlpha; + blendedAttachment.dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + // Alpha channel keeps destination scaled by inverse src alpha + blendedAttachment.srcAlphaBlendFactor = vk::BlendFactor::eOne; + blendedAttachment.dstAlphaBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + vk::PipelineColorBlendStateCreateInfo colorBlendingBlended{.attachmentCount = 1, .pAttachments = &blendedAttachment}; + vk::PipelineDepthStencilStateCreateInfo depthStencilBlended = depthStencil; + depthStencilBlended.depthWriteEnable = VK_FALSE; + depthStencilBlended.depthCompareOp = vk::CompareOp::eLessOrEqual; + + vk::GraphicsPipelineCreateInfo blendedPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + // Use back-face culling for the blended (glass) pipeline to avoid + // rendering both front and back faces of thin glass geometry, which + // can cause flickering as the camera rotates due to overlapping + // transparent surfaces passing the depth test. + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilBlended, + .pColorBlendState = &colorBlendingBlended, + .pDynamicState = &dynamicState, + .layout = *pbrTransparentPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrBlendGraphicsPipeline = vk::raii::Pipeline(device, nullptr, blendedPipelineInfo); + + // 3) Glass pipeline (architectural glass) - uses the same vertex input and + // descriptor layouts, but a dedicated fragment shader entry point + // (GlassPSMain) for more stable glass shading. + vk::PipelineShaderStageCreateInfo glassStages[] = {vertShaderStageInfo, fragGlassStageInfo}; + + vk::GraphicsPipelineCreateInfo glassPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = glassStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerGlass, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilBlended, + .pColorBlendState = &colorBlendingBlended, + .pDynamicState = &dynamicState, + .layout = *pbrTransparentPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + glassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, glassPipelineInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create fullscreen composite pipeline (samples off-screen color and writes to swapchain) +bool Renderer::createCompositePipeline() { + try { + // Reuse the transparent descriptor set layout (binding 0 = combined image sampler) + if (*transparentDescriptorSetLayout == nullptr) { + // Ensure PBR pipeline path created it + if (!createPBRPipeline()) { + return false; + } + } + + // Read composite shader code + auto shaderCode = readFile("shaders/composite.spv"); + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Shader stages + vk::PipelineShaderStageCreateInfo vert{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + vk::PipelineShaderStageCreateInfo frag{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + vk::PipelineShaderStageCreateInfo stages[] = {vert, frag}; + + // No vertex inputs (fullscreen triangle via SV_VertexID) + vk::PipelineVertexInputStateCreateInfo vertexInput{}; + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{.topology = vk::PrimitiveTopology::eTriangleList}; + vk::PipelineViewportStateCreateInfo viewportState{.viewportCount = 1, .scissorCount = 1}; + vk::PipelineRasterizationStateCreateInfo rasterizer{.polygonMode = vk::PolygonMode::eFill, .cullMode = vk::CullModeFlagBits::eNone, .frontFace = vk::FrontFace::eCounterClockwise, .lineWidth = 1.0f}; + vk::PipelineMultisampleStateCreateInfo multisampling{.rasterizationSamples = vk::SampleCountFlagBits::e1}; + // No depth + vk::PipelineDepthStencilStateCreateInfo depthStencil{.depthTestEnable = VK_FALSE, .depthWriteEnable = VK_FALSE}; + // No blending (we clear swapchain before this and blend transparents later) + vk::PipelineColorBlendAttachmentState attachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | + vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + vk::PipelineColorBlendStateCreateInfo colorBlending{.attachmentCount = 1, .pAttachments = &attachment}; + std::array dynStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor}; + vk::PipelineDynamicStateCreateInfo dynamicState{.dynamicStateCount = static_cast(dynStates.size()), .pDynamicStates = dynStates.data()}; + + // Pipeline layout: single set (combined image sampler) + push constants for exposure/gamma/srgb flag + vk::DescriptorSetLayout setLayouts[] = {*transparentDescriptorSetLayout}; + vk::PushConstantRange pushRange{.stageFlags = vk::ShaderStageFlagBits::eFragment, .offset = 0, .size = 16}; // matches struct Push in composite.slang + vk::PipelineLayoutCreateInfo plInfo{.setLayoutCount = 1, .pSetLayouts = setLayouts, .pushConstantRangeCount = 1, .pPushConstantRanges = &pushRange}; + compositePipelineLayout = vk::raii::PipelineLayout(device, plInfo); + + // Dynamic rendering info + compositePipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .viewMask = xrMode ? 0x3u : 0x0u, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = vk::Format::eUndefined, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + vk::GraphicsPipelineCreateInfo pipeInfo{ + + .pNext = &compositePipelineRenderingCreateInfo, + .stageCount = 2, + .pStages = stages, + .pVertexInputState = &vertexInput, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *compositePipelineLayout, + .renderPass = nullptr, + .subpass = 0 + }; + + compositePipeline = vk::raii::Pipeline(device, nullptr, pipeInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create composite pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create Depth Pre-pass pipeline (depth-only) +bool Renderer::createDepthPrepassPipeline() { + try { + // Use the same descriptor set layout and pipeline layout as PBR for UBOs and instancing + if (*pbrDescriptorSetLayout == nullptr || *pbrPipelineLayout == nullptr) { + if (!createPBRPipeline()) { + return false; + } + } + + // Read PBR shader (vertex only) + auto shaderCode = readFile("shaders/pbr.spv"); + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Stages: Vertex only + vk::PipelineShaderStageCreateInfo vertStage{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + // Vertex/instance bindings & attributes same as PBR + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions(); + auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions(); + std::vector allAttributes; + allAttributes.insert(allAttributes.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributes.insert(allAttributes.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end()); + allAttributes.insert(allAttributes.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end()); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributes.size()), + .pVertexAttributeDescriptions = allAttributes.data() + }; + + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Dummy viewport/scissor (dynamic) + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eBack, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1 + }; + + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLessOrEqual, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // No color attachments + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .attachmentCount = 0, + .pAttachments = nullptr + }; + + std::array dynamicStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor}; + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + vk::Format depthFormat = findDepthFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .viewMask = xrMode ? 0x3u : 0x0u, + .colorAttachmentCount = 0, + .pColorAttachmentFormats = nullptr, + .depthAttachmentFormat = depthFormat + }; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .pNext = &renderingInfo, + .stageCount = 1, + .pStages = &vertStage, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout + }; + + depthPrepassPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create depth pre-pass pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create a lighting pipeline +bool Renderer::createLightingPipeline() { + try { + // Read shader code + auto shaderCode = readFile("shaders/lighting.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info + auto bindingDescription = Vertex::getBindingDescription(); + auto attributeDescriptions = Vertex::getAttributeDescriptions(); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &bindingDescription, + .vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()), + .pVertexAttributeDescriptions = attributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_TRUE, + .srcColorBlendFactor = vk::BlendFactor::eSrcAlpha, + .dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha, + .colorBlendOp = vk::BlendOp::eAdd, + .srcAlphaBlendFactor = vk::BlendFactor::eOne, + .dstAlphaBlendFactor = vk::BlendFactor::eZero, + .alphaBlendOp = vk::BlendOp::eAdd, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + lightingPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + + // Initialize member variable for proper lifetime management + lightingPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .viewMask = xrMode ? 0x3u : 0x0u, + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // Create a graphics pipeline + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + rasterizerBack.cullMode = vk::CullModeFlagBits::eBack; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + + .pNext = &lightingPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *lightingPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + lightingPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create lighting pipeline: " << e.what() << std::endl; + return false; + } +} + +// Push material properties to the pipeline +void Renderer::pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const { + commandBuffer.pushConstants(*pbrPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(MaterialProperties), &material); +} + +bool Renderer::createRayQueryDescriptorSetLayout() { + // Production layout: 7 bindings (0..6), no debug buffer at 7 + std::array bindings{}; + + // Binding 0: UBO (UniformBufferObject) + bindings[0].binding = 0; + bindings[0].descriptorType = vk::DescriptorType::eUniformBuffer; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 1: TLAS (Top-Level Acceleration Structure) + bindings[1].binding = 1; + bindings[1].descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 2: Output image (storage image) + bindings[2].binding = 2; + bindings[2].descriptorType = vk::DescriptorType::eStorageImage; + bindings[2].descriptorCount = 1; + bindings[2].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 3: Light buffer (storage buffer) + bindings[3].binding = 3; + bindings[3].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[3].descriptorCount = 1; + bindings[3].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 4: Geometry info buffer (maps BLAS geometry index to vertex/index buffer addresses) + bindings[4].binding = 4; + bindings[4].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[4].descriptorCount = 1; + bindings[4].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 5: Material buffer (array of material properties) + bindings[5].binding = 5; + bindings[5].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[5].descriptorCount = 1; + bindings[5].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 6: BaseColor textures array (combined image samplers) + bindings[6].binding = 6; + bindings[6].descriptorType = vk::DescriptorType::eCombinedImageSampler; + bindings[6].descriptorCount = RQ_MAX_TEX; // large static array + bindings[6].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Descriptor indexing / update-after-bind support: + // The ray query shader indexes a large `eCombinedImageSampler` array with a per-pixel varying index. + // On some drivers this requires descriptor indexing features + layout binding flags to avoid the + // array collapsing to slot 0 (resulting in "no textures" even when `texIndex>0`). + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + // Binding 6 is the large sampled texture array. + bindingFlags[6] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | + vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending | + vk::DescriptorBindingFlagBits::ePartiallyBound; + } + + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + if (descriptorIndexingEnabled) { + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + if (descriptorIndexingEnabled) { + layoutInfo.pNext = &bindingFlagsInfo; + layoutInfo.flags = vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + } + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + + try { + rayQueryDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +bool Renderer::createRayQueryPipeline() { + // Check if ray query is supported on this device + if (!rayQueryEnabled || !accelerationStructureEnabled) { + std::cout << "Ray query rendering not available on this device (missing VK_KHR_ray_query or VK_KHR_acceleration_structure support)\n"; + return true; // Not an error - just skip ray query pipeline creation + } + + // Load compiled shader module + auto shaderCode = readFile("shaders/ray_query.spv"); + if (shaderCode.empty()) { + std::cerr << "Failed to load ray query shader\n"; + return false; + } + + vk::ShaderModuleCreateInfo createInfo{}; + createInfo.codeSize = shaderCode.size(); + createInfo.pCode = reinterpret_cast(shaderCode.data()); + + vk::raii::ShaderModule shaderModule(device, createInfo); + + vk::PipelineShaderStageCreateInfo shaderStage{}; + shaderStage.stage = vk::ShaderStageFlagBits::eCompute; + shaderStage.module = *shaderModule; + shaderStage.pName = "main"; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{}; + pipelineLayoutInfo.setLayoutCount = 1; + pipelineLayoutInfo.pSetLayouts = &(*rayQueryDescriptorSetLayout); + + rayQueryPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create compute pipeline + vk::ComputePipelineCreateInfo pipelineInfo{}; + pipelineInfo.stage = shaderStage; + pipelineInfo.layout = *rayQueryPipelineLayout; + + try { + rayQueryPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query pipeline: " << e.what() << std::endl; + return false; + } +} + +bool Renderer::createRayQueryResources() { + try { + // Create output image using memory pool (storage image for compute shader) + // Use an HDR-capable format for Ray Query so PBR lighting can accumulate in linear space + // before composite applies exposure/gamma. + // Fall back to R8G8B8A8_UNORM if the device does not support storage-image usage. + vk::Format rqFormat = vk::Format::eR16G16B16A16Sfloat; { + auto props = physicalDevice.getFormatProperties(rqFormat); + if (!(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eStorageImage)) { + rqFormat = vk::Format::eR8G8B8A8Unorm; + } + } + auto [image, allocation] = memoryPool->createImage( + swapChainExtent.width, + swapChainExtent.height, + rqFormat, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eStorage | vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eSampled, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + // mipLevels + vk::SharingMode::eExclusive, + {} // queueFamilies + ); + + rayQueryOutputImage = std::move(image); + rayQueryOutputImageAllocation = std::move(allocation); + + // Create image view + vk::ImageViewCreateInfo viewInfo{}; + viewInfo.image = *rayQueryOutputImage; + viewInfo.viewType = vk::ImageViewType::e2D; + viewInfo.format = rqFormat; + viewInfo.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + viewInfo.subresourceRange.baseMipLevel = 0; + viewInfo.subresourceRange.levelCount = 1; + viewInfo.subresourceRange.baseArrayLayer = 0; + viewInfo.subresourceRange.layerCount = 1; + + rayQueryOutputImageView = vk::raii::ImageView(device, viewInfo); + + // Transition output image to GENERAL layout for compute shader writes + transitionImageLayout(*rayQueryOutputImage, + rqFormat, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eGeneral, + 1); + + // Allocate descriptor sets (one per frame in flight) + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{}; + allocInfo.descriptorPool = *descriptorPool; + allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT; + allocInfo.pSetLayouts = layouts.data(); + + // Allocate into a temporary owning container, then move the individual RAII sets into our vector. + // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector`.) + { + auto sets = vk::raii::DescriptorSets(device, allocInfo); + rayQueryDescriptorSets.clear(); + rayQueryDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + rayQueryDescriptorSets.emplace_back(std::move(s)); + } + } + + // Create descriptor sets for composite pass to sample the rayQueryOutputImage + // Reuse the transparentDescriptorSetLayout (binding 0 = combined image sampler) + if (*transparentDescriptorSetLayout == nullptr) { + // Ensure it exists (created by PBR path); + createPBRPipeline(); + } + if (*transparentDescriptorSetLayout != nullptr) { + // Ensure we have a valid sampler for sampling the ray-query output image + if (*rqCompositeSampler == nullptr) { + vk::SamplerCreateInfo sci{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .mipmapMode = vk::SamplerMipmapMode::eNearest, + .addressModeU = vk::SamplerAddressMode::eClampToEdge, + .addressModeV = vk::SamplerAddressMode::eClampToEdge, + .addressModeW = vk::SamplerAddressMode::eClampToEdge, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 1.0f, + .compareEnable = VK_FALSE, + .compareOp = vk::CompareOp::eAlways, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = vk::BorderColor::eIntOpaqueBlack, + .unnormalizedCoordinates = VK_FALSE + }; + rqCompositeSampler = vk::raii::Sampler(device, sci); + } + std::vector rqLayouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo rqAllocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, + .pSetLayouts = rqLayouts.data() + }; { + auto sets = vk::raii::DescriptorSets(device, rqAllocInfo); + rqCompositeDescriptorSets.clear(); + rqCompositeDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + rqCompositeDescriptorSets.emplace_back(std::move(s)); + } + } + + // Update each set to sample the rayQueryOutputImage + for (size_t i = 0; i < rqCompositeDescriptorSets.size(); ++i) { + // Use a dedicated sampler to avoid null sampler issues during early init + vk::Sampler samplerHandle = *rqCompositeSampler; + vk::DescriptorImageInfo imgInfo{ + .sampler = samplerHandle, + .imageView = *rayQueryOutputImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + vk::WriteDescriptorSet write{ + .dstSet = *rqCompositeDescriptorSets[i], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imgInfo + }; + device.updateDescriptorSets({write}, {}); + } + } + + // Create dedicated UBO buffers for ray query (one per frame in flight) + rayQueryUniformBuffers.clear(); + rayQueryUniformAllocations.clear(); + rayQueryUniformBuffersMapped.clear(); + + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + auto [uboBuffer, uboAlloc] = createBufferPooled( + sizeof(RayQueryUniformBufferObject), + vk::BufferUsageFlagBits::eUniformBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + rayQueryUniformBuffers.push_back(std::move(uboBuffer)); + rayQueryUniformAllocations.push_back(std::move(uboAlloc)); + rayQueryUniformBuffersMapped.push_back(rayQueryUniformAllocations.back()->mappedPtr); + } + + std::cout << "Ray query resources created successfully (including " << MAX_FRAMES_IN_FLIGHT << " dedicated UBOs)\n"; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query resources: " << e.what() << std::endl; + return false; + } +} diff --git a/attachments/openxr_engine/renderer_rendering.cpp b/attachments/openxr_engine/renderer_rendering.cpp new file mode 100644 index 00000000..783fed89 --- /dev/null +++ b/attachments/openxr_engine/renderer_rendering.cpp @@ -0,0 +1,2971 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "imgui/imgui.h" +#include "imgui_system.h" +#include "mesh_component.h" +#include "model_loader.h" +#include "renderer.h" +#include "transform_component.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ===================== Culling helpers implementation ===================== + +Renderer::FrustumPlanes Renderer::extractFrustumPlanes(const glm::mat4& vp) { + // Work in row-major form for standard plane extraction by transposing GLM's column-major matrix + glm::mat4 m = glm::transpose(vp); + FrustumPlanes fp{}; + // Left : m[3] + m[0] + fp.planes[0] = m[3] + m[0]; + // Right : m[3] - m[0] + fp.planes[1] = m[3] - m[0]; + // Bottom : m[3] + m[1] + fp.planes[2] = m[3] + m[1]; + // Top : m[3] - m[1] + fp.planes[3] = m[3] - m[1]; + // Near : m[2] (matches Vulkan [0, 1] clip range) + fp.planes[4] = m[2]; + // Far : m[3] - m[2] + fp.planes[5] = m[3] - m[2]; + + // Normalize planes + for (auto& p : fp.planes) { + glm::vec3 n(p.x, p.y, p.z); + float len = glm::length(n); + if (len > 0.0f) { + p /= len; + } + } + return fp; +} + +void Renderer::transformAABB(const glm::mat4& M, + const glm::vec3& localMin, + const glm::vec3& localMax, + glm::vec3& outMin, + glm::vec3& outMax) { + // OBB (from model) to world AABB using center/extents and absolute 3x3 + const glm::vec3 c = 0.5f * (localMin + localMax); + const glm::vec3 e = 0.5f * (localMax - localMin); + + const glm::vec3 worldCenter = glm::vec3(M * glm::vec4(c, 1.0f)); + // Upper-left 3x3 + const glm::mat3 A = glm::mat3(M); + const glm::mat3 AbsA = glm::mat3(glm::abs(A[0]), glm::abs(A[1]), glm::abs(A[2])); + const glm::vec3 worldExtents = AbsA * e; // component-wise combination + + outMin = worldCenter - worldExtents; + outMax = worldCenter + worldExtents; +} + +bool Renderer::aabbIntersectsFrustum(const glm::vec3& worldMin, + const glm::vec3& worldMax, + const FrustumPlanes& frustum) { + // Use the p-vertex test against each plane; if outside any plane → culled + for (const auto& p : frustum.planes) { + const glm::vec3 n(p.x, p.y, p.z); + // Choose positive vertex (furthest in direction of normal) + glm::vec3 v{ + n.x >= 0.0f ? worldMax.x : worldMin.x, + n.y >= 0.0f ? worldMax.y : worldMin.y, + n.z >= 0.0f ? worldMax.z : worldMin.z + }; + + // If the most positive vertex is still on the negative side of the plane, + // then the entire box is on the negative side. + // Use a small epsilon to avoid numerical issues. + if (glm::dot(n, v) + p.w < -0.01f) { + return false; // completely outside + } + } + return true; +} + +// This file contains rendering-related methods from the Renderer class + +// Create swap chain +bool Renderer::createSwapChain() { + try { + if (xrMode) { + // NEW: Negotiate with OpenXR instead of the windowing system + // Query swap chain support (though XR often dictates this) + vk::Extent2D xrExtent = xrContext.getRecommendedExtent(); + xrContext.createSwapchains(*device, vk::Format::eB8G8R8A8Srgb, xrExtent); + + swapChainImageFormat = xrContext.getSwapchainFormat(); + swapChainExtent = xrContext.getSwapchainExtent(); + + // Use a single swapchain with 2 layers for multiview + eyeSwapchainImages[0] = xrContext.enumerateSwapchainImages(); + eyeSwapchainImages[1].clear(); // Not used in multiview mode + + // We still use swapChainImages as a dummy or to track common state if needed + return true; + } + + // Query swap chain support + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice); + + // Choose swap surface format, present mode, and extent + vk::SurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats); + vk::PresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes); + vk::Extent2D extent = chooseSwapExtent(swapChainSupport.capabilities); + + // Choose image count + uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; + if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) { + imageCount = swapChainSupport.capabilities.maxImageCount; + } + + // Create swap chain info + vk::SwapchainCreateInfoKHR createInfo{ + .surface = *surface, + .minImageCount = imageCount, + .imageFormat = surfaceFormat.format, + .imageColorSpace = surfaceFormat.colorSpace, + .imageExtent = extent, + .imageArrayLayers = 1, + .imageUsage = vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eTransferDst, + .preTransform = swapChainSupport.capabilities.currentTransform, + .compositeAlpha = vk::CompositeAlphaFlagBitsKHR::eOpaque, + .presentMode = presentMode, + .clipped = VK_TRUE, + .oldSwapchain = nullptr + }; + + // Find queue families + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + std::array queueFamilyIndicesLoc = {indices.graphicsFamily.value(), indices.presentFamily.value()}; + + // Set sharing mode + if (indices.graphicsFamily != indices.presentFamily) { + createInfo.imageSharingMode = vk::SharingMode::eConcurrent; + createInfo.queueFamilyIndexCount = static_cast(queueFamilyIndicesLoc.size()); + createInfo.pQueueFamilyIndices = queueFamilyIndicesLoc.data(); + } else { + createInfo.imageSharingMode = vk::SharingMode::eExclusive; + createInfo.queueFamilyIndexCount = 0; + createInfo.pQueueFamilyIndices = nullptr; + } + + // Create swap chain + swapChain = vk::raii::SwapchainKHR(device, createInfo); + + // Get swap chain images + swapChainImages = swapChain.getImages(); + + // Swapchain images start in UNDEFINED layout; track per-image layout for correct barriers. + swapChainImageLayouts.assign(swapChainImages.size(), vk::ImageLayout::eUndefined); + + // Store swap chain format and extent + swapChainImageFormat = surfaceFormat.format; + swapChainExtent = extent; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create swap chain: " << e.what() << std::endl; + return false; + } +} + +// ===================== Planar reflections resources ===================== +bool Renderer::createReflectionResources(uint32_t width, uint32_t height) { + try { + destroyReflectionResources(); + reflections.clear(); + reflections.resize(MAX_FRAMES_IN_FLIGHT); + reflectionVPs.clear(); + reflectionVPs.resize(MAX_FRAMES_IN_FLIGHT, glm::mat4(1.0f)); + sampleReflectionVP = glm::mat4(1.0f); + + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto& rt = reflections[i]; + rt.width = width; + rt.height = height; + + // Color RT: use swapchain format to match existing PBR pipeline rendering formats + vk::Format colorFmt = swapChainImageFormat; + auto [colorImg, colorAlloc] = createImagePooled( + width, + height, + colorFmt, + vk::ImageTiling::eOptimal, + // Allow sampling in glass and blitting to swapchain for diagnostics + vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + 1, + vk::SharingMode::eExclusive, + {}); + rt.color = std::move(colorImg); + rt.colorAlloc = std::move(colorAlloc); + rt.colorView = createImageView(rt.color, colorFmt, vk::ImageAspectFlagBits::eColor, 1); + // Simple sampler for sampling reflection texture (no mips) + vk::SamplerCreateInfo sampInfo{.magFilter = vk::Filter::eLinear, .minFilter = vk::Filter::eLinear, .mipmapMode = vk::SamplerMipmapMode::eNearest, .addressModeU = vk::SamplerAddressMode::eClampToEdge, .addressModeV = vk::SamplerAddressMode::eClampToEdge, .addressModeW = vk::SamplerAddressMode::eClampToEdge, .minLod = 0.0f, .maxLod = 0.0f}; + rt.colorSampler = vk::raii::Sampler(device, sampInfo); + + // Depth RT + vk::Format depthFmt = findDepthFormat(); + auto [depthImg, depthAlloc] = createImagePooled( + width, + height, + depthFmt, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eDepthStencilAttachment, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + 1, + vk::SharingMode::eExclusive, + {}); + rt.depth = std::move(depthImg); + rt.depthAlloc = std::move(depthAlloc); + rt.depthView = createImageView(rt.depth, depthFmt, vk::ImageAspectFlagBits::eDepth, 1); + } + + // One-time initialization: transition all per-frame reflection color images + // from UNDEFINED to SHADER_READ_ONLY_OPTIMAL so that the first frame can + // legally sample the "previous" frame's image. + if (!reflections.empty()) { + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1}; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + std::vector barriers; + barriers.reserve(reflections.size()); + for (auto& rt : reflections) { + if (!!*rt.color) { + barriers.push_back(vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }); + } + } + if (!barriers.empty()) { + vk::DependencyInfo depInfo{.imageMemoryBarrierCount = static_cast(barriers.size()), .pImageMemoryBarriers = barriers.data()}; + cb.pipelineBarrier2(depInfo); + } + cb.end(); + vk::SubmitInfo submit{.commandBufferCount = 1, .pCommandBuffers = &*cb}; + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); { + std::lock_guard lock(queueMutex); + graphicsQueue.submit(submit, *fence); + } + vk::Result result = waitForFencesSafe(*fence, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for reflection resource fence: " << vk::to_string(result) << std::endl; + } + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create reflection resources: " << e.what() << std::endl; + destroyReflectionResources(); + return false; + } +} + +void Renderer::destroyReflectionResources() { + for (auto& rt : reflections) { + rt.colorSampler = vk::raii::Sampler(nullptr); + rt.colorView = vk::raii::ImageView(nullptr); + rt.colorAlloc = nullptr; + rt.color = vk::raii::Image(nullptr); + rt.depthView = vk::raii::ImageView(nullptr); + rt.depthAlloc = nullptr; + rt.depth = vk::raii::Image(nullptr); + rt.width = rt.height = 0; + } +} + +void Renderer::renderReflectionPass(vk::raii::CommandBuffer& cmd, + const glm::vec4& planeWS, + CameraComponent* camera, + const std::vector& jobs) { + if (reflections.empty()) + return; + auto& rt = reflections[currentFrame]; + if (rt.width == 0 || rt.height == 0 || !*rt.colorView || !*rt.depthView) + return; + + // Transition reflection color to COLOR_ATTACHMENT_OPTIMAL (Sync2) + vk::ImageMemoryBarrier2 toColor2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + // Transition reflection depth to DEPTH_STENCIL_ATTACHMENT_OPTIMAL (Sync2) + vk::ImageMemoryBarrier2 toDepth2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite | vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.depth, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + std::array preBarriers{toColor2, toDepth2}; + vk::DependencyInfo depInfoToColor{.imageMemoryBarrierCount = static_cast(preBarriers.size()), .pImageMemoryBarriers = preBarriers.data()}; + cmd.pipelineBarrier2(depInfoToColor); + + vk::RenderingAttachmentInfo colorAtt{ + .imageView = *rt.colorView, + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + // Clear to black so scene content dominates reflections + .clearValue = vk::ClearValue{vk::ClearColorValue{std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f}}} + }; + vk::RenderingAttachmentInfo depthAtt{ + .imageView = *rt.depthView, + .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eDontCare, + .clearValue = vk::ClearValue{vk::ClearDepthStencilValue{1.0f, 0}} + }; + vk::RenderingInfo rinfo{ + .renderArea = vk::Rect2D({0, 0}, {rt.width, rt.height}), + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &colorAtt, + .pDepthAttachment = &depthAtt + }; + cmd.beginRendering(rinfo); + // Compute mirrored view matrix about planeWS (default Y=0 plane) + glm::mat4 reflectM(1.0f); + // For Y=0 plane, reflection is simply flip Y + if (glm::length(glm::vec3(planeWS.x, planeWS.y, planeWS.z)) > 0.5f && fabsf(planeWS.y - 1.0f) < 1e-3f && fabsf(planeWS.x) < 1e-3f && fabsf(planeWS.z) < 1e-3f) { + reflectM[1][1] = -1.0f; + } else { + // General plane reflection matrix R = I - 2*n*n^T for normalized plane; ignore translation for now + glm::vec3 n = glm::normalize(glm::vec3(planeWS)); + glm::mat3 R = glm::mat3(1.0f) - 2.0f * glm::outerProduct(n, n); + reflectM = glm::mat4(R); + } + + glm::mat4 viewReflected = camera ? (camera->GetViewMatrix() * reflectM) : reflectM; + glm::mat4 projReflected = camera ? camera->GetProjectionMatrix() : glm::mat4(1.0f); + currentReflectionVP = projReflected * viewReflected; + currentReflectionPlane = planeWS; + if (currentFrame < reflectionVPs.size()) { + reflectionVPs[currentFrame] = currentReflectionVP; + } + + // Set viewport/scissor to reflection RT size + vk::Viewport rv(0.0f, 0.0f, static_cast(rt.width), static_cast(rt.height), 0.0f, 1.0f); + cmd.setViewport(0, rv); + vk::Rect2D rs({0, 0}, {rt.width, rt.height}); + cmd.setScissor(0, rs); + + // Draw opaque entities with mirrored view + // Use reflection-specific pipeline (cull none) to avoid mirrored winding issues. + if (!!*pbrReflectionGraphicsPipeline) { + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrReflectionGraphicsPipeline); + } else if (!!*pbrGraphicsPipeline) { + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrGraphicsPipeline); + } + + // Prepare frustum for mirrored view to allow culling + FrustumPlanes reflectFrustum = extractFrustumPlanes(currentReflectionVP); + + // Render all jobs (skip transparency) + for (const auto& job : jobs) { + Entity* entity = job.entity; + MeshComponent* meshComponent = job.meshComp; + EntityResources* entityRes = job.entityRes; + MeshResources* meshRes = job.meshRes; + + if (entityRes->cachedIsBlended) + continue; + + // Frustum culling for mirrored view + if (meshComponent->HasLocalAABB()) { + const glm::mat4 model = job.transformComp ? job.transformComp->GetModelMatrix() : glm::mat4(1.0f); + glm::vec3 wmin, wmax; + transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax); + if (!aabbIntersectsFrustum(wmin, wmax, reflectFrustum)) { + continue; // culled from reflection + } + } + + // Bind geometry + std::array buffers = {*meshRes->vertexBuffer, *entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + cmd.bindVertexBuffers(0, buffers, offsets); + cmd.bindIndexBuffer(*meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + // Populate UBO with mirrored view + clip plane and reflection flags + UniformBufferObject ubo{}; + if (job.transformComp) + ubo.model = job.transformComp->GetModelMatrix(); + else + ubo.model = glm::mat4(1.0f); + ubo.view = viewReflected; + ubo.proj = projReflected; + ubo.camPos = glm::vec4(camera ? camera->GetPosition() : glm::vec3(0), 1.0f); + ubo.reflectionPass = 1; + ubo.reflectionEnabled = 0; + ubo.reflectionVP = currentReflectionVP; + ubo.clipPlaneWS = planeWS; + // Ray query shadows in reflection pass + ubo.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f; + + updateUniformBufferInternal(currentFrame, entity, entityRes, camera, ubo); + + // Bind descriptor set (PBR set 0) + cmd.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, + *pbrPipelineLayout, + 0, + *entityRes->pbrDescriptorSets[currentFrame], + nullptr); + + // Push material properties + MaterialProperties mp = entityRes->cachedMaterialProps; + // Transmission suppressed during reflection pass via UBO (reflectionPass=1) + mp.transmissionFactor = 0.0f; + pushMaterialProperties(*cmd, mp); + + // Issue draw + uint32_t instanceCount = std::max(1u, static_cast(meshComponent->GetInstanceCount())); + cmd.drawIndexed(meshRes->indexCount, instanceCount, 0, 0, 0); + } + + cmd.endRendering(); + + // Transition reflection color to SHADER_READ_ONLY for sampling in main pass (Sync2) + vk::ImageMemoryBarrier2 toSample2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toSample2}; + cmd.pipelineBarrier2(depInfoToSample); +} + +// Create image views +bool Renderer::createImageViews() { + try { + if (xrMode) { + eyeSwapchainImageViews[0].clear(); + eyeSwapchainImageViews[1].clear(); + for (size_t i = 0; i < eyeSwapchainImages[0].size(); ++i) { + vk::Image img = eyeSwapchainImages[0][i]; + eyeSwapchainImageViews[0].emplace_back(createImageView(img, swapChainImageFormat, vk::ImageAspectFlagBits::eColor, 1, 2)); // 2 layers for multiview + } + return true; + } + + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + opaqueSceneColorSampler.clear(); + // Resize image views vector + swapChainImageViews.clear(); + swapChainImageViews.reserve(swapChainImages.size()); + + // Create image view info template (image will be set per iteration) + vk::ImageViewCreateInfo createInfo{ + .viewType = vk::ImageViewType::e2D, + .format = swapChainImageFormat, + .components = { + .r = vk::ComponentSwizzle::eIdentity, + .g = vk::ComponentSwizzle::eIdentity, + .b = vk::ComponentSwizzle::eIdentity, + .a = vk::ComponentSwizzle::eIdentity + }, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + + // Create image view for each swap chain image + for (const auto& image : swapChainImages) { + createInfo.image = image; + swapChainImageViews.emplace_back(device, createInfo); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create image views: " << e.what() << std::endl; + return false; + } +} + +// Setup dynamic rendering +bool Renderer::setupDynamicRendering() { + try { + // Create color attachment + colorAttachments = { + vk::RenderingAttachmentInfo{ + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f}) + + } + }; + + // Create depth attachment + depthAttachment = vk::RenderingAttachmentInfo{ + .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearDepthStencilValue(1.0f, 0) + }; + + // Create rendering info + renderingInfo = vk::RenderingInfo{ + .renderArea = vk::Rect2D(vk::Offset2D(0, 0), swapChainExtent), + .layerCount = 1, + .viewMask = xrMode ? 0x3u : 0x0u, // 0x3 enables views 0 and 1 for multiview + .colorAttachmentCount = static_cast(colorAttachments.size()), + .pColorAttachments = colorAttachments.data(), + .pDepthAttachment = &depthAttachment + }; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to setup dynamic rendering: " << e.what() << std::endl; + return false; + } +} + +// Create command pool +bool Renderer::createCommandPool() { + try { + // Find queue families + QueueFamilyIndices queueFamilyIndicesLoc = findQueueFamilies(physicalDevice); + + // Create command pool info + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndicesLoc.graphicsFamily.value() + }; + + // Create command pool + commandPool = vk::raii::CommandPool(device, poolInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create command pool: " << e.what() << std::endl; + return false; + } +} + +// Create command buffers +bool Renderer::createCommandBuffers() { + try { + // Resize command buffers vector + commandBuffers.clear(); + commandBuffers.reserve(MAX_FRAMES_IN_FLIGHT); + + // Create command buffer allocation info + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *commandPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = static_cast(MAX_FRAMES_IN_FLIGHT) + }; + + // Allocate command buffers + commandBuffers = vk::raii::CommandBuffers(device, allocInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create command buffers: " << e.what() << std::endl; + return false; + } +} + +// Create sync objects +bool Renderer::createSyncObjects() { + try { + // Resize semaphores and fences vectors + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + + // Semaphores per swapchain image (indexed by imageIndex from acquireNextImage) + // The presentation engine holds semaphores until the image is re-acquired, so we need + // one semaphore per swapchain image to avoid reuse conflicts. See Vulkan spec: + // https://docs.vulkan.org/guide/latest/swapchain_semaphore_reuse.html + const auto semaphoreCount = static_cast(swapChainImages.size()); + imageAvailableSemaphores.reserve(semaphoreCount); + renderFinishedSemaphores.reserve(semaphoreCount); + + // Fences per frame-in-flight for CPU-GPU synchronization (indexed by currentFrame) + inFlightFences.reserve(MAX_FRAMES_IN_FLIGHT); + + // Create semaphore info + vk::SemaphoreCreateInfo semaphoreInfo{}; + + // Create semaphores per swapchain image (indexed by imageIndex for presentation sync) + for (uint32_t i = 0; i < semaphoreCount; i++) { + imageAvailableSemaphores.emplace_back(device, semaphoreInfo); + renderFinishedSemaphores.emplace_back(device, semaphoreInfo); + } + + // Create fences per frame-in-flight (indexed by currentFrame for CPU-GPU pacing) + vk::FenceCreateInfo fenceInfo{ + .flags = vk::FenceCreateFlagBits::eSignaled + }; + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + inFlightFences.emplace_back(device, fenceInfo); + } + + // Ensure uploads timeline semaphore exists (created early in createLogicalDevice) + // No action needed here unless reinitializing after swapchain recreation. + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create sync objects: " << e.what() << std::endl; + return false; + } +} + +// Clean up swap chain +void Renderer::cleanupSwapChain() { + // Clean up depth resources + depthImageView = vk::raii::ImageView(nullptr); + depthImage = vk::raii::Image(nullptr); + depthImageAllocation = nullptr; + + // Clean up swap chain image views + swapChainImageViews.clear(); + + // Note: Keep descriptor pool alive here to ensure descriptor sets remain valid during swapchain recreation. + // descriptorPool is preserved; it will be managed during full renderer teardown. + + // Destroy reflection render targets if present + destroyReflectionResources(); + + // Clean up pipelines + graphicsPipeline = vk::raii::Pipeline(nullptr); + pbrGraphicsPipeline = vk::raii::Pipeline(nullptr); + lightingPipeline = vk::raii::Pipeline(nullptr); + + // Clean up pipeline layouts + pipelineLayout = vk::raii::PipelineLayout(nullptr); + pbrPipelineLayout = vk::raii::PipelineLayout(nullptr); + lightingPipelineLayout = vk::raii::PipelineLayout(nullptr); + + // Clean up sync objects (they need to be recreated with new swap chain image count) + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + + // Clean up swap chain + swapChain = vk::raii::SwapchainKHR(nullptr); +} + +// Recreate swap chain +void Renderer::recreateSwapChain() { + // Prevent background uploads worker from mutating descriptors while we rebuild + StopUploadsWorker(); + + // Block descriptor writes while we rebuild swapchain and descriptor pools + descriptorSetsValid.store(false, std::memory_order_relaxed); { + // Drop any deferred descriptor updates that target old descriptor sets + std::lock_guard lk(pendingDescMutex); + pendingDescOps.clear(); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } + + // Wait for all frames in flight to complete before recreating the swap chain + std::vector allFences; + allFences.reserve(inFlightFences.size()); + for (const auto& fence : inFlightFences) { + allFences.push_back(*fence); + } + if (!allFences.empty()) { + vk::Result result = waitForFencesSafe(allFences, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for in-flight fences during swap chain recreation: " << vk::to_string(result) << std::endl; + } + } + + // Wait for the device to be idle before recreating the swap chain + // External synchronization required (VVL): serialize against queue submits/present. + WaitIdle(); + + // Clean up old swap chain resources + cleanupSwapChain(); + + // Recreate swap chain and related resources + createSwapChain(); + createImageViews(); + setupDynamicRendering(); + createDepthResources(); + + // (Re)create reflection resources if enabled + if (enablePlanarReflections) { + uint32_t rw = std::max(1u, static_cast(static_cast(swapChainExtent.width) * reflectionResolutionScale)); + uint32_t rh = std::max(1u, static_cast(static_cast(swapChainExtent.height) * reflectionResolutionScale)); + createReflectionResources(rw, rh); + } + + // Recreate sync objects with correct sizing for new swap chain + createSyncObjects(); + + // Recreate off-screen opaque scene color and descriptor sets needed by transparent pass + createOpaqueSceneColorResources(); + createTransparentDescriptorSets(); + createTransparentFallbackDescriptorSets(); + + // Wait for all command buffers to complete before clearing resources + for (const auto& fence : inFlightFences) { + vk::Result result = waitForFencesSafe(*fence, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for fence before clearing resources: " << vk::to_string(result) << std::endl; + } + } + + // Clear all entity descriptor sets since they're now invalid (allocated from the old pool) + { + // Serialize descriptor frees against any other descriptor operations + std::lock_guard lk(descriptorMutex); + for (auto& kv : entityResources) { + auto& resources = kv.second; + resources.basicDescriptorSets.clear(); + resources.pbrDescriptorSets.clear(); + // Descriptor initialization flags must be reset because new descriptor sets + // will be allocated and only the current frame will be initialized at runtime. + resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + } + + // Clear ray query descriptor sets - they reference the old output image which will be destroyed + // Must clear before recreating to avoid descriptor set corruption + rayQueryDescriptorSets.clear(); + rayQueryDescriptorsWritten.clear(); + rayQueryDescriptorsDirtyMask.store(0u, std::memory_order_relaxed); + + // Destroy ray query output image resources - they're sized to old swapchain dimensions + rayQueryOutputImageView = vk::raii::ImageView(nullptr); + rayQueryOutputImage = vk::raii::Image(nullptr); + rayQueryOutputImageAllocation = nullptr; + + createGraphicsPipeline(); + createPBRPipeline(); + createLightingPipeline(); + createCompositePipeline(); + + // Recreate Forward+ specific pipelines/resources and resize tile buffers for new extent + if (useForwardPlus) { + createDepthPrepassPipeline(); + uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + createOrResizeForwardPlusBuffers(tilesX, tilesY, forwardPlusSlicesZ); + } + + // Re-create command buffers to ensure fresh recording against new swapchain state + commandBuffers.clear(); + createCommandBuffers(); + currentFrame = 0; + + // Recreate ray query resources with new swapchain dimensions + // This must happen after descriptor pool is valid but before marking descriptor sets valid + if (rayQueryEnabled && accelerationStructureEnabled) { + if (!createRayQueryResources()) { + std::cerr << "Warning: Failed to recreate ray query resources after swapchain recreation\n"; + } + } + + // Recreate descriptor sets for all entities after swapchain/pipeline rebuild + for (const auto& kv : entityResources) { + const auto& entity = kv.first; + if (!entity) + continue; + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback for basic pipeline: use baseColor when legacy path is empty + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + // Recreate basic descriptor sets (ignore failures here to avoid breaking resize) + createDescriptorSets(entity, texturePath, false); + // Recreate PBR descriptor sets + createDescriptorSets(entity, texturePath, true); + } + + // Descriptor sets are now valid again + descriptorSetsValid.store(true, std::memory_order_relaxed); + + // Resume background uploads worker now that swapchain and descriptors are recreated + StartUploadsWorker(); +} + +void Renderer::prepareFrameUboTemplate(CameraComponent* camera) { + frameUboTemplate = UniformBufferObject{}; + if (!camera) return; + + frameUboTemplate.view = camera->GetViewMatrix(); + frameUboTemplate.proj = camera->GetProjectionMatrix(); + frameUboTemplate.proj[1][1] *= -1; // Flip Y for Vulkan + frameUboTemplate.camPos = glm::vec4(camera->GetPosition(), 1.0f); + + frameUboTemplate.lightCount = static_cast(lastFrameLightCount); + frameUboTemplate.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + frameUboTemplate.gamma = this->gamma; + frameUboTemplate.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height); + frameUboTemplate.nearZ = camera->GetNearPlane(); + frameUboTemplate.farZ = camera->GetFarPlane(); + frameUboTemplate.slicesZ = static_cast(forwardPlusSlicesZ); + + int outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || + swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + frameUboTemplate.padding0 = outputIsSRGB; + // Raster PBR shader uses padding1 as the Forward+ enable flag. + // 0 = disabled (always use global light loop), non-zero = enabled (use culled tile lists). + frameUboTemplate.padding1 = useForwardPlus ? 1.0f : 0.0f; + frameUboTemplate.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f; + + bool reflReady = false; + if (enablePlanarReflections && !reflections.empty()) { + const uint32_t count = static_cast(reflections.size()); + const uint32_t prev = (currentFrame + count - 1u) % count; + auto& rtPrev = reflections[prev]; + reflReady = (!!*rtPrev.colorView) && (!!*rtPrev.colorSampler); + } + frameUboTemplate.reflectionEnabled = reflReady ? 1 : 0; + frameUboTemplate.reflectionVP = sampleReflectionVP; + frameUboTemplate.clipPlaneWS = currentReflectionPlane; + frameUboTemplate.reflectionIntensity = std::clamp(reflectionIntensity, 0.0f, 2.0f); + frameUboTemplate.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0; + frameUboTemplate.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0; + + // Ray-query shared buffers are also used by raster PBR when doing ray-query shadows. + // Populate counts so shaders can bounds-check even when running in raster mode. + frameUboTemplate.geometryInfoCount = static_cast(geometryInfoCountCPU); + frameUboTemplate.materialCount = static_cast(materialCountCPU); +} + +// Update uniform buffer +void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, TransformComponent* tc) { + if (!entityRes) { + return; + } + + // Get transform component + auto transformComponent = tc ? tc : (entity ? entity->GetComponent() : nullptr); + if (!transformComponent) { + return; + } + + // Create uniform buffer object + UniformBufferObject ubo{}; + ubo.model = transformComponent->GetModelMatrix(); + ubo.view = camera->GetViewMatrix(); + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + + // Continue with the rest of the uniform buffer setup + updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo); +} + +// Overloaded version that accepts a custom transform matrix +void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, const glm::mat4& customTransform) { + if (!entityRes) return; + // Create the uniform buffer object with custom transform + UniformBufferObject ubo{}; + ubo.model = customTransform; + ubo.view = camera->GetViewMatrix(); + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + + // Continue with the rest of the uniform buffer setup + updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo); +} + +// Internal helper function to complete uniform buffer setup +void Renderer::updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, UniformBufferObject& ubo) { + if (!entityRes) { + return; + } + + // Use frame template for most fields + UniformBufferObject finalUbo = frameUboTemplate; + finalUbo.model = ubo.model; + + // For reflection pass, we must override view/proj/reflection flags + if (ubo.reflectionPass == 1) { + finalUbo.views[0] = ubo.views[0]; + finalUbo.projs[0] = ubo.projs[0]; + finalUbo.viewProjections[0] = ubo.projs[0] * ubo.views[0]; + finalUbo.reflectionPass = 1; + finalUbo.reflectionEnabled = 0; + finalUbo.reflectionVP = ubo.reflectionVP; + finalUbo.clipPlaneWS = ubo.clipPlaneWS; + finalUbo.padding2 = ubo.padding2; + } + + // Copy to uniform buffer (guard against null mapped pointer) + void* dst = entityRes->uniformBuffersMapped[currentImage]; + if (!dst) { + std::cerr << "Warning: UBO mapped ptr null for entity '" << (entity ? entity->GetName() : "unknown") << "' frame " << currentImage << std::endl; + return; + } + std::memcpy(dst, &finalUbo, sizeof(UniformBufferObject)); +} + +void Renderer::drawRenderJob(const vk::raii::CommandBuffer& cmd, const RenderJob& job, uint32_t currentFrame, uint32_t eye, bool transparent) { + EntityResources* entityRes = job.entityRes; + MeshResources* meshRes = job.meshRes; + MeshComponent* meshComponent = job.meshComp; + + // 1. Determine descriptor set index + // In multiview, we only need one set per frame as both views share the same UBO + uint32_t setIndex = currentFrame; + + // 2. Update UBO using the current template (which has correct multiview matrices if in XR) + updateUniformBufferInternal(setIndex, job.entity, entityRes, nullptr, frameUboTemplate); + + // 3. Bind geometry + std::array buffers = {*meshRes->vertexBuffer, *entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + cmd.bindVertexBuffers(0, buffers, offsets); + cmd.bindIndexBuffer(*meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + // 4. Bind descriptor set + cmd.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, + *pbrPipelineLayout, + 0, + *entityRes->pbrDescriptorSets[setIndex], + nullptr); + + // 5. Push material properties + pushMaterialProperties(*cmd, entityRes->cachedMaterialProps); + + // 6. Issue draw + uint32_t instanceCount = std::max(1u, static_cast(meshComponent->GetInstanceCount())); + cmd.drawIndexed(meshRes->indexCount, instanceCount, 0, 0, 0); +} + +void Renderer::ensureEntityMaterialCache(Entity* entity, EntityResources& res) { + if (!entity) + return; + + if (res.materialCacheValid) + return; + + res.materialCacheValid = true; + res.cachedMaterial = nullptr; + res.cachedIsBlended = false; + res.cachedIsGlass = false; + res.cachedIsLiquid = false; + + // Defaults represent the common case (no explicit material); textures come from descriptor bindings. + MaterialProperties mp{}; + // Sensible defaults for entities without explicit material + mp.baseColorFactor = glm::vec4(1.0f); + mp.metallicFactor = 0.0f; + mp.roughnessFactor = 1.0f; + mp.baseColorTextureSet = 0; + mp.physicalDescriptorTextureSet = 0; + mp.normalTextureSet = -1; + mp.occlusionTextureSet = -1; + mp.emissiveTextureSet = -1; + mp.alphaMask = 0.0f; + mp.alphaMaskCutoff = 0.5f; + mp.emissiveFactor = glm::vec3(0.0f); + mp.emissiveStrength = 1.0f; + mp.transmissionFactor = 0.0f; + mp.useSpecGlossWorkflow = 0; + mp.glossinessFactor = 0.0f; + mp.specularFactor = glm::vec3(1.0f); + mp.ior = 1.5f; + mp.hasEmissiveStrengthExtension = 0; + + if (modelLoader) { + const std::string& entityName = entity->GetName(); + const size_t tagPos = entityName.find("_Material_"); + if (tagPos != std::string::npos) { + const size_t afterTag = tagPos + std::string("_Material_").size(); + if (afterTag < entityName.length()) { + // Entity name format: "modelName_Material__" + const std::string remainder = entityName.substr(afterTag); + const size_t nextUnderscore = remainder.find('_'); + if (nextUnderscore != std::string::npos && nextUnderscore + 1 < remainder.length()) { + const std::string materialName = remainder.substr(nextUnderscore + 1); + if (const Material* material = modelLoader->GetMaterial(materialName)) { + res.cachedMaterial = material; + res.cachedIsGlass = material->isGlass; + res.cachedIsLiquid = material->isLiquid; + + // Base factors + mp.baseColorFactor = glm::vec4(material->albedo, material->alpha); + mp.metallicFactor = material->metallic; + mp.roughnessFactor = material->roughness; + + // Texture set flags (-1 = no texture) + mp.baseColorTextureSet = material->albedoTexturePath.empty() ? -1 : 0; + // physical descriptor: MR or SpecGloss + if (material->useSpecularGlossiness) { + mp.useSpecGlossWorkflow = 1; + mp.physicalDescriptorTextureSet = material->specGlossTexturePath.empty() ? -1 : 0; + mp.glossinessFactor = material->glossinessFactor; + mp.specularFactor = material->specularFactor; + } else { + mp.useSpecGlossWorkflow = 0; + mp.physicalDescriptorTextureSet = material->metallicRoughnessTexturePath.empty() ? -1 : 0; + } + mp.normalTextureSet = material->normalTexturePath.empty() ? -1 : 0; + mp.occlusionTextureSet = material->occlusionTexturePath.empty() ? -1 : 0; + mp.emissiveTextureSet = material->emissiveTexturePath.empty() ? -1 : 0; + + // Emissive and transmission/IOR + mp.emissiveFactor = material->emissive; + mp.emissiveStrength = material->emissiveStrength; + // Heuristic: consider emissive strength extension present when strength != 1.0 + mp.hasEmissiveStrengthExtension = (std::abs(material->emissiveStrength - 1.0f) > 1e-6f) ? 1 : 0; + mp.transmissionFactor = material->transmissionFactor; + mp.ior = material->ior; + + // Alpha mask handling + mp.alphaMask = (material->alphaMode == "MASK") ? 1.0f : 0.0f; + mp.alphaMaskCutoff = material->alphaCutoff; + + // Blended classification (opaque materials stay in the opaque pass) + const bool alphaBlend = (material->alphaMode == "BLEND"); + const bool highTransmission = (material->transmissionFactor > 0.2f); + res.cachedIsBlended = alphaBlend || highTransmission || res.cachedIsGlass || res.cachedIsLiquid; + } + } + } + } + } + + res.cachedMaterialProps = mp; +} + +// Render the scene (unique_ptr container overload) +// Convert to a raw-pointer snapshot so callers can safely release their container locks. +void Renderer::Render(const std::vector>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) { + std::vector snapshot; + snapshot.reserve(entities.size()); + for (const auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + Render(snapshot, camera, imguiSystem); +} + +// Render the scene (raw pointer snapshot overload) +void Renderer::Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem, XrTime predictedTime) { + // Update watchdog timestamp to prove frame is progressing + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogProgressLabel.store("RenderXR: frame begin", std::memory_order_relaxed); + + if (memoryPool) + memoryPool->setRenderingActive(true); + + // Prepare frame-constant UBO data once for both eyes + // We'll update the eye-specific view/proj matrices inside the eye loop + prepareFrameUboTemplate(camera); + + // Wait for the previous frame's work on this frame slot to complete + vk::Result fenceResult = waitForFencesSafe(*inFlightFences[currentFrame], VK_TRUE); + if (fenceResult != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for in-flight fence (XR): " << vk::to_string(fenceResult) << std::endl; + } + device.resetFences(*inFlightFences[currentFrame]); + + // Execute any pending GPU uploads/preallocations + ProcessPendingMeshUploads(); + ProcessPendingEntityPreallocations(); + + // Preparation pass: Culling and classification + // In a real XR engine, we might do this per eye, but for a simple engine, + // we can use a combined frustum or just render everything for both eyes. + std::vector opaqueJobs; + std::vector transparentJobs; + // ... Simplified: just use all active entities for both eyes for now ... + for (Entity* entity : entities) { + if (!entity || !entity->IsActive()) continue; + auto meshComponent = entity->GetComponent(); + if (!meshComponent) continue; + + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) continue; + + EntityResources& entityRes = entityIt->second; + ensureEntityMaterialCache(entity, entityRes); + + RenderJob job{ + .entity = entity, + .entityRes = &entityRes, + .meshRes = &meshResources[meshComponent], + .meshComp = meshComponent, + .transformComp = entity->GetComponent(), + .isAlphaMasked = entityRes.cachedIsBlended + }; + if (entityRes.cachedIsBlended) { + transparentJobs.push_back(job); + } else { + opaqueJobs.push_back(job); + } + } + + // --- Start Command Buffer --- + vk::raii::CommandBuffer& cmd = commandBuffers[currentFrame]; + cmd.reset(); + cmd.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + // 1. Acquire and Wait for OpenXR swapchain image + uint32_t imageIndex = xrContext.acquireSwapchainImage(); + xrContext.waitSwapchainImage(); + + vk::Image swapchainImage = eyeSwapchainImages[0][imageIndex]; + vk::ImageView swapchainView = *eyeSwapchainImageViews[0][imageIndex]; + + // 2. Transition image to COLOR_ATTACHMENT_OPTIMAL + transitionImageLayout(*cmd, swapchainImage, swapChainImageFormat, vk::ImageLayout::eUndefined, vk::ImageLayout::eColorAttachmentOptimal, + 1, 2); // 2 layers + + // 3. Prepare Multiview UBO Template + for (uint32_t eye = 0; eye < 2; ++eye) { + glm::mat4 view = camera ? camera->GetViewMatrix(eye) : glm::mat4(1.0f); + glm::mat4 proj = camera ? camera->GetProjectionMatrix(eye) : glm::mat4(1.0f); + proj[1][1] *= -1.0f; // Vulkan Y-flip + frameUboTemplate.views[eye] = view; + frameUboTemplate.projs[eye] = proj; + frameUboTemplate.viewProjections[eye] = proj * view; + frameUboTemplate.camPoses[eye] = glm::vec4(xrContext.getEyePosition(eye), 1.0f); + } + + // 4. Begin Rendering (Single-Pass Multiview) + vk::RenderingAttachmentInfo colorAtt{ + .imageView = swapchainView, + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearValue{vk::ClearColorValue{std::array{0.1f, 0.1f, 0.1f, 1.0f}}} + }; + + // Note: depth buffer must be 2 layers for multiview + vk::RenderingAttachmentInfo depthAtt{ + .imageView = *depthImageView, + .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eDontCare, + .clearValue = vk::ClearValue{vk::ClearDepthStencilValue{1.0f, 0}} + }; + + vk::RenderingInfo rinfo{ + .renderArea = vk::Rect2D({0, 0}, xrContext.getSwapchainExtent()), + .layerCount = 1, + .viewMask = 0x3u, // Enable view 0 and 1 + .colorAttachmentCount = 1, + .pColorAttachments = &colorAtt, + .pDepthAttachment = &depthAtt + }; + + cmd.beginRendering(rinfo); + + // Set Viewport and Scissor (shared for both eyes in multiview) + cmd.setViewport(0, xrContext.getViewport(0)); + cmd.setScissor(0, xrContext.getScissor(0)); + + // Draw opaque objects + for (const auto& job : opaqueJobs) { + drawRenderJob(cmd, job, currentFrame, 0, false); + } + + // Draw transparent objects + for (const auto& job : transparentJobs) { + drawRenderJob(cmd, job, currentFrame, 0, true); + } + + cmd.endRendering(); + + // 5. Transition back (OpenXR often expects this or TransferSrcOptimal for blitting) + transitionImageLayout(*cmd, swapchainImage, swapChainImageFormat, vk::ImageLayout::eColorAttachmentOptimal, vk::ImageLayout::eTransferSrcOptimal, + 1, 2); + + // 6. Release OpenXR swapchain image + xrContext.releaseSwapchainImage(); + + cmd.end(); + + // --- Submit to Graphics Queue --- + vk::PipelineStageFlags waitStages[] = {vk::PipelineStageFlagBits::eColorAttachmentOutput}; + vk::SubmitInfo submitInfo{ + .waitSemaphoreCount = 0, + .pWaitSemaphores = nullptr, + .pWaitDstStageMask = waitStages, + .commandBufferCount = 1, + .pCommandBuffers = &*cmd, + .signalSemaphoreCount = 0, + .pSignalSemaphores = nullptr + }; + + { + std::lock_guard lock(queueMutex); + graphicsQueue.submit(submitInfo, *inFlightFences[currentFrame]); + } + + currentFrame = (currentFrame + 1) % MAX_FRAMES_IN_FLIGHT; +} + +void Renderer::Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) { + // Update watchdog timestamp to prove frame is progressing + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogProgressLabel.store("Render: frame begin", std::memory_order_relaxed); + + if (memoryPool) + memoryPool->setRenderingActive(true); + struct RenderingStateGuard { + MemoryPool* pool; + explicit RenderingStateGuard(MemoryPool* p) : pool(p) { + } + ~RenderingStateGuard() { + if (pool) + pool->setRenderingActive(false); + } + } guard(memoryPool.get()); + + // Track if ray query rendered successfully this frame to skip rasterization code path + bool rayQueryRenderedThisFrame = false; + + // --- Extract lights for the frame --- + // Build a single light list once per frame (emissive lights only for this scene) + std::vector lightsSubset; + if (!staticLights.empty()) { + lightsSubset.reserve(std::min(staticLights.size(), static_cast(MAX_ACTIVE_LIGHTS))); + for (const auto& L : staticLights) { + // Include all lights (Directional, Point, Emissive) up to the limit + lightsSubset.push_back(L); + if (lightsSubset.size() >= MAX_ACTIVE_LIGHTS) + break; + } + } + lastFrameLightCount = static_cast(lightsSubset.size()); + if (!lightsSubset.empty()) { + updateLightStorageBuffer(currentFrame, lightsSubset, camera); + } + + // Pre-calculate frame-constant UBO data + prepareFrameUboTemplate(camera); + + // Wait for the previous frame's work on this frame slot to complete + // Use a finite timeout loop so we can keep the watchdog alive during long GPU work + // (e.g., acceleration structure builds/refits can legitimately take seconds on large scenes). + watchdogProgressLabel.store("Render: wait inFlightFence", std::memory_order_relaxed); + vk::Result fenceResult = waitForFencesSafe(*inFlightFences[currentFrame], VK_TRUE); + if (fenceResult != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for in-flight fence: " << vk::to_string(fenceResult) << std::endl; + } + + // Reset the fence immediately after successful wait, before any new work + watchdogProgressLabel.store("Render: reset inFlightFence", std::memory_order_relaxed); + device.resetFences(*inFlightFences[currentFrame]); + + // Execute any pending GPU uploads (enqueued by worker/loading threads) on the render thread + // at this safe point to ensure all Vulkan submits happen on a single thread. + // This prevents validation/GPU-AV PostSubmit crashes due to cross-thread queue usage. + watchdogProgressLabel.store("Render: ProcessPendingMeshUploads", std::memory_order_relaxed); + ProcessPendingMeshUploads(); + // Execute any pending per-entity GPU resource preallocation requested by the scene loader. + // This prevents background threads from mutating `entityResources`/`meshResources` concurrently + // with rendering (which can corrupt unordered_map internals and crash). + watchdogProgressLabel.store("Render: ProcessPendingEntityPreallocations", std::memory_order_relaxed); + ProcessPendingEntityPreallocations(); + watchdogProgressLabel.store("Render: after ProcessPendingEntityPreallocations", std::memory_order_relaxed); + + // Process deferred AS deletion queue at safe point (after fence wait) + // Increment frame counters and delete AS structures that are no longer in use + // Wait for MAX_FRAMES_IN_FLIGHT + 1 frames to ensure GPU has finished all work + // (The +1 ensures we've waited through a full cycle of all frame slots) + { + auto it = pendingASDeletions.begin(); + while (it != pendingASDeletions.end()) { + it->framesSinceDestroy++; + if (it->framesSinceDestroy > MAX_FRAMES_IN_FLIGHT) { + // Safe to delete - all frames have finished using these AS structures + it = pendingASDeletions.erase(it); + } else { + ++it; + } + } + } + watchdogProgressLabel.store("Render: after pendingASDeletions", std::memory_order_relaxed); + + // Opportunistically request AS rebuild when more meshes become ready than in the last built AS. + // This makes the TLAS grow as streaming/allocations complete, then settle (no rebuild spam). + // NOTE: This scan can be relatively heavy and is not needed for the default startup path. + // Only run it when opportunistic rebuilds are enabled. + // While loading, allow opportunistic AS rebuild scanning even if the user-facing toggle is off. + // This prevents nondeterministic “missing outdoor props” across app restarts when the first TLAS + // build happens before all entities exist. + if (rayQueryEnabled && accelerationStructureEnabled && (asOpportunisticRebuildEnabled || IsLoading())) { + watchdogProgressLabel.store("Render: AS readiness scan", std::memory_order_relaxed); + size_t readyRenderableCount = 0; + size_t readyUniqueMeshCount = 0; { + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&]() { + auto now = std::chrono::steady_clock::now(); + if (now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + std::map meshToBLASProbe; + for (Entity* e : entities) { + kickWatchdog(); + if (!e || !e->IsActive()) + continue; + // In Ray Query static-only mode, ignore dynamic/animated entities for readiness + if (IsRayQueryStaticOnly()) { + const std::string& nm = e->GetName(); + if (nm.find("_AnimNode_") != std::string::npos) + continue; + if (!nm.empty() && nm.rfind("Ball_", 0) == 0) + continue; + } + auto meshComp = e->GetComponent(); + if (!meshComp) + continue; + try { + auto it = meshResources.find(meshComp); + if (it == meshResources.end()) + continue; + const auto& res = it->second; + // STRICT readiness: uploads must be finished (staging sizes zero) + if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0) + continue; + if (!*res.vertexBuffer || !*res.indexBuffer) + continue; + if (res.indexCount == 0) + continue; + } catch (...) { + continue; + } + readyRenderableCount++; + if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) { + meshToBLASProbe[meshComp] = static_cast(meshToBLASProbe.size()); + } + } + readyUniqueMeshCount = meshToBLASProbe.size(); + } + // During scene loading/finalization, the TLAS may be built before all entities exist. + // Allow rebuilds even if AS is "frozen" so the TLAS converges to the full scene across restarts. + if ((!asFrozen || IsLoading()) && (readyRenderableCount > lastASBuiltInstanceCount || readyUniqueMeshCount > lastASBuiltBLASCount) && !asBuildRequested.load(std::memory_order_relaxed)) { + std::cout << "AS rebuild requested: counts increased (built instances=" << lastASBuiltInstanceCount + << ", ready instances=" << readyRenderableCount + << ", built meshes=" << lastASBuiltBLASCount + << ", ready meshes=" << readyUniqueMeshCount << ")\n"; + RequestAccelerationStructureBuild("counts increased"); + } + + // Post-load repair: if loading is done and the current TLAS instance count is far below readiness, + // force a one-time rebuild even when frozen so we include the whole scene. + if (!IsLoading() && !asBuildRequested.load(std::memory_order_relaxed)) { + const size_t targetInstances = readyRenderableCount; + if (targetInstances > 0 && lastASBuiltInstanceCount < static_cast(static_cast(targetInstances) * 0.95)) { + asDevOverrideAllowRebuild = true; // allow rebuild even if frozen + std::cout << "AS rebuild requested: post-load full build (built instances=" << lastASBuiltInstanceCount + << ", ready instances=" << targetInstances << ")\n"; + RequestAccelerationStructureBuild("post-load full build"); + } + } + } + + // If in Ray Query static-only mode and TLAS not yet built post-load, request a one-time build now. + // (Does not require a readiness scan.) + if (rayQueryEnabled&& accelerationStructureEnabled && currentRenderMode + == + RenderMode::RayQuery&& IsRayQueryStaticOnly() && + !IsLoading() && + !*tlasStructure.handle && !asBuildRequested.load(std::memory_order_relaxed) + ) { + RequestAccelerationStructureBuild("static-only initial build"); + } + + // Check if acceleration structure build was requested (e.g., after scene loading or counts grew) + // Build at this safe frame point to avoid threading issues + watchdogProgressLabel.store("Render: AS build request check", std::memory_order_relaxed); + if (asBuildRequested.load(std::memory_order_acquire)) { + watchdogProgressLabel.store("Render: AS build request handling", std::memory_order_relaxed); + + // Defer TLAS/BLAS build while the scene loader is still active to avoid partial builds. + // IMPORTANT: Do NOT use IsLoading() here; IsLoading() also includes the post-load + // "finalizing" stage, and deferring on that would deadlock the AS build forever. + if (IsSceneLoaderActive()) { + // Keep the request flag set; we'll build once the loader (and critical textures) finish. + } else if (asFrozen && !asDevOverrideAllowRebuild && !IsLoading()) { + // Ignore rebuilds while frozen to avoid wiping TLAS during animation playback + std::cout << "AS rebuild request ignored (frozen). Reason: " << lastASBuildRequestReason << "\n"; + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + watchdogSuppressed.store(false, std::memory_order_relaxed); + } else { + // Gate initial build until readiness is high enough to represent the full scene + size_t totalRenderableEntities = 0; + size_t readyRenderableCount = 0; + size_t readyUniqueMeshCount = 0; + size_t missingMeshResources = 0; + size_t pendingUploadsCount = 0; + size_t nullBuffersCount = 0; + size_t zeroIndicesCount = 0; { + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&]() { + auto now = std::chrono::steady_clock::now(); + if (now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + std::map meshToBLASProbe; + for (Entity* e : entities) { + kickWatchdog(); + if (!e || !e->IsActive()) + continue; + // In Ray Query static-only mode, ignore dynamic/animated entities for totals/readiness + if (IsRayQueryStaticOnly()) { + const std::string& nm = e->GetName(); + if (nm.find("_AnimNode_") != std::string::npos) + continue; + if (!nm.empty() && nm.rfind("Ball_", 0) == 0) + continue; + } + auto meshComp = e->GetComponent(); + if (!meshComp) + continue; + totalRenderableEntities++; + try { + auto it = meshResources.find(meshComp); + if (it == meshResources.end()) { + missingMeshResources++; + continue; + } + const auto& res = it->second; + // STRICT readiness here too: uploads finished + if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0) { + pendingUploadsCount++; + continue; + } + if (!*res.vertexBuffer || !*res.indexBuffer) { + nullBuffersCount++; + continue; + } + if (res.indexCount == 0) { + zeroIndicesCount++; + continue; + } + } catch (...) { + continue; + } + readyRenderableCount++; + if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) { + meshToBLASProbe[meshComp] = static_cast(meshToBLASProbe.size()); + } + } + readyUniqueMeshCount = meshToBLASProbe.size(); + } + const double readiness = (totalRenderableEntities > 0) ? static_cast(readyRenderableCount) / static_cast(totalRenderableEntities) : 0.0; + const double buildThreshold = 0.95; // prefer building when ~full scene is ready + + // Bounded deferral: avoid getting stuck forever waiting for perfect readiness. + // After a short timeout from the original request, build with the best available data. + const uint64_t reqNs = asBuildRequestStartNs.load(std::memory_order_relaxed); + const uint64_t nowNs = std::chrono::steady_clock::now().time_since_epoch().count(); + const double maxDeferralSeconds = 15.0; + const bool deferralTimedOut = (reqNs != 0) && (nowNs > reqNs) && + (static_cast(nowNs - reqNs) / 1'000'000'000.0) >= maxDeferralSeconds; + + if (readiness < buildThreshold && !asDevOverrideAllowRebuild && !deferralTimedOut) { + // Intentionally no stdout spam here (Windows consoles are slow and there's no user-facing benefit). + // Keep the request flag set; try again next frame + } else { + if (deferralTimedOut && readiness < buildThreshold && !asDevOverrideAllowRebuild) { + std::cout << "AS build forced after " << maxDeferralSeconds + << "s deferral (readiness " << readyRenderableCount << "/" << totalRenderableEntities + << ", uniqueMeshesReady=" << readyUniqueMeshCount << ")\n"; + } + struct WatchdogSuppressGuard { + std::atomic& flag; + explicit WatchdogSuppressGuard(std::atomic& f) : flag(f) { + flag.store(true, std::memory_order_relaxed); + } + ~WatchdogSuppressGuard() { + flag.store(false, std::memory_order_relaxed); + } + } watchdogGuard(watchdogSuppressed); + + // Ensure previous GPU work is complete BEFORE building AS. + // + // Wait for all *other* frame-in-flight fences to signal using a finite timeout loop + // and kick the watchdog while we wait. + // Do NOT include `currentFrame` here because its fence was reset at frame start + // and will not signal until we submit the current frame. + { + std::vector fencesToWait; + if (inFlightFences.size() > 1) { + fencesToWait.reserve(inFlightFences.size() - 1); + } + for (uint32_t i = 0; i < static_cast(inFlightFences.size()); ++i) { + if (i == currentFrame) + continue; + if (!!*inFlightFences[i]) { + fencesToWait.push_back(*inFlightFences[i]); + } + } + if (!fencesToWait.empty()) { + vk::Result result = waitForFencesSafe(fencesToWait, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for fences before acceleration structure build: " << vk::to_string(result) << std::endl; + } + } + } + + watchdogProgressLabel.store("Render: buildAccelerationStructures", std::memory_order_relaxed); + if (buildAccelerationStructures(entities)) { + watchdogProgressLabel.store("Render: after buildAccelerationStructures", std::memory_order_relaxed); + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + // AS build request resolved; restore normal watchdog sensitivity. + watchdogSuppressed.store(false, std::memory_order_relaxed); + // Transition the loading UI to a finalizing phase (descriptor cold-init, etc.). + if (IsLoading()) { + SetLoadingPhase(LoadingPhase::Finalizing); + SetLoadingPhaseProgress(0.0f); + } + + // The TLAS handle can transition from null -> valid (or change on rebuild). + // Ensure raster PBR descriptor sets (set 0, binding 11 `tlas`) are rewritten after an AS build + // so subsequent Raster draws never see an unwritten/stale acceleration-structure descriptor. + for (auto& kv : entityResources) { + kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + for (Entity* e : entities) { + MarkEntityDescriptorsDirty(e); + } + + // Freeze only when the built AS covers essentially the full set of renderable entities. + // NOTE: `lastASBuiltInstanceCount` is an ENTITY count; TLAS instance count (instancing) is tracked separately. + if (asFreezeAfterFullBuild) { + const double threshold = 0.95; + if (totalRenderableEntities > 0 && + static_cast(lastASBuiltInstanceCount) >= threshold * static_cast(totalRenderableEntities)) { + asFrozen = true; + } + } + + // One concise TLAS summary with consistent units. + if (!!*tlasStructure.handle) { + if (IsRayQueryStaticOnly()) { + std::cout << "TLAS ready (static-only): tlasInstances=" << lastASBuiltTlasInstanceCount + << ", entities=" << lastASBuiltInstanceCount + << ", BLAS=" << lastASBuiltBLASCount + << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl; + } else { + std::cout << "TLAS ready: tlasInstances=" << lastASBuiltTlasInstanceCount + << ", entities=" << lastASBuiltInstanceCount + << ", BLAS=" << lastASBuiltBLASCount + << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl; + } + } + } else { + if (!accelerationStructureEnabled || !rayQueryEnabled) { + // Permanent failure due to lack of support; do not retry. + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + watchdogSuppressed.store(false, std::memory_order_relaxed); + } else { + // If nothing is ready yet (e.g., mesh uploads still pending), don't spam logs. + if (readyRenderableCount > 0 || readyUniqueMeshCount > 0) { + std::cout << "Failed to build acceleration structures, will retry next frame" << std::endl; + } + } + } + // Reset dev override after one use + asDevOverrideAllowRebuild = false; + } + } + } + + // Safe point: the previous work referencing this frame's descriptor sets is complete. + // Apply any deferred descriptor set updates for entities whose textures finished streaming. + watchdogProgressLabel.store("Render: ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed); + ProcessDirtyDescriptorsForFrame(currentFrame); + watchdogProgressLabel.store("Render: after ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed); + + // --- 1. PREPARATION PASS --- + // Gather active entities with mesh resources, perform per-frame descriptor initialization, + // and execute culling. This single pass replaces multiple redundant scans and reduces map lookups. + std::vector opaqueJobs; + std::vector transparentJobs; + opaqueJobs.reserve(entities.size()); + + { + watchdogProgressLabel.store("Render: preparation pass", std::memory_order_relaxed); + + // Prepare frustum once per frame for culling + FrustumPlanes frustum{}; + const bool doCulling = enableFrustumCulling && camera; + if (doCulling && camera) { + glm::mat4 proj = camera->GetProjectionMatrix(); + proj[1][1] *= -1.0f; + const glm::mat4 vp = proj * camera->GetViewMatrix(); + frustum = extractFrustumPlanes(vp); + } + lastCullingVisibleCount = 0; + lastCullingCulledCount = 0; + + uint32_t entityProcessCount = 0; + for (Entity* entity : entities) { + if (!entity || !entity->IsActive()) + continue; + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) + continue; + + auto meshIt = meshResources.find(meshComponent); + if (meshIt == meshResources.end()) + continue; + + EntityResources& entityRes = entityIt->second; + MeshResources& meshRes = meshIt->second; + + // Ensure material cache is valid once per frame + ensureEntityMaterialCache(entity, entityRes); + + // --- Per-frame Descriptor Cold-Init (Integrated) --- + if (entityRes.basicDescriptorSets.empty() || entityRes.pbrDescriptorSets.empty()) { + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (entityRes.basicDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, false); + if (entityRes.pbrDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, true); + } + + // Initialize binding 0 (UBO) for the current frame slot if not already done. + if (!entityRes.pbrUboBindingWritten[currentFrame] || !entityRes.basicUboBindingWritten[currentFrame]) { + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (!entityRes.pbrUboBindingWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, false, true); + } + if (!entityRes.basicUboBindingWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, false, true); + } + } + + // Initialize images for the current frame slot if not already done. + if (!entityRes.pbrImagesWritten[currentFrame] || !entityRes.basicImagesWritten[currentFrame]) { + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (!entityRes.pbrImagesWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, true, false); + entityRes.pbrImagesWritten[currentFrame] = true; + } + if (!entityRes.basicImagesWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, true, false); + entityRes.basicImagesWritten[currentFrame] = true; + } + } + + // --- Culling & Classification --- + auto* tc = entity->GetComponent(); + bool useBlended = entityRes.cachedIsBlended; + + if (meshComponent->HasLocalAABB()) { + const glm::mat4 model = tc ? tc->GetModelMatrix() : glm::mat4(1.0f); + glm::vec3 wmin, wmax; + transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax); + + // 1. Frustum Culling + if (doCulling && !aabbIntersectsFrustum(wmin, wmax, frustum)) { + lastCullingCulledCount++; + continue; + } + + // 2. Distance-based LOD + if (enableDistanceLOD && camera) { + glm::vec3 camPos = camera->GetPosition(); + bool cameraInside = (camPos.x >= wmin.x && camPos.x <= wmax.x && + camPos.y >= wmin.y && camPos.y <= wmax.y && + camPos.z >= wmin.z && camPos.z <= wmax.z); + if (!cameraInside) { + float dx = std::max({0.0f, wmin.x - camPos.x, camPos.x - wmax.x}); + float dy = std::max({0.0f, wmin.y - camPos.y, camPos.y - wmax.y}); + float dz = std::max({0.0f, wmin.z - camPos.z, camPos.z - wmax.z}); + float dist = std::sqrt(dx * dx + dy * dy + dz * dz); + float z_eff = std::max(0.1f, dist); + float fov = glm::radians(camera->GetFieldOfView()); + float radius = glm::length(0.5f * (wmax - wmin)); + float pixelDiameter = (radius * 2.0f * static_cast(swapChainExtent.height)) / (z_eff * 2.0f * std::tan(fov * 0.5f)); + float threshold = useBlended ? lodPixelThresholdTransparent : lodPixelThresholdOpaque; + if (pixelDiameter < threshold) { + lastCullingCulledCount++; + continue; + } + } + } + } + + lastCullingVisibleCount++; + bool isAlphaMasked = false; + if (entityRes.materialCacheValid) { + isAlphaMasked = (entityRes.cachedMaterialProps.alphaMask > 0.5f); + } + + // Update UBO for visible entity once per frame (shared across all main passes) + updateUniformBuffer(currentFrame, entity, &entityRes, camera, tc); + + RenderJob job{entity, &entityRes, &meshRes, meshComponent, tc, isAlphaMasked}; + if (useBlended) { + transparentJobs.push_back(job); + } else { + opaqueJobs.push_back(job); + } + + // Update watchdog periodically + if (++entityProcessCount % 100 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + } + watchdogProgressLabel.store("Render: after preparation pass", std::memory_order_relaxed); + } + + // If the scene loader has finished and there are no remaining blocking tasks, + // hide the fullscreen loading overlay. + if (IsLoading() && GetLoadingPhase() == LoadingPhase::Finalizing) { + const bool loaderDone = !loadingFlag.load(std::memory_order_relaxed); + const bool criticalDone = (criticalJobsOutstanding.load(std::memory_order_relaxed) == 0u); + const bool noASPending = !asBuildRequested.load(std::memory_order_relaxed); + const bool noPreallocPending = !pendingEntityPreallocQueued.load(std::memory_order_relaxed); + const bool noDirtyEntities = descriptorDirtyEntities.empty(); + const bool noDeferredDescOps = !descriptorRefreshPending.load(std::memory_order_relaxed); + if (loaderDone && criticalDone && noASPending && noPreallocPending && noDirtyEntities && noDeferredDescOps) { + MarkInitialLoadComplete(); + } + } + + // Safe point: flush any descriptor updates that were deferred while a command buffer + // was recording in a prior frame. Only apply ops for the current frame to avoid + // update-after-bind on pending frames. + if (descriptorRefreshPending.load(std::memory_order_relaxed)) { + watchdogProgressLabel.store("Render: flush deferred descriptor ops", std::memory_order_relaxed); + std::vector ops; { + std::lock_guard lk(pendingDescMutex); + ops.swap(pendingDescOps); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } + uint32_t opCount = 0; + for (auto& op : ops) { + // Kick watchdog periodically during potentially heavy descriptor update bursts + if ((++opCount % 50u) == 0u) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + if (op.frameIndex == currentFrame) { + // Now not recording; safe to apply updates for this frame + updateDescriptorSetsForFrame(op.entity, op.texPath, op.usePBR, op.frameIndex, op.imagesOnly); + } else { + // Keep other frame ops queued for next frame’s safe point + std::lock_guard lk(pendingDescMutex); + pendingDescOps.push_back(op); + descriptorRefreshPending.store(true, std::memory_order_relaxed); + } + } + watchdogProgressLabel.store("Render: after deferred descriptor ops", std::memory_order_relaxed); + } + + // Safe point: handle any pending reflection resource (re)creation and per-frame descriptor refreshes + if (reflectionResourcesDirty) { + if (enablePlanarReflections) { + uint32_t rw = std::max(1u, static_cast(static_cast(swapChainExtent.width) * reflectionResolutionScale)); + uint32_t rh = std::max(1u, static_cast(static_cast(swapChainExtent.height) * reflectionResolutionScale)); + createReflectionResources(rw, rh); + } else { + destroyReflectionResources(); + } + reflectionResourcesDirty = false; + } + + // Reflection descriptor binding refresh is handled elsewhere; avoid redundant per-frame mass updates here. + // Pick the VP associated with the previous frame's reflection texture for sampling in the main pass + if (enablePlanarReflections && !reflectionVPs.empty()) { + uint32_t prev = (currentFrame > 0) ? (currentFrame - 1) : (static_cast(reflectionVPs.size()) - 1); + sampleReflectionVP = reflectionVPs[prev]; + } + + // This function updates bindings 6/7/8 (storage buffers) which don't have UPDATE_AFTER_BIND. + // Updating these every frame causes "updated without UPDATE_AFTER_BIND" errors with MAX_FRAMES_IN_FLIGHT > 1. + // These bindings are already initialized in createDescriptorSets and updated when buffers change. + // Binding 10 (reflection map) has UPDATE_AFTER_BIND and can be updated separately if needed. + // refreshPBRForwardPlusBindingsForFrame(currentFrame); + + // Acquire next swapchain image + // acquireNextImage returns imageIndex (which swapchain image is available). + // Use currentFrame to select an imageAvailableSemaphore for acquire. + // Use imageIndex to select renderFinishedSemaphore for present (ties semaphore to the specific image). + const uint32_t acquireSemaphoreIndex = currentFrame % static_cast(imageAvailableSemaphores.size()); + + uint32_t imageIndex; + vk::Result acquireResultCode = vk::Result::eSuccess; + // Helper overloads to normalize acquireNextImage return across Vulkan-Hpp versions + auto extractAcquire = [](auto const& ret, vk::Result& code, uint32_t& idx) { + using RetT = std::decay_t; + if constexpr (std::is_same_v>) { + code = ret.result; + idx = ret.value; + } else { + // Assume older std::pair + code = ret.first; + idx = ret.second; + } + }; + try { + watchdogProgressLabel.store("Render: acquireNextImage", std::memory_order_relaxed); + auto acquireRet = swapChain.acquireNextImage(UINT64_MAX, *imageAvailableSemaphores[acquireSemaphoreIndex]); + // Vulkan-Hpp changed the return type of acquireNextImage for RAII swapchain across versions. + // Support both vk::ResultValue (newer) and std::pair (older). + extractAcquire(acquireRet, acquireResultCode, imageIndex); + } catch (const vk::OutOfDateKHRError&) { + watchdogProgressLabel.store("Render: acquireNextImage out-of-date", std::memory_order_relaxed); + // Swapchain is out of date (e.g., window resized) before we could + // query the result. Trigger recreation and exit this frame cleanly. + framebufferResized.store(true, std::memory_order_relaxed); + if (imguiSystem) + ImGui::EndFrame(); + // IMPORTANT: We already reset the in-flight fence at the start of the frame. + // Because we're exiting early (no submit), signal it via an empty submit so + // swapchain recreation won't hang waiting for an unsignaled fence. + { + vk::SubmitInfo2 emptySubmit2{}; + std::lock_guard lock(queueMutex); + graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + + // imageIndex already populated above + watchdogProgressLabel.store("Render: acquired swapchain image", std::memory_order_relaxed); + + if (acquireResultCode == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) { + framebufferResized.store(false, std::memory_order_relaxed); + if (imguiSystem) + ImGui::EndFrame(); + // Fence was reset earlier; ensure it is signaled before we bail out + // to avoid a deadlock in swapchain recreation. + { + vk::SubmitInfo2 emptySubmit2{}; + std::lock_guard lock(queueMutex); + graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + if (acquireResultCode != vk::Result::eSuccess) { + throw std::runtime_error("Failed to acquire swap chain image"); + } + + if (framebufferResized.load(std::memory_order_relaxed)) { + // Signal the fence via empty submit since no real work will be submitted + // this frame, preventing a wait on an unsignaled fence during resize. + { + vk::SubmitInfo2 emptySubmit2{}; + std::lock_guard lock(queueMutex); + graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + + // Perform any descriptor updates that must not happen during command buffer recording + if (useForwardPlus) { + uint32_t tilesX_pre = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY_pre = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + // Only update current frame's descriptors to avoid touching in-flight frames + createOrResizeForwardPlusBuffers(tilesX_pre, tilesY_pre, forwardPlusSlicesZ, /*updateOnlyCurrentFrame=*/true); + // After (re)creating Forward+ buffers, bindings 7/8 will be refreshed as needed. + } + + // Ensure light buffers are sufficiently large before recording to avoid resizing while in use + { + // Reserve capacity based on emissive lights only (punctual lights disabled for now) + size_t desiredLightCapacity = 0; + if (!staticLights.empty()) { + size_t emissiveCount = 0; + for (const auto& L : staticLights) { + if (L.type == ExtractedLight::Type::Emissive) { + ++emissiveCount; + if (emissiveCount >= MAX_ACTIVE_LIGHTS) + break; + } + } + desiredLightCapacity = emissiveCount; + } + if (desiredLightCapacity > 0) { + createOrResizeLightStorageBuffers(desiredLightCapacity); + // Ensure compute (binding 0) sees the current frame's lights buffer + refreshForwardPlusComputeLightsBindingForFrame(currentFrame); + // Bindings 6/7/8 for PBR are refreshed only when buffers change (handled in resize path). + } + } + + // Safe point: Update ray query descriptor sets if ray query mode is active + // This MUST happen before command buffer recording starts to avoid "descriptor updated without UPDATE_AFTER_BIND" errors + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + if (!!*tlasStructure.handle) { + watchdogProgressLabel.store("Render: updateRayQueryDescriptorSets", std::memory_order_relaxed); + updateRayQueryDescriptorSets(currentFrame, entities); + watchdogProgressLabel.store("Render: after updateRayQueryDescriptorSets", std::memory_order_relaxed); + } + } + + // Refit TLAS if needed (either for Ray Query mode or for Raster shadows) + const bool needTLAS = (currentRenderMode == RenderMode::RayQuery || enableRasterRayQueryShadows) && accelerationStructureEnabled; + if (needTLAS && !!*tlasStructure.handle) { + if (!IsRayQueryStaticOnly()) { + watchdogProgressLabel.store("Render: refitTopLevelAS", std::memory_order_relaxed); + refitTopLevelAS(entities, camera); + } + } + + commandBuffers[currentFrame].reset(); + // Begin command buffer recording for this frame + commandBuffers[currentFrame].begin(vk::CommandBufferBeginInfo()); + isRecordingCmd.store(true, std::memory_order_relaxed); + if (framebufferResized.load(std::memory_order_relaxed)) { + commandBuffers[currentFrame].end(); + recreateSwapChain(); + return; + } + + // Ray query rendering mode dispatch + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + // Check if TLAS handle is valid (dereference RAII handle) + if (!*tlasStructure.handle) { + // TLAS not built yet. + // During loading, allow the raster path (and the progress overlay) to render normally + // instead of presenting a diagnostic magenta frame. + if (!IsLoading()) { + // Present a diagnostic frame from the ray-query path to avoid accidentally showing + // rasterized content in RayQuery mode. + // Transition swapchain image from PRESENT to TRANSFER_DST + vk::ImageMemoryBarrier2 swapchainBarrier{}; + swapchainBarrier.srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainBarrier.srcAccessMask = vk::AccessFlagBits2::eNone; + swapchainBarrier.dstStageMask = vk::PipelineStageFlagBits2::eTransfer; + swapchainBarrier.dstAccessMask = vk::AccessFlagBits2::eTransferWrite; + swapchainBarrier.oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined; + swapchainBarrier.newLayout = vk::ImageLayout::eTransferDstOptimal; + swapchainBarrier.image = swapChainImages[imageIndex]; + swapchainBarrier.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + swapchainBarrier.subresourceRange.levelCount = 1; + swapchainBarrier.subresourceRange.layerCount = 1; + + vk::DependencyInfo depInfoSwap{}; + depInfoSwap.imageMemoryBarrierCount = 1; + depInfoSwap.pImageMemoryBarriers = &swapchainBarrier; + commandBuffers[currentFrame].pipelineBarrier2(depInfoSwap); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainBarrier.newLayout; + + // Clear to a distinct magenta diagnostic color + vk::ClearColorValue clearColor{std::array < float, 4 >{1.0f, 0.0f, 1.0f, 1.0f}}; + vk::ImageSubresourceRange clearRange{vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}; + commandBuffers[currentFrame].clearColorImage(swapChainImages[imageIndex], vk::ImageLayout::eTransferDstOptimal, clearColor, clearRange); + + // Transition back to PRESENT + swapchainBarrier.srcStageMask = vk::PipelineStageFlagBits2::eTransfer; + swapchainBarrier.srcAccessMask = vk::AccessFlagBits2::eTransferWrite; + swapchainBarrier.dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainBarrier.dstAccessMask = vk::AccessFlagBits2::eNone; + swapchainBarrier.oldLayout = vk::ImageLayout::eTransferDstOptimal; + swapchainBarrier.newLayout = vk::ImageLayout::ePresentSrcKHR; + commandBuffers[currentFrame].pipelineBarrier2(depInfoSwap); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainBarrier.newLayout; + + rayQueryRenderedThisFrame = true; // Skip raster; ensure we are looking at RQ path only + } + } else { + // TLAS is valid and descriptor sets were already updated at safe point + // Proceed with ray query rendering + // Bind ray query compute pipeline + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eCompute, *rayQueryPipeline); + + // Bind descriptor set + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eCompute, + *rayQueryPipelineLayout, + 0, + *rayQueryDescriptorSets[currentFrame], + nullptr); + + // This dedicated UBO is separate from entity UBOs and uses a Ray Query-specific layout. + if (rayQueryUniformBuffersMapped.size() > currentFrame && rayQueryUniformBuffersMapped[currentFrame]) { + RayQueryUniformBufferObject ubo{}; + ubo.model = glm::mat4(1.0f); // Identity - not used for ray query + + // Force view matrix update to reflect current camera position + // (the dirty flag isn't automatically set when camera position changes) + camera->ForceViewMatrixUpdate(); + + // Get camera matrices + glm::mat4 camView = camera->GetViewMatrix(); + ubo.view = camView; + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + ubo.camPos = glm::vec4(camera->GetPosition(), 1.0f); + // Clamp to sane ranges to avoid black output (exposure=0 → 1-exp(0)=0) + ubo.exposure = std::clamp(exposure, 0.2f, 4.0f); + ubo.gamma = std::clamp(gamma, 1.6f, 2.6f); + // Match raster convention: ambient scale factor for simple IBL/ambient term. + // (Raster defaults to ~1.0 in the main pass; keep Ray Query consistent.) + ubo.scaleIBLAmbient = 1.0f; + // Provide the per-frame light count so the ray query shader can iterate lights. + ubo.lightCount = static_cast(lastFrameLightCount); + ubo.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height); + ubo.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0; + ubo.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0; + // Max secondary bounces (reflection/refraction). Stored in the padding slot to avoid UBO layout churn. + // Shader clamps this value. + ubo._pad0 = rayQueryMaxBounces; + // Thick-glass toggles and tuning + ubo.enableThickGlass = enableThickGlass ? 1 : 0; + ubo.thicknessClamp = thickGlassThicknessClamp; + ubo.absorptionScale = thickGlassAbsorptionScale; + // Ray Query hard shadows (see `shaders/ray_query.slang`) + ubo._pad1 = enableRayQueryShadows ? 1 : 0; + ubo.shadowSampleCount = std::clamp(rayQueryShadowSampleCount, 1, 32); + ubo.shadowSoftness = std::clamp(rayQueryShadowSoftness, 0.0f, 1.0f); + ubo.reflectionIntensity = reflectionIntensity; + // Provide geometry info count for shader-side bounds checking (per-instance) + ubo.geometryInfoCount = static_cast(tlasInstanceCount); + // Provide material buffer count for shader-side bounds checking + ubo.materialCount = static_cast(materialCountCPU); + + // Copy to mapped memory + std::memcpy(rayQueryUniformBuffersMapped[currentFrame], &ubo, sizeof(RayQueryUniformBufferObject)); + } else { + // Keep concise error for visibility + std::cerr << "Ray Query UBO not mapped for frame " << currentFrame << "\n"; + } + + // Dispatch compute shader (8x8 workgroups as defined in shader) + uint32_t workgroupsX = (swapChainExtent.width + 7) / 8; + uint32_t workgroupsY = (swapChainExtent.height + 7) / 8; + commandBuffers[currentFrame].dispatch(workgroupsX, workgroupsY, 1); + + // Barrier: wait for compute shader to finish writing to output image, + // then make it readable by fragment shader for sampling in composite pass + vk::ImageMemoryBarrier2 rqToSample{}; + rqToSample.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader; + rqToSample.srcAccessMask = vk::AccessFlagBits2::eShaderWrite; + rqToSample.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + rqToSample.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + rqToSample.oldLayout = vk::ImageLayout::eGeneral; + rqToSample.newLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + rqToSample.image = *rayQueryOutputImage; + rqToSample.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + rqToSample.subresourceRange.levelCount = 1; + rqToSample.subresourceRange.layerCount = 1; + + vk::DependencyInfo depRQToSample{}; + depRQToSample.imageMemoryBarrierCount = 1; + depRQToSample.pImageMemoryBarriers = &rqToSample; + commandBuffers[currentFrame].pipelineBarrier2(depRQToSample); + + // Composite fullscreen: sample rayQueryOutputImage to the swapchain using the composite pipeline + // Transition swapchain image to COLOR_ATTACHMENT_OPTIMAL + vk::ImageMemoryBarrier2 swapchainToColor{}; + swapchainToColor.srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainToColor.srcAccessMask = vk::AccessFlagBits2::eNone; + swapchainToColor.dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + swapchainToColor.dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead; + swapchainToColor.oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined; + swapchainToColor.newLayout = vk::ImageLayout::eColorAttachmentOptimal; + swapchainToColor.image = swapChainImages[imageIndex]; + swapchainToColor.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + swapchainToColor.subresourceRange.levelCount = 1; + swapchainToColor.subresourceRange.layerCount = 1; + vk::DependencyInfo depSwapToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor}; + commandBuffers[currentFrame].pipelineBarrier2(depSwapToColor); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainToColor.newLayout; + + // Begin dynamic rendering for composite (no depth) + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear; + depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare; + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + auto savedDepthPtr2 = renderingInfo.pDepthAttachment; + renderingInfo.pDepthAttachment = nullptr; + commandBuffers[currentFrame].beginRendering(renderingInfo); + + if (!!*compositePipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline); + } + vk::Viewport vp(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + vk::Rect2D sc({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setViewport(0, vp); + commandBuffers[currentFrame].setScissor(0, sc); + + // Bind the RQ composite descriptor set (samples rayQueryOutputImage) + if (!rqCompositeDescriptorSets.empty()) { + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + *compositePipelineLayout, + 0, + {*rqCompositeDescriptorSets[currentFrame]}, + {}); + } + + // Push exposure/gamma and sRGB flag + struct CompositePush { + float exposure; + float gamma; + int outputIsSRGB; + float _pad; + } pc2{}; + pc2.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + pc2.gamma = this->gamma; + pc2.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + commandBuffers[currentFrame].pushConstants(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc2); + + commandBuffers[currentFrame].draw(3, 1, 0, 0); + commandBuffers[currentFrame].endRendering(); + renderingInfo.pDepthAttachment = savedDepthPtr2; + + // Transition swapchain back to PRESENT and RQ image back to GENERAL for next frame + vk::ImageMemoryBarrier2 swapchainToPresent{}; + swapchainToPresent.srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + swapchainToPresent.srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite; + swapchainToPresent.dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainToPresent.dstAccessMask = vk::AccessFlagBits2::eNone; + swapchainToPresent.oldLayout = vk::ImageLayout::eColorAttachmentOptimal; + swapchainToPresent.newLayout = vk::ImageLayout::ePresentSrcKHR; + swapchainToPresent.image = swapChainImages[imageIndex]; + swapchainToPresent.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + swapchainToPresent.subresourceRange.levelCount = 1; + swapchainToPresent.subresourceRange.layerCount = 1; + + vk::ImageMemoryBarrier2 rqBackToGeneral{}; + rqBackToGeneral.srcStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + rqBackToGeneral.srcAccessMask = vk::AccessFlagBits2::eShaderRead; + rqBackToGeneral.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader; + rqBackToGeneral.dstAccessMask = vk::AccessFlagBits2::eShaderWrite; + rqBackToGeneral.oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + rqBackToGeneral.newLayout = vk::ImageLayout::eGeneral; + rqBackToGeneral.image = *rayQueryOutputImage; + rqBackToGeneral.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + rqBackToGeneral.subresourceRange.levelCount = 1; + rqBackToGeneral.subresourceRange.layerCount = 1; + + std::array barriers{swapchainToPresent, rqBackToGeneral}; + vk::DependencyInfo depEnd{.imageMemoryBarrierCount = static_cast(barriers.size()), .pImageMemoryBarriers = barriers.data()}; + commandBuffers[currentFrame].pipelineBarrier2(depEnd); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainToPresent.newLayout; + + // Ray query rendering complete - set flag to skip rasterization code path + rayQueryRenderedThisFrame = true; + } + } + + // Process texture streaming uploads (see Renderer::ProcessPendingTextureJobs) + + vk::raii::Pipeline* currentPipeline = nullptr; + vk::raii::PipelineLayout* currentLayout = nullptr; + + // Incrementally process pending texture uploads on the main thread so that + // all Vulkan submits happen from a single place while worker threads only + // handle CPU-side decoding. While the loading screen is up, prioritize + // critical textures so the first rendered frame looks mostly correct. + if (IsLoading()) { + // Larger budget while loading screen is visible so we don't stall + // streaming of near-field baseColor textures. + ProcessPendingTextureJobs(/*maxJobs=*/16, /*includeCritical=*/true, /*includeNonCritical=*/false); + } else { + // After loading screen disappears, we want the scene to remain + // responsive (~20 fps) while textures stream in. Limit the number + // of non-critical uploads per frame so we don't tank frame time. + static uint32_t streamingFrameCounter = 0; + streamingFrameCounter++; + // Ray Query needs textures visible quickly; process more streaming work when in Ray Query mode. + if (currentRenderMode == RenderMode::RayQuery) { + // Aggressively drain both critical and non-critical queues each frame for faster bring-up. + ProcessPendingTextureJobs(/*maxJobs=*/32, /*includeCritical=*/true, /*includeNonCritical=*/true); + } else { + // Raster path: keep previous throttling to avoid stalls. + if ((streamingFrameCounter % 3) == 0) { + ProcessPendingTextureJobs(/*maxJobs=*/1, /*includeCritical=*/false, /*includeNonCritical=*/true); + } + } + } + + // Renderer UI - available for both ray query and rasterization modes. + // Hide UI during loading; the progress overlay is handled by ImGuiSystem::NewFrame(). + if (imguiSystem && !imguiSystem->IsFrameRendered() && !IsLoading()) { + if (ImGui::Begin("Renderer")) { + // Declare variables that need to persist across conditional blocks + bool prevFwdPlus = useForwardPlus; + + // === RENDERING MODE SELECTION (TOP) === + ImGui::Text("Rendering Mode:"); + if (rayQueryEnabled && accelerationStructureEnabled) { + const char* modeNames[] = {"Rasterization", "Ray Query"}; + int currentMode = (currentRenderMode == RenderMode::RayQuery) ? 1 : 0; + if (ImGui::Combo("Mode", ¤tMode, modeNames, 2)) { + RenderMode newMode = (currentMode == 1) ? RenderMode::RayQuery : RenderMode::Rasterization; + if (newMode != currentRenderMode) { + currentRenderMode = newMode; + std::cout << "Switched to " << modeNames[currentMode] << " mode\n"; + + // Request acceleration structure build when switching to ray query mode + if (currentRenderMode == RenderMode::RayQuery) { + std::cout << "Requesting acceleration structure build...\n"; + RequestAccelerationStructureBuild(); + } + + // Switching modes can change which pipelines are bound and whether ray-query-dependent + // descriptor bindings (e.g., PBR binding 11 `tlas`) become statically used. + // Mark entity descriptor sets dirty so the next safe point refreshes bindings for this frame. + for (auto& kv : entityResources) { + kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + for (Entity* e : entities) { + MarkEntityDescriptorsDirty(e); + } + } + } + } else { + ImGui::TextColored(ImVec4(0.7f, 0.7f, 0.7f, 1.0f), "Rasterization only (ray query not supported)"); + } + + // === RASTERIZATION-SPECIFIC OPTIONS === + if (currentRenderMode == RenderMode::Rasterization) { + ImGui::Separator(); + ImGui::Text("Rasterization Options:"); + + // Lighting Controls - BRDF/PBR is now the default lighting model + bool useBasicLighting = imguiSystem && !imguiSystem->IsPBREnabled(); + if (ImGui::Checkbox("Use Basic Lighting (Phong)", &useBasicLighting)) { + imguiSystem->SetPBREnabled(!useBasicLighting); + std::cout << "Lighting mode: " << (!useBasicLighting ? "BRDF/PBR (default)" : "Basic Phong") << std::endl; + } + + if (!useBasicLighting) { + ImGui::Text("Status: BRDF/PBR pipeline active (default)"); + ImGui::Text("All models rendered with physically-based lighting"); + } else { + ImGui::Text("Status: Basic Phong pipeline active"); + ImGui::Text("All models rendered with basic Phong shading"); + } + + ImGui::Checkbox("Forward+ (tiled light culling)", &useForwardPlus); + if (useForwardPlus && !prevFwdPlus) { + // Lazily create Forward+ resources if enabled at runtime + if (!*forwardPlusPipeline || !*forwardPlusDescriptorSetLayout || forwardPlusPerFrame.empty()) { + createForwardPlusPipelinesAndResources(); + } + if (!*depthPrepassPipeline) { + createDepthPrepassPipeline(); + } + } + + // Raster shadows via ray queries (experimental) + if (rayQueryEnabled && accelerationStructureEnabled) { + ImGui::Checkbox("RayQuery shadows (raster)", &enableRasterRayQueryShadows); + } else { + ImGui::TextDisabled("RayQuery shadows (raster) (requires ray query + AS)"); + } + + // Planar reflections controls + ImGui::Spacing(); + /* + if (ImGui::Checkbox("Planar reflections (experimental)", &enablePlanarReflections)) { + // Defer actual (re)creation/destruction to the next safe point at frame start + reflectionResourcesDirty = true; + } + */ + enablePlanarReflections = false; + float scaleBefore = reflectionResolutionScale; + if (ImGui::SliderFloat("Reflection resolution scale", &reflectionResolutionScale, 0.25f, 1.0f, "%.2f")) { + reflectionResolutionScale = std::clamp(reflectionResolutionScale, 0.25f, 1.0f); + if (enablePlanarReflections&& std::abs(scaleBefore - reflectionResolutionScale) + > + 1e-3f + ) { + reflectionResourcesDirty = true; + } + } + if (enablePlanarReflections && !reflections.empty()) { + auto& rt = reflections[currentFrame]; + if (rt.width > 0) { + ImGui::Text("Reflection RT: %ux%u", rt.width, rt.height); + } + } + } + + // === RAY QUERY-SPECIFIC OPTIONS === + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + ImGui::Separator(); + ImGui::Text("Ray Query Status:"); + + // Show acceleration structure status + if (!!*tlasStructure.handle) { + ImGui::TextColored(ImVec4(0.0f, 1.0f, 0.0f, 1.0f), "Acceleration Structures: Built (%zu meshes)", blasStructures.size()); + } else { + ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Acceleration Structures: Not built"); + } + + ImGui::Spacing(); + ImGui::Text("Ray Query Features:"); + ImGui::Checkbox("Enable Hard Shadows", &enableRayQueryShadows); + if (enableRayQueryShadows) { + ImGui::SliderInt("Shadow samples", &rayQueryShadowSampleCount, 1, 32); + ImGui::SliderFloat("Shadow softness (fraction of range)", &rayQueryShadowSoftness, 0.0f, 0.2f, "%.3f"); + } + ImGui::Checkbox("Enable Reflections", &enableRayQueryReflections); + ImGui::Checkbox("Enable Transparency/Refraction", &enableRayQueryTransparency); + ImGui::SliderInt("Max secondary bounces", &rayQueryMaxBounces, 0, 10); + // Thick-glass realism controls + ImGui::Separator(); + ImGui::Text("Thick Glass"); + ImGui::Checkbox("Enable Thick Glass", &enableThickGlass); + ImGui::SliderFloat("Thickness Clamp (m)", &thickGlassThicknessClamp, 0.0f, 0.5f, "%.3f"); + ImGui::SliderFloat("Absorption Scale", &thickGlassAbsorptionScale, 0.0f, 4.0f, "%.2f"); + } + + // === SHARED OPTIONS (BOTH MODES) === + ImGui::Separator(); + ImGui::Text("Culling & LOD:"); + if (ImGui::Checkbox("Frustum culling", &enableFrustumCulling)) { + // no-op, takes effect immediately + } + if (ImGui::Checkbox("Distance LOD (projected-size skip)", &enableDistanceLOD)) { + } + ImGui::SliderFloat("LOD threshold opaque (px)", &lodPixelThresholdOpaque, 0.5f, 8.0f, "%.1f"); + ImGui::SliderFloat("LOD threshold transparent (px)", &lodPixelThresholdTransparent, 0.5f, 12.0f, "%.1f"); + // Anisotropy control (recreate samplers on change) + { + float deviceMaxAniso = physicalDevice.getProperties().limits.maxSamplerAnisotropy; + if (ImGui::SliderFloat("Sampler max anisotropy", &samplerMaxAnisotropy, 1.0f, deviceMaxAniso, "%.1f")) { + // Recreate samplers for all textures to apply new anisotropy + std::unique_lock texLock(textureResourcesMutex); + for (auto& kv : textureResources) { + createTextureSampler(kv.second); + } + // Default texture + createTextureSampler(defaultTextureResources); + } + } + if (lastCullingVisibleCount + lastCullingCulledCount > 0) { + ImGui::Text("Culling: visible=%u, culled=%u", lastCullingVisibleCount, lastCullingCulledCount); + } + + // Basic tone mapping controls + ImGui::Separator(); + ImGui::Text("Tone Mapping & Tuning:"); + ImGui::SliderFloat("Reflection intensity", &reflectionIntensity, 0.0f, 2.0f, "%.2f"); + ImGui::SliderFloat("Exposure", &exposure, 0.1f, 4.0f, "%.2f"); + ImGui::SliderFloat("Gamma", &gamma, 1.6f, 2.6f, "%.2f"); + } + ImGui::End(); + } + + // Rasterization rendering: only execute if ray query did not render this frame. + if (!rayQueryRenderedThisFrame) { + // Optional: render planar reflections first + /* + if (enablePlanarReflections) { + glm::vec4 planeWS(0.0f, 1.0f, 0.0f, 0.0f); + renderReflectionPass(commandBuffers[currentFrame], planeWS, camera, opaqueJobs); + } + */ + + // Sort transparent entities back-to-front for correct blending of nested glass/liquids + if (!transparentJobs.empty()) { + glm::vec3 camPos = camera ? camera->GetPosition() : glm::vec3(0.0f); + std::ranges::sort(transparentJobs, + [camPos](const RenderJob& a, const RenderJob& b) { + glm::vec3 pa = a.transformComp ? a.transformComp->GetPosition() : glm::vec3(0.0f); + glm::vec3 pb = b.transformComp ? b.transformComp->GetPosition() : glm::vec3(0.0f); + float da2 = glm::length2(pa - camPos); + float db2 = glm::length2(pb - camPos); + if (da2 != db2) return da2 > db2; + if (a.entityRes->cachedIsLiquid != b.entityRes->cachedIsLiquid) return a.entityRes->cachedIsLiquid; + return a.entity < b.entity; + }); + } + + + // Track whether we executed a depth pre-pass this frame (used to choose depth load op and pipeline state) + bool didOpaqueDepthPrepass = false; + + // Optional Forward+ depth pre-pass for opaque geometry + if (useForwardPlus) { + if (!opaqueJobs.empty()) { + // Transition depth image for attachment write (Sync2) + vk::ImageMemoryBarrier2 depthBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *depthImage, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoDepth{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthBarrier2}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoDepth); + + // Depth-only rendering + vk::RenderingAttachmentInfo depthOnlyAttachment{.imageView = *depthImageView, .imageLayout = vk::ImageLayout::eDepthAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearDepthStencilValue{1.0f, 0}}; + vk::RenderingInfo depthOnlyInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 0, .pColorAttachments = nullptr, .pDepthAttachment = &depthOnlyAttachment}; + commandBuffers[currentFrame].beginRendering(depthOnlyInfo); + vk::Viewport viewport(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, viewport); + vk::Rect2D scissor({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, scissor); + + // Bind depth pre-pass pipeline + if (!!*depthPrepassPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *depthPrepassPipeline); + } + + for (const auto& job : opaqueJobs) { + if (job.isAlphaMasked) continue; + + // Bind geometry + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + // Bind descriptor set (PBR set 0) + commandBuffers[currentFrame].bindDescriptorSets(vk::PipelineBindPoint::eGraphics, + *pbrPipelineLayout, + 0, + *job.entityRes->pbrDescriptorSets[currentFrame], + nullptr); + + // Issue draw + uint32_t instanceCount = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0); + } + + commandBuffers[currentFrame].endRendering(); + + // Barrier to ensure depth is visible for subsequent passes (Sync2) + vk::ImageMemoryBarrier2 depthToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *depthImage, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoDepthToRead{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthToRead2}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoDepthToRead); + + didOpaqueDepthPrepass = true; + } + + // Forward+ compute culling based on current camera and screen tiles + uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + + // Lights already extracted at frame start - use lastFrameLightCount for Forward+ params + glm::mat4 view = camera->GetViewMatrix(); + glm::mat4 proj = camera->GetProjectionMatrix(); + proj[1][1] *= -1.0f; + float nearZ = camera->GetNearPlane(); + float farZ = camera->GetFarPlane(); + updateForwardPlusParams(currentFrame, view, proj, lastFrameLightCount, tilesX, tilesY, forwardPlusSlicesZ, nearZ, farZ); + // As a last guard before dispatch, make sure compute binding 0 is valid for this frame + refreshForwardPlusComputeLightsBindingForFrame(currentFrame); + + dispatchForwardPlus(commandBuffers[currentFrame], tilesX, tilesY, forwardPlusSlicesZ); + } + + // PASS 1: RENDER OPAQUE OBJECTS TO OFF-SCREEN TEXTURE + // Transition off-screen color to attachment write (Sync2). On first use after creation or after switching + // from a mode that never produced this image, the layout may still be UNDEFINED. + vk::ImageLayout oscOldLayout = vk::ImageLayout::eUndefined; + vk::PipelineStageFlags2 oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe; + vk::AccessFlags2 oscSrcAccess = vk::AccessFlagBits2::eNone; + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + oscOldLayout = opaqueSceneColorImageLayouts[currentFrame]; + if (oscOldLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + oscSrcStage = vk::PipelineStageFlagBits2::eFragmentShader; + oscSrcAccess = vk::AccessFlagBits2::eShaderRead; + } else if (oscOldLayout == vk::ImageLayout::eColorAttachmentOptimal) { + oscSrcStage = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + oscSrcAccess = vk::AccessFlagBits2::eColorAttachmentWrite; + } else { + oscOldLayout = vk::ImageLayout::eUndefined; + oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe; + oscSrcAccess = vk::AccessFlagBits2::eNone; + } + } + vk::ImageMemoryBarrier2 oscToColor2{ + .srcStageMask = oscSrcStage, + .srcAccessMask = oscSrcAccess, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = oscOldLayout, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *opaqueSceneColorImages[currentFrame], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depOscToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &oscToColor2}; + commandBuffers[currentFrame].pipelineBarrier2(depOscToColor); + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eColorAttachmentOptimal; + } + // PASS 1: OFF-SCREEN COLOR (Opaque) + // Clear the off-screen target at the start of opaque rendering to a neutral black background + vk::RenderingAttachmentInfo colorAttachment{.imageView = *opaqueSceneColorImageViews[currentFrame], .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f})}; + depthAttachment.imageView = *depthImageView; + depthAttachment.loadOp = (didOpaqueDepthPrepass) ? vk::AttachmentLoadOp::eLoad : vk::AttachmentLoadOp::eClear; + vk::RenderingInfo passInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 1, .pColorAttachments = &colorAttachment, .pDepthAttachment = &depthAttachment}; + commandBuffers[currentFrame].beginRendering(passInfo); + vk::Viewport viewport(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, viewport); + vk::Rect2D scissor({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, scissor); { + uint32_t opaqueDrawsThisPass = 0; + for (const auto& job : opaqueJobs) { + bool useBasic = (imguiSystem && !imguiSystem->IsPBREnabled()); + vk::raii::Pipeline* selectedPipeline = nullptr; + vk::raii::PipelineLayout* selectedLayout = nullptr; + if (useBasic) { + selectedPipeline = &graphicsPipeline; + selectedLayout = &pipelineLayout; + } else { + // If masked, we need depth writes with alpha test; otherwise, after-prepass read-only is fine. + if (job.isAlphaMasked) { + selectedPipeline = &pbrGraphicsPipeline; // writes depth, compare Less + } else { + selectedPipeline = didOpaqueDepthPrepass && !!*pbrPrepassGraphicsPipeline ? &pbrPrepassGraphicsPipeline : &pbrGraphicsPipeline; + } + selectedLayout = &pbrPipelineLayout; + } + if (currentPipeline != selectedPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **selectedPipeline); + currentPipeline = selectedPipeline; + currentLayout = selectedLayout; + } + + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + auto* descSetsPtr = useBasic ? &job.entityRes->basicDescriptorSets : &job.entityRes->pbrDescriptorSets; + if (descSetsPtr->empty() || currentFrame >= descSetsPtr->size()) { + continue; + } + + if (useBasic) { + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **selectedLayout, + 0, + {*(*descSetsPtr)[currentFrame]}, + {}); + } else { + vk::DescriptorSet set1Opaque = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **selectedLayout, + 0, + {*(*descSetsPtr)[currentFrame], set1Opaque}, + {}); + + commandBuffers[currentFrame].pushConstants(**selectedLayout, vk::ShaderStageFlagBits::eFragment, 0, {job.entityRes->cachedMaterialProps}); + } + uint32_t instanceCount = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0); + ++opaqueDrawsThisPass; + } + } + commandBuffers[currentFrame].endRendering(); + // PASS 1b: PRESENT – composite path + { + // Transition off-screen to SHADER_READ for sampling (Sync2) + vk::ImageMemoryBarrier2 opaqueToSample2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *opaqueSceneColorImages[currentFrame], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depOpaqueToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &opaqueToSample2}; + commandBuffers[currentFrame].pipelineBarrier2(depOpaqueToSample); + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eShaderReadOnlyOptimal; + } + + // Make the swapchain image ready for color attachment output and clear it (Sync2) + vk::ImageMemoryBarrier2 swapchainToColor2{ + .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depSwapchainToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor2}; + commandBuffers[currentFrame].pipelineBarrier2(depSwapchainToColor); + + // Begin rendering to swapchain for composite + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear; // clear before composing base layer (full-screen composite overwrites all pixels) + depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare; // no depth for composite + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + // IMPORTANT: Composite pass does not use a depth attachment. Avoid binding it to satisfy dynamic rendering VUIDs. + auto savedDepthPtr = renderingInfo.pDepthAttachment; // save to restore later + renderingInfo.pDepthAttachment = nullptr; + commandBuffers[currentFrame].beginRendering(renderingInfo); + + // Bind composite pipeline + if (!!*compositePipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline); + } + vk::Viewport vp(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, vp); + vk::Rect2D sc({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, sc); + + // Bind descriptor set 0 for the composite. During loading, force fallback to avoid sampling uninitialized off-screen color. + vk::DescriptorSet setComposite = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + *compositePipelineLayout, + 0, + {setComposite}, + {}); + + // Push exposure/gamma and sRGB flag + struct CompositePush { + float exposure; + float gamma; + int outputIsSRGB; + float _pad; + } pc{}; + pc.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + pc.gamma = this->gamma; + pc.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + commandBuffers[currentFrame].pushConstants(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc); + + // Draw fullscreen triangle + commandBuffers[currentFrame].draw(3, 1, 0, 0); + + commandBuffers[currentFrame].endRendering(); + // Restore depth attachment pointer for subsequent passes + renderingInfo.pDepthAttachment = savedDepthPtr; + } + // PASS 2: RENDER TRANSPARENT OBJECTS TO THE SWAPCHAIN + { + // Ensure depth attachment is bound again for the transparent pass + renderingInfo.pDepthAttachment = &depthAttachment; + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eLoad; + depthAttachment.loadOp = vk::AttachmentLoadOp::eLoad; + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + commandBuffers[currentFrame].beginRendering(renderingInfo); + commandBuffers[currentFrame].setViewport(0, viewport); + commandBuffers[currentFrame].setScissor(0, scissor); + + if (!transparentJobs.empty()) { + currentLayout = &pbrTransparentPipelineLayout; + vk::raii::Pipeline* activeTransparentPipeline = nullptr; + + for (const auto& job : transparentJobs) { + vk::raii::Pipeline* desiredPipeline = job.entityRes->cachedIsGlass ? &glassGraphicsPipeline : &pbrBlendGraphicsPipeline; + if (desiredPipeline != activeTransparentPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **desiredPipeline); + activeTransparentPipeline = desiredPipeline; + } + + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + vk::DescriptorSet set1 = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **currentLayout, + 0, + {*job.entityRes->pbrDescriptorSets[currentFrame], set1}, + {}); + + MaterialProperties pushConstants = job.entityRes->cachedMaterialProps; + if (job.entityRes->cachedIsLiquid) { + pushConstants.transmissionFactor = 0.0f; + } + commandBuffers[currentFrame].pushConstants < MaterialProperties > (**currentLayout, vk::ShaderStageFlagBits::eFragment, 0, { + pushConstants + } + ) + ; + uint32_t instanceCountT = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCountT, 0, 0, 0); + } + } + // End transparent rendering pass before any layout transitions (even if no transparent draws) + commandBuffers[currentFrame].endRendering(); + } { + // Screenshot and final present transition are handled in rasterization path only + // Ray query path handles these separately + + // Final layout transition for present (rasterization path only) + { + vk::ImageMemoryBarrier2 presentBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eNone, + .dstAccessMask = {}, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::ePresentSrcKHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depToPresentFinal{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentBarrier2}; + commandBuffers[currentFrame].pipelineBarrier2(depToPresentFinal); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = presentBarrier2.newLayout; + } + } + } // skip rasterization when ray query has rendered + + // Render ImGui UI overlay AFTER rasterization/ray query (must always execute regardless of render mode) + // ImGui expects Render() to be called every frame after NewFrame() - skipping it causes hangs + if (imguiSystem && !imguiSystem->IsFrameRendered()) { + // When ray query renders, swapchain is in PRESENT layout with valid content. + // When rasterization renders, swapchain is also in PRESENT layout with valid content. + // Transition to COLOR_ATTACHMENT with loadOp=eLoad to preserve existing pixels for ImGui overlay. + vk::ImageMemoryBarrier2 presentToColor{ + .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentToColor}; + commandBuffers[currentFrame].pipelineBarrier2(depInfo); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = presentToColor.newLayout; + + // Begin a dedicated render pass for ImGui (UI overlay) + vk::RenderingAttachmentInfo imguiColorAttachment{ + .imageView = *swapChainImageViews[imageIndex], + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eLoad, // Load existing content + .storeOp = vk::AttachmentStoreOp::eStore + }; + vk::RenderingInfo imguiRenderingInfo{ + .renderArea = vk::Rect2D({0, 0}, swapChainExtent), + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &imguiColorAttachment, + .pDepthAttachment = nullptr + }; + commandBuffers[currentFrame].beginRendering(imguiRenderingInfo); + + imguiSystem->Render(commandBuffers[currentFrame], currentFrame); + + commandBuffers[currentFrame].endRendering(); + + // Transition swapchain back to PRESENT layout after ImGui renders + vk::ImageMemoryBarrier2 colorToPresent{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::ePresentSrcKHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoBack{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &colorToPresent}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoBack); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = colorToPresent.newLayout; + } + + commandBuffers[currentFrame].end(); + isRecordingCmd.store(false, std::memory_order_relaxed); + + // Submit and present (Synchronization 2) + uint64_t uploadsValueToWait = uploadTimelineLastSubmitted.load(std::memory_order_relaxed); + + // Use acquireSemaphoreIndex for imageAvailable semaphore (same as we used in acquireNextImage) + // Use imageIndex for renderFinished semaphore (matches the image being presented) + + std::array waitInfos = { + vk::SemaphoreSubmitInfo{ + .semaphore = *imageAvailableSemaphores[acquireSemaphoreIndex], + .value = 0, + .stageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .deviceIndex = 0 + }, + vk::SemaphoreSubmitInfo{ + .semaphore = *uploadsTimeline, + .value = uploadsValueToWait, + .stageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .deviceIndex = 0 + } + }; + + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = *commandBuffers[currentFrame], .deviceMask = 0}; + vk::SemaphoreSubmitInfo signalInfo{.semaphore = *renderFinishedSemaphores[imageIndex], .value = 0, .stageMask = vk::PipelineStageFlagBits2::eAllGraphics, .deviceIndex = 0}; + vk::SubmitInfo2 submit2{ + .waitSemaphoreInfoCount = static_cast(waitInfos.size()), + .pWaitSemaphoreInfos = waitInfos.data(), + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &signalInfo + }; + + if (framebufferResized.load(std::memory_order_relaxed)) { + vk::SubmitInfo2 emptySubmit2{}; { + std::lock_guard lock(queueMutex); + graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + + // Update watchdog BEFORE queue submit because submit can block waiting for GPU + // This proves frame CPU work is complete even if GPU queue is busy + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); { + std::lock_guard lock(queueMutex); + graphicsQueue.submit2(submit2, *inFlightFences[currentFrame]); + } + + vk::PresentInfoKHR presentInfo{.waitSemaphoreCount = 1, .pWaitSemaphores = &*renderFinishedSemaphores[imageIndex], .swapchainCount = 1, .pSwapchains = &*swapChain, .pImageIndices = &imageIndex}; + vk::Result presentResult = vk::Result::eSuccess; + try { + std::lock_guard lock(queueMutex); + presentResult = presentQueue.presentKHR(presentInfo); + } catch (const vk::OutOfDateKHRError&) { + framebufferResized.store(true, std::memory_order_relaxed); + } + if (presentResult == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) { + framebufferResized.store(false, std::memory_order_relaxed); + recreateSwapChain(); + } else if (presentResult != vk::Result::eSuccess) { + throw std::runtime_error("Failed to present swap chain image"); + } + + currentFrame = (currentFrame + 1) % MAX_FRAMES_IN_FLIGHT; +} + +// Public toggle APIs for planar reflections (keyboard/UI) +void Renderer::SetPlanarReflectionsEnabled(bool enabled) { + // Flip mode and mark resources dirty so RTs are created/destroyed at the next safe point + enablePlanarReflections = enabled; + reflectionResourcesDirty = true; +} + +void Renderer::TogglePlanarReflections() { + SetPlanarReflectionsEnabled(!enablePlanarReflections); +} diff --git a/attachments/openxr_engine/renderer_resources.cpp b/attachments/openxr_engine/renderer_resources.cpp new file mode 100644 index 00000000..6e5fed66 --- /dev/null +++ b/attachments/openxr_engine/renderer_resources.cpp @@ -0,0 +1,4094 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "mesh_component.h" +#include "model_loader.h" +#include "renderer.h" +#include "transform_component.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// stb_image dependency removed; all GLTF textures are uploaded via memory path from ModelLoader. + +// KTX2 support +#include +#include + +// This file contains resource-related methods from the Renderer class + +// Define shared default PBR texture identifiers (static constants) +const std::string Renderer::SHARED_DEFAULT_ALBEDO_ID = "__shared_default_albedo__"; +const std::string Renderer::SHARED_DEFAULT_NORMAL_ID = "__shared_default_normal__"; +const std::string Renderer::SHARED_DEFAULT_METALLIC_ROUGHNESS_ID = "__shared_default_metallic_roughness__"; +const std::string Renderer::SHARED_DEFAULT_OCCLUSION_ID = "__shared_default_occlusion__"; +const std::string Renderer::SHARED_DEFAULT_EMISSIVE_ID = "__shared_default_emissive__"; +const std::string Renderer::SHARED_BRIGHT_RED_ID = "__shared_bright_red__"; + +// Create depth resources +bool Renderer::createDepthResources() { + try { + // Find depth format + vk::Format depthFormat = findDepthFormat(); + uint32_t layers = xrMode ? 2 : 1; + + // Use non-pooled createImage for depth as memoryPool doesn't support layers yet + auto [image, memory] = createImage( + swapChainExtent.width, + swapChainExtent.height, + depthFormat, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eDepthStencilAttachment, + vk::MemoryPropertyFlagBits::eDeviceLocal, + layers); + + depthImage = std::move(image); + depthImageMemory = std::move(memory); + + // Create depth image view (with layerCount=layers) + depthImageView = createImageView(depthImage, depthFormat, vk::ImageAspectFlagBits::eDepth, 1, layers); + + // Transition depth image layout + transitionImageLayout( + *depthImage, + depthFormat, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eDepthStencilAttachmentOptimal, + 1, + layers); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create depth resources: " << e.what() << std::endl; + return false; + } +} + +// Helper: coerce an sRGB/UNORM variant of a given VkFormat while preserving block type where possible +static vk::Format CoerceFormatSRGB(vk::Format fmt, bool wantSRGB) { + switch (fmt) { + case vk::Format::eR8G8B8A8Unorm: + return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + case vk::Format::eR8G8B8A8Srgb: + return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + + case vk::Format::eBc1RgbUnormBlock: + return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + case vk::Format::eBc1RgbSrgbBlock: + return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + case vk::Format::eBc1RgbaUnormBlock: + return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock; + case vk::Format::eBc1RgbaSrgbBlock: + return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock; + + case vk::Format::eBc2UnormBlock: + return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock; + case vk::Format::eBc2SrgbBlock: + return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock; + + case vk::Format::eBc3UnormBlock: + return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + case vk::Format::eBc3SrgbBlock: + return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + + case vk::Format::eBc7UnormBlock: + return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + case vk::Format::eBc7SrgbBlock: + return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + + default: + return fmt; + } +} + +// Create texture image +bool Renderer::createTextureImage(const std::string& texturePath_, TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + const std::string textureId = ResolveTextureId(texturePath_); + // Check if texture already exists + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(textureId); + if (it != textureResources.end()) { + // Texture already loaded and cached; leave cache intact and return success + return true; + } + } + + // Resolve on-disk path (may differ from logical ID) + std::string resolvedPath = textureId; + + // Ensure command pool is initialized before any GPU work + if (!*commandPool) { + std::cerr << "createTextureImage: commandPool not initialized yet for '" << textureId << "'" << std::endl; + return false; + } + + // Per-texture de-duplication (serialize loads of the same texture ID only) + { + std::unique_lock lk(textureLoadStateMutex); + while (texturesLoading.contains(textureId)) { + textureLoadStateCv.wait(lk); + } + } + // Double-check cache after the wait + { + std::shared_lock texLock(textureResourcesMutex); + auto it2 = textureResources.find(textureId); + if (it2 != textureResources.end()) { + return true; + } + } + // Mark as loading and ensure we notify on all exit paths + { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.insert(textureId); + } + auto _loadingGuard = std::unique_ptr>(reinterpret_cast(1), + [this, textureId](void*) { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.erase(textureId); + textureLoadStateCv.notify_all(); + }); + + // Check if this is a KTX2 file + bool isKtx2 = resolvedPath.ends_with(".ktx2"); + + // If it's a KTX2 texture but the path doesn't exist, try common fallback filename variants + if (isKtx2) { + std::filesystem::path origPath(resolvedPath); + if (!std::filesystem::exists(origPath)) { + std::string fname = origPath.filename().string(); + std::string dir = origPath.parent_path().string(); + auto tryCandidate = [&](const std::string& candidateName) -> bool { + std::filesystem::path cand = std::filesystem::path(dir) / candidateName; + if (std::filesystem::exists(cand)) { + std::cout << "Resolved missing texture '" << resolvedPath << "' to existing file '" << cand.string() << "'" << std::endl; + resolvedPath = cand.string(); + return true; + } + return false; + }; + // Known suffix variants near the end of filename before extension + // Examples: *_c.ktx2, *_d.ktx2, *_cm.ktx2, *_diffuse.ktx2, *_basecolor.ktx2, *_albedo.ktx2 + std::vector suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"}; + // If filename matches one known suffix, try others + for (const auto& s : suffixes) { + std::string key = s + ".ktx2"; + if (fname.ends_with(key)) { + std::string prefix = fname.substr(0, fname.size() - key.size()); + for (const auto& alt : suffixes) { + if (alt == s) + continue; + std::string candName = prefix + alt + ".ktx2"; + if (tryCandidate(candName)) { + isKtx2 = true; + break; + } + } + break; // Only replace last suffix occurrence + } + } + } + } + + int texWidth, texHeight, texChannels; + unsigned char* pixels = nullptr; + ktxTexture2* ktxTex = nullptr; + vk::DeviceSize imageSize; + + // Track KTX2 transcoding state across the function scope (BasisU only) + bool wasTranscoded = false; + // Track KTX2 header-provided VkFormat (0 == VK_FORMAT_UNDEFINED) + uint32_t headerVkFormatRaw = 0; + + uint32_t mipLevels = 1; + std::vector copyRegions; + + if (isKtx2) { + // Load KTX2 file + KTX_error_code result = ktxTexture2_CreateFromNamedFile(resolvedPath.c_str(), + KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, + &ktxTex); + if (result != KTX_SUCCESS) { + // Retry with sibling suffix variants if file exists but cannot be parsed/opened + std::filesystem::path origPath(resolvedPath); + std::string fname = origPath.filename().string(); + std::string dir = origPath.parent_path().string(); + auto tryLoad = [&](const std::string& candidateName) -> bool { + std::filesystem::path cand = std::filesystem::path(dir) / candidateName; + if (std::filesystem::exists(cand)) { + std::string candStr = cand.string(); + std::cout << "Retrying KTX2 load with sibling candidate '" << candStr << "' for original '" << resolvedPath << "'" << std::endl; + result = ktxTexture2_CreateFromNamedFile(candStr.c_str(), KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex); + if (result == KTX_SUCCESS) { + resolvedPath = candStr; // Use the successfully opened candidate + return true; + } + } + return false; + }; + // Known suffix variants near the end of filename before extension + std::vector suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"}; + for (const auto& s : suffixes) { + std::string key = s + ".ktx2"; + if (fname.ends_with(key)) { + std::string prefix = fname.substr(0, fname.size() - key.size()); + bool loaded = false; + for (const auto& alt : suffixes) { + if (alt == s) + continue; + std::string candName = prefix + alt + ".ktx2"; + if (tryLoad(candName)) { + loaded = true; + break; + } + } + if (loaded) + break; + } + } + } + + // Bail out if we still failed to load + if (result != KTX_SUCCESS || ktxTex == nullptr) { + std::cerr << "Failed to load KTX2 texture: " << resolvedPath << " (error: " << result << ")" << std::endl; + return false; + } + + // Read header-provided vkFormat (if already GPU-compressed/transcoded offline) + headerVkFormatRaw = static_cast(ktxTex->vkFormat); + + // Check if the texture needs BasisU transcoding; prefer GPU-compressed targets to save VRAM + wasTranscoded = ktxTexture2_NeedsTranscoding(ktxTex); + if (wasTranscoded) { + // Select a compressed target supported by the device (prefer BC7 RGBA, then BC3 RGBA, then BC1 RGB) + auto supportsFormat = [&](vk::Format f) { + auto props = physicalDevice.getFormatProperties(f); + return static_cast(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImage); + }; + bool wantSrgb = (Renderer::determineTextureFormat(resolvedPath) == vk::Format::eR8G8B8A8Srgb); + KTX_error_code tcErr = KTX_SUCCESS; + if (supportsFormat(vk::Format::eBc7UnormBlock) || supportsFormat(vk::Format::eBc7SrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC7_RGBA, 0); + } else if (supportsFormat(vk::Format::eBc3UnormBlock) || supportsFormat(vk::Format::eBc3SrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC3_RGBA, 0); + } else if (supportsFormat(vk::Format::eBc1RgbUnormBlock) || supportsFormat(vk::Format::eBc1RgbSrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC1_RGB, 0); + } else { + // Fallback to RGBA32 if no BC formats are supported + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0); + } + if (tcErr != KTX_SUCCESS) { + std::cerr << "Failed to transcode KTX2 BasisU texture: " << resolvedPath << " (error: " << tcErr << ")" << std::endl; + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return false; + } + } + + texWidth = ktxTex->baseWidth; + texHeight = ktxTex->baseHeight; + texChannels = 4; // logical channels; compressed size handled by libktx + + // Use all levels present in the KTX container + mipLevels = std::max(1u, ktxTex->numLevels); + + // Total data size across all mip levels + imageSize = ktxTexture_GetDataSize(reinterpret_cast(ktxTex)); + + // Build copy regions for every mip level in the file + copyRegions.clear(); + copyRegions.reserve(mipLevels); + for (uint32_t level = 0; level < mipLevels; ++level) { + ktx_size_t levelOffset = 0; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), level, 0, 0, &levelOffset); + uint32_t w = std::max(1u, static_cast(texWidth) >> level); + uint32_t h = std::max(1u, static_cast(texHeight) >> level); + copyRegions.push_back({ + .bufferOffset = static_cast(levelOffset), + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = level, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {w, h, 1} + }); + } + } else { + // Non-KTX texture loading via file path is disabled to simplify pipeline. + std::cerr << "Unsupported non-KTX2 texture path: " << textureId << std::endl; + return false; + } + + // Create staging buffer + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + + if (isKtx2) { + // Copy entire KTX2 image data blob (all mip levels) + const uint8_t* allData = ktxTexture_GetData(reinterpret_cast(ktxTex)); + const ktx_size_t dataSz = ktxTexture_GetDataSize(reinterpret_cast(ktxTex)); + memcpy(data, allData, static_cast(dataSz)); + } else { + // Copy regular image data + memcpy(data, pixels, static_cast(imageSize)); + } + + stagingBufferMemory.unmapMemory(); + + // Determine appropriate texture format + vk::Format textureFormat; + const bool wantSRGB = (Renderer::determineTextureFormat(textureId) == vk::Format::eR8G8B8A8Srgb); + bool alphaMaskedHint = false; + if (isKtx2) { + // If the KTX2 provided a valid VkFormat and we did NOT transcode, respect its block type + // but coerce the sRGB/UNORM variant based on texture usage (baseColor vs data maps) + if (!wasTranscoded) { + VkFormat headerFmt = static_cast(headerVkFormatRaw); + if (headerFmt != VK_FORMAT_UNDEFINED) { + textureFormat = CoerceFormatSRGB(static_cast(headerFmt), wantSRGB); + } else { + textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + } + // Can't easily scan alpha in compressed formats here; leave hint at default false + } else { + // We transcoded; choose a Vulkan format matching the transcode target (we requested BC7/BC3/BC1 or RGBA32 fallback) + // There is no direct query from KTX for chosen VkFormat after transcoding, so infer by capabilities using our preference order. + bool wantSRGB2 = wantSRGB; + if (!!physicalDevice.getFormatProperties(vk::Format::eBc7UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc7SrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc3UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc3SrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc1RgbUnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc1RgbSrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + } else { + // Fallback to uncompressed RGBA + textureFormat = wantSRGB2 ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + // We have CPU-visible RGBA data; detect alpha for masked hint + ktx_size_t offsetScan = 0; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), 0, 0, 0, &offsetScan); + const uint8_t* rgba = ktxTexture_GetData(reinterpret_cast(ktxTex)) + offsetScan; + size_t pixelCount = static_cast(texWidth) * static_cast(texHeight); + for (size_t i = 0; i < pixelCount; ++i) { + if (rgba[i * 4 + 3] < 250) { + alphaMaskedHint = true; + break; + } + } + } + } + } else { + textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + } + + // Now that we're done reading libktx data, destroy the KTX texture to avoid leaks + if (isKtx2 && ktxTex) { + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + ktxTex = nullptr; + } + + // Create texture image using memory pool + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) { + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + } + // KTX2 mip levels are set above (line 306); mipLevels already reflects what the file contains + // KTX2 files come with pre-generated mips, so we don't need TRANSFER_SRC for blit generation + vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; + + // Create image with OOM fallback: retry with mipLevels=1 and reduced usage if needed + try { + auto [textureImg, textureImgAllocation] = createImagePooled( + texWidth, + texHeight, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + 1, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } catch (const std::exception& e) { + std::cerr << "Image allocation failed (" << resolvedPath << "): " << e.what() << ". Retrying with mipLevels=1..." << std::endl; + // Retry with a single mip level and no TRANSFER_SRC usage to reduce memory pressure + mipLevels = 1; + usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc; + auto [textureImg2, textureImgAllocation2] = createImagePooled( + texWidth, + texHeight, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + 1, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg2); + resources.textureImageAllocation = std::move(textureImgAllocation2); + } + + // GPU upload for this texture (copies all regions provided) + uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, copyRegions, mipLevels, imageSize); + + // KTX2 files provide their own mip levels; no runtime generation needed + // Store the format and mipLevels for createTextureImageView + resources.format = textureFormat; + resources.mipLevels = mipLevels; + resources.alphaMaskedHint = alphaMaskedHint; + + // Create texture image view + if (!createTextureImageView(resources)) { + return false; + } + + // Create texture sampler + if (!createTextureSampler(resources)) { + return false; + } + + // Add to texture resources map (guarded) + { + std::unique_lock texLock(textureResourcesMutex); + textureResources[textureId] = std::move(resources); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture image: " << e.what() << std::endl; + return false; + } +} + +// Create texture image view +bool Renderer::createTextureImageView(TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + resources.textureImageView = createImageView( + resources.textureImage, + resources.format, + // Use the stored format instead of hardcoded sRGB + vk::ImageAspectFlagBits::eColor, + resources.mipLevels // Use the stored mipLevels + ); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture image view: " << e.what() << std::endl; + return false; + } +} + +// Create shared default PBR textures (to avoid creating hundreds of identical textures) +bool Renderer::createSharedDefaultPBRTextures() { + try { + unsigned char translucentPixel[4] = {128, 128, 128, 128}; // 50% alpha (128/255) + if (!LoadTextureFromMemory(SHARED_DEFAULT_ALBEDO_ID, translucentPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default albedo texture" << std::endl; + return false; + } + + // Create shared default normal texture (flat normal) + unsigned char normalPixel[4] = {128, 128, 255, 255}; // (0.5, 0.5, 1.0, 1.0) in 0-255 range + if (!LoadTextureFromMemory(SHARED_DEFAULT_NORMAL_ID, normalPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default normal texture" << std::endl; + return false; + } + + // Create shared metallic-roughness texture (non-metallic, fully rough) + unsigned char metallicRoughnessPixel[4] = {0, 255, 0, 255}; // (unused, roughness=1.0, metallic=0.0, alpha=1.0) + if (!LoadTextureFromMemory(SHARED_DEFAULT_METALLIC_ROUGHNESS_ID, metallicRoughnessPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default metallic-roughness texture" << std::endl; + return false; + } + + // Create shared default occlusion texture (white - no occlusion) + unsigned char occlusionPixel[4] = {255, 255, 255, 255}; + if (!LoadTextureFromMemory(SHARED_DEFAULT_OCCLUSION_ID, occlusionPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default occlusion texture" << std::endl; + return false; + } + + // Create shared default emissive texture (black - no emission) + unsigned char emissivePixel[4] = {0, 0, 0, 255}; + if (!LoadTextureFromMemory(SHARED_DEFAULT_EMISSIVE_ID, emissivePixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default emissive texture" << std::endl; + return false; + } + + // Create shared bright red texture for ball visibility + unsigned char brightRedPixel[4] = {255, 0, 0, 255}; // Bright red (R=255, G=0, B=0, A=255) + if (!LoadTextureFromMemory(SHARED_BRIGHT_RED_ID, brightRedPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared bright red texture" << std::endl; + return false; + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create shared default PBR textures: " << e.what() << std::endl; + return false; + } +} + +// Create default texture resources (1x1 white texture) +bool Renderer::createDefaultTextureResources() { + try { + // Create a 1x1 white texture + const uint32_t width = 1; + const uint32_t height = 1; + const uint32_t pixelSize = 4; // RGBA + const std::vector pixels = {255, 255, 255, 255}; // White pixel (RGBA) + + // Create staging buffer + vk::DeviceSize imageSize = width * height * pixelSize; + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + memcpy(data, pixels.data(), static_cast(imageSize)); + stagingBufferMemory.unmapMemory(); + + // Create texture image using memory pool + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + vk::Format::eR8G8B8A8Srgb, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + 1); + + defaultTextureResources.textureImage = std::move(textureImg); + defaultTextureResources.textureImageAllocation = std::move(textureImgAllocation); + + // Transition image layout for copy + transitionImageLayout( + *defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eTransferDstOptimal); + + // Copy buffer to image + vk::BufferImageCopy region{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1} + }; + copyBufferToImage( + *stagingBuffer, + *defaultTextureResources.textureImage, + width, + height, + region); + + // Transition image layout for shader access + transitionImageLayout( + *defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageLayout::eTransferDstOptimal, + vk::ImageLayout::eShaderReadOnlyOptimal); + + // Create texture image view + defaultTextureResources.textureImageView = createImageView( + defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageAspectFlagBits::eColor); + + // Create texture sampler + return createTextureSampler(defaultTextureResources); + } catch (const std::exception& e) { + std::cerr << "Failed to create default texture resources: " << e.what() << std::endl; + return false; + } +} + +// Create texture sampler +bool Renderer::createTextureSampler(TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + // Get physical device properties + vk::PhysicalDeviceProperties properties = physicalDevice.getProperties(); + + // Create sampler with mipmapping + anisotropy (clamped to device limit) + float deviceMaxAniso = properties.limits.maxSamplerAnisotropy; + float desiredAniso = std::clamp(samplerMaxAnisotropy, 1.0f, deviceMaxAniso); + float maxLod = resources.mipLevels > 1 ? static_cast(resources.mipLevels - 1) : 0.0f; + vk::SamplerCreateInfo samplerInfo{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .mipmapMode = vk::SamplerMipmapMode::eLinear, + .addressModeU = vk::SamplerAddressMode::eRepeat, + .addressModeV = vk::SamplerAddressMode::eRepeat, + .addressModeW = vk::SamplerAddressMode::eRepeat, + .mipLodBias = 0.0f, + .anisotropyEnable = desiredAniso > 1.0f ? VK_TRUE : VK_FALSE, + .maxAnisotropy = desiredAniso, + .compareEnable = VK_FALSE, + .compareOp = vk::CompareOp::eAlways, + .minLod = 0.0f, + .maxLod = maxLod, + .borderColor = vk::BorderColor::eIntOpaqueBlack, + .unnormalizedCoordinates = VK_FALSE + }; + + resources.textureSampler = vk::raii::Sampler(device, samplerInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture sampler: " << e.what() << std::endl; + return false; + } +} + +// Load texture from file (public wrapper for createTextureImage) +bool Renderer::LoadTexture(const std::string& texturePath) { + ensureThreadLocalVulkanInit(); + if (texturePath.empty()) { + std::cerr << "LoadTexture: Empty texture path provided" << std::endl; + return false; + } + + // Resolve aliases (canonical ID -> actual key) + const std::string resolvedId = ResolveTextureId(texturePath); + + // Check if texture is already loaded + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(resolvedId); + if (it != textureResources.end()) { + // Texture already loaded + return true; + } + } + + // Create temporary texture resources (unused output; cache will be populated internally) + TextureResources tempResources; + + // Use existing createTextureImage method (it inserts into textureResources on success) if it's a KTX2 path; otherwise fall back to memory path below + bool success = false; + if (resolvedId.ends_with(".ktx2")) { + success = createTextureImage(resolvedId, tempResources); + if (success) + return true; + // Fall through to raw-memory path if KTX load failed + } + + if (!success) { + std::cerr << "Failed to load texture: " << texturePath << std::endl; + } + + return success; +} + +// Determine appropriate texture format based on texture type +vk::Format Renderer::determineTextureFormat(const std::string& textureId) { + // Determine sRGB vs Linear in a case-insensitive way + std::string idLower = textureId; + std::ranges::transform(idLower, idLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + + // BaseColor/Albedo/Diffuse & SpecGloss RGB should be sRGB for proper gamma correction + if (idLower.find("basecolor") != std::string::npos || + idLower.find("base_color") != std::string::npos || + idLower.find("albedo") != std::string::npos || + idLower.find("diffuse") != std::string::npos || + idLower.find("specgloss") != std::string::npos || + idLower.find("specularglossiness") != std::string::npos || + textureId == Renderer::SHARED_DEFAULT_ALBEDO_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // Emissive is color data and should be sampled in sRGB + if (idLower.find("emissive") != std::string::npos || + textureId == Renderer::SHARED_DEFAULT_EMISSIVE_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // Shared bright red (ball) is a color texture; ensure sRGB for vivid appearance + if (textureId == Renderer::SHARED_BRIGHT_RED_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // All other PBR textures (normal, metallic-roughness, occlusion) should be linear + // because they contain non-color data that shouldn't be gamma corrected + return vk::Format::eR8G8B8A8Unorm; +} + +// Load texture from raw image data in memory +bool Renderer::LoadTextureFromMemory(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels) { + ensureThreadLocalVulkanInit(); + const std::string resolvedId = ResolveTextureId(textureId); + std::cout << "[LoadTextureFromMemory] start id=" << textureId << " -> resolved=" << resolvedId << " size=" << width << "x" << height << " ch=" << channels << std::endl; + if (resolvedId.empty() || !imageData || width <= 0 || height <= 0 || channels <= 0) { + std::cerr << "LoadTextureFromMemory: Invalid parameters" << std::endl; + return false; + } + + // Check if texture is already loaded + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(resolvedId); + if (it != textureResources.end()) { + // Texture already loaded + return true; + } + } + + // Per-texture de-duplication (serialize loads of the same texture ID only) + { + std::unique_lock lk(textureLoadStateMutex); + while (texturesLoading.contains(resolvedId)) { + textureLoadStateCv.wait(lk); + } + } + // Double-check cache after the wait + { + std::shared_lock texLock(textureResourcesMutex); + auto it2 = textureResources.find(resolvedId); + if (it2 != textureResources.end()) { + return true; + } + } + // Mark as loading and ensure we notify on all exit paths + { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.insert(resolvedId); + } + auto _loadingGuard = std::unique_ptr>(reinterpret_cast(1), + [this, resolvedId](void*) { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.erase(resolvedId); + textureLoadStateCv.notify_all(); + }); + + try { + TextureResources resources; + + // Calculate image size (ensure 4 channels for RGBA) + int targetChannels = 4; // Always use RGBA for consistency + vk::DeviceSize imageSize = width * height * targetChannels; + + // Create a staging buffer + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy and convert pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + auto* stagingData = static_cast(data); + + if (channels == 4) { + // Already RGBA, direct copy + memcpy(stagingData, imageData, imageSize); + } else if (channels == 3) { + // RGB to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i * 3 + 0]; // R + stagingData[i * 4 + 1] = imageData[i * 3 + 1]; // G + stagingData[i * 4 + 2] = imageData[i * 3 + 2]; // B + stagingData[i * 4 + 3] = 255; // A + } + } else if (channels == 2) { + // Grayscale + Alpha to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i * 2 + 0]; // R (grayscale) + stagingData[i * 4 + 1] = imageData[i * 2 + 0]; // G (grayscale) + stagingData[i * 4 + 2] = imageData[i * 2 + 0]; // B (grayscale) + stagingData[i * 4 + 3] = imageData[i * 2 + 1]; // A (alpha) + } + } else if (channels == 1) { + // Grayscale to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i]; // R + stagingData[i * 4 + 1] = imageData[i]; // G + stagingData[i * 4 + 2] = imageData[i]; // B + stagingData[i * 4 + 3] = 255; // A + } + } else { + std::cerr << "LoadTextureFromMemory: Unsupported channel count: " << channels << std::endl; + stagingBufferMemory.unmapMemory(); + return false; + } + + // Analyze alpha to set alphaMaskedHint (treat as masked if any pixel alpha < ~1.0) + bool alphaMaskedHint = false; + for (int i = 0, n = width * height; i < n; ++i) { + if (stagingData[i * 4 + 3] < 250) { + alphaMaskedHint = true; + break; + } + } + + stagingBufferMemory.unmapMemory(); + + // Determine the appropriate texture format based on the texture type + vk::Format textureFormat = determineTextureFormat(textureId); + + // Create texture image using memory pool (with optional mipmap generation) + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) { + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + } + // Decide mip count and usage for memory textures; cap to reduce VRAM pressure + uint32_t mipLevels = 1; + if (width > 1 && height > 1) { + uint32_t full = static_cast(std::floor(std::log2(std::max(width, height)))) + 1; + mipLevels = std::max(1u, std::min(full, maxAutoGeneratedMipLevels)); + } + vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; + if (mipLevels > 1) + usageFlags |= vk::ImageUsageFlagBits::eTransferSrc; + + // OOM-resilient allocation + try { + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + 1, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } catch (const std::exception& e) { + std::cerr << "Image allocation failed (memory texture): " << e.what() << ". Retrying with mipLevels=1..." << std::endl; + mipLevels = 1; + usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc; + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + 1, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } + + // GPU upload. Copy buffer to image in a single submit. + vk::BufferImageCopy region{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {static_cast(width), static_cast(height), 1} + }; + uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, region, mipLevels, imageSize); + + // Generate mip chain if requested and format is uncompressed RGBA + if (mipLevels > 1 && (textureFormat == vk::Format::eR8G8B8A8Srgb || textureFormat == vk::Format::eR8G8B8A8Unorm)) { + generateMipmaps(*resources.textureImage, textureFormat, width, height, mipLevels); + } + + // Store the format for createTextureImageView + resources.format = textureFormat; + resources.mipLevels = mipLevels; + resources.alphaMaskedHint = alphaMaskedHint; + + // Use resolvedId as the cache key to avoid duplicates + const std::string& cacheId = resolvedId; + + // Create texture image view + resources.textureImageView = createImageView( + resources.textureImage, + textureFormat, + vk::ImageAspectFlagBits::eColor, + mipLevels); + + // Create texture sampler + if (!createTextureSampler(resources)) { + return false; + } + + // Add to texture resources map (guarded) + { + std::unique_lock texLock(textureResourcesMutex); + textureResources[cacheId] = std::move(resources); + } + + std::cout << "Successfully loaded texture from memory: " << cacheId + << " (" << width << "x" << height << ", " << channels << " channels)" << std::endl; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to load texture from memory: " << e.what() << std::endl; + return false; + } +} + +// Create mesh resources +bool Renderer::createMeshResources(MeshComponent* meshComponent, bool deferUpload) { + ensureThreadLocalVulkanInit(); + try { + // If resources already exist, no need to recreate them. + auto it = meshResources.find(meshComponent); + if (it != meshResources.end()) { + // If we previously created this mesh with deferred uploads, but the caller now + // wants an immediate/ready mesh (e.g., during loading or before AS build), + // flush the pending staging copies right here. + if (!deferUpload) { + MeshResources& res = it->second; + if ((res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) || + (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer)) { + if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) { + copyBuffer(res.stagingVertexBuffer, res.vertexBuffer, res.vertexBufferSizeBytes); + res.stagingVertexBuffer = vk::raii::Buffer(nullptr); + res.stagingVertexBufferMemory = vk::raii::DeviceMemory(nullptr); + res.vertexBufferSizeBytes = 0; + } + if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) { + copyBuffer(res.stagingIndexBuffer, res.indexBuffer, res.indexBufferSizeBytes); + res.stagingIndexBuffer = vk::raii::Buffer(nullptr); + res.stagingIndexBufferMemory = vk::raii::DeviceMemory(nullptr); + res.indexBufferSizeBytes = 0; + } + } + } + return true; + } + + // Get mesh data + const auto& vertices = meshComponent->GetVertices(); + const auto& indices = meshComponent->GetIndices(); + + if (vertices.empty() || indices.empty()) { + std::cerr << "Mesh has no vertices or indices" << std::endl; + return false; + } + + // --- 1. Create and fill per-mesh staging buffers on the host --- + vk::DeviceSize vertexBufferSize = sizeof(vertices[0]) * vertices.size(); + auto [stagingVertexBuffer, stagingVertexBufferMemory] = createBuffer( + vertexBufferSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + void* vertexData = stagingVertexBufferMemory.mapMemory(0, vertexBufferSize); + std::memcpy(vertexData, vertices.data(), static_cast(vertexBufferSize)); + stagingVertexBufferMemory.unmapMemory(); + + vk::DeviceSize indexBufferSize = sizeof(indices[0]) * indices.size(); + auto [stagingIndexBuffer, stagingIndexBufferMemory] = createBuffer( + indexBufferSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + void* indexData = stagingIndexBufferMemory.mapMemory(0, indexBufferSize); + std::memcpy(indexData, indices.data(), static_cast(indexBufferSize)); + stagingIndexBufferMemory.unmapMemory(); + + // --- 2. Create device-local vertex and index buffers via the memory pool --- + // Add ray tracing flags: eShaderDeviceAddress for vkGetBufferDeviceAddress and + // eAccelerationStructureBuildInputReadOnlyKHR for acceleration structure building + auto [vertexBuffer, vertexBufferAllocation] = createBufferPooled( + vertexBufferSize, + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eVertexBuffer | + vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + auto [indexBuffer, indexBufferAllocation] = createBufferPooled( + indexBufferSize, + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eIndexBuffer | + vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // --- 3. Either copy now (legacy path) or defer copies for batched submission --- + MeshResources resources; + resources.vertexBuffer = std::move(vertexBuffer); + resources.vertexBufferAllocation = std::move(vertexBufferAllocation); + resources.indexBuffer = std::move(indexBuffer); + resources.indexBufferAllocation = std::move(indexBufferAllocation); + resources.indexCount = static_cast(indices.size()); + + if (deferUpload) { + // Keep staging buffers alive and record their sizes; copies will be + // performed later by preAllocateEntityResourcesBatch(). + resources.stagingVertexBuffer = std::move(stagingVertexBuffer); + resources.stagingVertexBufferMemory = std::move(stagingVertexBufferMemory); + resources.vertexBufferSizeBytes = vertexBufferSize; + + resources.stagingIndexBuffer = std::move(stagingIndexBuffer); + resources.stagingIndexBufferMemory = std::move(stagingIndexBufferMemory); + resources.indexBufferSizeBytes = indexBufferSize; + } else { + // Immediate upload path used by preAllocateEntityResources() and other + // small-object callers. This preserves existing behaviour. + copyBuffer(stagingVertexBuffer, resources.vertexBuffer, vertexBufferSize); + copyBuffer(stagingIndexBuffer, resources.indexBuffer, indexBufferSize); + // staging* buffers are RAII objects and will be destroyed on scope exit. + } + + // Add to mesh resources map + meshResources[meshComponent] = std::move(resources); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create mesh resources: " << e.what() << std::endl; + return false; + } +} + +// Create uniform buffers +bool Renderer::createUniformBuffers(Entity* entity) { + ensureThreadLocalVulkanInit(); + try { + // Kick watchdog periodically during heavy buffer creation (if called from a long loop) + static uint32_t bufferWatchdogCounter = 0; + if (++bufferWatchdogCounter % 50 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + // Check if entity resources already exist + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + return true; + } + + // Create entity resources + EntityResources resources; + + // Create uniform buffers using memory pool + vk::DeviceSize bufferSize = sizeof(UniformBufferObject); + uint32_t numUbos = MAX_FRAMES_IN_FLIGHT; // Multiview uses one UBO per frame for both eyes + for (size_t i = 0; i < numUbos; i++) { + auto [buffer, bufferAllocation] = createBufferPooled( + bufferSize, + vk::BufferUsageFlagBits::eUniformBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Use the memory pool's mapped pointer if available + void* mappedMemory = bufferAllocation->mappedPtr; + if (!mappedMemory) { + std::cerr << "Warning: Uniform buffer allocation is not mapped" << std::endl; + } + + resources.uniformBuffers.emplace_back(std::move(buffer)); + resources.uniformBufferAllocations.emplace_back(std::move(bufferAllocation)); + resources.uniformBuffersMapped.emplace_back(mappedMemory); + } + + // Initialize descriptor initialization tracking flags to MAX_FRAMES_IN_FLIGHT + resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + + // Create instance buffer for all entities (shaders always expect instance data) + auto* meshComponent = entity->GetComponent(); + if (meshComponent) { + std::vector instanceData; + + if (meshComponent->GetInstanceCount() > 0) { + // Use existing instance data from GLTF loading (whether 1 or many instances) + instanceData = meshComponent->GetInstances(); + } else { + // Create single instance data using IDENTITY matrix to avoid double-transform with UBO.model + InstanceData singleInstance; + singleInstance.setModelMatrix(glm::mat4(1.0f)); + instanceData = {singleInstance}; + } + + vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size(); + + auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled( + instanceBufferSize, + vk::BufferUsageFlagBits::eVertexBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy instance data to buffer + void* instanceMappedMemory = instanceBufferAllocation->mappedPtr; + if (instanceMappedMemory) { + std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize); + } else { + std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl; + } + + resources.instanceBuffer = std::move(instanceBuffer); + resources.instanceBufferAllocation = std::move(instanceBufferAllocation); + resources.instanceBufferMapped = instanceMappedMemory; + } + + // Add to entity resources map + entityResources[entity] = std::move(resources); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create uniform buffers: " << e.what() << std::endl; + return false; + } +} + +// Create descriptor pool +bool Renderer::createDescriptorPool() { + try { + // Calculate pool sizes for all Bistro materials plus additional entities + // The Bistro model creates many more entities than initially expected + // Each entity needs descriptor sets for both basic and PBR pipelines + // PBR pipeline needs 7 descriptors per set (1 UBO + 5 PBR textures + 1 shadow map array with 16 shadow maps) + // Basic pipeline needs 2 descriptors per set (1 UBO + 1 texture) + const uint32_t maxEntities = 20000; // Increased to 20k entities to handle large scenes like Bistro reliably + const uint32_t maxDescriptorSets = MAX_FRAMES_IN_FLIGHT * maxEntities * 2; // 2 pipeline types per entity + + // Calculate descriptor counts + // UBO descriptors: 1 per descriptor set + const uint32_t uboDescriptors = maxDescriptorSets; + // Texture descriptors: Basic pipeline uses 1, PBR uses 21 (5 PBR textures + 16 shadow maps) + // Allocate for worst case: all entities using PBR (21 texture descriptors each) + const uint32_t textureDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 21; + // Storage buffer descriptors: PBR pipeline uses multiple storage buffers per descriptor set. + // Storage buffers used per PBR descriptor set: + // - Binding 6: light storage buffer + // - Binding 7: Forward+ tile headers buffer + // - Binding 8: Forward+ tile indices buffer + // - Binding 9: Fragment debug output buffer (optional) + // - Binding 12: Ray-query geometry info buffer (for raster ray-query shadows) + // - Binding 13: Ray-query material buffer (for raster ray-query shadows) + const uint32_t storageBufferDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 6u; + + // Acceleration structure descriptors: Ray query needs 1 TLAS descriptor per frame + const uint32_t accelerationStructureDescriptors = MAX_FRAMES_IN_FLIGHT; + + // Storage image descriptors: Ray query needs 1 output image descriptor per frame + const uint32_t storageImageDescriptors = MAX_FRAMES_IN_FLIGHT; + + // Reserve extra combined image sampler capacity for Ray Query binding 6 (baseColor texture array) + const uint32_t rqTexDescriptors = MAX_FRAMES_IN_FLIGHT * RQ_MAX_TEX; + std::array poolSizes = { + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eUniformBuffer, + .descriptorCount = uboDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = textureDescriptors + rqTexDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eStorageBuffer, + .descriptorCount = storageBufferDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eAccelerationStructureKHR, + .descriptorCount = accelerationStructureDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eStorageImage, + .descriptorCount = storageImageDescriptors + } + }; + + // Create descriptor pool + vk::DescriptorPoolCreateFlags poolFlags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet; + if (descriptorIndexingEnabled) { + poolFlags |= vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind; + } + vk::DescriptorPoolCreateInfo poolInfo{ + .flags = poolFlags, + .maxSets = maxDescriptorSets, + .poolSizeCount = static_cast(poolSizes.size()), + .pPoolSizes = poolSizes.data() + }; + + descriptorPool = vk::raii::DescriptorPool(device, poolInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor pool: " << e.what() << std::endl; + return false; + } +} + +// Create descriptor sets +bool Renderer::createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR) { + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) + return false; + return createDescriptorSets(entity, entityIt->second, texturePath, usePBR); +} + +bool Renderer::createDescriptorSets(Entity* entity, EntityResources& res, const std::string& texturePath, bool usePBR) { + // Kick watchdog periodically during heavy descriptor creation (if called from a long loop) + static uint32_t descWatchdogCounter = 0; + if (++descWatchdogCounter % 50 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + // Resolve alias before taking the shared lock to avoid nested shared_lock on the same mutex + const std::string resolvedTexturePath = ResolveTextureId(texturePath); + try { + vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout; + uint32_t numSets = MAX_FRAMES_IN_FLIGHT; // Multiview uses one set per frame for both eyes + std::vector layouts(numSets, selectedLayout); + vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = numSets, .pSetLayouts = layouts.data()}; + + auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets; + if (targetDescriptorSets.empty()) { + std::lock_guard lk(descriptorMutex); + // Allocate into a temporary owning container, then move the individual RAII sets into our vector. + // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector`.) + auto sets = vk::raii::DescriptorSets(device, allocInfo); + targetDescriptorSets.clear(); + targetDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + targetDescriptorSets.emplace_back(std::move(s)); + } + } + + // Checking validity prevents SIGSEGV crash when Vulkan tries to access invalid handles. + if (targetDescriptorSets.empty() || targetDescriptorSets.size() < numSets) { + std::cerr << "ERROR: Descriptor set allocation failed for entity " << entity->GetName() + << " (usePBR=" << usePBR << "). Descriptor pool may be exhausted." << std::endl; + return false; + } + + // Only initialize the current frame's descriptor set + // In multiview, both eyes share the same descriptor set for the current frame + { + size_t i = static_cast(currentFrame); + // Validate descriptor set handle before dereferencing to prevent crash + vk::DescriptorSet handleCheck = *targetDescriptorSets[i]; + if (handleCheck == vk::DescriptorSet{}) { + std::cerr << "ERROR: Invalid descriptor set handle for entity " << entity->GetName() + << " frame " << i << " (usePBR=" << usePBR << ")" << std::endl; + return false; + } + vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[i], .range = sizeof(UniformBufferObject)}; + + if (usePBR) { + // Build descriptor writes dynamically to avoid writing unused bindings + std::vector descriptorWrites; + std::array imageInfos; + // Keep additional descriptor infos alive until updateDescriptorSets completes. + vk::DescriptorImageInfo reflInfo; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + vk::AccelerationStructureKHR tlasHandleValue{}; + vk::DescriptorBufferInfo lightBufferInfo; + vk::DescriptorBufferInfo headersInfo; + vk::DescriptorBufferInfo indicesInfo; + + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + + auto meshComponent = entity->GetComponent(); + std::vector pbrTexturePaths; + { + const std::string legacyPath = (meshComponent ? meshComponent->GetTexturePath() : std::string()); + const std::string baseColorPath = (meshComponent && !meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID); + const std::string mrPath = (meshComponent && !meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + const std::string normalPath = (meshComponent && !meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID; + const std::string occlusionPath = (meshComponent && !meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID; + const std::string emissivePath = (meshComponent && !meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID; + + pbrTexturePaths = {baseColorPath, mrPath, normalPath, occlusionPath, emissivePath}; + } + + for (int j = 0; j < 5; j++) { + const auto resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]); + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedBindingPath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + imageInfos[j] = {.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = static_cast(j + 1), .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfos[j]}); + } + + lightBufferInfo = vk::DescriptorBufferInfo{.buffer = *lightStorageBuffers[i % MAX_FRAMES_IN_FLIGHT].buffer, .range = VK_WHOLE_SIZE}; + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo}); + + // Ensure Forward+ per-frame array exists + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + + // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed + if ((i % MAX_FRAMES_IN_FLIGHT) < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[i % MAX_FRAMES_IN_FLIGHT]; + if (!*f.tileHeaders) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1} + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + } + headersInfo = vk::DescriptorBufferInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed + if ((i % MAX_FRAMES_IN_FLIGHT) < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[i % MAX_FRAMES_IN_FLIGHT]; + if (!*f.tileLightIndices) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + } + indicesInfo = vk::DescriptorBufferInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Now both headersInfo and indicesInfo have valid buffers (never nullptr) + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + + // Binding 10: reflection sampler (planar reflections) + // Always bind a safe fallback (default texture) so the descriptor is valid. + reflInfo = vk::DescriptorImageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + descriptorWrites.push_back({ + .dstSet = *targetDescriptorSets[i], + .dstBinding = 10, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &reflInfo + }); + + // Binding 11: TLAS (ray-query shadows in raster fragment shader) + // The PBR pipeline layout always declares this binding; it must be written before any draw. + // Bind the current TLAS when AS is enabled. + if (accelerationStructureEnabled) { + vk::AccelerationStructureKHR h = *tlasStructure.handle; + if (!!h) + tlasHandleValue = h; + } + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + vk::WriteDescriptorSet tlasWrite{}; + tlasWrite.dstSet = *targetDescriptorSets[i]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + descriptorWrites.push_back(tlasWrite); { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } else { + // Basic Pipeline + // ... (this part remains the same) + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedTexturePath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}, + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } + } + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor sets for " << entity->GetName() << ": " << e.what() << std::endl; + return false; + } +} + +// Pre-allocate all Vulkan resources for an entity during scene loading +bool Renderer::preAllocateEntityResources(Entity* entity) { + try { + // Get the mesh component + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + std::cerr << "Entity " << entity->GetName() << " has no mesh component" << std::endl; + return false; + } + + // Ensure local AABB is available for debug/probes + meshComponent->RecomputeLocalAABB(); + + // 1. Create mesh resources (vertex/index buffers) + if (!createMeshResources(meshComponent)) { + std::cerr << "Failed to create mesh resources for entity: " << entity->GetName() << std::endl; + return false; + } + + // 2. Create uniform buffers + if (!createUniformBuffers(entity)) { + std::cerr << "Failed to create uniform buffers for entity: " << entity->GetName() << std::endl; + return false; + } + + // Initialize per-frame UBO and image binding write flags + { + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + it->second.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + } + + // 3. Pre-allocate BOTH basic and PBR descriptor sets + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback: if legacy texturePath is empty, use PBR baseColor texture + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + + // Create basic descriptor sets + if (!createDescriptorSets(entity, texturePath, false)) { + std::cerr << "Failed to create basic descriptor sets for entity: " << entity->GetName() << std::endl; + return false; + } + + // Create PBR descriptor sets + if (!createDescriptorSets(entity, texturePath, true)) { + std::cerr << "Failed to create PBR descriptor sets for entity: " << entity->GetName() << std::endl; + return false; + } + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to pre-allocate resources for entity " << entity->GetName() << ": " << e.what() << std::endl; + return false; + } +} + +// Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads +bool Renderer::preAllocateEntityResourcesBatch(const std::vector& entities) { + watchdogProgressLabel.store("Batch: ensureThreadLocalVulkanInit", std::memory_order_relaxed); + watchdogProgressIndex.store(0, std::memory_order_relaxed); + ensureThreadLocalVulkanInit(); + try { + // --- 1. For all entities, create mesh resources with deferred uploads --- + // Then, during initial loading (and while an AS build is pending), flush the queued + // uploads immediately in a single batched submit (much faster than per-mesh submits). + watchdogProgressLabel.store("Batch: createMeshResources loop", std::memory_order_relaxed); + std::vector meshesNeedingUpload; + meshesNeedingUpload.reserve(entities.size()); + const bool flushUploadsNow = IsLoading() || asBuildRequested.load(std::memory_order_relaxed); + + uint32_t processedMeshes = 0; + uint32_t meshLoopIndex = 0; + for (Entity* entity : entities) { + watchdogProgressIndex.store(meshLoopIndex++, std::memory_order_relaxed); + + if (!entity) { + continue; + } + + // Kick watchdog periodically during heavy mesh resource creation + if (++processedMeshes % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + continue; + } + + // Ensure local AABB is available for debug/probes + watchdogProgressLabel.store("Batch: RecomputeLocalAABB", std::memory_order_relaxed); + meshComponent->RecomputeLocalAABB(); + + watchdogProgressLabel.store("Batch: createMeshResources", std::memory_order_relaxed); + if (!createMeshResources(meshComponent, /*deferUpload=*/true)) { + std::cerr << "Failed to create mesh resources for entity (batch): " + << entity->GetName() << std::endl; + return false; + } + + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) { + continue; + } + MeshResources& res = it->second; + + // Only schedule meshes that still have staged data pending upload + if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) { + meshesNeedingUpload.push_back(meshComponent); + } + } + + // --- 2. Defer all GPU copies to the render thread safe point --- + if (!meshesNeedingUpload.empty()) + { + watchdogProgressLabel.store("Batch: EnqueueMeshUploads", std::memory_order_relaxed); + EnqueueMeshUploads(meshesNeedingUpload); + if (flushUploadsNow) { + watchdogProgressLabel.store("Batch: Flush mesh uploads now", std::memory_order_relaxed); + ProcessPendingMeshUploads(); + } + } + + // --- 3. Create uniform buffers and descriptor sets per entity --- + watchdogProgressLabel.store("Batch: per-entity resources loop", std::memory_order_relaxed); + uint32_t processedResources = 0; + uint32_t resourceLoopIndex = 0; + for (Entity* entity : entities) { + watchdogProgressIndex.store(resourceLoopIndex++, std::memory_order_relaxed); + + if (!entity) { + continue; + } + + // Kick watchdog periodically during heavy resource creation + if (++processedResources % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + continue; + } + + watchdogProgressLabel.store("Batch: createUniformBuffers", std::memory_order_relaxed); + if (!createUniformBuffers(entity)) { + std::cerr << "Failed to create uniform buffers for entity (batch): " + << entity->GetName() << std::endl; + return false; + } + + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback: if legacy texturePath is empty, use PBR baseColor texture + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + + watchdogProgressLabel.store("Batch: createDescriptorSets (basic)", std::memory_order_relaxed); + if (!createDescriptorSets(entity, texturePath, false)) { + std::cerr << "Failed to create basic descriptor sets for entity (batch): " + << entity->GetName() << std::endl; + return false; + } + + watchdogProgressLabel.store("Batch: createDescriptorSets (pbr)", std::memory_order_relaxed); + if (!createDescriptorSets(entity, texturePath, true)) { + std::cerr << "Failed to create PBR descriptor sets for entity (batch): " + << entity->GetName() << std::endl; + return false; + } + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to batch pre-allocate resources for entities: " << e.what() << std::endl; + return false; + } +} + +// Enqueue a set of meshes to upload on the render thread (safe point) +void Renderer::EnqueueMeshUploads(const std::vector& meshes) { + if (meshes.empty()) + return; + std::lock_guard lk(pendingMeshUploadsMutex); + // Avoid duplicates by using a temporary set of current entries + for (MeshComponent* m : meshes) { + if (!m) + continue; + pendingMeshUploads.push_back(m); + } +} + +void Renderer::EnqueueEntityPreallocationBatch(const std::vector& entities) { + if (entities.empty()) + return; { + std::lock_guard lk(pendingEntityPreallocMutex); + for (Entity* e : entities) { + if (!e) + continue; + pendingEntityPrealloc.push_back(e); + } + } + pendingEntityPreallocQueued.store(true, std::memory_order_relaxed); +} + +void Renderer::EnqueueInstanceBufferRecreation(Entity* entity) { + if (!entity) + return; { + std::lock_guard lk(pendingEntityPreallocMutex); + pendingInstanceBufferRecreations.push_back(entity); + } + pendingEntityPreallocQueued.store(true, std::memory_order_relaxed); +} + +void Renderer::ProcessPendingEntityPreallocations() { + if (!pendingEntityPreallocQueued.load(std::memory_order_relaxed)) + return; + + watchdogProgressLabel.store("Prealloc: drain queues", std::memory_order_relaxed); + + std::vector toPreallocate; + std::vector toRecreateInstances; { + std::lock_guard lk(pendingEntityPreallocMutex); + if (pendingEntityPrealloc.empty() && pendingInstanceBufferRecreations.empty()) { + pendingEntityPreallocQueued.store(false, std::memory_order_relaxed); + return; + } + toPreallocate.swap(pendingEntityPrealloc); + toRecreateInstances.swap(pendingInstanceBufferRecreations); + pendingEntityPreallocQueued.store(false, std::memory_order_relaxed); + } + + // De-dup preallocations + watchdogProgressLabel.store("Prealloc: dedup", std::memory_order_relaxed); + std::sort(toPreallocate.begin(), toPreallocate.end()); + toPreallocate.erase(std::unique(toPreallocate.begin(), toPreallocate.end()), toPreallocate.end()); + + std::vector batch; + batch.reserve(toPreallocate.size()); + for (Entity* e : toPreallocate) { + if (!e || !e->IsActive()) + continue; + if (!e->GetComponent()) + continue; + batch.push_back(e); + } + + if (!batch.empty()) { + watchdogProgressLabel.store("Prealloc: preAllocateEntityResourcesBatch", std::memory_order_relaxed); + if (!preAllocateEntityResourcesBatch(batch)) { + std::cerr << "Warning: batch entity GPU preallocation failed; will retry" << std::endl; + } + } + + // Process instance buffer recreations. + // Wait for GPU idle ONCE before processing the batch to safely destroy old buffers. + if (!toRecreateInstances.empty()) { + watchdogProgressLabel.store("Prealloc: wait other inFlightFences (before recreateInstanceBuffer)", std::memory_order_relaxed); + // IMPORTANT: We are called from the render thread at the frame-start safe point, + // *after* `inFlightFences[currentFrame]` was waited and then reset. + // Waiting on the current frame fence here would deadlock forever because it won't be + // signaled until we submit the current frame (which can't happen while we're blocked). + std::vector fencesToWait; + if (inFlightFences.size() > 1) { + fencesToWait.reserve(inFlightFences.size() - 1); + } + for (uint32_t i = 0; i < static_cast(inFlightFences.size()); ++i) { + if (i == currentFrame) + continue; + if (!!*inFlightFences[i] && *inFlightFences[i] != vk::Fence{}) { + fencesToWait.push_back(*inFlightFences[i]); + } + } + if (!fencesToWait.empty()) { + (void) waitForFencesSafe(fencesToWait, VK_TRUE); + } + watchdogProgressLabel.store("Prealloc: recreateInstanceBuffer loop", std::memory_order_relaxed); + uint32_t processed = 0; + for (Entity* e : toRecreateInstances) { + if (!e || !e->IsActive()) + continue; + + // Kick watchdog periodically during heavy batch processing + if (++processed % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + if (!recreateInstanceBuffer(e)) { + std::cerr << "Warning: failed to recreate instance buffer for entity " << e->GetName() << std::endl; + } + } + } + + watchdogProgressLabel.store("Prealloc: done", std::memory_order_relaxed); +} + +// Execute pending mesh uploads on the render thread after the per-frame fence wait +void Renderer::ProcessPendingMeshUploads() { + // 0. Retire completed async upload batches (if timeline semaphore is available) + if (!!*uploadsTimeline && *uploadsTimeline != vk::Semaphore{}) { + uint64_t completedValue = 0; + try { + // vk::raii::Device doesn't expose getSemaphoreCounterValue in all Vulkan-Hpp versions; + // use the underlying vk::Device handle. + completedValue = (*device).getSemaphoreCounterValue(*uploadsTimeline); + } catch (...) { + completedValue = 0; + } + + bool anyCompleted = false; + while (true) { + InFlightMeshUploadBatch completedBatch; { + std::lock_guard lk(inFlightMeshUploadsMutex); + if (inFlightMeshUploads.empty()) + break; + if (inFlightMeshUploads.front().signalValue == 0 || inFlightMeshUploads.front().signalValue > completedValue) + break; + completedBatch = std::move(inFlightMeshUploads.front()); + inFlightMeshUploads.pop_front(); + } + + // Clear staging once copies are complete + for (auto* meshComponent : completedBatch.meshes) { + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + MeshResources& res = it->second; + res.stagingVertexBuffer = vk::raii::Buffer(nullptr); + res.stagingVertexBufferMemory = vk::raii::DeviceMemory(nullptr); + res.vertexBufferSizeBytes = 0; + res.stagingIndexBuffer = vk::raii::Buffer(nullptr); + res.stagingIndexBufferMemory = vk::raii::DeviceMemory(nullptr); + res.indexBufferSizeBytes = 0; + } + + anyCompleted = true; + } + + if (anyCompleted) { + // Now that more meshes are READY (uploads finished), request a TLAS rebuild so + // non‑instanced and previously missing meshes are included in the acceleration structure. + asDevOverrideAllowRebuild = true; // allow rebuild even if frozen + RequestAccelerationStructureBuild("uploads completed"); + } + } + + // Grab the list atomically + std::vector toProcess; { + std::lock_guard lk(pendingMeshUploadsMutex); + if (pendingMeshUploads.empty()) + return; + toProcess.swap(pendingMeshUploads); + } + + // Build a quick lookup of meshes already in flight so we don't submit duplicate copies + std::unordered_set inFlightMeshes; { + std::lock_guard lk(inFlightMeshUploadsMutex); + for (const auto& b : inFlightMeshUploads) { + for (auto* m : b.meshes) { + inFlightMeshes.insert(m); + } + } + } + + // Filter to meshes that still have staged data + std::vector needsCopy; + needsCopy.reserve(toProcess.size()); + for (auto* meshComponent : toProcess) { + if (inFlightMeshes.find(meshComponent) != inFlightMeshes.end()) + continue; + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + const MeshResources& res = it->second; + if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) { + needsCopy.push_back(meshComponent); + } + } + if (needsCopy.empty()) + return; + + // Record copies on GRAPHICS queue to avoid cross-queue hazards while stabilizing + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + + // Prefer async submission via the uploads timeline semaphore to avoid blocking the render thread. + // However, during initial loading (and when an AS build is pending), we want mesh uploads to + // complete promptly so readiness can increase and the AS can be built within the target budget. + const bool forceSynchronous = IsLoading() || asBuildRequested.load(std::memory_order_relaxed); + const bool canSignalTimeline = (!!*uploadsTimeline && *uploadsTimeline != vk::Semaphore{}) && !forceSynchronous; + if (canSignalTimeline) { + auto tempPool = std::make_unique(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = **tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + auto cbs = std::make_unique(device, allocInfo); + vk::raii::CommandBuffer& cb = (*cbs)[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + for (auto* meshComponent : needsCopy) { + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + MeshResources& res = it->second; + if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes}; + cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region); + } + if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes}; + cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region); + } + } + + cb.end(); + + uint64_t signalValue = 0; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submitInfo{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submitInfo.pNext = &timelineInfo; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &*cb; + submitInfo.signalSemaphoreCount = 1; + submitInfo.pSignalSemaphores = &*uploadsTimeline; + graphicsQueue.submit(submitInfo, vk::Fence{}); + } + + InFlightMeshUploadBatch batch; + batch.signalValue = signalValue; + batch.meshes = std::move(needsCopy); + batch.commandPool = std::move(tempPool); + batch.commandBuffers = std::move(cbs); { + std::lock_guard lk(inFlightMeshUploadsMutex); + inFlightMeshUploads.push_back(std::move(batch)); + } + } else { + // Fallback: submit and wait on the GRAPHICS queue (single-threaded via queueMutex) + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + for (auto* meshComponent : needsCopy) { + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + MeshResources& res = it->second; + if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes}; + cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region); + } + if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes}; + cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region); + } + } + + cb.end(); + + vk::SubmitInfo submitInfo{.commandBufferCount = 1, .pCommandBuffers = &*cb}; + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); { + std::lock_guard lock(queueMutex); + graphicsQueue.submit(submitInfo, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + + for (auto* meshComponent : needsCopy) { + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + MeshResources& res = it->second; + res.stagingVertexBuffer.clear(); + res.stagingVertexBufferMemory.clear(); + res.vertexBufferSizeBytes = 0; + res.stagingIndexBuffer.clear(); + res.stagingIndexBufferMemory.clear(); + res.indexBufferSizeBytes = 0; + } + + asDevOverrideAllowRebuild = true; + RequestAccelerationStructureBuild("uploads completed"); + } +} + +// Recreate instance buffer for an entity (e.g., after clearing instances for animation) +bool Renderer::recreateInstanceBuffer(Entity* entity) { + ensureThreadLocalVulkanInit(); + try { + // Find the entity in entityResources + auto it = entityResources.find(entity); + if (it == entityResources.end()) { + std::cerr << "Entity " << entity->GetName() << " not found in entityResources" << std::endl; + return false; + } + + EntityResources& resources = it->second; + + // Create a single instance with identity matrix + InstanceData singleInstance; + singleInstance.setModelMatrix(glm::mat4(1.0f)); + std::vector instanceData = {singleInstance}; + + vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size(); + + // Create new instance buffer using memory pool + auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled( + instanceBufferSize, + vk::BufferUsageFlagBits::eVertexBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy instance data to buffer + void* instanceMappedMemory = instanceBufferAllocation->mappedPtr; + if (instanceMappedMemory) { + std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize); + } else { + std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl; + } + + // Replace the old instance buffer with the new one. + // Note: Caller must ensure GPU is idle before this method is called to safely destroy the old buffer. + resources.instanceBuffer = std::move(instanceBuffer); + resources.instanceBufferAllocation = std::move(instanceBufferAllocation); + resources.instanceBufferMapped = instanceMappedMemory; + + std::cout << "[Animation] Recreated instance buffer for entity '" << entity->GetName() + << "' with single identity instance" << std::endl; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to recreate instance buffer for entity " << entity->GetName() + << ": " << e.what() << std::endl; + return false; + } +} + +// Create buffer using memory pool for efficient allocation +std::pair> Renderer::createBufferPooled( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties) { + try { + if (!memoryPool) { + throw std::runtime_error("Memory pool not initialized"); + } + + // Use memory pool for allocation + auto [buffer, allocation] = memoryPool->createBuffer(size, usage, properties); + + return {std::move(buffer), std::move(allocation)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create buffer with memory pool: " << e.what() << std::endl; + throw; + } +} + +// Legacy createBuffer function - now strictly enforces memory pool usage +std::pair Renderer::createBuffer( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties) { + // This function should only be used for temporary staging buffers during resource creation + // All persistent resources should use createBufferPooled directly + + if (!memoryPool) { + throw std::runtime_error("Memory pool not available - cannot create buffer"); + } + + // Only allow direct allocation for staging buffers (temporary, host-visible) + if (!(properties & vk::MemoryPropertyFlagBits::eHostVisible)) { + std::cerr << "ERROR: Legacy createBuffer should only be used for staging buffers!" << std::endl; + throw std::runtime_error("Legacy createBuffer used for non-staging buffer"); + } + + try { + vk::BufferCreateInfo bufferInfo{ + .size = size, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer buffer(device, bufferInfo); + + // Allocate memory directly for staging buffers only + vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements(); + + // Align allocation size to nonCoherentAtomSize (64 bytes) to prevent validation errors + // VUID-VkMappedMemoryRange-size-01390 requires memory flush sizes to be multiples of nonCoherentAtomSize + const vk::DeviceSize nonCoherentAtomSize = 64; // Typical value, should query from device properties + vk::DeviceSize alignedSize = ((memRequirements.size + nonCoherentAtomSize - 1) / nonCoherentAtomSize) * nonCoherentAtomSize; + + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = alignedSize, + .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties) + }; + + vk::raii::DeviceMemory bufferMemory(device, allocInfo); + + // Bind memory to buffer + buffer.bindMemory(*bufferMemory, 0); + + return {std::move(buffer), std::move(bufferMemory)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create staging buffer: " << e.what() << std::endl; + throw; + } +} + +void Renderer::createTransparentDescriptorSets() { + // We need one descriptor set per frame in flight for this resource + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = static_cast(MAX_FRAMES_IN_FLIGHT), + .pSetLayouts = layouts.data() + }; { + // Serialize allocation vs other descriptor ops + std::lock_guard lk(descriptorMutex); + transparentDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + + // Update each descriptor set to point to the per-frame off-screen opaque color image + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + vk::DescriptorImageInfo imageInfo{ + .sampler = *opaqueSceneColorSampler, + .imageView = *opaqueSceneColorImageViews[i], + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *transparentDescriptorSets[i], + .dstBinding = 0, // Binding 0 in Set 1 + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imageInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, nullptr); + } + } +} + +void Renderer::createTransparentFallbackDescriptorSets() { + // Allocate one descriptor set per frame in flight using the same layout (single combined image sampler at binding 0) + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = static_cast(MAX_FRAMES_IN_FLIGHT), + .pSetLayouts = layouts.data() + }; { + std::lock_guard lk(descriptorMutex); + transparentFallbackDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + + // Point each set to the default texture, which is guaranteed to be in SHADER_READ_ONLY_OPTIMAL when used in the opaque pass + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + vk::DescriptorImageInfo imageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *transparentFallbackDescriptorSets[i], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imageInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, nullptr); + } + } +} + +bool Renderer::createOpaqueSceneColorResources() { + try { + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + + opaqueSceneColorImages.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageAllocations.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageViews.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageLayouts.reserve(MAX_FRAMES_IN_FLIGHT); + + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto [image, allocation] = createImagePooled( + swapChainExtent.width, + swapChainExtent.height, + swapChainImageFormat, + // Use the same format as the swapchain + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + 1); + + opaqueSceneColorImages.push_back(std::move(image)); + opaqueSceneColorImageAllocations.push_back(std::move(allocation)); + opaqueSceneColorImageViews.push_back(createImageView(opaqueSceneColorImages.back(), swapChainImageFormat, vk::ImageAspectFlagBits::eColor)); + opaqueSceneColorImageLayouts.push_back(vk::ImageLayout::eUndefined); + } + + // Create (or recreate) the sampler (shared across frames) + vk::SamplerCreateInfo samplerInfo{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .addressModeU = vk::SamplerAddressMode::eClampToEdge, + .addressModeV = vk::SamplerAddressMode::eClampToEdge, + .addressModeW = vk::SamplerAddressMode::eClampToEdge, + }; + opaqueSceneColorSampler = vk::raii::Sampler(device, samplerInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create opaque scene color resources: " << e.what() << std::endl; + return false; + } +} + +// Copy buffer +void Renderer::copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool and command buffer to isolate per-thread usage (transfer family) + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.transferFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + // Copy buffer + vk::BufferCopy copyRegion{ + .srcOffset = 0, + .dstOffset = 0, + .size = size + }; + + commandBuffer.copyBuffer(*srcBuffer, *dstBuffer, copyRegion); + + // End command buffer + commandBuffer.end(); + + // Submit command buffer + vk::SubmitInfo submitInfo{ + .commandBufferCount = 1, + .pCommandBuffers = &*commandBuffer + }; + + // Use mutex to ensure thread-safe access to transfer queue + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); { + std::lock_guard lock(queueMutex); + transferQueue.submit(submitInfo, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to copy buffer: " << e.what() << std::endl; + throw; + } +} + +// Create image +std::pair Renderer::createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t arrayLayers) { + try { + // Create image + vk::ImageCreateInfo imageInfo{ + .imageType = vk::ImageType::e2D, + .format = format, + .extent = {width, height, 1}, + .mipLevels = 1, + .arrayLayers = arrayLayers, + .samples = vk::SampleCountFlagBits::e1, + .tiling = tiling, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive, + .initialLayout = vk::ImageLayout::eUndefined + }; + + vk::raii::Image image(device, imageInfo); + + // Allocate memory + vk::MemoryRequirements memRequirements = image.getMemoryRequirements(); + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = memRequirements.size, + .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties) + }; + + vk::raii::DeviceMemory imageMemory(device, allocInfo); + + // Bind memory to image + image.bindMemory(*imageMemory, 0); + + return {std::move(image), std::move(imageMemory)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image: " << e.what() << std::endl; + throw; + } +} + +// Create image using memory pool for efficient allocation +std::pair> Renderer::createImagePooled( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels, + uint32_t arrayLayers, + vk::SharingMode sharingMode, + const std::vector& queueFamilies) { + try { + if (!memoryPool) { + throw std::runtime_error("Memory pool not initialized"); + } + + // Use memory pool for allocation (mipmap support limited by memory pool API) + auto [image, allocation] = memoryPool->createImage(width, + height, + format, + tiling, + usage, + properties, + mipLevels, + sharingMode, + queueFamilies, + arrayLayers); + + return {std::move(image), std::move(allocation)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image with memory pool: " << e.what() << std::endl; + throw; + } +} + +// Create an image view +vk::raii::ImageView Renderer::createImageView(vk::Image image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels, uint32_t layerCount) { + try { + ensureThreadLocalVulkanInit(); + // Create image view + vk::ImageViewCreateInfo viewInfo{ + .image = image, + .viewType = (layerCount > 1) ? vk::ImageViewType::e2DArray : vk::ImageViewType::e2D, + .format = format, + .subresourceRange = { + .aspectMask = aspectFlags, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = layerCount + } + }; + + return {device, viewInfo}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image view: " << e.what() << std::endl; + throw; + } +} + +// Transition image layout +void Renderer::transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels) { + transitionImageLayout(image, format, oldLayout, newLayout, mipLevels, 1); +} + +void Renderer::transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels, uint32_t layerCount) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool and command buffer to isolate per-thread usage + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + transitionImageLayout(*commandBuffer, image, format, oldLayout, newLayout, mipLevels, layerCount); + + // End command buffer + commandBuffer.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = !!*uploadsTimeline; + uint64_t signalValue = 0; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submitInfo{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + if (canSignalTimeline) { + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submitInfo.pNext = &timelineInfo; + submitInfo.signalSemaphoreCount = 1; + submitInfo.pSignalSemaphores = &*uploadsTimeline; + } + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &*commandBuffer; + graphicsQueue.submit(submitInfo, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to transition image layout: " << e.what() << std::endl; + throw; + } +} + +// Copy buffer to image +void Renderer::transitionImageLayout(vk::CommandBuffer cmd, vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels, uint32_t layerCount) { + // Create an image barrier (Sync2) + vk::ImageMemoryBarrier2 barrier2{ + .oldLayout = oldLayout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = layerCount + } + }; + + // Set stage and access masks based on layouts + if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eTransferDstOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eTransfer; + barrier2.dstAccessMask = vk::AccessFlagBits2::eTransferWrite; + } else if (oldLayout == vk::ImageLayout::eTransferDstOptimal && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTransfer; + barrier2.srcAccessMask = vk::AccessFlagBits2::eTransferWrite; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilAttachmentOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests; + barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead | vk::AccessFlagBits2::eDepthStencilAttachmentWrite; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilReadOnlyOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests; + barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eGeneral) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderWrite | vk::AccessFlagBits2::eShaderRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + } else if (oldLayout == vk::ImageLayout::eColorAttachmentOptimal && newLayout == vk::ImageLayout::eTransferSrcOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + barrier2.srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eTransfer; + barrier2.dstAccessMask = vk::AccessFlagBits2::eTransferRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eColorAttachmentOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + barrier2.dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite; + } else { + throw std::invalid_argument("Unsupported layout transition!"); + } + + vk::DependencyInfo depInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &barrier2 + }; + cmd.pipelineBarrier2(depInfo); +} + +void Renderer::copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy regions) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool for the GRAPHICS queue to avoid cross-queue races + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + // Copy buffer to image using provided regions + commandBuffer.copyBufferToImage( + buffer, + image, + vk::ImageLayout::eTransferDstOptimal, + regions); + std::cout << "[copyBufferToImage] recorded copy img=" << (void *) image << std::endl; + + // End command buffer + commandBuffer.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = !!*uploadsTimeline; + uint64_t signalValue = 0; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submitInfo{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + if (canSignalTimeline) { + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submitInfo.pNext = &timelineInfo; + submitInfo.signalSemaphoreCount = 1; + submitInfo.pSignalSemaphores = &*uploadsTimeline; + } + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &*commandBuffer; + graphicsQueue.submit(submitInfo, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to copy buffer to image: " << e.what() << std::endl; + throw; + } +} + +// Create or resize light storage buffers to accommodate the given number of lights +bool Renderer::createOrResizeLightStorageBuffers(size_t lightCount) { + try { + // Ensure we have storage buffers for each frame in flight + if (lightStorageBuffers.size() != MAX_FRAMES_IN_FLIGHT) { + lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT); + } + + // Check if we need to resize buffers + bool needsResize = false; + for (auto& buffer : lightStorageBuffers) { + if (buffer.capacity < lightCount) { + needsResize = true; + break; + } + } + + if (!needsResize) { + return true; // Buffers are already large enough + } + + // Calculate new capacity (with some headroom for growth) + size_t newCapacity = std::max(lightCount * 2, static_cast(64)); + vk::DeviceSize bufferSize = sizeof(LightData) * newCapacity; + + // Wait for device to be idle before destroying old buffers to prevent validation errors. + // External synchronization required (VVL): serialize against queue submits/present. + WaitIdle(); + + // Create new buffers for each frame + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto& buffer = lightStorageBuffers[i]; + + // Clean up old buffer if it exists (now safe after waitIdle) + if (!!buffer.allocation) { + buffer.buffer = vk::raii::Buffer(nullptr); + buffer.allocation.reset(); + buffer.mapped = nullptr; + } + + // Create new storage buffer + auto [newBuffer, newAllocation] = createBufferPooled( + bufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Get the mapped pointer from the allocation + void* mapped = newAllocation->mappedPtr; + + // Store the new buffer + buffer.buffer = std::move(newBuffer); + buffer.allocation = std::move(newAllocation); + buffer.mapped = mapped; + buffer.capacity = newCapacity; + buffer.size = 0; + } + + // Update all existing descriptor sets to reference the new light storage buffers + updateAllDescriptorSetsWithNewLightBuffers(); + + // Also refresh Forward+ compute descriptor sets (binding 0) so compute reads valid buffers + try { + if (!forwardPlusPerFrame.empty()) { + for (size_t i = 0; i < forwardPlusPerFrame.size() && i < lightStorageBuffers.size(); ++i) { + if (!*forwardPlusPerFrame[i].computeSet) + continue; + if (!*lightStorageBuffers[i].buffer) + continue; + vk::DescriptorBufferInfo lightsInfo{.buffer = *lightStorageBuffers[i].buffer, .offset = 0, .range = VK_WHOLE_SIZE}; + vk::WriteDescriptorSet write{ + .dstSet = *forwardPlusPerFrame[i].computeSet, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &lightsInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(write, {}); + } + } + } + } catch (const std::exception& e) { + std::cerr << "Failed to update Forward+ compute descriptors after light buffer resize: " << e.what() << std::endl; + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create or resize light storage buffers: " << e.what() << std::endl; + return false; + } +} + +// Update all existing descriptor sets with new light storage buffer references +void Renderer::updateAllDescriptorSetsWithNewLightBuffers(bool allFrames) { + try { + if (!descriptorSetsValid.load(std::memory_order_relaxed)) + return; + if (isRecordingCmd.load(std::memory_order_relaxed)) + return; + // Iterate through all entity resources and update their PBR descriptor sets + for (auto& kv : entityResources) { + auto& resources = kv.second; + // Only update PBR descriptor sets (they have light buffer bindings) + if (!resources.pbrDescriptorSets.empty()) { + size_t beginFrame = allFrames ? 0 : static_cast(currentFrame); + size_t endFrame = allFrames ? resources.pbrDescriptorSets.size() : (beginFrame + 1); + for (size_t i = beginFrame; i < endFrame && i < resources.pbrDescriptorSets.size() && i < lightStorageBuffers.size(); ++i) { + // Skip if this set looks invalid/uninitialized + if (!(*resources.pbrDescriptorSets[i])) + continue; + if (i < lightStorageBuffers.size() && !!*lightStorageBuffers[i].buffer) { + // Create descriptor write for light storage buffer (binding 7) + vk::DescriptorBufferInfo lightBufferInfo{ + .buffer = *lightStorageBuffers[i].buffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *resources.pbrDescriptorSets[i], + .dstBinding = 6, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &lightBufferInfo + }; + + // Update the descriptor set + { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, {}); + } + } + } + } + } + } catch (const std::exception& e) { + std::cerr << "Failed to update descriptor sets with new light buffers: " << e.what() << std::endl; + } +} + +// Refresh only current frame's PBR descriptor bindings used by Forward+ +// Safe to call after waiting on inFlightFences[currentFrame] and before command recording. +void Renderer::refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex) { + try { + if (frameIndex >= MAX_FRAMES_IN_FLIGHT) + return; + if (!descriptorSetsValid.load(std::memory_order_relaxed)) + return; + if (isRecordingCmd.load(std::memory_order_relaxed)) + return; + + // Resolve current frame Forward+ buffers + vk::Buffer headersBuf{}; + vk::Buffer indicesBuf{}; + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + } + + // Resolve current frame lights buffer + vk::Buffer lightsBuf{}; + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + lightsBuf = *lightStorageBuffers[frameIndex].buffer; + } + + // Ensure lights buffer exists (binding 6) - create minimal dummy if needed + if (!lightsBuf) { + // Lazily create a minimal lights buffer (single LightData element) for use when Forward+ is disabled + if (lightStorageBuffers.empty()) { + lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < lightStorageBuffers.size() && !*lightStorageBuffers[frameIndex].buffer) { + vk::DeviceSize minSize = sizeof(LightData); // Single light element + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + lightStorageBuffers[frameIndex].buffer = std::move(buf); + lightStorageBuffers[frameIndex].allocation = std::move(alloc); + lightStorageBuffers[frameIndex].mapped = lightStorageBuffers[frameIndex].allocation->mappedPtr; + lightStorageBuffers[frameIndex].capacity = 1; + lightStorageBuffers[frameIndex].size = 0; + // Zero-initialize to prevent garbage data + if (!!lightStorageBuffers[frameIndex].mapped) { + std::memset(lightStorageBuffers[frameIndex].mapped, 0, minSize); + } + } + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + lightsBuf = *lightStorageBuffers[frameIndex].buffer; + } + } + + // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed + if (!headersBuf) { + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!*f.tileHeaders) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1} + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + } + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + } + } + + // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed + if (!indicesBuf) { + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!*f.tileLightIndices) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + } + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + } + } + + std::vector writes; + vk::DescriptorBufferInfo lightsInfo{}; + vk::DescriptorBufferInfo headersInfo{}; + vk::DescriptorBufferInfo indicesInfo{}; + vk::DescriptorBufferInfo geoInfoInfo{}; + vk::DescriptorBufferInfo matInfoInfo{}; + vk::DescriptorBufferInfo fragDbgInfo{}; + + // At this point, all three critical buffers (lights, headers, indices) should exist (real or dummy) + if (!!lightsBuf) { + lightsInfo = vk::DescriptorBufferInfo{.buffer = lightsBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + // Current frame fragment debug buffer (reuse compute debugOut) - this one is optional + if (frameIndex < forwardPlusPerFrame.size()) { + auto& fpf = forwardPlusPerFrame[frameIndex]; + if (!!*fpf.debugOut) { + fragDbgInfo = vk::DescriptorBufferInfo{.buffer = *fpf.debugOut, .offset = 0, .range = VK_WHOLE_SIZE}; + } + } + if (!!headersBuf) { + headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + if (!!indicesBuf) { + indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Binding 10: reflection sampler — always bind fallback texture while reflection pass is disabled + // The reflection rendering pass is currently disabled (commented out in renderer_rendering.cpp + // lines 1194-1203), so we must not bind any reflection RTs that may exist but contain stale data. + // When reflection rendering is re-enabled, restore the conditional logic to bind previous frame's RT. + vk::DescriptorImageInfo reflInfo{}; + reflInfo = vk::DescriptorImageInfo{.sampler = *defaultTextureResources.textureSampler, .imageView = *defaultTextureResources.textureImageView, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + + // Binding 11: TLAS (for raster ray-query shadows) + // Raster PBR shaders can statically declare/use `tlas` even when ray-query mode is disabled, + // so the descriptor must be written whenever acceleration structures are enabled. + vk::AccelerationStructureKHR tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{}; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + + for (auto& kv : entityResources) { + auto& res = kv.second; + if (res.pbrDescriptorSets.empty() || frameIndex >= res.pbrDescriptorSets.size()) + continue; + + // This prevents "Invalid VkDescriptorSet Object" errors when sets have been freed/invalidated + if (!(*res.pbrDescriptorSets[frameIndex])) { + std::cerr << "Warning: Invalid descriptor set handle for entity at frame " << frameIndex << ", skipping" << std::endl; + continue; + } + + // Binding 6: lights SSBO - ALWAYS bind (required by layout) + if (!!lightsBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo}); + } + // Binding 7: tile headers - ALWAYS bind (required by layout) + if (!!headersBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + } + // Binding 8: tile indices - ALWAYS bind (required by layout) + if (!!indicesBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + } + // Binding 9: fragment debug output buffer (optional - only bind if exists) + if (!!fragDbgInfo.buffer) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 9, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &fragDbgInfo}); + } + // Binding 10: reflection sampler - ALWAYS bind (required by layout) + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo}); + + // Binding 11: TLAS - ALWAYS bind (required by layout when ray query/AS is enabled) + // If TLAS is not built yet, the handle will be null; the shader must not trace when disabled. + vk::WriteDescriptorSet tlasWrite{}; + tlasWrite.dstSet = *res.pbrDescriptorSets[frameIndex]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + writes.push_back(tlasWrite); + + // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries. + // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`. + vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf; + vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf; + vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf; + geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo}); + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo}); + } + + if (!writes.empty()) { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + } catch (const std::exception& e) { + std::cerr << "Failed to refresh PBR Forward+ bindings for frame " << frameIndex << ": " << e.what() << std::endl; + } +} + +// Update the light storage buffer with current light data +bool Renderer::updateLightStorageBuffer(uint32_t frameIndex, const std::vector& lights, CameraComponent* camera) { + try { + // Ensure buffers are large enough and properly initialized + if (!createOrResizeLightStorageBuffers(lights.size())) { + return false; + } + + // Now check frame index after buffers are properly initialized + if (frameIndex >= lightStorageBuffers.size()) { + std::cerr << "Invalid frame index for light storage buffer update: " << frameIndex + << " >= " << lightStorageBuffers.size() << std::endl; + return false; + } + + auto& buffer = lightStorageBuffers[frameIndex]; + if (!buffer.mapped) { + std::cerr << "Light storage buffer not mapped" << std::endl; + return false; + } + + // Convert ExtractedLight data to LightData format + auto* lightData = static_cast(buffer.mapped); + for (size_t i = 0; i < lights.size(); ++i) { + const auto& light = lights[i]; + + // For directional lights, store direction in position field (they don't need position) + // For other lights, store position + if (light.type == ExtractedLight::Type::Directional) { + lightData[i].position = glm::vec4(light.direction, 0.0f); // w=0 indicates direction + } else { + lightData[i].position = glm::vec4(light.position, 1.0f); // w=1 indicates position + } + + lightData[i].color = glm::vec4(light.color * light.intensity, 1.0f); + lightData[i].direction = glm::vec4(light.direction, 0.0f); + + // Calculate light space matrix for shadow mapping + glm::mat4 lightProjection, lightView; + if (light.type == ExtractedLight::Type::Directional) { + float orthoSize = 50.0f; + glm::vec3 shadowCamPos = light.position; + glm::vec3 lightDir = glm::normalize(light.direction); + if (camera) { + // Center shadow map on camera frustum + glm::vec3 camPos = camera->GetPosition(); + shadowCamPos = camPos - lightDir * 50.0f; + } + lightProjection = glm::ortho(-orthoSize, orthoSize, -orthoSize, orthoSize, 0.1f, 200.0f); + + // Robust up vector to avoid LookAt singularities with vertical lights + glm::vec3 up = (std::abs(lightDir.y) > 0.99f) ? glm::vec3(0.0f, 0.0f, 1.0f) : glm::vec3(0.0f, 1.0f, 0.0f); + lightView = glm::lookAt(shadowCamPos, shadowCamPos + lightDir, up); + } else { + lightProjection = glm::perspective(glm::radians(90.0f), 1.0f, 0.1f, light.range); + lightView = glm::lookAt(light.position, light.position + light.direction, glm::vec3(0.0f, 1.0f, 0.0f)); + } + lightData[i].lightSpaceMatrix = lightProjection * lightView; + + // Set light type + switch (light.type) { + case ExtractedLight::Type::Point: + lightData[i].lightType = 0; + break; + case ExtractedLight::Type::Directional: + lightData[i].lightType = 1; + break; + case ExtractedLight::Type::Spot: + lightData[i].lightType = 2; + break; + case ExtractedLight::Type::Emissive: + lightData[i].lightType = 3; + break; + } + + // Set other light properties + lightData[i].range = light.range; + lightData[i].innerConeAngle = light.innerConeAngle; + lightData[i].outerConeAngle = light.outerConeAngle; + } + + // Update buffer size + buffer.size = lights.size(); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to update light storage buffer: " << e.what() << std::endl; + return false; + } +} + +// Asynchronous texture loading implementations using ThreadPool +std::future Renderer::LoadTextureAsync(const std::string& texturePath, bool critical) { + if (texturePath.empty()) { + return std::async(std::launch::deferred, [] { return false; }); + } + // Schedule a CPU-light job that enqueues a pending GPU upload to be + // processed later on the main thread. This avoids submitting Vulkan + // command buffers from worker threads, which can confuse GPU-assisted + // validation. + textureTasksScheduled.fetch_add(1, std::memory_order_relaxed); + uploadJobsTotal.fetch_add(1, std::memory_order_relaxed); + auto task = [this, texturePath, critical]() { + PendingTextureJob job; + job.type = PendingTextureJob::Type::FromFile; + job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical; + job.idOrPath = texturePath; { + std::lock_guard lk(pendingTextureJobsMutex); + pendingTextureJobs.emplace_back(std::move(job)); + } + pendingTextureCv.notify_one(); + if (critical) { + criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed); + } + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + return true; + }; + + std::shared_lock lock(threadPoolMutex); + if (!threadPool) { + return std::async(std::launch::async, task); + } + return threadPool->enqueue(task); +} + +std::future Renderer::LoadTextureFromMemoryAsync(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels, + bool critical) { + if (!imageData || textureId.empty() || width <= 0 || height <= 0 || channels <= 0) { + return std::async(std::launch::deferred, [] { return false; }); + } + // Copy the source bytes so the caller can free/modify their buffer immediately + size_t srcSize = static_cast(width) * static_cast(height) * static_cast(channels); + std::vector dataCopy(srcSize); + std::memcpy(dataCopy.data(), imageData, srcSize); + + textureTasksScheduled.fetch_add(1, std::memory_order_relaxed); + uploadJobsTotal.fetch_add(1, std::memory_order_relaxed); + auto task = [this, textureId, data = std::move(dataCopy), width, height, channels, critical]() mutable { + PendingTextureJob job; + job.type = PendingTextureJob::Type::FromMemory; + job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical; + job.idOrPath = textureId; { + std::lock_guard lk(pendingTextureJobsMutex); + pendingTextureJobs.emplace_back(std::move(job)); + } + pendingTextureCv.notify_one(); + if (critical) { + criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed); + } + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + return true; + }; + + std::shared_lock lock(threadPoolMutex); + if (!threadPool) { + return std::async(std::launch::async, std::move(task)); + } + return threadPool->enqueue(std::move(task)); +} + +void Renderer::WaitForAllTextureTasks() { + // Simple blocking wait: spin until all scheduled texture tasks have completed. + // This is only intended for use during initial scene loading where a short + // stall is acceptable to ensure descriptor sets see all real textures. + for (;;) { + uint32_t scheduled = textureTasksScheduled.load(std::memory_order_relaxed); + uint32_t completed = textureTasksCompleted.load(std::memory_order_relaxed); + if (scheduled == 0 || completed >= scheduled) { + break; + } + // Sleep briefly to yield CPU while background texture jobs finish + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +// Start background worker threads that drain pending texture jobs and perform GPU uploads +void Renderer::StartUploadsWorker(size_t workerCount) { + stopUploadsWorker.store(false, std::memory_order_relaxed); + if (workerCount == 0) { + unsigned int hw = std::thread::hardware_concurrency(); + // Heuristic: at least 2 workers, at most 4, and not exceeding half of HW threads + unsigned int target = std::max(2u, std::min(4u, hw > 0 ? hw / 2 : 2u)); + workerCount = static_cast(target); + } + uploadsWorkerThreads.reserve(workerCount); + for (size_t t = 0; t < workerCount; ++t) { + uploadsWorkerThreads.emplace_back([this]() { + ensureThreadLocalVulkanInit(); + while (!stopUploadsWorker.load(std::memory_order_relaxed)) { + // Wait for work or stop signal + { + std::unique_lock lk(pendingTextureJobsMutex); + pendingTextureCv.wait(lk, + [this]() { + return stopUploadsWorker.load(std::memory_order_relaxed) || !pendingTextureJobs.empty(); + }); + } + if (stopUploadsWorker.load(std::memory_order_relaxed)) + break; + + // Drain a batch of jobs + std::vector batch; { + std::lock_guard lk(pendingTextureJobsMutex); + const size_t maxBatch = 16; // simple batch size to limit command overhead + const size_t take = std::min(maxBatch, pendingTextureJobs.size()); + batch.reserve(take); + for (size_t i = 0; i < take; ++i) { + batch.emplace_back(std::move(pendingTextureJobs.back())); + pendingTextureJobs.pop_back(); + } + } + + // Process critical jobs first + auto isCritical = [](const PendingTextureJob& j) { return j.priority == PendingTextureJob::Priority::Critical; }; + std::stable_sort(batch.begin(), + batch.end(), + [&](const PendingTextureJob& a, const PendingTextureJob& b) { + return isCritical(a) && !isCritical(b); + }); + + // Try to batch FromMemory jobs together for a single transfer submit + std::vector memJobs; + for (auto& j : batch) + if (j.type == PendingTextureJob::Type::FromMemory) + memJobs.push_back(std::move(j)); + // Remove moved jobs from batch + batch.erase(std::remove_if(batch.begin(), batch.end(), [](const PendingTextureJob& j) { return j.type == PendingTextureJob::Type::FromMemory; }), batch.end()); + + if (!memJobs.empty()) { + try { + // Process batched memory uploads with a single submit + // Fallback to per-job if batching fails for any reason + auto processSingle = [&](const PendingTextureJob& job) { + (void) LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + }; + + // Build staging buffers and images without submitting yet + struct Item { + std::string id; + vk::raii::Buffer staging; + std::unique_ptr stagingAlloc; + std::vector tmp; + uint32_t w, h; + vk::Format format; + std::vector regions; + uint32_t mipLevels; + vk::raii::Image image; + std::unique_ptr imageAlloc; + }; + std::vector items; + items.reserve(memJobs.size()); + + for (auto& job : memJobs) { + try { + // Create staging buffer and copy data + const vk::DeviceSize imgSize = static_cast(job.width * job.height * 4); + auto [stagingBuf, stagingAlloc] = createBufferPooled(imgSize, vk::BufferUsageFlagBits::eTransferSrc, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + void* mapped = stagingAlloc->mappedPtr; + // Convert to RGBA if not already + std::vector rgba; + rgba.resize(static_cast(imgSize)); + const uint8_t* src = job.data.data(); + if (job.channels == 4) { + std::memcpy(rgba.data(), src, static_cast(imgSize)); + } else if (job.channels == 3) { + for (int y = 0; y < job.height; ++y) { + for (int x = 0; x < job.width; ++x) { + size_t si = (y * job.width + x) * 3; + size_t di = (y * job.width + x) * 4; + rgba[di + 0] = src[si + 0]; + rgba[di + 1] = src[si + 1]; + rgba[di + 2] = src[si + 2]; + rgba[di + 3] = 255; + } + } + } else if (job.channels == 1) { + for (int i = 0, n = job.width * job.height; i < n; ++i) { + uint8_t v = src[i]; + size_t di = i * 4; + rgba[di + 0] = v; + rgba[di + 1] = v; + rgba[di + 2] = v; + rgba[di + 3] = 255; + } + } else { + // unsupported layout, fallback to single path which will handle it + processSingle(job); + continue; + } + std::memcpy(mapped, rgba.data(), static_cast(imgSize)); + // Persistent mapping via memory pool; no explicit unmap needed here + + // Create image (concurrent sharing if needed) + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + vk::Format texFormat = determineTextureFormat(job.idOrPath); + auto [image, imageAlloc] = createImagePooled(job.width, job.height, texFormat, vk::ImageTiling::eOptimal, vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled, vk::MemoryPropertyFlagBits::eDeviceLocal, 1, 1, differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, families); + + // Prepare one region + std::vector regions{ + vk::BufferImageCopy{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = {.aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1}, + .imageOffset = {0, 0, 0}, + .imageExtent = {static_cast(job.width), static_cast(job.height), 1} + } + }; + + items.push_back(Item{job.idOrPath, std::move(stagingBuf), std::move(stagingAlloc), std::move(rgba), static_cast(job.width), static_cast(job.height), texFormat, std::move(regions), 1, std::move(image), std::move(imageAlloc)}); + } catch (const std::exception& e) { + std::cerr << "Batch prepare failed for '" << job.idOrPath << "': " << e.what() << ". Falling back to single." << std::endl; + processSingle(job); + continue; + } + } + + if (!items.empty()) { + // Record a single command buffer for all items + vk::CommandPoolCreateInfo poolInfo{.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, .queueFamilyIndex = queueFamilyIndices.transferFamily.value()}; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1}; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin(vk::CommandBufferBeginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + for (auto& it : items) { + // Transition undefined->transfer dst (Sync2) + vk::ImageMemoryBarrier2 toDst2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eTransferDstOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *it.image, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + vk::DependencyInfo depToDst{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toDst2}; + cb.pipelineBarrier2(depToDst); + + cb.copyBufferToImage(*it.staging, *it.image, vk::ImageLayout::eTransferDstOptimal, it.regions); + + // Transition to shader-read (Sync2) + vk::ImageMemoryBarrier2 toShader2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *it.image, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + vk::DependencyInfo depToShader{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toShader2}; + cb.pipelineBarrier2(depToShader); + } + + cb.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + uint64_t signalValue = 0; + bool canSignal = !!*uploadsTimeline; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submit{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + if (canSignal) { + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submit.pNext = &timelineInfo; + submit.signalSemaphoreCount = 1; + submit.pSignalSemaphores = &*uploadsTimeline; + } + submit.commandBufferCount = 1; + submit.pCommandBuffers = &*cb; + transferQueue.submit(submit, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + + // Perf accounting for the batch + uint64_t batchBytes = 0; + for (auto& it : items) + batchBytes += static_cast(it.w) * it.h * 4ull; + bytesUploadedTotal.fetch_add(batchBytes, std::memory_order_relaxed); + uploadCount.fetch_add(static_cast(items.size()), std::memory_order_relaxed); + + // Finalize resources and notify + for (auto& it : items) { + // Store in textureResources + TextureResources res; + res.textureImage = std::move(it.image); + res.textureImageAllocation = std::move(it.imageAlloc); + res.format = it.format; + res.mipLevels = it.mipLevels; + res.alphaMaskedHint = false; // heuristic omitted in batch + // Create sampler/view + createTextureSampler(res); + res.textureImageView = createImageView(res.textureImage, res.format, vk::ImageAspectFlagBits::eColor, res.mipLevels); { + std::unique_lock lk(textureResourcesMutex); + textureResources[it.id] = std::move(res); + } + OnTextureUploaded(it.id); + // Update counters + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + } + // Decrement outstanding critical jobs if any + for (auto& job : memJobs) + if (job.priority == PendingTextureJob::Priority::Critical) + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + } catch (const std::exception& e) { + std::cerr << "UploadsWorker: batch processing failed: " << e.what() << std::endl; + // Fallback: per-job processing + for (auto& job : memJobs) { + try { + (void) LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + } catch (...) { + } + } + } + } + + // Process remaining non-memory jobs individually + for (auto& job : batch) { + try { + if (job.type == PendingTextureJob::Type::FromFile) { + (void) LoadTexture(job.idOrPath); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + } + } catch (const std::exception& e) { + std::cerr << "UploadsWorker: failed to process job for '" << job.idOrPath << "': " << e.what() << std::endl; + } + } + } + }); + } +} + +void Renderer::StopUploadsWorker() { + stopUploadsWorker.store(true, std::memory_order_relaxed); + pendingTextureCv.notify_all(); + for (auto& th : uploadsWorkerThreads) { + if (th.joinable()) + th.join(); + } + uploadsWorkerThreads.clear(); +} + +void Renderer::RegisterTextureUser(const std::string& textureId, Entity* entity) { + if (textureId.empty() || !entity) + return; + + // Always register under the canonical resolved ID so that lookups from + // descriptor creation and upload completion (which also use + // ResolveTextureId) are consistent. + std::string canonicalId = ResolveTextureId(textureId); + if (canonicalId.empty()) { + canonicalId = textureId; + } + + std::lock_guard lk(textureUsersMutex); + textureToEntities[canonicalId].push_back(entity); +} + +void Renderer::OnTextureUploaded(const std::string& textureId) { + // Resolve alias to canonical ID used for tracking and descriptor + // creation. RegisterTextureUser also stores under this canonical ID. + std::string canonicalId = ResolveTextureId(textureId); + if (canonicalId.empty()) { + canonicalId = textureId; + } + + std::vector users; { + std::lock_guard lk(textureUsersMutex); + auto it = textureToEntities.find(canonicalId); + if (it == textureToEntities.end()) { + return; + } + users = it->second; + } + + // Always defer descriptor updates to the safe point at the start of Render() + // (after the in-flight fence for the current frame has been signaled). + // This avoids UPDATE_AFTER_BIND violations and mid-recording invalidation. + // If descriptor indexing / UPDATE_AFTER_BIND is enabled, we still prefer + // this safer path for consistency across devices. + for (Entity* entity : users) { + if (!entity) + continue; + MarkEntityDescriptorsDirty(entity); + } + + // Ray Query uses a global texture table (binding 6) that may reference this texture. + // Mark the ray query descriptor sets dirty for all frames so the render-thread safe point + // can refresh the table when the texture becomes available. + if (rayQueryEnabled && accelerationStructureEnabled) { + const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u); + rayQueryDescriptorsDirtyMask.fetch_or(allFramesMask, std::memory_order_relaxed); + } +} + +void Renderer::MarkEntityDescriptorsDirty(Entity* entity) { + if (!entity) + return; + // Mark this entity as needing refresh for *all* frames-in-flight. + // Each frame will refresh its own descriptor sets at its safe point. + const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u); + std::lock_guard lk(dirtyEntitiesMutex); + auto& mask = descriptorDirtyEntities[entity]; + mask |= allFramesMask; +} + +bool Renderer::updateDescriptorSetsForFrame(Entity* entity, + const std::string& texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly, + bool uboOnly) { + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) + return false; + return updateDescriptorSetsForFrame(entity, entityIt->second, texturePath, usePBR, frameIndex, imagesOnly, uboOnly); +} + +bool Renderer::updateDescriptorSetsForFrame(Entity* entity, + EntityResources& res, + const std::string& texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly, + bool uboOnly) { + if (!entity) + return false; + if (!descriptorSetsValid.load(std::memory_order_relaxed)) { + // Descriptor sets are being recreated; skip updates for now + return false; + } + // Defer descriptor writes if the command buffer is currently being recorded. + if (isRecordingCmd.load(std::memory_order_relaxed)) { + std::lock_guard qlk(pendingDescMutex); + pendingDescOps.push_back(PendingDescOp{entity, texturePath, usePBR, frameIndex, imagesOnly}); + descriptorRefreshPending.store(true, std::memory_order_relaxed); + return true; + } + // IMPORTANT: Do NOT hold `textureResourcesMutex` across this function. + // We may call `ResolveTextureId()` (which also locks it), and `std::shared_mutex` is not recursive. + + // Ensure we have a valid UBO for this frame before attempting descriptor writes + if (frameIndex >= res.uniformBuffers.size() || + frameIndex >= res.uniformBuffersMapped.size() || + *res.uniformBuffers[frameIndex] == vk::Buffer{}) { + // Missing UBO for this frame; skip to avoid writing invalid descriptors + return false; + } + + vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout; + // Ensure descriptor sets exist for this entity + std::vector layouts(MAX_FRAMES_IN_FLIGHT, selectedLayout); + vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()}; + auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets; + bool newlyAllocated = false; + if (targetDescriptorSets.empty()) { + std::lock_guard lk(descriptorMutex); + targetDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + newlyAllocated = true; + } + if (frameIndex >= targetDescriptorSets.size()) + return false; + + vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[frameIndex], .range = sizeof(UniformBufferObject)}; + + // Ensure per-pipeline UBO init tracking is sized + if (res.pbrUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.basicUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.pbrFixedBindingsWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.pbrImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.basicImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + + if (usePBR) { + // We'll fill descriptor writes. Binding 0 (UBO) is written only when explicitly requested (uboOnly) + // or when doing a full update (imagesOnly == false). For imagesOnly updates we must NOT touch UBO + // to avoid update-after-bind hazards. + std::vector writes; + std::array imageInfos; + // Helper: ensure required PBR layout bindings (7/8/10/11) are written at least once per frame. + // IMPORTANT: descriptor infos must remain alive until `updateDescriptorSets` is called. + vk::DescriptorBufferInfo headersInfo{}; + vk::DescriptorBufferInfo indicesInfo{}; + vk::DescriptorBufferInfo geoInfoInfo{}; + vk::DescriptorBufferInfo matInfoInfo{}; + vk::DescriptorImageInfo reflInfo{}; + vk::AccelerationStructureKHR tlasHandleValue{}; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + vk::WriteDescriptorSet tlasWrite{}; + const bool needFixedWrites = !res.pbrFixedBindingsWritten[frameIndex]; + auto appendPbrFixedWrites = [&](std::vector& dstWrites) { + if (!needFixedWrites) + return; + + // Binding 7/8: Forward+ tile buffers (must be valid even when Forward+ is disabled) + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + vk::Buffer headersBuf{}; + vk::Buffer indicesBuf{}; + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + if (!headersBuf) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + headersBuf = *f.tileHeaders; + } + if (!indicesBuf) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + indicesBuf = *f.tileLightIndices; + } + } + headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + + // Binding 10: reflection sampler (always bind safe fallback) + reflInfo = vk::DescriptorImageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo}); + + // Binding 11: TLAS (ray-query shadows in raster PBR fragment shader) + tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{}; + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + tlasWrite.dstSet = *targetDescriptorSets[frameIndex]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + dstWrites.push_back(tlasWrite); + + // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries. + // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`. + vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf; + vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf; + vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf; + geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo}); + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo}); + }; + + // Optionally write only the UBO (binding 0) — used at safe point to initialize per-frame sets once + if (uboOnly) { + // Avoid re-writing if we already initialized this frame's UBO binding + if (!res.pbrUboBindingWritten[frameIndex]) { + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + } + appendPbrFixedWrites(writes); + if (!writes.empty()) { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + if (!res.pbrUboBindingWritten[frameIndex]) { + res.pbrUboBindingWritten[frameIndex] = true; + } + if (needFixedWrites) { + res.pbrFixedBindingsWritten[frameIndex] = true; + } + } + return true; + } + + // For full updates (imagesOnly == false), include UBO write; for imagesOnly, skip it + if (!imagesOnly) { + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + } + + auto meshComponent = entity->GetComponent(); + // Determine PBR texture paths in the same manner as createDescriptorSets + std::string legacyPath = (meshComponent ? meshComponent->GetTexturePath() : std::string()); + const std::string baseColorPath = (meshComponent && !meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID); + const std::string mrPath = (meshComponent && !meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + const std::string normalPath = (meshComponent && !meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID; + const std::string occlusionPath = (meshComponent && !meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID; + const std::string emissivePath = (meshComponent && !meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID; + std::array pbrTexturePaths = {baseColorPath, mrPath, normalPath, occlusionPath, emissivePath}; + + for (int j = 0; j < 5; ++j) { + const std::string resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]); + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedBindingPath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + imageInfos[j] = {.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = static_cast(j + 1), .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfos[j]}); + } + // Ensure Forward+ light buffer (binding 6) is written for the current frame when available. + // Do this even on imagesOnly updates so set 0 is fully valid for PBR shading. + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + vk::DescriptorBufferInfo lightBufferInfo{.buffer = *lightStorageBuffers[frameIndex].buffer, .range = VK_WHOLE_SIZE}; + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo}); + } + appendPbrFixedWrites(writes); { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + if (needFixedWrites) { + res.pbrFixedBindingsWritten[frameIndex] = true; + } + if (!imagesOnly) { + res.pbrUboBindingWritten[frameIndex] = true; + } + } else { + const std::string resolvedTexturePath = ResolveTextureId(texturePath); + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedTexturePath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + if (imagesOnly && !newlyAllocated) { + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } else { + // If uboOnly is requested for basic pipeline, only write binding 0 + if (uboOnly) { + if (!res.basicUboBindingWritten[frameIndex]) { + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + res.basicUboBindingWritten[frameIndex] = true; + } + return true; + } + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}, + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + res.basicUboBindingWritten[frameIndex] = true; + } + } + return true; +} + +void Renderer::ProcessDirtyDescriptorsForFrame(uint32_t frameIndex) { + if (frameIndex >= 32u) + return; + const uint32_t frameBit = (1u << frameIndex); + + std::vector toProcess; { + std::lock_guard lk(dirtyEntitiesMutex); + if (descriptorDirtyEntities.empty()) + return; + toProcess.reserve(descriptorDirtyEntities.size()); + for (auto& [e, mask] : descriptorDirtyEntities) { + if (!!e && (mask & frameBit)) { + toProcess.push_back(e); + } + } + } + + uint32_t processed = 0; + for (Entity* entity : toProcess) { + if (!entity) + continue; + + // Kick watchdog periodically during heavy descriptor processing + if (++processed % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + // Resolve a texture path to pass for the basic pipeline + std::string basicTexPath = meshComponent->GetTexturePath(); + if (basicTexPath.empty()) + basicTexPath = meshComponent->GetBaseColorTexturePath(); + // Update strategy: + // - Only update the current frame here at the safe point. + // Other frames will be updated at their own safe points to avoid UPDATE_AFTER_BIND violations. + auto entityIt = entityResources.find(entity); + if (entityIt != entityResources.end()) { + updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, false, frameIndex, /*imagesOnly=*/true); + updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, true, frameIndex, /*imagesOnly=*/true); + } + // Do not touch descriptors for other frames while their command buffers may be pending. + } + + // Clear the processed bit; keep entities dirty until all frames have been refreshed. + { + std::lock_guard lk(dirtyEntitiesMutex); + for (Entity* entity : toProcess) { + auto it = descriptorDirtyEntities.find(entity); + if (it == descriptorDirtyEntities.end()) + continue; + it->second &= ~frameBit; + if (it->second == 0u) { + descriptorDirtyEntities.erase(it); + } + } + } +} + +void Renderer::ProcessPendingTextureJobs(uint32_t maxJobs, + bool includeCritical, + bool includeNonCritical) { + // If the background uploads worker is running, it will handle draining + // texture jobs. Keep this function as a safe no-op for render-thread code + // paths that still call it. + if (!uploadsWorkerThreads.empty() && !stopUploadsWorker.load(std::memory_order_relaxed)) { + return; + } + // Drain the pending job list under lock into a local vector, then + // perform a bounded number of texture loads (including Vulkan work) + // on this thread. This must be called from the main/render thread. + std::vector jobs; { + std::lock_guard lk(pendingTextureJobsMutex); + if (pendingTextureJobs.empty()) { + return; + } + jobs.swap(pendingTextureJobs); + } + + std::vector remaining; + remaining.reserve(jobs.size()); + + uint32_t processed = 0; + uint32_t watchdogCounter = 0; + for (auto& job : jobs) { + // Kick watchdog periodically during heavy texture processing + if (++watchdogCounter % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + const bool isCritical = (job.priority == PendingTextureJob::Priority::Critical); + if (processed < maxJobs && + ((isCritical && includeCritical) || (!isCritical && includeNonCritical))) { + switch (job.type) { + case PendingTextureJob::Type::FromFile: + // LoadTexture will resolve aliases and perform full GPU upload + LoadTexture(job.idOrPath); + break; + case PendingTextureJob::Type::FromMemory: + // LoadTextureFromMemory will create GPU resources for this ID + LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + break; + } + // Refresh descriptors for entities that use this texture so + // streaming uploads become visible in the scene. + OnTextureUploaded(job.idOrPath); + if (isCritical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + ++processed; + } else { + remaining.emplace_back(std::move(job)); + } + } + + if (!remaining.empty()) { + std::lock_guard lk(pendingTextureJobsMutex); + // Append remaining jobs back to the pending queue + pendingTextureJobs.insert(pendingTextureJobs.end(), + std::make_move_iterator(remaining.begin()), + std::make_move_iterator(remaining.end())); + } +} + +// Record both layout transitions and the copy in a single submission with a fence +void Renderer::uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + vk::ArrayProxy regions, + uint32_t mipLevels, + vk::DeviceSize stagedBytes) { + ensureThreadLocalVulkanInit(); + try { + // Start perf window on first upload + if (uploadWindowStartNs.load(std::memory_order_relaxed) == 0) { + auto now = std::chrono::steady_clock::now().time_since_epoch(); + uint64_t nowNs = static_cast(std::chrono::duration_cast(now).count()); + uploadWindowStartNs.store(nowNs, std::memory_order_relaxed); + } + auto t0 = std::chrono::steady_clock::now(); + + // Use a temporary transient command pool for the TRANSFER queue family + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.transferFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + + vk::CommandBufferBeginInfo beginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}; + cb.begin(beginInfo); + + // Barrier: Undefined -> TransferDstOptimal (all mip levels that will be copied) (Sync2) + vk::ImageMemoryBarrier2 toTransfer2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eTransferDstOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depToTransfer{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toTransfer2}; + cb.pipelineBarrier2(depToTransfer); + // Copy + cb.copyBufferToImage(staging, image, vk::ImageLayout::eTransferDstOptimal, regions); + // After copy, if we'll generate mips, keep level 0 in TRANSFER_SRC and leave others in TRANSFER_DST. + // Else transition ALL levels to SHADER_READ_ONLY. (Sync2) + const bool willGenerateMips = (mipLevels > 1 && regions.size() == 1); + if (willGenerateMips) { + vk::ImageMemoryBarrier2 postCopy2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eTransferSrcOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depPostCopy{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &postCopy2}; + cb.pipelineBarrier2(depPostCopy); + } else { + vk::ImageMemoryBarrier2 allToSample{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depAllToSample{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &allToSample}; + cb.pipelineBarrier2(depAllToSample); + } + cb.end(); + + // Submit once on the TRANSFER queue; signal uploads timeline if available + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = !!*uploadsTimeline; + uint64_t signalValue = 0; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submit{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + if (canSignalTimeline) { + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submit.pNext = &timelineInfo; + submit.signalSemaphoreCount = 1; + submit.pSignalSemaphores = &*uploadsTimeline; + } + submit.commandBufferCount = 1; + submit.pCommandBuffers = &*cb; + + transferQueue.submit(submit, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); + + // Perf accounting + auto t1 = std::chrono::steady_clock::now(); + auto ns = std::chrono::duration_cast(t1 - t0).count(); + totalUploadNs.fetch_add(static_cast(ns), std::memory_order_relaxed); + uploadCount.fetch_add(1, std::memory_order_relaxed); + if (stagedBytes > 0) { + bytesUploadedTotal.fetch_add(static_cast(stagedBytes), std::memory_order_relaxed); + } + } catch (const std::exception& e) { + std::cerr << "uploadImageFromStaging failed: " << e.what() << std::endl; + throw; + } +} + +// Generate full mip chain with linear blits (RGBA formats). Assumes level 0 is in TRANSFER_SRC_OPTIMAL. +void Renderer::generateMipmaps(vk::Image image, + vk::Format format, + int32_t texWidth, + int32_t texHeight, + uint32_t mipLevels) { + ensureThreadLocalVulkanInit(); + // Verify format supports linear blit + auto props = physicalDevice.getFormatProperties(format); + if ((props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImageFilterLinear) == vk::FormatFeatureFlags{}) { + return; // no linear filter support; skip + } + + vk::CommandPoolCreateInfo poolInfo{.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()}; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1}; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + int32_t mipW = texWidth; + int32_t mipH = texHeight; + for (uint32_t i = 1; i < mipLevels; ++i) { + // Transition level i to TRANSFER_DST (Sync2) + vk::ImageMemoryBarrier2 toDst2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, .srcAccessMask = vk::AccessFlagBits2::eNone, .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, .oldLayout = vk::ImageLayout::eUndefined, .newLayout = vk::ImageLayout::eTransferDstOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, i, 1, 0, 1} + }; + vk::DependencyInfo depToDst{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toDst2}; + cb.pipelineBarrier2(depToDst); + + // Blit from i-1 to i + vk::ImageBlit blit{}; + blit.srcSubresource.aspectMask = vk::ImageAspectFlagBits::eColor; + blit.srcSubresource.mipLevel = i - 1; + blit.srcSubresource.baseArrayLayer = 0; + blit.srcSubresource.layerCount = 1; + blit.srcOffsets[0] = vk::Offset3D{0, 0, 0}; + blit.srcOffsets[1] = vk::Offset3D{mipW, mipH, 1}; + blit.dstSubresource.aspectMask = vk::ImageAspectFlagBits::eColor; + blit.dstSubresource.mipLevel = i; + blit.dstSubresource.baseArrayLayer = 0; + blit.dstSubresource.layerCount = 1; + blit.dstOffsets[0] = vk::Offset3D{0, 0, 0}; + blit.dstOffsets[1] = vk::Offset3D{std::max(1, mipW / 2), std::max(1, mipH / 2), 1}; + cb.blitImage(image, vk::ImageLayout::eTransferSrcOptimal, image, vk::ImageLayout::eTransferDstOptimal, blit, vk::Filter::eLinear); + + // Transition previous level to SHADER_READ_ONLY (Sync2) + vk::ImageMemoryBarrier2 prevToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferRead, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferSrcOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, i - 1, 1, 0, 1} + }; + vk::DependencyInfo depPrevToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &prevToRead2}; + cb.pipelineBarrier2(depPrevToRead); + + mipW = std::max(1, mipW / 2); + mipH = std::max(1, mipH / 2); + } + // Transition last level to SHADER_READ_ONLY (Sync2) + vk::ImageMemoryBarrier2 lastToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferDstOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, mipLevels - 1, 1, 0, 1} + }; + vk::DependencyInfo depLastToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &lastToRead2}; + cb.pipelineBarrier2(depLastToRead); + + cb.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = !!*uploadsTimeline; + uint64_t signalValue = 0; { + std::lock_guard lock(queueMutex); + vk::SubmitInfo submit{}; + vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit + if (canSignalTimeline) { + signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1; + timelineInfo.signalSemaphoreValueCount = 1; + timelineInfo.pSignalSemaphoreValues = &signalValue; + submit.pNext = &timelineInfo; + submit.signalSemaphoreCount = 1; + submit.pSignalSemaphores = &*uploadsTimeline; + } + submit.commandBufferCount = 1; + submit.pCommandBuffers = &*cb; + graphicsQueue.submit(submit, *fence); + } + (void) waitForFencesSafe(*fence, VK_TRUE); +} diff --git a/attachments/openxr_engine/shaders/common_types.slang b/attachments/openxr_engine/shaders/common_types.slang new file mode 100644 index 00000000..c0638a04 --- /dev/null +++ b/attachments/openxr_engine/shaders/common_types.slang @@ -0,0 +1,205 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Common types and structures shared between rasterization and ray query shaders +// This module contains data structures that match the CPU-side definitions + +// Light data structure for storage buffer +struct LightData { + [[vk::offset(0)]] float4 position; + [[vk::offset(16)]] float4 color; + [[vk::offset(32)]] column_major float4x4 lightSpaceMatrix; + [[vk::offset(96)]] float4 direction; + [[vk::offset(112)]] int lightType; + [[vk::offset(116)]] float range; + [[vk::offset(120)]] float innerConeAngle; + [[vk::offset(124)]] float outerConeAngle; +}; + +// Uniform buffer object +struct UniformBufferObject { + float4x4 model; + float4x4 viewProjections[4]; // Matches Chapter 8 tutorial + float4x4 views[4]; + float4x4 projs[4]; + float4 camPoses[4]; + float exposure; + float gamma; + float prefilteredCubeMipLevels; + float scaleIBLAmbient; + int lightCount; + int padding0; + float padding1; + float padding2; + float2 screenDimensions; + float nearZ; + float farZ; + float slicesZ; + float _pad3; + // Planar reflections + float4x4 reflectionVP; // projection * mirroredView + int reflectionEnabled; // 1 when sampling reflection in main pass + int reflectionPass; // 1 during reflection render pass + float2 _reflectPad0; + float4 clipPlaneWS; // world-space plane ax+by+cz+d=0 + // Controls + float reflectionIntensity; // scales reflection mix in glass + int enableRayQueryReflections; // 1 to enable reflections in ray query mode + int enableRayQueryTransparency; // 1 to enable transparency/refraction in ray query mode + float _padReflect[1]; + // Ray-query specific: number of entries in geometryInfoBuffer (per-instance) + int geometryInfoCount; + // Keep CPU/GPU layout identical to C++ (renderer.h) + int _padGeo0; + int _padGeo1; + int _padGeo2; + float4 _rqReservedWorldPos; + // Ray-query specific: number of materials in materialBuffer (for bounds) + int materialCount; + int _padMat0; + int _padMat1; + int _padMat2; +}; + +// Push constants for material properties +struct PushConstants { + float4 baseColorFactor; + float metallicFactor; + float roughnessFactor; + int baseColorTextureSet; + int physicalDescriptorTextureSet; + int normalTextureSet; + int occlusionTextureSet; + int emissiveTextureSet; + float alphaMask; + float alphaMaskCutoff; + float3 emissiveFactor; + float emissiveStrength; + float transmissionFactor; + int useSpecGlossWorkflow; + float glossinessFactor; + float3 specularFactor; + float ior; + bool hasEmissiveStrengthExt; +}; + +// Forward+ per-tile header +struct TileHeader { + uint offset; + uint count; + uint pad0; + uint pad1; +}; + +// Ray-query shared buffers (also reused by raster PBR when doing ray-query shadows) +// Layout must match the CPU-side structs in `renderer.h` and the definitions in `ray_query.slang`. +struct GeometryInfo { + uint64_t vertexBufferAddress; + uint64_t indexBufferAddress; + uint vertexCount; + uint materialIndex; + uint indexCount; // number of indices in the index buffer + uint _pad0; + // Instance -> world normal transform (3 columns; xyz used, w unused) + float4 normalMatrix0; + float4 normalMatrix1; + float4 normalMatrix2; +}; + +struct MaterialData { + float3 albedo; + float metallic; + float3 emissive; + float roughness; + float ao; + float ior; + float emissiveStrength; + float alpha; + float transmissionFactor; + float alphaCutoff; + int alphaMode; // 0=OPAQUE, 1=MASK, 2=BLEND (matches glTF) + uint isGlass; + uint isLiquid; + + // Thick-glass parameters (RQ-only) + float3 absorptionColor; // Color after traveling absorptionDistance in the medium (1=none) + float absorptionDistance; // Distance at which absorptionColor applies (meters) + uint thinWalled; // 1 = thin surface (no thickness), 0 = thick volume + + // Raster parity: texture-set flags (-1 = no texture; 0 = sample from texture table) + int baseColorTextureSet; + int physicalDescriptorTextureSet; + int normalTextureSet; + int occlusionTextureSet; + int emissiveTextureSet; + + // Ray Query texture table indices (binding 6) + int baseColorTexIndex; + int normalTexIndex; + int physicalTexIndex; + int occlusionTexIndex; + int emissiveTexIndex; + + // Specular-glossiness workflow support + int useSpecGlossWorkflow; + float glossinessFactor; + float3 specularFactor; + int hasEmissiveStrengthExt; + uint _padMat[3]; +}; + +// Constants +static const float PI = 3.14159265359; + +// Matrix inverse utility (4x4 only) +float4x4 inverse(float4x4 m) { + float n11 = m[0][0], n12 = m[1][0], n13 = m[2][0], n14 = m[3][0]; + float n21 = m[0][1], n22 = m[1][1], n23 = m[2][1], n24 = m[3][1]; + float n31 = m[0][2], n32 = m[1][2], n33 = m[2][2], n34 = m[3][2]; + float n41 = m[0][3], n42 = m[1][3], n43 = m[2][3], n44 = m[3][3]; + + float t11 = n23 * n34 * n42 - n24 * n33 * n42 + n24 * n32 * n43 - n22 * n34 * n43 - n23 * n32 * n44 + n22 * n33 * n44; + float t12 = n14 * n33 * n42 - n13 * n34 * n42 - n14 * n32 * n43 + n12 * n34 * n43 + n13 * n32 * n44 - n12 * n33 * n44; + float t13 = n13 * n24 * n42 - n14 * n23 * n42 + n14 * n22 * n43 - n12 * n24 * n43 - n13 * n22 * n44 + n12 * n23 * n44; + float t14 = n14 * n23 * n32 - n13 * n24 * n32 - n14 * n22 * n33 + n12 * n24 * n33 + n13 * n22 * n34 - n12 * n23 * n34; + + float det = n11 * t11 + n21 * t12 + n31 * t13 + n41 * t14; + float idet = 1.0 / det; + + float4x4 ret; + ret[0][0] = t11 * idet; + ret[0][1] = (n24 * n33 * n41 - n23 * n34 * n41 - n24 * n31 * n43 + n21 * n34 * n43 + n23 * n31 * n44 - n21 * n33 * n44) * idet; + ret[0][2] = (n22 * n34 * n41 - n24 * n32 * n41 + n24 * n31 * n42 - n21 * n34 * n42 - n22 * n31 * n44 + n21 * n32 * n44) * idet; + ret[0][3] = (n23 * n32 * n41 - n22 * n33 * n41 - n23 * n31 * n42 + n21 * n33 * n42 + n22 * n31 * n43 - n21 * n32 * n43) * idet; + + ret[1][0] = t12 * idet; + ret[1][1] = (n13 * n34 * n41 - n14 * n33 * n41 + n14 * n31 * n43 - n11 * n34 * n43 - n13 * n31 * n44 + n11 * n33 * n44) * idet; + ret[1][2] = (n14 * n32 * n41 - n12 * n34 * n41 - n14 * n31 * n42 + n11 * n34 * n42 + n12 * n31 * n44 - n11 * n32 * n44) * idet; + ret[1][3] = (n12 * n33 * n41 - n13 * n32 * n41 + n13 * n31 * n42 - n11 * n33 * n42 - n12 * n31 * n43 + n11 * n32 * n43) * idet; + + ret[2][0] = t13 * idet; + ret[2][1] = (n14 * n23 * n41 - n13 * n24 * n41 - n14 * n21 * n43 + n11 * n24 * n43 + n13 * n21 * n44 - n11 * n23 * n44) * idet; + ret[2][2] = (n12 * n24 * n41 - n14 * n22 * n41 + n14 * n21 * n42 - n11 * n24 * n42 - n12 * n21 * n44 + n11 * n22 * n44) * idet; + ret[2][3] = (n13 * n22 * n41 - n12 * n23 * n41 - n13 * n21 * n42 + n11 * n23 * n42 + n12 * n21 * n43 - n11 * n22 * n43) * idet; + + ret[3][0] = t14 * idet; + ret[3][1] = (n13 * n24 * n31 - n14 * n23 * n31 + n14 * n21 * n33 - n11 * n24 * n33 - n13 * n21 * n34 + n11 * n23 * n34) * idet; + ret[3][2] = (n14 * n22 * n31 - n12 * n24 * n31 - n14 * n21 * n32 + n11 * n24 * n32 + n12 * n21 * n34 - n11 * n22 * n34) * idet; + ret[3][3] = (n12 * n23 * n31 - n13 * n22 * n31 + n13 * n21 * n32 - n11 * n23 * n32 - n12 * n21 * n33 + n11 * n22 * n33) * idet; + + return ret; +} diff --git a/attachments/openxr_engine/shaders/composite.slang b/attachments/openxr_engine/shaders/composite.slang new file mode 100644 index 00000000..01f32133 --- /dev/null +++ b/attachments/openxr_engine/shaders/composite.slang @@ -0,0 +1,74 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Fullscreen composite pass: samples the off-screen opaque color and writes to swapchain + +import tonemapping_utils; + +struct VSOut { + float4 Position : SV_POSITION; + float2 UV : TEXCOORD0; +}; + +// Export entrypoint for vertex stage +[shader("vertex")] VSOut VSMain(uint vid : SV_VertexID) +{ + // Fullscreen triangle (no vertex buffer) + float2 pos = float2( (vid == 2) ? 3.0 : -1.0, + (vid == 1) ? 3.0 : -1.0 ); + float2 uv = float2( (vid == 2) ? 2.0 : 0.0, + (vid == 1) ? 2.0 : 0.0 ); + VSOut o; + o.Position = float4(pos, 0.0, 1.0); + o.UV = uv; + return o; +} + +// Set 0, binding 0: combined image sampler for the off-screen scene color +[[vk::binding(0, 0)]] Sampler2D sceneColor; + +struct Push { + float exposure; + float gamma; + int outputIsSRGB; // 1 when the color attachment is SRGB; 0 otherwise + float _pad; // pad to 16 bytes for push constant layout +}; +[[vk::push_constant]] Push pushConsts; + +// Export entrypoint for fragment stage +[shader("fragment")] float4 PSMain(VSOut i) : SV_TARGET +{ + float4 c = sceneColor.Sample(i.UV); + float3 color = c.rgb; + + // Apply exposure and filmic tonemapping + color *= pushConsts.exposure; + + // Uncharted2 / Hable filmic tonemap, canonical form + float3 t = Hable_Filmic_Tonemapping::Uncharted2Tonemap(color); + float3 w = Hable_Filmic_Tonemapping::Uncharted2Tonemap(float3(1,1,1) * Hable_Filmic_Tonemapping::W); + color = t / max(w, float3(1e-6, 1e-6, 1e-6)); + + // If the attachment is NOT SRGB, encode gamma here. When it is SRGB, + // the hardware will encode at store so we keep color in linear space. + if (pushConsts.outputIsSRGB == 0) { + color = pow(max(color, 0.0), float3(1.0 / pushConsts.gamma)); + } else { + color = saturate(color); + } + return float4(color, 1.0); +} diff --git a/attachments/openxr_engine/shaders/forward_plus_cull.slang b/attachments/openxr_engine/shaders/forward_plus_cull.slang new file mode 100644 index 00000000..86eb1e5e --- /dev/null +++ b/attachments/openxr_engine/shaders/forward_plus_cull.slang @@ -0,0 +1,136 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Forward+ tiled light culling (2D tiles) + +// Use the shared `LightData`/`TileHeader` definitions +import common_types; + +// Params packed by the engine (see updateForwardPlusParams) +struct FPParams { + column_major float4x4 view; + column_major float4x4 proj; + float4 screenTile; // x=width,y=height,z=tileX,w=tileY + uint4 counts; // x=lightCount,y=maxPerTile,z=tilesX,w=tilesY + float4 zParams; // x=nearZ, y=farZ, z=slicesZ, w=0 +}; + +[[vk::binding(0, 0)]] StructuredBuffer lightsRO; +[[vk::binding(1, 0)]] RWStructuredBuffer tileHeadersRW; +[[vk::binding(2, 0)]] RWStructuredBuffer tileLightIndicesRW; +[[vk::binding(3, 0)]] ConstantBuffer params; + +// NOTE: This implementation performs a conservative 2D geometric test per tile: +// it projects each point light to screen-space and computes an approximate screen-space +// radius from its world-space range using the projection matrix. A light is included in +// a tile if the circle intersects the tile rectangle. Depth/cluster slicing can be added later. + +[numthreads(1, 1, 1)] +void main(uint3 DTid : SV_DispatchThreadID) +{ + uint tilesX = params.counts.z; + uint tilesY = params.counts.w; + uint slicesZ = (uint)params.zParams.z; + uint maxPerTile = params.counts.y; + uint lightCount = params.counts.x; + + uint cx = min(DTid.x, (tilesX > 0) ? tilesX - 1 : 0); + uint cy = min(DTid.y, (tilesY > 0) ? tilesY - 1 : 0); + uint cz = (slicesZ > 0) ? min(DTid.z, slicesZ - 1) : 0; + + uint tileId = (cz * tilesY + cy) * tilesX + cx; + + // Screen and tile metrics + float2 screenSize = params.screenTile.xy; + float2 tileSize = params.screenTile.zw; // (tileXSize, tileYSize) + float2 tileMin = float2(cx, cy) * tileSize; + float2 tileMax = tileMin + tileSize; + + uint base = tileId * maxPerTile; + uint count = 0; + + // Precompute projection scaling terms to estimate screen-space radius + // For a perspective matrix, proj[0][0] and proj[1][1] scale x/y by f/z. + float projXX = params.proj[0][0]; + float projYY = params.proj[1][1]; + + // Log-sliced depth range for this cluster (positive distances) + float nearZ = max(params.zParams.x, 1e-3); + float farZ = max(params.zParams.y, nearZ + 1e-3); + float fcz0 = (slicesZ > 0) ? (float(cz) / float(slicesZ)) : 0.0; + float fcz1 = (slicesZ > 0) ? (float(cz + 1) / float(slicesZ)) : 1.0; + float sliceNear = exp(lerp(log(nearZ), log(farZ), fcz0)); + float sliceFar = exp(lerp(log(nearZ), log(farZ), fcz1)); + + // Iterate over all lights and append those intersecting this tile + [loop] + for (uint li = 0; li < lightCount; ++li) + { + if (count >= maxPerTile) { break; } + + LightData L = lightsRO[li]; + + // Only point and spot lights have finite range spheres; treat directional as global (include all tiles/slices) + bool isDirectional = (L.lightType == 1); + bool includeAll = isDirectional; + + float2 centerPx = float2(0.0, 0.0); + float radiusPx = 1e9; // huge for directional + bool zOverlap = true; + + if (!includeAll) + { + // Transform light center to view space + float4 posVS = mul(params.view, float4(L.position.xyz, 1.0)); + + // Use positive depth distance + float z = max(1e-3, abs(posVS.z)); + + // Z overlap test with this slice + float zMin = max(0.0, z - L.range); + float zMax = z + L.range; + zOverlap = (zMax >= sliceNear) && (zMin <= sliceFar); + + // Project to clip then NDC + float4 clip = mul(params.proj, float4(posVS.xyz, 1.0)); + float invW = (clip.w != 0.0) ? rcp(clip.w) : 0.0; + float2 ndc = clip.xy * invW; // [-1,1] + centerPx = (ndc * 0.5 + 0.5) * screenSize; // pixels + + // Approximate screen-space radius from world radius (range) + float rx = abs(L.range * projXX / z) * (screenSize.x * 0.5); + float ry = abs(L.range * projYY / z) * (screenSize.y * 0.5); + radiusPx = max(rx, ry); + } + + // Circle vs axis-aligned rectangle overlap test (conservative) + float2 closest = clamp(centerPx, tileMin, tileMax); + float2 d = closest - centerPx; + float dist2 = dot(d, d); + if (zOverlap && dist2 <= radiusPx * radiusPx) + { + tileLightIndicesRW[base + count] = li; + count++; + } + } + + // Write header + TileHeader hdr; + hdr.offset = base; + hdr.count = count; + hdr.pad0 = 0; hdr.pad1 = 0; + tileHeadersRW[tileId] = hdr; +} diff --git a/attachments/openxr_engine/shaders/hrtf.slang b/attachments/openxr_engine/shaders/hrtf.slang new file mode 100644 index 00000000..3ebd59fc --- /dev/null +++ b/attachments/openxr_engine/shaders/hrtf.slang @@ -0,0 +1,262 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Compute shader for HRTF (Head-Related Transfer Function) audio processing +// This shader processes audio data to create 3D spatial audio effects + +// Input/output buffer bindings +[[vk::binding(0, 0)]] RWStructuredBuffer inputAudioBuffer; // Raw audio samples +[[vk::binding(1, 0)]] RWStructuredBuffer outputAudioBuffer; // Processed audio samples +[[vk::binding(2, 0)]] StructuredBuffer hrtfData; // HRTF impulse responses +[[vk::binding(3, 0)]] ConstantBuffer params; // HRTF parameters + +// Parameters for HRTF processing - MUST match CPU GPUHRTFParams structure exactly +struct HRTFParams { + float4 listenerPosition; // Position of the listener (float[4] on CPU) - 16 bytes + float4 listenerForward; // Forward direction of the listener (float[4] on CPU) - 16 bytes + float4 listenerUp; // Up direction of the listener (float[4] on CPU) - 16 bytes + float4 sourcePosition; // Position of the sound source (float[4] on CPU) - 16 bytes + float sampleCount; // Number of samples to process (4 bytes) - offset 64 + float3 padding1; // Padding to align to 16-byte boundary (12 bytes) - offset 68 + uint inputChannels; // Number of input channels (4 bytes) - offset 80 + uint outputChannels; // Number of output channels (4 bytes) - offset 84 + uint hrtfSize; // Size of each HRTF impulse response (4 bytes) - offset 88 + uint numHrtfPositions; // Number of HRTF positions (4 bytes) - offset 92 + float distanceAttenuation; // Distance attenuation factor (4 bytes) - offset 96 + float dopplerFactor; // Doppler effect factor (4 bytes) - offset 100 + float reverbMix; // Reverb mix factor (4 bytes) - offset 104 + float padding2; // Padding to complete 16-byte alignment (4 bytes) - offset 108 +}; + +// Helper function to calculate the index of the closest HRTF in the dataset +uint FindClosestHRTF(float azimuth, float elevation) { + // This is a simplified implementation + // In a real implementation, this would find the closest HRTF in the dataset + // based on the azimuth and elevation angles + + // Normalize azimuth to [0, 360) degrees + azimuth = fmod(azimuth + 360.0, 360.0); + + // Clamp elevation to [-90, 90] degrees + elevation = clamp(elevation, -90.0, 90.0); + + // Calculate indices based on a typical HRTF dataset layout + // Assuming 10-degree resolution in azimuth and 15-degree in elevation + uint azimuthIndex = uint(round(azimuth / 10.0)) % 36; + uint elevationIndex = uint(round((elevation + 90.0) / 15.0)) % 13; + + // Calculate the final index + return elevationIndex * 36 + azimuthIndex; +} + +// Helper function to calculate azimuth and elevation angles +void CalculateAngles(float3 sourceDir, float3 listenerForward, float3 listenerUp, out float azimuth, out float elevation) { + // Simplified angle calculation - directly use source direction + // Calculate azimuth (horizontal angle) - angle around Y axis + azimuth = atan2(sourceDir.x, -sourceDir.z) * 57.2957795; // Convert to degrees, negate z for correct orientation + + // Calculate elevation (vertical angle) - angle from horizontal plane + float horizontalLength = sqrt(sourceDir.x * sourceDir.x + sourceDir.z * sourceDir.z); + elevation = atan2(sourceDir.y, horizontalLength) * 57.2957795; // Convert to degrees +} + +// Main compute shader function +[shader("compute")] +[numthreads(64, 1, 1)] +void main(uint3 dispatchThreadID : SV_DispatchThreadID) { + uint index = dispatchThreadID.x; + + // Check if the thread is within bounds + if (index >= uint(params.sampleCount)) { + return; + } + + // STAGE 1: HRTF DATA ACCESS WITH SAFETY VALIDATION + // Start with working basic panning and add HRTF data access + + // Get input sample for this thread + float inputSample = inputAudioBuffer[index]; + + // STAGE 1: Test HRTF data buffer access with ultra-safe bounds checking + bool hrtfDataValid = false; + float testHrtfSample = 0.0f; + + // Ultra-safe HRTF data access test + if (params.hrtfSize > 0 && params.numHrtfPositions > 0) { + // Test access to first HRTF sample with multiple safety checks + uint testHrtfIndex = 0; // Start with first sample + uint maxHrtfBufferSize = params.numHrtfPositions * params.hrtfSize * 2; // 2 channels + + if (testHrtfIndex < maxHrtfBufferSize && testHrtfIndex < 500000) { // Additional hardcoded safety limit + testHrtfSample = hrtfData[testHrtfIndex]; + hrtfDataValid = true; + } + } + + // STAGE 2: 3D DIRECTION CALCULATION AND ANGLE COMPUTATION + // Calculate 3D direction from listener to source + float3 sourceDir = params.sourcePosition.xyz - params.listenerPosition.xyz; + float distance = length(sourceDir); + + // Handle edge case where listener and source are at same position + if (distance < 0.001) { + sourceDir = float3(0.0, 0.0, -1.0); // Default to front direction + distance = 1.0; + } else { + sourceDir = normalize(sourceDir); + } + + // Calculate azimuth and elevation angles using the helper function + float azimuth, elevation; + CalculateAngles(sourceDir, params.listenerForward.xyz, params.listenerUp.xyz, azimuth, elevation); + + + // ENHANCED SPATIAL PROCESSING: Use 3D angles for better panning + float leftGain = 1.0; + float rightGain = 1.0; + + // Convert azimuth to left/right panning (-180 to +180 degrees) + // Positive azimuth = right side, negative = left side + if (azimuth > 0.0) { + // Source is to the right, reduce left channel based on angle + float rightness = min(1.0, azimuth / 90.0); // Normalize to 0-1 for 0-90 degrees + leftGain = max(0.2, 1.0 - rightness * 0.8); // Reduce left by up to 80% + rightGain = 1.0; + } else if (azimuth < 0.0) { + // Source is to the left, reduce right channel based on angle + float leftness = min(1.0, -azimuth / 90.0); // Normalize to 0-1 for 0-90 degrees + leftGain = 1.0; + rightGain = max(0.2, 1.0 - leftness * 0.8); // Reduce right by up to 80% + } + + // Apply distance attenuation (closer sources are louder) + float distanceAttenuation = 1.0 / max(1.0, distance * 0.5); // Gentle distance falloff + leftGain *= distanceAttenuation; + rightGain *= distanceAttenuation; + + // STAGE 3: HRTF INDEX LOOKUP WITH BOUNDS CHECKING + // Find the closest HRTF in the dataset based on calculated angles + uint hrtfIndex = FindClosestHRTF(azimuth, elevation); + + // Ultra-safe bounds checking for HRTF index + bool hrtfIndexValid = false; + if (hrtfIndex < params.numHrtfPositions && params.numHrtfPositions > 0) { + hrtfIndexValid = true; + } + + // ENHANCED HRTF DATA ACCESS: Use calculated index instead of just first sample + float hrtfLeftSample = 0.0f; + float hrtfRightSample = 0.0f; + bool hrtfSamplesValid = false; + + if (hrtfIndexValid && hrtfDataValid) { + // Calculate HRTF buffer offsets for left and right channels + // HRTF data layout: [position0_left_samples][position0_right_samples][position1_left_samples]... + uint leftChannelOffset = hrtfIndex * params.hrtfSize * 2; // 2 channels per position + uint rightChannelOffset = leftChannelOffset + params.hrtfSize; + + // Ultra-safe bounds checking for HRTF sample access + uint maxHrtfBufferSize = params.numHrtfPositions * params.hrtfSize * 2; + if (leftChannelOffset < maxHrtfBufferSize && rightChannelOffset < maxHrtfBufferSize && + leftChannelOffset < 500000 && rightChannelOffset < 500000) { // Additional hardcoded safety + + // Access first sample of each channel's impulse response for this position + hrtfLeftSample = hrtfData[leftChannelOffset]; + hrtfRightSample = hrtfData[rightChannelOffset]; + hrtfSamplesValid = true; + } + } + + // STAGE 4: HRTF CONVOLUTION LOOP WITH ULTRA-SAFE MEMORY ACCESS + float leftConvolution = 0.0f; + float rightConvolution = 0.0f; + uint convolutionSamples = 0; + + if (hrtfIndexValid && hrtfDataValid && params.hrtfSize > 0) { + // Calculate base offsets for this HRTF position + uint leftChannelBase = hrtfIndex * params.hrtfSize * 2; + uint rightChannelBase = leftChannelBase + params.hrtfSize; + uint maxHrtfBufferSize = params.numHrtfPositions * params.hrtfSize * 2; + + // Limit convolution size for safety and performance + uint safeHrtfSize = min(params.hrtfSize, 32u); // Limit to 32 samples for safety + + // HRTF Convolution loop with ultra-safe bounds checking + for (uint i = 0; i < safeHrtfSize; i++) { + // Check if we can access the input audio sample + if (index >= i) { + uint inputIndex = index - i; + + // Ultra-safe input buffer bounds check + if (inputIndex < uint(params.sampleCount) && inputIndex < 1024) { + float audioSample = inputAudioBuffer[inputIndex]; + + // Calculate HRTF sample indices with bounds checking + uint leftHrtfIndex = leftChannelBase + i; + uint rightHrtfIndex = rightChannelBase + i; + + // Ultra-safe HRTF buffer bounds check + if (leftHrtfIndex < maxHrtfBufferSize && rightHrtfIndex < maxHrtfBufferSize && + leftHrtfIndex < 500000 && rightHrtfIndex < 500000) { + + float leftHrtfSample = hrtfData[leftHrtfIndex]; + float rightHrtfSample = hrtfData[rightHrtfIndex]; + + // Apply convolution + leftConvolution += audioSample * leftHrtfSample; + rightConvolution += audioSample * rightHrtfSample; + convolutionSamples++; + } + } + } + } + + } + + // STAGE 4: Apply convolution results with distance attenuation + if (convolutionSamples > 0) { + // Use convolution results instead of simple gain modification + leftGain = leftConvolution * distanceAttenuation; + rightGain = rightConvolution * distanceAttenuation; + } + + + // STAGE 5: COMPLETE HRTF PROCESSING - FINAL OUTPUT WITH OPTIMIZATION + // Write to both output channels with full HRTF processing + for (uint channel = 0; channel < 2; channel++) { // Hardcode to 2 channels for safety + uint outputIndex = index * 2 + channel; + + // Ultra-safe bounds check with hardcoded limits + if (outputIndex < 1024 * 2 && outputIndex < 2048) { + float finalSample = 0.0f; + + if (convolutionSamples > 0) { + // STAGE 5: Use full HRTF convolution results + finalSample = (channel == 0) ? leftGain : rightGain; + + // Apply output normalization to prevent clipping + finalSample = clamp(finalSample, -1.0f, 1.0f); + } else { + // Fallback: Enhanced spatial panning + float channelGain = (channel == 0) ? leftGain : rightGain; + finalSample = inputSample * channelGain; + } + + outputAudioBuffer[outputIndex] = finalSample; + } + } + +} diff --git a/attachments/openxr_engine/shaders/imgui.slang b/attachments/openxr_engine/shaders/imgui.slang new file mode 100644 index 00000000..3d10f7cc --- /dev/null +++ b/attachments/openxr_engine/shaders/imgui.slang @@ -0,0 +1,66 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Combined vertex and fragment shader for ImGui rendering + +// Input from vertex buffer +struct VSInput { + float2 Position : POSITION; + float2 UV : TEXCOORD0; + float4 Color : COLOR0; +}; + +// Output from vertex shader / Input to fragment shader +struct VSOutput { + float4 Position : SV_POSITION; + float2 UV : TEXCOORD0; + float4 Color : COLOR0; +}; + +// Push constants for transformation +struct PushConstants { + float2 Scale; + float2 Translate; +}; + +// Bindings +[[vk::push_constant]] PushConstants pushConstants; +[[vk::binding(0, 0)]] Sampler2D fontTexture; + +// Vertex shader entry point +[[shader("vertex")]] +VSOutput VSMain(VSInput input) +{ + VSOutput output; + + // Transform position + output.Position = float4(input.Position * pushConstants.Scale + pushConstants.Translate, 0.0, 1.0); + + // Pass UV and color to fragment shader + output.UV = input.UV; + output.Color = input.Color; + + return output; +} + +// Fragment shader entry point +[[shader("fragment")]] +float4 PSMain(VSOutput input) : SV_TARGET +{ + // Sample font texture and multiply by color + float4 color = input.Color * fontTexture.Sample(input.UV); + return color; +} diff --git a/attachments/openxr_engine/shaders/lighting.slang b/attachments/openxr_engine/shaders/lighting.slang new file mode 100644 index 00000000..5d673cc8 --- /dev/null +++ b/attachments/openxr_engine/shaders/lighting.slang @@ -0,0 +1,116 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Combined vertex and fragment shader for basic/legacy lighting +// This shader implements the Phong lighting model as a fallback when BRDF/PBR is disabled +// Note: BRDF/PBR is now the default lighting model - this is used only when explicitly requested + +// Input from vertex buffer +struct VSInput { + float3 Position : POSITION; + float3 Normal : NORMAL; + float2 TexCoord : TEXCOORD0; + float4 Tangent : TANGENT; // Added to match vertex layout (unused in basic lighting) +}; + +// Output from vertex shader / Input to fragment shader +struct VSOutput { + float4 Position : SV_POSITION; + float3 WorldPos : POSITION; + float3 Normal : NORMAL; + float2 TexCoord : TEXCOORD0; + float4 Tangent : TANGENT; // Pass through tangent (unused in basic lighting) +}; + +// Uniform buffer for transformation matrices and light information +struct UniformBufferObject { + float4x4 model; + float4x4 view; + float4x4 proj; + float4 lightPos; + float4 lightColor; + float4 viewPos; +}; + +// Push constants for material properties +struct PushConstants { + float4 ambientColor; + float4 diffuseColor; + float4 specularColor; + float shininess; +}; + +// Bindings +[[vk::binding(0, 0)]] ConstantBuffer ubo; +[[vk::binding(1, 0)]] Sampler2D texSampler; + +// Push constants +[[vk::push_constant]] PushConstants material; + +// Vertex shader entry point +[[shader("vertex")]] +VSOutput VSMain(VSInput input) +{ + VSOutput output; + + // Transform position to clip space + float4 worldPos = mul(ubo.model, float4(input.Position, 1.0)); + output.Position = mul(ubo.proj, mul(ubo.view, worldPos)); + + // Pass world position to fragment shader + output.WorldPos = worldPos.xyz; + + // Transform normal to world space + output.Normal = normalize(mul((float3x3)ubo.model, input.Normal)); + + // Pass texture coordinates + output.TexCoord = input.TexCoord; + + // Pass tangent (unused in basic lighting but required for vertex layout compatibility) + output.Tangent = input.Tangent; + + return output; +} + +// Fragment shader entry point +[[shader("fragment")]] +float4 PSMain(VSOutput input) : SV_TARGET +{ + // Sample texture + float4 texColor = texSampler.Sample(input.TexCoord); + + // Normalize vectors + float3 normal = normalize(input.Normal); + float3 lightDir = normalize(ubo.lightPos.xyz - input.WorldPos); + float3 viewDir = normalize(ubo.viewPos.xyz - input.WorldPos); + float3 reflectDir = reflect(-lightDir, normal); + + // Ambient + float3 ambient = material.ambientColor.rgb * ubo.lightColor.rgb; + + // Diffuse + float diff = max(dot(normal, lightDir), 0.0); + float3 diffuse = diff * material.diffuseColor.rgb * ubo.lightColor.rgb; + + // Specular + float spec = pow(max(dot(viewDir, reflectDir), 0.0), material.shininess); + float3 specular = spec * material.specularColor.rgb * ubo.lightColor.rgb; + + // Combine components + float3 result = (ambient + diffuse + specular) * texColor.rgb; + + return float4(result, texColor.a); +} diff --git a/attachments/openxr_engine/shaders/lighting_utils.slang b/attachments/openxr_engine/shaders/lighting_utils.slang new file mode 100644 index 00000000..1ecb3fae --- /dev/null +++ b/attachments/openxr_engine/shaders/lighting_utils.slang @@ -0,0 +1,119 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Lighting utilities for evaluating lights and accumulating BRDF contributions +// Shared between rasterization and ray query shaders + +import common_types; +import pbr_utils; + +// Result of evaluating a single light +struct LightEvaluation { + float3 L; // Direction to light (normalized) + float3 radiance; // Incident radiance + float NdotL; // Clamped N·L for BRDF evaluation + bool valid; // True if light contributes +}; + +// Evaluate a single light at a world position +// Returns light direction, radiance, and N·L +LightEvaluation evaluateLight(LightData light, float3 worldPos, float3 N) { + LightEvaluation result; + result.valid = false; + + if (light.lightType == 1) { + // Directional light + result.L = normalize(-light.position.xyz); + result.radiance = light.color.rgb; + result.valid = true; + } else { + // Point/spot/emissive light: position.xyz is light position in world space + float3 toLight = light.position.xyz - worldPos; + float d = length(toLight); + result.L = (d > 1e-5) ? toLight / d : float3(0, 0, 1); + + if (light.lightType == 3) { + // Emissive light: soft falloff using range as characteristic radius + float r = max(light.range, 0.001); + float att = 1.0 / (1.0 + (d / r) * (d / r)); + result.radiance = light.color.rgb * att; + result.valid = true; + } else if (light.lightType == 0 || light.lightType == 2) { + // Point or spot light: inverse square falloff with range windowing + float attenuation = 1.0 / max(d * d, 0.0001); + + // GLTF style range attenuation + if (light.range > 0.0) { + attenuation *= pow(saturate(1.0 - pow(d / light.range, 4.0)), 2.0); + } + + result.radiance = light.color.rgb * attenuation; + + if (light.lightType == 2) { + // Spot light cone attenuation + float3 D = normalize(light.direction.xyz); + float cd = dot(D, -result.L); + float cosInner = cos(light.innerConeAngle); + float cosOuter = cos(light.outerConeAngle); + float spotAttenuation = saturate((cd - cosOuter) / max(cosInner - cosOuter, 0.0001)); + spotAttenuation *= spotAttenuation; + result.radiance *= spotAttenuation; + } + result.valid = true; + } + } + + if (result.valid) { + // For emissive lights, treat lighting as two-sided to avoid self-occlusion + float rawDot = dot(N, result.L); + result.NdotL = (light.lightType == 3) ? abs(rawDot) : max(rawDot, 0.0); + result.valid = (result.NdotL > 0.0); + } + + return result; +} + +// Accumulate lighting contribution from a single light using GGX BRDF +// Adds diffuse and specular contributions to the provided accumulators +void accumulateLighting( + LightEvaluation lightEval, + float3 N, + float3 V, + float3 albedo, + float metallic, + float roughness, + float3 F0, + inout float3 diffuseLighting, + inout float3 specularLighting) +{ + if (!lightEval.valid) return; + + float3 H = normalize(V + lightEval.L); + float NdotV = max(dot(N, V), 0.0); + float NdotH = max(dot(N, H), 0.0); + float HdotV = max(dot(H, V), 0.0); + + // GGX microfacet BRDF + float D = DistributionGGX(NdotH, roughness); + float G = GeometrySmith(NdotV, lightEval.NdotL, roughness); + float3 F = FresnelSchlick(HdotV, F0); + + float3 spec = (D * G * F) / max(4.0 * NdotV * lightEval.NdotL, 0.0001); + float3 kD = (1.0 - F) * (1.0 - metallic); + + specularLighting += spec * lightEval.radiance * lightEval.NdotL; + diffuseLighting += (kD * albedo / PI) * lightEval.radiance * lightEval.NdotL; +} diff --git a/attachments/openxr_engine/shaders/pbr.slang b/attachments/openxr_engine/shaders/pbr.slang new file mode 100644 index 00000000..83c705e2 --- /dev/null +++ b/attachments/openxr_engine/shaders/pbr.slang @@ -0,0 +1,625 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Import shared utility modules +import common_types; +import pbr_utils; +import lighting_utils; +import tonemapping_utils; + +// Input from vertex buffer +struct VSInput { + [[vk::location(0)]] float3 Position; + [[vk::location(1)]] float3 Normal; + [[vk::location(2)]] float2 UV; + [[vk::location(3)]] float4 Tangent; + + // Per-instance data. Model matrix is a true 4x4, while the normal + // matrix is provided as three float4 columns that match the CPU + // layout (glm::mat3x4: 3 columns of vec4, xyz used, w unused). + [[vk::location(4)]] column_major float4x4 InstanceModelMatrix; // binding 1 (uses 4 locations) + [[vk::location(8)]] float4 InstanceNormal0; // normal matrix column 0 + [[vk::location(9)]] float4 InstanceNormal1; // normal matrix column 1 + [[vk::location(10)]] float4 InstanceNormal2; // normal matrix column 2 +}; + +// Output from vertex shader / Input to fragment shader +struct VSOutput { + float4 Position : SV_POSITION; + float3 WorldPos; + float3 Normal : NORMAL; + float3 GeometricNormal : NORMAL1; + float2 UV : TEXCOORD0; + float4 Tangent : TANGENT; + uint ViewID : SV_ViewID; +}; + +[[vk::binding(0, 1)]] Sampler2D opaqueSceneColor; + +// Bindings +[[vk::binding(0, 0)]] ConstantBuffer ubo; +[[vk::binding(1, 0)]] Sampler2D baseColorMap; +[[vk::binding(2, 0)]] Sampler2D metallicRoughnessMap; +[[vk::binding(3, 0)]] Sampler2D normalMap; +[[vk::binding(4, 0)]] Sampler2D occlusionMap; +[[vk::binding(5, 0)]] Sampler2D emissiveMap; +[[vk::binding(6, 0)]] StructuredBuffer lightBuffer; +// Forward+ per-tile light lists (same set 0 to keep pipeline layouts compact) +[[vk::binding(7, 0)]] StructuredBuffer tileHeaders; +[[vk::binding(8, 0)]] StructuredBuffer tileLightIndices; +// Planar reflection sampler (bound only when reflections are enabled) +[[vk::binding(10, 0)]] Sampler2D reflectionMap; + +// Raster ray-query shadows: TLAS +[[vk::binding(11, 0)]] RaytracingAccelerationStructure tlas; + +// Ray-query shared buffers (used for material-aware raster shadow queries) +[[vk::binding(12, 0)]] StructuredBuffer geometryInfoBuffer; +[[vk::binding(13, 0)]] StructuredBuffer materialBuffer; + +[[vk::push_constant]] PushConstants material; + +static const float RASTER_SHADOW_EPS = 0.001; + +// Hard shadow query for raster fragment shading. +// NOTE: We intentionally treat NON_OPAQUE candidates as non-occluding here. +// To make glass/transmissive surfaces not block light, those instances should +// be flagged as FORCE_NO_OPAQUE in the TLAS build. +bool traceShadowOccluded(float3 origin, float3 direction, float tMin, float tMax) +{ + RayDesc ray; + ray.Origin = origin; + ray.Direction = direction; + ray.TMin = tMin; + ray.TMax = tMax; + + RayQuery q; + // Match TLAS instance masking convention from `renderer_ray_query.cpp`: + // 0x01 = regular scene geometry, 0x02 = environment/sky. + // For raster shadows, ignore the environment to avoid global false occlusion. + uint mask = 0x01; + q.TraceRayInline( + tlas, + RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH, + mask, + ray + ); + + int iter = 0; + while (q.Proceed() && iter < 64) + { + iter++; + // No special acceptance rules here; accept-first-hit will commit on the first opaque blocker. + } + return (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT); +} + +// Vertex shader entry point +[[shader("vertex")]] +VSOutput VSMain(VSInput input, uint viewID : SV_ViewID) +{ + VSOutput output; + output.ViewID = viewID; + float4x4 instanceModelMatrix = input.InstanceModelMatrix; + float4 worldPos = mul(ubo.model, mul(instanceModelMatrix, float4(input.Position, 1.0))); + output.Position = mul(ubo.viewProjections[viewID], worldPos); + output.WorldPos = worldPos.xyz; + + // Transform normals correctly: first by the per-instance normal matrix, + // then by the entity model 3x3 (avoid double-applying instance transform). + float3x3 instNormal = float3x3(input.InstanceNormal0.xyz, input.InstanceNormal1.xyz, input.InstanceNormal2.xyz); + float3x3 model3x3 = (float3x3)ubo.model; + float3 worldNormal = normalize(mul(model3x3, mul(instNormal, input.Normal))); + output.Normal = worldNormal; + + // Geometric normal (pre-normal-map) uses the same transform path. + output.GeometricNormal = worldNormal; + + // Transform tangent similarly (approximate with same normal transform path). + float3 worldTangent = normalize(mul(model3x3, mul(instNormal, input.Tangent.xyz))); + output.UV = input.UV; + output.Tangent = float4(worldTangent, input.Tangent.w); + return output; +} + +// Fragment shader entry point for generic PBR materials +[[shader("fragment")]] +float4 PSMain(VSOutput input) : SV_TARGET +{ + // --- 1. Material Properties --- + float2 uv = float2(input.UV.x, 1.0 - input.UV.y); + float4 baseColor = (material.baseColorTextureSet < 0) ? material.baseColorFactor : baseColorMap.Sample(uv) * material.baseColorFactor; + float4 mrOrSpecGloss = (material.physicalDescriptorTextureSet < 0) ? float4(1.0, 1.0, 1.0, 1.0) : metallicRoughnessMap.Sample(uv); + float metallic = 0.0, roughness = 1.0; + float3 F0, albedo; + + if (material.useSpecGlossWorkflow != 0) { + float3 specColorSG = mrOrSpecGloss.rgb * material.specularFactor; + float gloss = clamp(mrOrSpecGloss.a * material.glossinessFactor, 0.0, 1.0); + roughness = clamp(1.0 - gloss, 0.0, 1.0); + F0 = specColorSG; + albedo = baseColor.rgb * (1.0 - max(F0.r, max(F0.g, F0.b))); + } else { + // glTF metallic-roughness texture packs metallic in B, roughness in G (linear space) + float metallicTex = mrOrSpecGloss.b; + float roughnessTex = mrOrSpecGloss.g; + metallic = clamp(metallicTex * material.metallicFactor, 0.0, 1.0); + roughness = clamp(roughnessTex * material.roughnessFactor, 0.0, 1.0); + F0 = lerp(float3(0.04, 0.04, 0.04), baseColor.rgb, metallic); + albedo = baseColor.rgb * (1.0 - metallic); + } + + float ao = (material.occlusionTextureSet < 0) ? 1.0 : occlusionMap.Sample(uv).r; + + // Emissive: default to constant white when no emissive texture so authored emissiveFactor works per glTF spec. + // If a texture is present but factor is zero, assume (1,1,1) to preserve emissive textures by default. + float3 emissiveTex = (material.emissiveTextureSet < 0) ? float3(1.0, 1.0, 1.0) : emissiveMap.Sample(uv).rgb; + float3 emissiveFactor = material.emissiveFactor; + float3 emissive = emissiveTex * emissiveFactor; + if (material.hasEmissiveStrengthExt) + emissive *= material.emissiveStrength; + + if (material.alphaMask > 0.5 && baseColor.a < material.alphaMaskCutoff) { discard; } + + // --- 2. Normal Calculation --- + float3 N = normalize(input.Normal); + if (material.normalTextureSet >= 0) { + float3 tangentNormal = normalMap.Sample(uv).xyz * 2.0 - 1.0; + float3 T = normalize(input.Tangent.xyz); + // We flip the V coordinate for all textures (uv.y -> 1-uv.y). In + // tangent space, this corresponds to inverting the bitangent. + // glTF's tangent.w encodes the sign of the bitangent relative to + // cross(N,T) in the *unflipped* UV space, so we must negate it here + // to keep the normal map oriented correctly after the V flip. + float handedness = -input.Tangent.w; + float3 B = normalize(cross(N, T)) * handedness; + float3x3 TBN = float3x3(T, B, N); + N = normalize(mul(TBN, tangentNormal)); + } + + // Geometric (non-normal-mapped) normal for large-scale effects like Fresnel, + // refraction and screen-space reflections. This stays stable across pixels + // on a flat pane and helps avoid flickering when rotating the camera. + float3 G = normalize(input.GeometricNormal); + + // --- 3. Opaque Lighting Calculation --- + float3 V = normalize(ubo.camPoses[input.ViewID].xyz - input.WorldPos); + + float3 diffuseLighting = float3(0.0, 0.0, 0.0); + float3 specularLighting = float3(0.0, 0.0, 0.0); + + // Forward+: compute tile id and iterate culled light list + const uint TILE = 16u; // must match engine configuration + uint tilesX = (uint(ubo.screenDimensions.x) + TILE - 1u) / TILE; + uint tilesY = (uint(ubo.screenDimensions.y) + TILE - 1u) / TILE; + + // SV_POSITION in the fragment stage is in window coordinates. Use robust integer index. + uint px = (uint)max(0.0, input.Position.x); + uint py = (uint)max(0.0, input.Position.y); + uint tileX = (tilesX > 0u) ? min(px / TILE, tilesX - 1u) : 0u; + uint tileY = (tilesY > 0u) ? min(py / TILE, tilesY - 1u) : 0u; + uint totalTiles = max(tilesX * tilesY, 1u); + + // Clustered Z slice index from view-space depth (positive distance) + float dVS = abs(mul(ubo.views[input.ViewID], float4(input.WorldPos, 1.0)).z); + float lnN = log(max(ubo.nearZ, 1e-4)); + float lnF = log(max(ubo.farZ, lnN + 1e-4)); + float denom = max(lnF - lnN, 1e-6); + float slices = max(ubo.slicesZ, 1.0); + float lambda = saturate((log(max(dVS, 1e-4)) - lnN) / denom); + uint slice = (uint)clamp(floor(lambda * slices), 0.0, slices - 1.0); + + uint tileId = (slice * tilesY + tileY) * tilesX + tileX; + + // CPU toggles Forward+ via ubo.padding1 (0 = disabled, non-zero = enabled) + bool useForwardPlus = (ubo.padding1 != 0.0); + + uint base = 0u; + uint count = 0u; + if (useForwardPlus && tileId < totalTiles * (uint)slices) { + TileHeader th = tileHeaders[tileId]; + base = th.offset; + count = th.count; + } + + bool forceGlobal = false; + + // Accumulate per-light diffuse and specular terms using GGX microfacet BRDF. + if (useForwardPlus && !forceGlobal && count > 0) { + // Use Forward+ culled list + for (uint li = 0u; li < count; ++li) { + uint lightIndex = tileLightIndices[base + li]; + LightData light = lightBuffer[lightIndex]; + float3 L, radiance; + float distToLight = 10000.0; + if (light.lightType == 1) { + // Directional + L = normalize(-light.position.xyz); + radiance = light.color.rgb; + } else { + // Point/spot/emissive: position.xyz is light position in world space + float3 toLight = light.position.xyz - input.WorldPos; + float d = length(toLight); + L = (d > 1e-5) ? toLight / d : float3(0,0,1); + distToLight = d; + + float attenuation = 1.0; + if (light.lightType == 3) { + // Emissive: soft falloff using range as a characteristic radius + float r = max(light.range, 0.001); + attenuation = 1.0 / (1.0 + (d / r) * (d / r)); + } else { + attenuation = 1.0 / max(d * d, 0.0001); + // GLTF style range attenuation + if (light.range > 0.0) { + attenuation *= pow(saturate(1.0 - pow(d / light.range, 4.0)), 2.0); + } + } + radiance = light.color.rgb * attenuation; + + if (light.lightType == 2) { + // Spot light cone attenuation + float3 D = normalize(light.direction.xyz); + float cd = dot(D, -L); + float cosInner = cos(light.innerConeAngle); + float cosOuter = cos(light.outerConeAngle); + float spotAttenuation = saturate((cd - cosOuter) / max(cosInner - cosOuter, 0.0001)); + spotAttenuation *= spotAttenuation; + radiance *= spotAttenuation; + } + } + // For emissive lights, treat lighting as two-sided to avoid glass/self-occlusion issues + float rawDot = dot(N, L); + float NdotL = (light.lightType == 3) ? abs(rawDot) : max(rawDot, 0.0); + + if (NdotL > 0.0) { + float visibility = 1.0; + // Raster ray-query shadows are expensive. In Bistro, most lights are emissive + // and casting per-light shadows drops FPS drastically. Shadow only the directional + // light (sun) for now. + if (ubo.padding2 != 0.0 && light.lightType == 1) { + float tMaxShadow = (light.lightType == 1) ? 10000.0 : max(distToLight - RASTER_SHADOW_EPS, RASTER_SHADOW_EPS); + float3 shadowOrigin = input.WorldPos + N * RASTER_SHADOW_EPS; + bool occluded = traceShadowOccluded(shadowOrigin, L, RASTER_SHADOW_EPS, tMaxShadow); + visibility = occluded ? 0.0 : 1.0; + } + + float3 H = normalize(V + L); + float NdotV = max(dot(N, V), 0.0); + float NdotH = max(dot(N, H), 0.0); + float HdotV = max(dot(H, V), 0.0); + float D = DistributionGGX(NdotH, roughness); + float G = GeometrySmith(NdotV, NdotL, roughness); + float3 F = FresnelSchlick(HdotV, F0); + float3 spec = (D * G * F) / max(4.0 * NdotV * NdotL, 0.0001); + float3 kD = (1.0 - F) * (1.0 - metallic); + specularLighting += spec * radiance * NdotL * visibility; + diffuseLighting += (kD * albedo / PI) * radiance * NdotL * visibility; + } + } + } + // Global light loop (fallback or forced debug) + // Fallback when Forward+ list is empty but lights exist and not in single-tile mode, + // OR always when forceGlobal flag is enabled. + // If Forward+ is disabled, always use global lights. + // If Forward+ is enabled but lists are empty (e.g., before first dispatch), fall back to global. + if (forceGlobal || !useForwardPlus || (count == 0 && ubo.lightCount > 0)) { + // Fallback path when Forward+ is disabled or lists are not populated yet + for (uint li = 0u; li < (uint)ubo.lightCount; ++li) { + LightData light = lightBuffer[li]; + float3 L, radiance; + float distToLight = 10000.0; + if (light.lightType == 1) { + L = normalize(-light.position.xyz); + radiance = light.color.rgb; + } else { + float3 toLight = light.position.xyz - input.WorldPos; + float d = length(toLight); + L = (d > 1e-5) ? toLight / d : float3(0,0,1); + distToLight = d; + + float attenuation = 1.0; + if (light.lightType == 3) { + float r = max(light.range, 0.001); + attenuation = 1.0 / (1.0 + (d / r) * (d / r)); + } else { + attenuation = 1.0 / max(d * d, 0.0001); + // GLTF style range attenuation + if (light.range > 0.0) { + attenuation *= pow(saturate(1.0 - pow(d / light.range, 4.0)), 2.0); + } + } + radiance = light.color.rgb * attenuation; + + if (light.lightType == 2) { + // Spot light cone attenuation + float3 D = normalize(light.direction.xyz); + float cd = dot(D, -L); + float cosInner = cos(light.innerConeAngle); + float cosOuter = cos(light.outerConeAngle); + float spotAttenuation = saturate((cd - cosOuter) / max(cosInner - cosOuter, 0.0001)); + spotAttenuation *= spotAttenuation; + radiance *= spotAttenuation; + } + } + float NdotL = (light.lightType == 3) ? abs(dot(N, L)) : max(dot(N, L), 0.0); + if (NdotL > 0.0) { + float visibility = 1.0; + if (ubo.padding2 != 0.0 && light.lightType == 1) { + float tMaxShadow = (light.lightType == 1) ? 10000.0 : max(distToLight - RASTER_SHADOW_EPS, RASTER_SHADOW_EPS); + float3 shadowOrigin = input.WorldPos + N * RASTER_SHADOW_EPS; + bool occluded = traceShadowOccluded(shadowOrigin, L, RASTER_SHADOW_EPS, tMaxShadow); + visibility = occluded ? 0.0 : 1.0; + } + + float3 H = normalize(V + L); + float NdotV = max(dot(N, V), 0.0); + float NdotH = max(dot(N, H), 0.0); + float HdotV = max(dot(H, V), 0.0); + float D = DistributionGGX(NdotH, roughness); + float G = GeometrySmith(NdotV, NdotL, roughness); + float3 F = FresnelSchlick(HdotV, F0); + float3 spec = (D * G * F) / max(4.0 * NdotV * NdotL, 0.0001); + float3 kD = (1.0 - F) * (1.0 - metallic); + specularLighting += spec * radiance * NdotL * visibility; + diffuseLighting += (kD * albedo / PI) * radiance * NdotL * visibility; + } + } + } + + float3 ambient = albedo * ao * (0.1 * ubo.scaleIBLAmbient); + float3 opaqueLit = diffuseLighting + specularLighting + ambient + emissive; + + // --- 4. Final Color Assembly (opaque only; transmission handled in GlassPSMain) --- + float3 color = opaqueLit; + float alphaOut = baseColor.a; + + // Clip-plane discard during reflection render pass (to remove behind-plane geometry) + if (ubo.reflectionPass == 1) { + float side = dot(ubo.clipPlaneWS, float4(input.WorldPos, 1.0)); + if (side > 0.0) discard; // discard geometry on the positive side of the plane + } + + // Note: reflections are only applied in glass path (GlassPSMain). No planar reflection + // sampling here to avoid banding/aliasing and ensure user-requested behavior. + + // --- 5. Post-Processing --- + // Output linear color for intermediate buffers (composite pass will tonemap) + return float4(color, alphaOut); +} + +// Fragment shader entry point specialized for architectural glass. +// Shares the same inputs and bindings as PSMain, but uses a much simpler +// and more stable shading model: primarily refraction of the opaque scene +// with a small ambient/emissive surface term. Direct diffuse/specular +// lighting and screen-space reflections are omitted to avoid global +// bright/dark flashes across large glass surfaces. +[[shader("fragment")]] +float4 GlassPSMain(VSOutput input) : SV_TARGET +{ + // --- 1. Material / texture sampling (minimal subset) --- + float2 uv = float2(input.UV.x, 1.0 - input.UV.y); + + float4 baseColor = (material.baseColorTextureSet < 0) + ? material.baseColorFactor + : baseColorMap.Sample(uv) * material.baseColorFactor; + + // Emissive (same logic as PSMain) + float3 emissiveTex = (material.emissiveTextureSet < 0) + ? float3(1.0, 1.0, 1.0) + : emissiveMap.Sample(uv).rgb; + float3 emissiveFactor = material.emissiveFactor; + float3 emissive = emissiveTex * emissiveFactor; + if (material.hasEmissiveStrengthExt) + emissive *= material.emissiveStrength; + + // Alpha mask discard as in PSMain + if (material.alphaMask > 0.5 && baseColor.a < material.alphaMaskCutoff) { + discard; + } + + // Geometric normal for view-angle dependence and refraction + float3 G = normalize(input.GeometricNormal); + float3 V = normalize(ubo.camPoses[input.ViewID].xyz - input.WorldPos); + + // Base albedo used for transmission tint + float3 albedo = baseColor.rgb; + + // Ambient is intentionally disabled for the glass path. + // Even small ambient terms can make large glass surfaces look "filled in" + // (frosted/opaque) rather than primarily showing the background through refraction. + + // Transmission factor from push constants. + // Some assets flag “glass” via engine-side heuristics but may not author + // `KHR_materials_transmission`. Since this shader is only used for glass, + // derive a robust effective transmission so glass never goes black. + float T_auth = clamp(material.transmissionFactor, 0.0, 1.0); + float opacity = clamp(baseColor.a, 0.0, 1.0); + float T_fromAlpha = 1.0 - opacity; + float T_eff = max(T_auth, T_fromAlpha); + if (T_eff < 0.01) { + // Default to mostly transmissive for glass when no explicit transmission/alpha is authored. + T_eff = 0.90; + } + + float3 color; + float alphaOut = baseColor.a; + + if (T_eff > 0.0) { + // Transmission/background sample (refraction approximation): sample the opaque scene behind glass. + float2 uvR = input.Position.xy / ubo.screenDimensions; + uvR = clamp(uvR, float2(0.0, 0.0), float2(1.0, 1.0)); + float3 bg = opaqueSceneColor.Sample(uvR).rgb; + // Tint the background by albedo to approximate colored glass. + bg *= lerp(float3(1.0, 1.0, 1.0), max(albedo, 0.6), 0.8); + + // Planar reflection sample (optional) + float3 refl = bg; + if (ubo.reflectionEnabled == 1) { + float4 pr = mul(ubo.reflectionVP, float4(input.WorldPos, 1.0)); + float2 uvP = pr.xy / max(pr.w, 1e-5); + uvP = uvP * 0.5 + 0.5; + if (uvP.x >= 0.0 && uvP.x <= 1.0 && uvP.y >= 0.0 && uvP.y <= 1.0) { + refl = reflectionMap.Sample(uvP).rgb; + } + } + + // Stylized, stable glass: Use a tinted + // glass body + rim highlight, then add planar reflection contribution. + + // Use symmetric |N·V| so that front/back views of thin glass walls + // behave consistently (important when looking down into glasses). + float NdotV = abs(dot(G, V)); + + // Base clear color from albedo, slightly dimmed so glass does not + // appear self-emissive. + float3 clearColor = albedo * 0.6; + + // Rim term stronger at grazing angles (1 - NdotV)^3, but keep it subtle + float edge = pow(1.0 - NdotV, 3.0); + float3 rimColor = lerp(clearColor, float3(1.0, 1.0, 1.0), 0.25); + + // Surface term: keep subtle so glass does not appear frosted. + float3 surfaceBase = emissive; + float3 surfaceTerm = surfaceBase * (1.0 - T_eff) * 0.12; + + // Base surface appearance (slight body + rim) and transmitted background. + float3 glassBody = clearColor * 0.08; + float3 rim = rimColor * (edge * 0.25); + float3 surface = glassBody + rim + surfaceTerm; + + // Primary transmission mix: this is what makes interior lighting visible through windows. + color = lerp(surface, bg, T_eff); + + // Restore Fresnel-blended mixing with boosted visibility for debugging/tuning. + float3 F_view2 = FresnelSchlick(NdotV, float3(0.06, 0.06, 0.06)); + float F_avg2 = (F_view2.r + F_view2.g + F_view2.b) / 3.0; + float reflStrength = saturate(0.20 + (1.5 * F_avg2) * (1.0 - material.roughnessFactor)); + // Scale by user-controlled intensity + reflStrength *= max(0.0, ubo.reflectionIntensity); + color = lerp(color, refl, reflStrength); + + // Fresnel influences alpha (how opaque the glass appears), not color here. + // We already used F to modulate reflection strength above. + + // Opacity model for architectural glass: mostly transparent at + // normal incidence, with a gentle Fresnel-driven increase in + // opacity toward grazing angles. TransmissionFactor controls how + // much of the underlying scene shows through overall. + + // Since we are sampling the background (opaqueSceneColor) and mixing it in the shader, + // we should output an alpha of 1.0 to ensure our mixed color is shown correctly + // in the swapchain, avoiding "double blending" with the hardware blender. + alphaOut = 1.0; + } else { + // Non-transmissive fallback: just ambient + emissive. + color = emissive; + } + + // Simple Forward+ lighting for glass (additive), using per-tile lists. + // This is a pragmatic lighting contribution so emissive bulbs can light glass-covered pixels. + // It does not model full transmission; it simply adds local diffuse+spec highlights. + { + const uint TILE = 16u; + uint tilesX = (uint(ubo.screenDimensions.x) + TILE - 1u) / TILE; + uint tilesY = (uint(ubo.screenDimensions.y) + TILE - 1u) / TILE; + uint px = (uint)max(0.0, input.Position.x); + uint py = (uint)max(0.0, input.Position.y); + uint tileX = (tilesX > 0u) ? min(px / TILE, tilesX - 1u) : 0u; + uint tileY = (tilesY > 0u) ? min(py / TILE, tilesY - 1u) : 0u; + uint totalTiles = max(tilesX * tilesY, 1u); + uint tileId = tileY * tilesX + tileX; + uint base = 0u; + uint count = 0u; + if (tileId < totalTiles) { + TileHeader th = tileHeaders[tileId]; + base = th.offset; + count = th.count; + } + if (count > 0u) { + float3 Ng = normalize(input.GeometricNormal); + float3 Vv = normalize(ubo.camPoses[input.ViewID].xyz - input.WorldPos); + // Use a neutral albedo to avoid darkening glass; weight specular more + float3 alb = float3(0.6, 0.6, 0.6); + float rough = 0.49; + float metal = 0.0; + for (uint li = 0u; li < count; ++li) { + uint lightIndex = tileLightIndices[base + li]; + LightData light = lightBuffer[lightIndex]; + float3 L, radiance; + float distToLight = 10000.0; + if (light.lightType == 1) { + L = normalize(-light.position.xyz); + radiance = light.color.rgb; + } else { + float3 toLight = light.position.xyz - input.WorldPos; + float d = length(toLight); + L = (d > 1e-5) ? toLight / d : float3(0,0,1); + distToLight = d; + if (light.lightType == 3) { + float r = max(light.range, 0.001); + float att = 1.0 / (1.0 + (d / r) * (d / r)); + radiance = light.color.rgb * att; + } else { + radiance = light.color.rgb / max(d * d, 0.0001); + } + } + float rawDot = dot(Ng, L); + float NdotL = (light.lightType == 3) ? abs(rawDot) : max(rawDot, 0.0); + if (NdotL > 0.0) { + float visibility = 1.0; + if (ubo.padding2 != 0.0) { + float tMaxShadow = (light.lightType == 1) ? 10000.0 : max(distToLight - RASTER_SHADOW_EPS, RASTER_SHADOW_EPS); + float3 shadowOrigin = input.WorldPos + Ng * RASTER_SHADOW_EPS; + bool occluded = traceShadowOccluded(shadowOrigin, L, RASTER_SHADOW_EPS, tMaxShadow); + visibility = occluded ? 0.0 : 1.0; + } + + float3 H = normalize(Vv + L); + float NdotV = max(dot(Ng, Vv), 0.0); + float NdotH = max(dot(Ng, H), 0.0); + float HdotV = max(dot(H, Vv), 0.0); + float D = DistributionGGX(NdotH, rough); + float G = GeometrySmith(NdotV, NdotL, rough); + float3 F = FresnelSchlick(HdotV, lerp(float3(0.04,0.04,0.04), alb, metal)); + float3 spec = (D * G * F) / max(4.0 * NdotV * NdotL, 0.0001); + float3 kD = (1.0 - F) * (1.0 - metal); + // Add a modest contribution to the glass color + color += ((kD * alb / PI) * radiance * NdotL * 0.6 + spec * radiance * NdotL * 0.8) * visibility; + } + } + } + } + + + // --- 3. Post-processing (same as PSMain) --- + color *= ubo.exposure; + + // Uncharted2 / Hable filmic tonemap. Use the canonical form without + // the extra 1.2 pre-scale so that midtones and shadows are not + // over-compressed relative to highlights. + float3 t = Hable_Filmic_Tonemapping::Uncharted2Tonemap(color); + float3 w = Hable_Filmic_Tonemapping::Uncharted2Tonemap(float3(1,1,1) * Hable_Filmic_Tonemapping::W); + color = t / max(w, float3(1e-6, 1e-6, 1e-6)); + + if (ubo.padding0 == 0) { + color = pow(saturate(color), float3(1.0 / ubo.gamma)); + } else { + color = saturate(color); + } + + return float4(color, alphaOut); +} diff --git a/attachments/openxr_engine/shaders/pbr_utils.slang b/attachments/openxr_engine/shaders/pbr_utils.slang new file mode 100644 index 00000000..59846a20 --- /dev/null +++ b/attachments/openxr_engine/shaders/pbr_utils.slang @@ -0,0 +1,55 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// PBR utility functions for physically-based rendering +// Shared between rasterization and ray query shaders + +import common_types; + +// GGX/Trowbridge-Reitz normal distribution function +// Describes the distribution of microfacet normals +float DistributionGGX(float NdotH, float roughness) { + float a = roughness * roughness; + float a2 = a * a; + float NdotH2 = NdotH * NdotH; + float nom = a2; + float denom = (NdotH2 * (a2 - 1.0) + 1.0); + denom = PI * denom * denom; + return nom / max(denom, 0.000001); +} + +// Smith's geometry function with Schlick-GGX approximation +// Describes the self-shadowing of microfacets +float GeometrySmith(float NdotV, float NdotL, float roughness) { + float r = roughness + 1.0; + float k = (r * r) / 8.0; + float ggx1 = NdotV / (NdotV * (1.0 - k) + k); + float ggx2 = NdotL / (NdotL * (1.0 - k) + k); + return ggx1 * ggx2; +} + +// Fresnel-Schlick approximation +// Describes the ratio of reflected vs refracted light +float3 FresnelSchlick(float cosTheta, float3 F0) { + return F0 + (1.0 - F0) * pow(saturate(1.0 - cosTheta), 5.0); +} + +// Fresnel for dielectric materials (given IOR) +float3 Fresnel_Dielectric(float cosTheta, float ior) { + float r0 = (1.0 - ior) / (1.0 + ior); + float3 F0 = float3(r0 * r0); + return F0 + (1.0 - F0) * pow(saturate(1.0 - cosTheta), 5.0); +} diff --git a/attachments/openxr_engine/shaders/physics.slang b/attachments/openxr_engine/shaders/physics.slang new file mode 100644 index 00000000..ce8c9e34 --- /dev/null +++ b/attachments/openxr_engine/shaders/physics.slang @@ -0,0 +1,460 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Compute shader for physics simulation +// This shader processes rigid body physics data to simulate physical interactions + + +// Physics data structure +struct PhysicsData { + float4 position; // xyz = position, w = inverse mass + float4 rotation; // quaternion + float4 linearVelocity; // xyz = velocity, w = restitution + float4 angularVelocity; // xyz = angular velocity, w = friction + float4 force; // xyz = force, w = is kinematic (0 or 1) + float4 torque; // xyz = torque, w = use gravity (0 or 1) + float4 colliderData; // type-specific data (e.g., radius for spheres) + float4 colliderData2; // additional collider data (e.g., box half extents) +}; + +// Collision data structure +struct CollisionData { + uint bodyA; + uint bodyB; + float4 contactNormal; // xyz = normal, w = penetration depth + float4 contactPoint; // xyz = contact point, w = unused +}; + +// Input/output buffer bindings +[[vk::binding(0, 0)]] RWStructuredBuffer physicsBuffer; // Physics data +[[vk::binding(1, 0)]] RWStructuredBuffer collisionBuffer; // Collision data +[[vk::binding(2, 0)]] RWStructuredBuffer pairBuffer; // Potential collision pairs +[[vk::binding(3, 0)]] RWStructuredBuffer counterBuffer; // [0] = pair count, [1] = collision count + +// Parameters for physics simulation +[[vk::binding(4, 0)]] ConstantBuffer params; + +struct PhysicsParams { + float deltaTime; // Time step - 4 bytes + uint numBodies; // Number of rigid bodies - 4 bytes + uint maxCollisions; // Maximum number of collisions - 4 bytes + float padding; // Explicit padding to align gravity to 16-byte boundary - 4 bytes + float4 gravity; // Gravity vector (xyz) + padding (w) - 16 bytes + // Total: 32 bytes (aligned to 16-byte boundaries for std140 layout) +}; + +// Quaternion multiplication +float4 quatMul(float4 q1, float4 q2) { + return float4( + q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y, + q1.w * q2.y - q1.x * q2.z + q1.y * q2.w + q1.z * q2.x, + q1.w * q2.z + q1.x * q2.y - q1.y * q2.x + q1.z * q2.w, + q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z + ); +} + +// Quaternion normalization +float4 quatNormalize(float4 q) { + float len = length(q); + if (len > 0.0001) { + return q / len; + } + return float4(0, 0, 0, 1); +} + +// Integration shader - updates positions and velocities +[shader("compute")] +[numthreads(64, 1, 1)] +void IntegrateCS(uint3 dispatchThreadID : SV_DispatchThreadID) { + uint index = dispatchThreadID.x; + + // Check if this thread is within the number of bodies + if (index >= params.numBodies) { + return; + } + + // Get physics data for this body + PhysicsData body = physicsBuffer[index]; + + + // Skip kinematic bodies + if (body.force.w > 0.5) { + return; + } + + // Apply gravity if enabled + if (body.torque.w > 0.5) { + float3 gravityForce = params.gravity.xyz * body.position.w; + body.force.xyz += gravityForce; + + } + + // Integrate forces + float3 velocityChange = body.force.xyz * body.position.w * params.deltaTime; + body.linearVelocity.xyz += velocityChange; + body.angularVelocity.xyz += body.torque.xyz * params.deltaTime; // Simplified, should use inertia tensor + + + // Apply damping + const float linearDamping = 0.01; + const float angularDamping = 0.01; + body.linearVelocity.xyz *= (1.0 - linearDamping); + body.angularVelocity.xyz *= (1.0 - angularDamping); + + // Integrate velocities + float3 positionChange = body.linearVelocity.xyz * params.deltaTime; + body.position.xyz += positionChange; + + + // Update rotation + float4 angularVelocityQuat = float4(body.angularVelocity.xyz * 0.5, 0.0); + float4 rotationDelta = quatMul(angularVelocityQuat, body.rotation); + body.rotation = quatNormalize(body.rotation + rotationDelta * params.deltaTime); + + // Write updated data back to buffer + physicsBuffer[index] = body; + +} + +// Compute AABB for a body +void computeAABB(PhysicsData body, out float3 min, out float3 max) { + // Default to a small AABB + min = body.position.xyz - float3(0.1, 0.1, 0.1); + max = body.position.xyz + float3(0.1, 0.1, 0.1); + + // Check collider type + int colliderType = int(body.colliderData.w); + + if (colliderType == 0) { // Sphere + float radius = body.colliderData.x; + float3 center = body.position.xyz + body.colliderData2.xyz; + min = center - float3(radius, radius, radius); + max = center + float3(radius, radius, radius); + } + else if (colliderType == 1) { // Box + float3 halfExtents = body.colliderData.xyz; + float3 center = body.position.xyz + body.colliderData2.xyz; + // This is simplified - should account for rotation + min = center - halfExtents; + max = center + halfExtents; + } + else if (colliderType == 2) { // Mesh (represented as large bounding box) + float3 halfExtents = body.colliderData.xyz; + float3 center = body.position.xyz + body.colliderData2.xyz; + // This is simplified - should account for rotation + min = center - halfExtents; + max = center + halfExtents; + } +} + +// Check if two AABBs overlap +bool aabbOverlap(float3 minA, float3 maxA, float3 minB, float3 maxB) { + return all(minA < maxB) && all(minB < maxA); +} + +// Broad phase collision detection - identifies potential collision pairs +[shader("compute")] +[numthreads(64, 1, 1)] +void BroadPhaseCS(uint3 dispatchThreadID : SV_DispatchThreadID) { + uint index = dispatchThreadID.x; + + // Calculate total number of pairs + uint numPairs = (params.numBodies * (params.numBodies - 1)) / 2; + + if (index >= numPairs) { + return; + } + + // Convert linear index to pair indices (i, j) where i < j + // Use a more robust algorithm that avoids floating-point precision issues + uint i = 0; + uint j = 0; + + // Find i and j using integer arithmetic to avoid precision errors + uint remaining = index; + uint currentRow = 0; + + // Find which row (i value) this index belongs to + while (remaining >= (params.numBodies - 1 - currentRow)) { + remaining -= (params.numBodies - 1 - currentRow); + currentRow++; + } + + i = currentRow; + j = i + 1 + remaining; + + // Get physics data for both bodies + PhysicsData bodyA = physicsBuffer[i]; + PhysicsData bodyB = physicsBuffer[j]; + + + // Skip if both bodies are kinematic + if (bodyA.force.w > 0.5 && bodyB.force.w > 0.5) { + return; + } + + // Skip if either body doesn't have a collider + if (bodyA.colliderData.w < 0 || bodyB.colliderData.w < 0) { + return; + } + + // Early culling: only consider pairs where at least one body is a sphere (shape 0) + int shapeA = int(bodyA.colliderData.w); + int shapeB = int(bodyB.colliderData.w); + if (!(shapeA == 0 || shapeB == 0)) { + return; + } + + // Compute AABBs + float3 minA, maxA, minB, maxB; + computeAABB(bodyA, minA, maxA); + computeAABB(bodyB, minB, maxB); + + // Expand sphere AABBs by motion over the timestep to catch fast-moving spheres + if (shapeA == 0) { + float3 expandA = abs(bodyA.linearVelocity.xyz) * params.deltaTime; + minA -= expandA; maxA += expandA; + } + if (shapeB == 0) { + float3 expandB = abs(bodyB.linearVelocity.xyz) * params.deltaTime; + minB -= expandB; maxB += expandB; + } + + // Check for AABB overlap + if (aabbOverlap(minA, maxA, minB, maxB)) { + // Add to potential collision pairs + uint pairIndex; + InterlockedAdd(counterBuffer[0], 1, pairIndex); + + if (pairIndex < params.maxCollisions) { + pairBuffer[pairIndex] = uint2(i, j); + } + } +} + +// Narrow phase collision detection - detailed collision detection for potential pairs +[shader("compute")] +[numthreads(64, 1, 1)] +void NarrowPhaseCS(uint3 dispatchThreadID : SV_DispatchThreadID) { + uint index = dispatchThreadID.x; + + // Check if this thread is within the number of potential pairs + uint numPairs = counterBuffer[0]; + if (index >= numPairs || index >= params.maxCollisions) { + return; + } + + // Get the pair of bodies + uint2 pair = pairBuffer[index]; + uint bodyIndexA = pair.x; + uint bodyIndexB = pair.y; + + PhysicsData bodyA = physicsBuffer[bodyIndexA]; + PhysicsData bodyB = physicsBuffer[bodyIndexB]; + + // Determine collision shapes + int shapeA = int(bodyA.colliderData.w); + int shapeB = int(bodyB.colliderData.w); + + // Handle sphere-sphere collisions + if (shapeA == 0 && shapeB == 0) { // Both are spheres + float radiusA = bodyA.colliderData.x; + float radiusB = bodyB.colliderData.x; + + float3 posA = bodyA.position.xyz + bodyA.colliderData2.xyz; + float3 posB = bodyB.position.xyz + bodyB.colliderData2.xyz; + + float3 direction = posB - posA; + float distance = length(direction); + float minDistance = radiusA + radiusB; + + if (distance < minDistance) { + // Collision detected + uint collisionIndex; + InterlockedAdd(counterBuffer[1], 1, collisionIndex); + + if (collisionIndex < params.maxCollisions) { + // Normalize direction + float3 normal = direction / max(distance, 0.0001); + + // Create collision data + CollisionData collision; + collision.bodyA = bodyIndexA; + collision.bodyB = bodyIndexB; + collision.contactNormal = float4(normal, minDistance - distance); // penetration depth + collision.contactPoint = float4(posA + normal * radiusA, 0); + + // Store collision data + collisionBuffer[collisionIndex] = collision; + } + } + } + // Handle sphere-geometry collisions (sphere vs mesh represented as box) + else if ((shapeA == 0 && shapeB == 2) || (shapeA == 2 && shapeB == 0)) { + // Determine which is sphere and which is geometry + PhysicsData sphere = (shapeA == 0) ? bodyA : bodyB; + PhysicsData geometry = (shapeA == 0) ? bodyB : bodyA; + uint sphereIndex = (shapeA == 0) ? bodyIndexA : bodyIndexB; + uint geometryIndex = (shapeA == 0) ? bodyIndexB : bodyIndexA; + + float sphereRadius = sphere.colliderData.x; + float3 spherePos = sphere.position.xyz + sphere.colliderData2.xyz; + float3 geometryPos = geometry.position.xyz + geometry.colliderData2.xyz; + float3 geometryHalfExtents = geometry.colliderData.xyz; + + // Simple sphere-box collision detection + float3 closestPoint = clamp(spherePos, geometryPos - geometryHalfExtents, geometryPos + geometryHalfExtents); + float3 direction = spherePos - closestPoint; + float distance = length(direction); + + if (distance < sphereRadius) { + // Collision detected (overlap) + uint collisionIndex; + InterlockedAdd(counterBuffer[1], 1, collisionIndex); + + if (collisionIndex < params.maxCollisions) { + // Calculate normal so that it points from sphere(A) to geometry(B) + float3 normal = (distance > 0.0001) ? (-direction / distance) : float3(0, 1, 0); + float penetration = sphereRadius - distance; + + // Create collision data + CollisionData collision; + collision.bodyA = sphereIndex; + collision.bodyB = geometryIndex; + collision.contactNormal = float4(normal, penetration); + collision.contactPoint = float4(closestPoint, 0); + + // Store collision data + collisionBuffer[collisionIndex] = collision; + } + } else { + // Swept test (CCD-lite): segment from previous position to current against box expanded by sphere radius + float3 prevPos = spherePos - sphere.linearVelocity.xyz * params.deltaTime; + float3 dir = spherePos - prevPos; + float dirLen = length(dir); + if (dirLen > 1e-6) { + float3 bbMin = geometryPos - (geometryHalfExtents + sphereRadius); + float3 bbMax = geometryPos + (geometryHalfExtents + sphereRadius); + + float3 invDir = 1.0 / max(abs(dir), float3(1e-6, 1e-6, 1e-6)); + float3 t0 = (bbMin - prevPos) / dir; + float3 t1 = (bbMax - prevPos) / dir; + float3 tmin3 = min(t0, t1); + float3 tmax3 = max(t0, t1); + float tEnter = max(tmin3.x, max(tmin3.y, tmin3.z)); + float tExit = min(tmax3.x, min(tmax3.y, tmax3.z)); + + if (tEnter >= 0.0 && tEnter <= 1.0 && tEnter <= tExit) { + // Determine contact normal based on entry axis and direction of motion + float3 normal = float3(0,0,0); + if (tEnter >= tmin3.x && tEnter >= tmin3.y && tEnter >= tmin3.z) { + normal = float3((dir.x > 0.0) ? 1.0 : -1.0, 0.0, 0.0); + } else if (tEnter >= tmin3.y && tEnter >= tmin3.z) { + normal = float3(0.0, (dir.y > 0.0) ? 1.0 : -1.0, 0.0); + } else { + normal = float3(0.0, 0.0, (dir.z > 0.0) ? 1.0 : -1.0); + } + + float3 hitPoint = prevPos + dir * tEnter; + + uint collisionIndex; + InterlockedAdd(counterBuffer[1], 1, collisionIndex); + if (collisionIndex < params.maxCollisions) { + CollisionData collision; + collision.bodyA = sphereIndex; + collision.bodyB = geometryIndex; + // Tiny penetration to trigger resolution without large positional correction + collision.contactNormal = float4(normalize(normal), 0.0); + collision.contactPoint = float4(hitPoint, 0.0); + collisionBuffer[collisionIndex] = collision; + } + } + } + } + } +} + +// Collision resolution - resolves detected collisions +[shader("compute")] +[numthreads(64, 1, 1)] +void ResolveCS(uint3 dispatchThreadID : SV_DispatchThreadID) { + uint index = dispatchThreadID.x; + + // Check if this thread is within the number of collisions + uint numCollisions = counterBuffer[1]; + if (index >= numCollisions || index >= params.maxCollisions) { + return; + } + + // Get collision data + CollisionData collision = collisionBuffer[index]; + + // Get the bodies involved in the collision + PhysicsData bodyA = physicsBuffer[collision.bodyA]; + PhysicsData bodyB = physicsBuffer[collision.bodyB]; + + // Skip if both bodies are kinematic + if (bodyA.force.w > 0.5 && bodyB.force.w > 0.5) { + return; + } + + // Calculate relative velocity + float3 relativeVelocity = bodyB.linearVelocity.xyz - bodyA.linearVelocity.xyz; + + // Calculate velocity along normal + float velocityAlongNormal = dot(relativeVelocity, collision.contactNormal.xyz); + + // Don't resolve if velocities are separating + if (velocityAlongNormal > 0) { + return; + } + + // Calculate restitution (bounciness) + float restitution = min(bodyA.linearVelocity.w, bodyB.linearVelocity.w); + + // Calculate impulse scalar + float j = -(1.0 + restitution) * velocityAlongNormal; + j /= bodyA.position.w + bodyB.position.w; + + // Apply impulse + float3 impulse = collision.contactNormal.xyz * j; + + // Update velocities + if (bodyA.force.w < 0.5) { // If not kinematic + bodyA.linearVelocity.xyz -= impulse * bodyA.position.w; + physicsBuffer[collision.bodyA] = bodyA; + } + + if (bodyB.force.w < 0.5) { // If not kinematic + bodyB.linearVelocity.xyz += impulse * bodyB.position.w; + physicsBuffer[collision.bodyB] = bodyB; + } + + // Position correction to prevent sinking + const float percent = 0.2; // usually 20% to 80% + const float slop = 0.01; // small penetration allowed + float3 correction = max(collision.contactNormal.w - slop, 0.0) * percent * collision.contactNormal.xyz / (bodyA.position.w + bodyB.position.w); + + if (bodyA.force.w < 0.5) { // If not kinematic + bodyA.position.xyz -= correction * bodyA.position.w; + physicsBuffer[collision.bodyA] = bodyA; + } + + if (bodyB.force.w < 0.5) { // If not kinematic + bodyB.position.xyz += correction * bodyB.position.w; + physicsBuffer[collision.bodyB] = bodyB; + } +} diff --git a/attachments/openxr_engine/shaders/ray_query.slang b/attachments/openxr_engine/shaders/ray_query.slang new file mode 100644 index 00000000..ad09434c --- /dev/null +++ b/attachments/openxr_engine/shaders/ray_query.slang @@ -0,0 +1,1067 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Ray query compute shader for ray-traced rendering +// Uses Slang's ray query extension as an alternative to rasterization + +import common_types; +import pbr_utils; +import lighting_utils; +import tonemapping_utils; + +// C++ Vertex structure layout (tightly packed, 48 bytes total): +// - position: vec3 at offset 0 (12 bytes) +// - normal: vec3 at offset 12 (12 bytes) +// - texCoord: vec2 at offset 24 (8 bytes) +// - tangent: vec4 at offset 32 (16 bytes) +// We'll read normals directly as floats to avoid PhysicalStorageBuffer alignment issues + +// Ray Query uses a dedicated uniform layout to avoid CPU↔shader drift. +// IMPORTANT: This must match `RayQueryUniformBufferObject` in `renderer.h`. +struct RayQueryUniforms { + [[vk::offset(0)]] float4x4 model; + [[vk::offset(64)]] float4x4 view; + [[vk::offset(128)]] float4x4 proj; + [[vk::offset(192)]] float4 camPos; + + [[vk::offset(208)]] float exposure; + [[vk::offset(212)]] float gamma; + [[vk::offset(216)]] float scaleIBLAmbient; + [[vk::offset(220)]] int lightCount; + [[vk::offset(224)]] int enableRayQueryReflections; + [[vk::offset(228)]] int enableRayQueryTransparency; + + [[vk::offset(232)]] float2 screenDimensions; + [[vk::offset(240)]] int geometryInfoCount; + [[vk::offset(244)]] int materialCount; + [[vk::offset(248)]] int _pad0; + [[vk::offset(252)]] int enableThickGlass; // 0/1 toggle for thick-glass attenuation + [[vk::offset(256)]] float thicknessClamp; // max thickness in meters (safety clamp) + [[vk::offset(260)]] float absorptionScale; // scales sigma_a (1=as-is) + // Ray Query shadows: 0/1 enable (wired from C++ as `enableRayQueryShadows`) + [[vk::offset(264)]] int _pad1; + // Ray Query soft shadows (area-light approximation) + [[vk::offset(268)]] int shadowSampleCount; // 1 = hard shadows (single shadow ray) + [[vk::offset(272)]] float shadowSoftness; // 0 = hard; otherwise scales effective light radius (see shader) + [[vk::offset(276)]] float reflectionIntensity; // user control + [[vk::offset(280)]] float _padShadow1; + [[vk::offset(284)]] float _padShadow2; +}; + +// Compute shader descriptor bindings +[[vk::binding(0, 0)]] ConstantBuffer ubo; +[[vk::binding(1, 0)]] RaytracingAccelerationStructure tlas; // Top-level acceleration structure +[[vk::binding(2, 0)]] RWTexture2D outputImage; // Output render target +[[vk::binding(3, 0)]] StructuredBuffer lightBuffer; +[[vk::binding(4, 0)]] StructuredBuffer geometryInfoBuffer; // Geometry vertex/index addresses +[[vk::binding(5, 0)]] StructuredBuffer materialBuffer; // Material properties +// Fixed-size Ray Query texture table (combined image samplers) +// Must match Renderer::RQ_MAX_TEX in C++ (currently 2048) +static const uint RQ_MAX_TEX = 2048; +[[vk::binding(6, 0)]] Sampler2D baseColorTex[RQ_MAX_TEX]; + +// (No debug buffer in production layout) + +// (No debug toggles in production) + +// NOTE: Ray Query debugPrintf/printf diagnostics are intentionally removed. +// Keep Ray Query VVL-clean and log-noise-free by default. + +float3 materialDebugColor(uint materialIndex) { + float u = float(materialIndex + 1); + float3 h = frac(float3(0.06711056, 0.00583715, 0.2065) * u); + return frac(h * 2.6180339); +} + +// Simple ray-scene intersection result +struct HitInfo { + bool hit; + float t; + float3 worldPos; + float3 normal; + float3 baseColor; // base color (factor * texture), not lit + float3 color; + float3 F0; + float roughness; + float metallic; + float transmission; // Added for glass/transparency support + float ior; + bool isGlass; // Material is glass (should always have reflections/transparency) + bool isLiquid; + bool thinWalled; // true = thin surface (no thickness), false = thick volume + int alphaMode; // 0=OPAQUE, 1=MASK, 2=BLEND + float opacity; // 0..1, derived from baseColor alpha (factor * texture) + uint instanceId; // Committed instance ID (for debug coloring) + float2 uv; // Interpolated UV at the hit (glTF V flip already applied) + uint texIndex; // Resolved baseColor texture index used for sampling (for stats/debug) + uint materialIndex; // Resolved/clamped material index used +}; + +static const float RQ_RAY_EPS = 0.002; + +uint rqHash(uint v) +{ + // Thomas Wang 32-bit integer hash + v = (v ^ 61u) ^ (v >> 16u); + v *= 9u; + v = v ^ (v >> 4u); + v *= 0x27d4eb2du; + v = v ^ (v >> 15u); + return v; +} + +float rqRand01(inout uint state) +{ + state = rqHash(state); + // 24-bit mantissa-ish + return float(state & 0x00FFFFFFu) * (1.0 / 16777216.0); +} + +float2 rqSampleDisk(inout uint state) +{ + // Uniform disk sampling + float u1 = rqRand01(state); + float u2 = rqRand01(state); + float r = sqrt(max(u1, 0.0)); + float a = 6.28318530718 * u2; + return float2(cos(a), sin(a)) * r; +} + +float3 skyColor(float3 dir) { + float t = saturate(0.5 * (dir.y + 1.0)); + float3 a = float3(0.06, 0.06, 0.09); + float3 b = float3(0.20, 0.25, 0.32); + return lerp(a, b, t); +} + +// Heuristic explicit-LOD for compute sampling (no implicit derivatives available) +float computeTextureLOD(float3 worldPos, float roughness) { + // Approximate screen-space footprint from view distance and modulate by roughness + float d = length(ubo.camPos.xyz - worldPos); + // Tune 0.25 scale empirically for this scene; avoids over-sharp minification + float lod = log2(max(d * 0.25, 1.0)); + lod *= lerp(0.6, 1.4, saturate(roughness)); + return max(lod, 0.0); +} + +float3 shadeWithSecondaryRays(float3 rayOrigin, float3 rayDir, HitInfo hit, inout uint rngState) { + float3 base = max(hit.color, 0.0); + int maxBounces = clamp(ubo._pad0, 0, 10); + + float3 N = hit.normal; + float3 V = normalize(-rayDir); + float NdotV = abs(dot(N, V)); + + // Fresnel for reflection weighting + float3 F = FresnelSchlick(NdotV, hit.F0); + if (hit.transmission > 0.01 || hit.isGlass) { + F = Fresnel_Dielectric(NdotV, hit.ior); + } + float Fr = saturate((F.r + F.g + F.b) * (1.0 / 3.0)); + + // Transmission gate + float opacity = clamp(hit.opacity, 0.0, 1.0); + float blendTransmission = (hit.alphaMode == 2) ? (1.0 - opacity) : 0.0; + float physicalTransmission = clamp(hit.transmission, 0.0, 1.0); + + // Many scenes tag architectural glass via a material hint (`hit.isGlass`) even when + // `KHR_materials_transmission` is not authored. Provide a robust fallback so glass + // does not turn black and so interior lighting remains visible through windows. + float glassTransmission = 0.0; + if (hit.isGlass) { + // Only apply the strong "glass fallback" when the surface actually looks like a pane + // (i.e., has meaningful coverage/opacity < 1). This avoids pushing opaque surfaces that + // were heuristically tagged as glass into the refraction path (which can cause view-dependent banding). + glassTransmission = max(physicalTransmission, (1.0 - opacity)); + if (glassTransmission < 0.01 && opacity < 0.98) { + glassTransmission = 0.90; + } + } + + // Alpha BLEND (decals/foliage) should work regardless of the "transparency/refraction" toggle. + // That toggle is intended for true transmission/glass, not basic alpha compositing. + float T = blendTransmission; + if (ubo.enableRayQueryTransparency != 0 && !hit.isLiquid) { + T = max(T, max(physicalTransmission, glassTransmission)); + } + + // Reflection ray (chain up to maxBounces) + float3 reflCol = float3(0.0, 0.0, 0.0); + bool doRefl = (ubo.enableRayQueryReflections != 0) && (Fr > 1e-4); + if (doRefl) { + float3 ro = hit.worldPos + hit.normal * RQ_RAY_EPS; + float3 rd = normalize(reflect(rayDir, N)); + + float3 last = skyColor(rd); + for (int b = 0; b < maxBounces; ++b) { + HitInfo rh = traceRay(ro, rd, RQ_RAY_EPS, 10000.0, rngState); + last = rh.hit ? max(rh.color, 0.0) : skyColor(rd); + if (!rh.hit) { + break; + } + // Next bounce + float3 Nr = normalize(rh.normal); + rd = normalize(reflect(rd, Nr)); + ro = rh.worldPos + Nr * RQ_RAY_EPS; + } + reflCol = last * ubo.reflectionIntensity; + } + + // Alpha BLEND (decals/foliage) is coverage. Composite over what is behind along the same ray. + // Do this iteratively (no recursion allowed in SPIR-V codegen). + if (hit.alphaMode == 2 && physicalTransmission <= 1e-4 && !hit.isGlass) { + float3 accum = float3(0.0, 0.0, 0.0); + float trans = 1.0; + HitInfo cur = hit; + + // Composite up to a small number of BLEND layers (coplanar decals). + // After that, treat the next hit as opaque background. + uint lastInst = cur.instanceId; + uint lastMat = cur.materialIndex; + uint lastPrim = 0u; + + [loop] + for (int layer = 0; layer < 8; ++layer) { + float a = clamp(cur.opacity, 0.0, 1.0); + accum += max(cur.color, 0.0) * (a * trans); + trans *= (1.0 - a); + if (trans < 1e-3) { + break; + } + + // Step behind the current BLEND layer. Coplanar decals can self-hit even with a small bias, + // which would cause us to repeatedly composite the same layer and look opaque. + float3 startPos = cur.worldPos + cur.normal * (4.0 * RQ_RAY_EPS) + rayDir * (4.0 * RQ_RAY_EPS); + HitInfo next = traceRay(startPos, rayDir, RQ_RAY_EPS, 10000.0, rngState); + + // If we re-hit the same instance/material, push further along the ray direction and retry a few times. + [loop] + for (int retry = 0; retry < 4 && next.hit; ++retry) { + if (next.instanceId != lastInst || next.materialIndex != lastMat) { + break; + } + startPos += rayDir * (16.0 * RQ_RAY_EPS); + next = traceRay(startPos, rayDir, RQ_RAY_EPS, 10000.0, rngState); + } + if (!next.hit) { + accum += skyColor(rayDir) * trans; + break; + } + + // Stop stacking when we reach a non-BLEND surface (or true transmission/glass). + if (next.alphaMode != 2 || next.isGlass || next.transmission > 0.01) { + accum += max(next.color, 0.0) * trans; + break; + } + + lastInst = next.instanceId; + lastMat = next.materialIndex; + cur = next; + } + return accum; + } + + // Transmission ray (iterative loop to support multiple layers) + float3 thruCol = skyColor(rayDir); + bool doThru = (T > 1e-4); + if (doThru) { + float3 curRd = rayDir; + float3 curRo = hit.worldPos; + float3 accumTint = float3(1.0); + HitInfo curHit = hit; + + for (int bounce = 0; bounce < maxBounces + 1; ++bounce) { + float curPhysT = clamp(curHit.transmission, 0.0, 1.0); + float curOpacity = clamp(curHit.opacity, 0.0, 1.0); + float curGlassT = curHit.isGlass ? max(curPhysT, 1.0 - curOpacity) : 0.0; + if (curHit.isGlass && curGlassT < 0.01 && curOpacity < 0.98) curGlassT = 0.9; + float curT = max(curPhysT, max((curHit.alphaMode == 2 ? 1.0 - curOpacity : 0.0), curGlassT)); + + if (curT < 1e-4) { + thruCol = accumTint * curHit.color; + break; + } + + float3 nextRd = curRd; + bool refracts = (curPhysT > 1e-4 || (curHit.isGlass && curOpacity < 0.98)); + if (refracts) { + float3 Nn = curHit.normal; + float eta = 1.0 / max(curHit.ior, 1.0); + // Determine if we're entering or exiting based on ray direction vs normal + if (dot(curRd, curHit.normal) > 0.0) { + Nn = -curHit.normal; + eta = max(curHit.ior, 1.0); + } + float3 refrDir; + if (refract(curRd, Nn, eta, refrDir)) { + nextRd = normalize(refrDir); + } else { + // Total internal reflection + thruCol = float3(0, 0, 0); + break; + } + } + + // Accumulate tint for refractive surfaces + if (refracts) { + float3 tint = max(clamp(curHit.baseColor, 0.0, 1.0), float3(0.5, 0.5, 0.5)); + accumTint *= tint; + } + + // Trace from just inside the entry point to reduce self-hits. + float3 startPos = curHit.worldPos + nextRd * (4.0 * RQ_RAY_EPS); + // For coplanar BLEND decals, stepping only along the ray direction can still re-hit + // the same decal layer due to precision. Push along the surface normal as well so the + // next trace lands on the underlying wall. + if (curHit.alphaMode == 2) { + startPos = curHit.worldPos + curHit.normal * (4.0 * RQ_RAY_EPS) + nextRd * (4.0 * RQ_RAY_EPS); + } + + // Volumetric absorption for THICK glass (skip for thin-walled) + if (ubo.enableThickGlass != 0 && !curHit.thinWalled && refracts) { + HitInfo exitHit = traceRay(startPos, nextRd, (4.0 * RQ_RAY_EPS), 10000.0, rngState); + bool haveExitSameSurface = exitHit.hit && + (exitHit.instanceId == curHit.instanceId) && + (exitHit.materialIndex == curHit.materialIndex); + if (haveExitSameSurface) { + float thickness = min(distance(exitHit.worldPos, curHit.worldPos), max(0.0, ubo.thicknessClamp)); + if (thickness > 1e-6) { + uint mi = min(curHit.materialIndex, (uint)max(0, ubo.materialCount-1)); + MaterialData m = materialBuffer[mi]; + float3 C = saturate(m.absorptionColor); + float D = max(m.absorptionDistance, 1e-4); + float3 sigma_a = -log(max(C, 1e-3)) / D; + sigma_a *= max(ubo.absorptionScale, 0.0); + accumTint *= saturate(exp(-sigma_a * thickness)); + } + startPos = exitHit.worldPos + nextRd * (4.0 * RQ_RAY_EPS); + } + } + + // Trace the scene beyond this layer + HitInfo nextHit = traceRay(startPos, nextRd, RQ_RAY_EPS, 10000.0, rngState); + if (!nextHit.hit) { + thruCol = accumTint * skyColor(nextRd); + break; + } + + // Assume opaque for now; check if we should continue looping + thruCol = accumTint * nextHit.color; + + float nPhysT = clamp(nextHit.transmission, 0.0, 1.0); + float nOpacity = clamp(nextHit.opacity, 0.0, 1.0); + float nGlassT = nextHit.isGlass ? max(nPhysT, 1.0 - nOpacity) : 0.0; + if (nextHit.isGlass && nGlassT < 0.01) nGlassT = 0.9; + float nT = max(nPhysT, max((nextHit.alphaMode == 2 ? 1.0 - nOpacity : 0.0), nGlassT)); + + // Stop if the next hit is opaque or we reached max bounces + if (nT < 1e-4 || bounce == maxBounces) { + break; + } + + // Refractive radiance compensation (mitigate amplification) + if (refracts) { + bool entering = (dot(curRd, curHit.normal) < 0.0); + float eta_ratio = entering ? (1.0 / max(curHit.ior, 1.0)) : max(curHit.ior, 1.0); + if (entering) { + float invEta2 = 1.0 / max(eta_ratio * eta_ratio, 1e-4); + accumTint *= clamp(invEta2, 0.0, 1.0); + } + } + + // Move to the next transmissive layer + curHit = nextHit; + curRd = nextRd; + } + } + + if (T > 1e-4) { + // Transmissive/glass: energy-conserving mix using Fresnel reflectance Fr and + // transmission factor T (authored/heuristic). Transmission weight Ft = (1-Fr)*T. + float Fr_s = saturate(Fr); + float Ft = saturate((1.0 - Fr_s) * T); + float sum = Fr_s + Ft; + if (sum > 1.0) { + // Normalize to avoid accidental gain (should rarely trigger) + Fr_s /= sum; + Ft /= sum; + } + + float3 mixed = Fr_s * reflCol + Ft * thruCol; + + // Glass/transmission path: return the energy-conserving mixture directly, + // plus direct surface highlights (specular + emissive) from the glass surface. + return mixed + base; + } + + // Opaque: add a controlled reflection contribution (avoids double-counting too much) + float reflWeight = doRefl ? (Fr * (1.0 - clamp(hit.roughness, 0.0, 1.0))) : 0.0; + return lerp(base, reflCol, reflWeight); +} + +float2 computeTriangleUV(uint instIndex, uint primitiveIndex, float2 bary) { + float3 barycentrics = float3(1.0 - bary.x - bary.y, bary.x, bary.y); + if (instIndex >= uint(max(0, ubo.geometryInfoCount))) return float2(0.0, 0.0); + GeometryInfo geoInfo = geometryInfoBuffer[instIndex]; + if (geoInfo.vertexBufferAddress == 0 || geoInfo.indexBufferAddress == 0) return float2(0.0, 0.0); + + uint triCount = geoInfo.indexCount / 3u; + if (primitiveIndex >= triCount) return float2(0.0, 0.0); + + uint* indexBuffer = (uint*)geoInfo.indexBufferAddress; + float* vertexBuffer = (float*)geoInfo.vertexBufferAddress; + uint idxBase = primitiveIndex * 3u; + uint i0 = indexBuffer[idxBase + 0u]; + uint i1 = indexBuffer[idxBase + 1u]; + uint i2 = indexBuffer[idxBase + 2u]; + if (i0 >= geoInfo.vertexCount || i1 >= geoInfo.vertexCount || i2 >= geoInfo.vertexCount) return float2(0.0, 0.0); + + uint vertexStride = 12; // floats + float2 uv0 = float2(vertexBuffer[i0 * vertexStride + 6], vertexBuffer[i0 * vertexStride + 7]); + float2 uv1 = float2(vertexBuffer[i1 * vertexStride + 6], vertexBuffer[i1 * vertexStride + 7]); + float2 uv2 = float2(vertexBuffer[i2 * vertexStride + 6], vertexBuffer[i2 * vertexStride + 7]); + float2 uv = uv0 * barycentrics.x + uv1 * barycentrics.y + uv2 * barycentrics.z; + uv.y = 1.0 - uv.y; // flip V for glTF + return uv; +} + +float computeBaseColorAlpha(MaterialData material, uint instIndex, uint primitiveIndex, float2 bary) { + // In the Ray Query pipeline, `baseColorTexIndex` is always valid (real texture or a default slot). + // Some assets rely on alpha in the baseColor texture even when CPU-side `baseColorTextureSet` + // can be incorrect/stale. For alpha-sensitive materials, sample via the texture-table index. + float alpha = material.alpha; + bool alphaSensitive = (material.alphaMode != 0) || (material.alpha < 0.999); + if (material.baseColorTextureSet >= 0 || alphaSensitive) { + float2 uv = computeTriangleUV(instIndex, primitiveIndex, bary); + uint tiBase = (uint)min(max(material.baseColorTexIndex, 0), int(RQ_MAX_TEX - 1)); + float4 baseColor = baseColorTex[NonUniformResourceIndex(tiBase)].SampleLevel(uv, 0.0); + baseColor *= float4(material.albedo, material.alpha); + alpha = baseColor.a; + } + return alpha; +} + +// Shadow query: returns true when there is an occluder between origin and tMax along direction. +// - MASK materials are alpha-tested against alphaCutoff. +// - BLEND materials are treated as non-occluding for shadows. +bool traceShadowOccluded(float3 origin, float3 direction, float tMin, float tMax) { + RayDesc ray; + ray.Origin = origin; + ray.Direction = direction; + ray.TMin = tMin; + ray.TMax = tMax; + + RayQuery q; + uint mask = 0xFF; + // Force non-opaque so we can decide in-shader whether a candidate occludes. + // This is required so transmissive / glass materials do not fully block shadow rays. + q.TraceRayInline( + tlas, + RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH | RAY_FLAG_FORCE_NON_OPAQUE, + mask, + ray + ); + + int maxIterations = 1000; + int iteration = 0; + while (q.Proceed() && iteration < maxIterations) { + iteration++; + + if (q.CandidateType() == CANDIDATE_NON_OPAQUE_TRIANGLE) { + uint instIndex = q.CandidateInstanceID(); + if (instIndex < uint(max(0, ubo.geometryInfoCount))) { + GeometryInfo geoInfoC = geometryInfoBuffer[instIndex]; + uint materialIndexC = 0u; + if (ubo.materialCount > 0) { + materialIndexC = min(geoInfoC.materialIndex, (uint)(ubo.materialCount - 1)); + } + MaterialData matC = materialBuffer[materialIndexC]; + + bool accept = true; + + // Treat transmissive surfaces as non-occluding for shadows. + // NOTE: This includes "opaque-but-glass" materials (alphaMode==OPAQUE with isGlass hint). + bool transmissive = (matC.isGlass != 0) || (matC.transmissionFactor > 0.01); + if (transmissive || matC.alphaMode == 2) { + accept = false; + } else if (matC.alphaMode == 1) { + float alpha = computeBaseColorAlpha(matC, instIndex, q.CandidatePrimitiveIndex(), q.CandidateTriangleBarycentrics()); + accept = (alpha >= matC.alphaCutoff); + } + + if (accept) { + q.CommitNonOpaqueTriangleHit(); + break; + } + } + } + } + + return (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT); +} + +// Calculate refraction direction using Snell's law +// Returns true if refraction occurs, false if total internal reflection +bool refract(float3 I, float3 N, float eta, out float3 refracted) { + float cosi = -dot(N, I); + float cost2 = 1.0 - eta * eta * (1.0 - cosi * cosi); + if (cost2 > 0.0) { + refracted = eta * I + (eta * cosi - sqrt(cost2)) * N; + return true; + } + refracted = float3(0, 0, 0); // Initialize even on failure + return false; // Total internal reflection +} + +// Perform ray query and return hit information with proper vertex normals and material properties +HitInfo traceRay(float3 origin, float3 direction, float tMin, float tMax, inout uint rngState) { + HitInfo result; + result.hit = false; + result.t = tMax; + result.uv = float2(0.0, 0.0); + result.texIndex = 0u; + result.materialIndex = 0u; + result.baseColor = float3(1.0, 1.0, 1.0); + result.color = float3(0.0, 0.0, 0.0); + result.alphaMode = 0; + result.opacity = 1.0; + + // Create ray query object + RayDesc ray; + ray.Origin = origin; + ray.Direction = direction; + ray.TMin = tMin; + ray.TMax = tMax; + + // Initialize ray query. + // We do NOT force opaque because we need to alpha-test MASK materials in-shader + // (RayQuery inline traversal has no any-hit shader). + RayQuery q; + uint primaryMask = 0xFF; + q.TraceRayInline( + tlas, + RAY_FLAG_NONE, + primaryMask, + ray + ); + + // Process ray query - loop until Proceed() returns false + // For opaque geometry, this finds the closest hit automatically + // Add safety limit to prevent infinite loops + int maxIterations = 1000; + int iteration = 0; + while (q.Proceed() && iteration < maxIterations) { + iteration++; + + // Alpha-mask handling: emulate any-hit by inspecting candidate triangle alpha + // and committing only when the candidate passes alpha test. + // Slang/HLSL ray query candidate type for triangle intersections. + if (q.CandidateType() == CANDIDATE_NON_OPAQUE_TRIANGLE) { + uint instIndex = q.CandidateInstanceID(); + if (instIndex < uint(max(0, ubo.geometryInfoCount))) { + GeometryInfo geoInfoC = geometryInfoBuffer[instIndex]; + uint materialIndexC = 0u; + if (ubo.materialCount > 0) { + materialIndexC = min(geoInfoC.materialIndex, (uint)(ubo.materialCount - 1)); + } + MaterialData matC = materialBuffer[materialIndexC]; + + float alpha = computeBaseColorAlpha(matC, instIndex, q.CandidatePrimitiveIndex(), q.CandidateTriangleBarycentrics()); + + bool accept = true; + if (matC.alphaMode == 1) { + // MASK: classic cutout + accept = (alpha >= matC.alphaCutoff); + } else if (matC.alphaMode == 2) { + // BLEND: do not commit fully-transparent pixels. + // Committing alpha==0 hits can make opaque geometry behind decals disappear. + // Use a small threshold to avoid precision noise. + accept = (alpha > 0.01); + } + + if (accept) { + q.CommitNonOpaqueTriangleHit(); + } + } + } + } + + // Check if we hit anything + if (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT) { + result.hit = true; + result.t = q.CommittedRayT(); + result.worldPos = origin + direction * result.t; + // Use CommittedInstanceID() which returns the instance custom index we set on CPU + // (our per-instance sequential index that matches geometryInfoBuffer order). + uint instIndex = q.CommittedInstanceID(); + result.instanceId = instIndex; + + // Get barycentric coordinates + float2 bary = q.CommittedTriangleBarycentrics(); + float3 barycentrics = float3(1.0 - bary.x - bary.y, bary.x, bary.y); + + // PROPER GEOMETRY DATA FETCHING WITH SAFETY CHECKS + // Per-instance geometry info index equals the instance custom index we assigned on CPU. + uint blasIndex = instIndex; + uint primitiveIndex = q.CommittedPrimitiveIndex(); + + // Validate instance index is in bounds of geometry info buffer + if (blasIndex >= uint(max(0, ubo.geometryInfoCount))) { + // Invalid BLAS index, use default values + result.normal = float3(0, 1, 0); + result.baseColor = float3(0.8, 0.8, 0.8); + result.color = float3(0.8, 0.8, 0.8); + result.metallic = 0.0; + result.roughness = 0.5; + result.transmission = 0.0; + result.isGlass = false; + result.alphaMode = 0; + result.opacity = 1.0; + result.thinWalled = true; + return result; + } + + // Get geometry info for this BLAS (unique mesh) + GeometryInfo geoInfo = geometryInfoBuffer[blasIndex]; + + // Fetch material first so that even if geometry fetch fails we can still show a material color + // Material property fetch with bounds clamp + // Clamp material index to valid range to prevent out-of-bounds access + uint materialIndex = 0u; + if (ubo.materialCount > 0) { + materialIndex = min(geoInfo.materialIndex, (uint) (ubo.materialCount - 1)); + } + result.materialIndex = materialIndex; + + MaterialData material = materialBuffer[materialIndex]; + + // Validate buffer addresses are non-zero (geometry may still be streaming) + if (geoInfo.vertexBufferAddress == 0 || geoInfo.indexBufferAddress == 0) { + // Geometry not ready: show a stable material-derived color so the frame isn't flat gray + result.normal = float3(0, 1, 0); + float3 albedoBase = float3(material.albedo); + result.baseColor = albedoBase; + float u0 = float(materialIndex + 1); + float3 h0 = frac(float3(0.06711056, 0.00583715, 0.2065) * u0); + float3 hashColor0 = frac(h0 * 2.6180339); + result.color = saturate(0.7 * hashColor0 + 0.3 * albedoBase); + result.metallic = material.metallic; + result.roughness = material.roughness; + result.transmission = material.transmissionFactor; + result.isGlass = (material.isGlass != 0); + result.alphaMode = material.alphaMode; + result.opacity = clamp(material.alpha, 0.0, 1.0); + result.thinWalled = (material.thinWalled != 0); + return result; + } + + // Cast device addresses to typed pointers for index buffer + uint* indexBuffer = (uint*)geoInfo.indexBufferAddress; + float* vertexBuffer = (float*)geoInfo.vertexBufferAddress; + + // Validate primitive index is within range of available triangles + uint triCount = geoInfo.indexCount / 3u; + if (primitiveIndex >= triCount) { + // Out of bounds primitive; show material-derived color + result.normal = float3(0, 1, 0); + float3 albedoBase = float3(material.albedo); + result.baseColor = albedoBase; + float u1 = float(materialIndex + 1); + float3 h1 = frac(float3(0.06711056, 0.00583715, 0.2065) * u1); + float3 hashColor1 = frac(h1 * 2.6180339); + result.color = saturate(0.7 * hashColor1 + 0.3 * albedoBase); + result.metallic = material.metallic; + result.roughness = material.roughness; + result.transmission = material.transmissionFactor; + result.isGlass = (material.isGlass != 0); + result.alphaMode = material.alphaMode; + result.opacity = clamp(material.alpha, 0.0, 1.0); + result.thinWalled = (material.thinWalled != 0); + return result; + } + + // Fetch triangle indices + uint idxBase = primitiveIndex * 3u; + uint i0 = indexBuffer[idxBase + 0u]; + uint i1 = indexBuffer[idxBase + 1u]; + uint i2 = indexBuffer[idxBase + 2u]; + + if (i0 >= geoInfo.vertexCount || i1 >= geoInfo.vertexCount || i2 >= geoInfo.vertexCount) { + // Out of bounds vertex indices - still present material-derived color + result.normal = float3(0, 1, 0); + float3 albedoBase = float3(material.albedo); + result.baseColor = albedoBase; + float u2 = float(materialIndex + 1); + float3 h2 = frac(float3(0.06711056, 0.00583715, 0.2065) * u2); + float3 hashColor2 = frac(h2 * 2.6180339); + result.color = saturate(0.7 * hashColor2 + 0.3 * albedoBase); + result.metallic = material.metallic; + result.roughness = material.roughness; + result.transmission = material.transmissionFactor; + result.isGlass = (material.isGlass != 0); + result.alphaMode = material.alphaMode; + result.opacity = clamp(material.alpha, 0.0, 1.0); + result.thinWalled = (material.thinWalled != 0); + return result; + } + + // Vertex layout: pos(3) + normal(3) + texCoord(2) + tangent(4) = 12 floats per vertex + uint vertexStride = 12; // floats per vertex + + // Read object-space normals directly (offset 3 floats for position, then 3 floats for normal) + float3 n0 = float3(vertexBuffer[i0 * vertexStride + 3], + vertexBuffer[i0 * vertexStride + 4], + vertexBuffer[i0 * vertexStride + 5]); + float3 n1 = float3(vertexBuffer[i1 * vertexStride + 3], + vertexBuffer[i1 * vertexStride + 4], + vertexBuffer[i1 * vertexStride + 5]); + float3 n2 = float3(vertexBuffer[i2 * vertexStride + 3], + vertexBuffer[i2 * vertexStride + 4], + vertexBuffer[i2 * vertexStride + 5]); + + // Interpolate normal using barycentric coordinates + float3 interpolatedNormal = n0 * barycentrics.x + + n1 * barycentrics.y + + n2 * barycentrics.z; + + // Transform to world space using per-instance normal matrix + float3x3 nrmMat = float3x3(geoInfo.normalMatrix0.xyz, geoInfo.normalMatrix1.xyz, geoInfo.normalMatrix2.xyz); + float3 N = normalize(mul(nrmMat, interpolatedNormal)); + result.normal = N; + + // Read UVs and sample baseColor texture if available + float2 uv0 = float2(vertexBuffer[i0 * vertexStride + 6], vertexBuffer[i0 * vertexStride + 7]); + float2 uv1 = float2(vertexBuffer[i1 * vertexStride + 6], vertexBuffer[i1 * vertexStride + 7]); + float2 uv2 = float2(vertexBuffer[i2 * vertexStride + 6], vertexBuffer[i2 * vertexStride + 7]); + float2 uv = uv0 * barycentrics.x + uv1 * barycentrics.y + uv2 * barycentrics.z; + uv.y = 1.0 - uv.y; // flip V for glTF + result.uv = uv; + + // --- PBR texture sampling (explicit LOD 0 for compute) --- + float2 uvSample = uv; + float4 baseColor = float4(material.albedo, material.alpha); + float lodHint = computeTextureLOD(result.worldPos, material.roughness); + // For alpha-sensitive materials, prefer sampling by texture-table index even if the + // texture-set flag is missing. The texture table always provides a valid descriptor. + bool alphaSensitive = (material.alphaMode != 0) || (material.alpha < 0.999); + // For alpha-sensitive materials (decals/foliage), avoid noisy/unstable LOD selection in compute. + // Sampling baseColor at LOD 0 stabilizes alpha and prevents speckled transparency. + float baseColorLod = alphaSensitive ? 0.0 : lodHint; + if (material.baseColorTextureSet >= 0 || alphaSensitive) { + uint tiBase = (uint)min(max(material.baseColorTexIndex, 0), int(RQ_MAX_TEX - 1)); + baseColor = baseColorTex[NonUniformResourceIndex(tiBase)].SampleLevel(uvSample, baseColorLod); + baseColor *= float4(material.albedo, material.alpha); + result.texIndex = tiBase; + } else { + result.texIndex = 0u; + } + + result.baseColor = saturate(baseColor.rgb); + + float opacity = clamp(baseColor.a, 0.0, 1.0); + result.alphaMode = material.alphaMode; + result.opacity = opacity; + + // Physical descriptor: metallic-roughness (default) or spec-gloss + float4 mrOrSpecGloss = float4(1.0, 1.0, 1.0, 1.0); + if (material.physicalDescriptorTextureSet >= 0) { + uint tiPhys = (uint)min(max(material.physicalTexIndex, 0), int(RQ_MAX_TEX - 1)); + mrOrSpecGloss = baseColorTex[NonUniformResourceIndex(tiPhys)].SampleLevel(uvSample, lodHint); + } + + float metallic = 0.0; + float roughness = 1.0; + float3 F0 = float3(0.04, 0.04, 0.04); + float3 albedo = baseColor.rgb; + + if (material.useSpecGlossWorkflow != 0) { + float3 specColorSG = mrOrSpecGloss.rgb * material.specularFactor; + float gloss = clamp(mrOrSpecGloss.a * material.glossinessFactor, 0.0, 1.0); + roughness = clamp(1.0 - gloss, 0.0, 1.0); + F0 = specColorSG; + float maxF0 = max(F0.r, max(F0.g, F0.b)); + albedo = baseColor.rgb * (1.0 - maxF0); + } else { + float metallicTex = mrOrSpecGloss.b; + float roughnessTex = mrOrSpecGloss.g; + metallic = clamp(metallicTex * material.metallic, 0.0, 1.0); + roughness = clamp(roughnessTex * material.roughness, 0.0, 1.0); + F0 = lerp(float3(0.04, 0.04, 0.04), baseColor.rgb, metallic); + albedo = baseColor.rgb * (1.0 - metallic); + } + + // Ambient occlusion + float ao = material.ao; + if (material.occlusionTextureSet >= 0) { + uint tiAO = (uint)min(max(material.occlusionTexIndex, 0), int(RQ_MAX_TEX - 1)); + ao *= baseColorTex[NonUniformResourceIndex(tiAO)].SampleLevel(uvSample, lodHint).r; + } + + // Emissive + float3 emissiveTex = float3(1.0, 1.0, 1.0); + if (material.emissiveTextureSet >= 0) { + uint tiE = (uint)min(max(material.emissiveTexIndex, 0), int(RQ_MAX_TEX - 1)); + emissiveTex = baseColorTex[NonUniformResourceIndex(tiE)].SampleLevel(uvSample, lodHint).rgb; + } + float3 emissive = emissiveTex * material.emissive; + if (material.hasEmissiveStrengthExt != 0) { + emissive *= material.emissiveStrength; + } + + // Store F0 so the caller can compute Fresnel for reflection/transmission. + result.F0 = F0; + result.ior = max(material.ior, 1.0); + result.isLiquid = (material.isLiquid != 0); + + // Normal mapping (tangent space) + if (material.normalTextureSet >= 0) { + uint tiN = (uint)min(max(material.normalTexIndex, 0), int(RQ_MAX_TEX - 1)); + float3 tangentNormal = baseColorTex[NonUniformResourceIndex(tiN)].SampleLevel(uvSample, lodHint).xyz * 2.0 - 1.0; + + // Read and interpolate tangent (object-space) from vertex buffer + float4 t0 = float4(vertexBuffer[i0 * vertexStride + 8], + vertexBuffer[i0 * vertexStride + 9], + vertexBuffer[i0 * vertexStride + 10], + vertexBuffer[i0 * vertexStride + 11]); + float4 t1 = float4(vertexBuffer[i1 * vertexStride + 8], + vertexBuffer[i1 * vertexStride + 9], + vertexBuffer[i1 * vertexStride + 10], + vertexBuffer[i1 * vertexStride + 11]); + float4 t2 = float4(vertexBuffer[i2 * vertexStride + 8], + vertexBuffer[i2 * vertexStride + 9], + vertexBuffer[i2 * vertexStride + 10], + vertexBuffer[i2 * vertexStride + 11]); + float4 tan4 = t0 * barycentrics.x + t1 * barycentrics.y + t2 * barycentrics.z; + float3 T = normalize(mul(nrmMat, tan4.xyz)); + + // We flip V for glTF (uv.y = 1-uv.y). In tangent space this inverts bitangent. + // glTF tangent.w encodes bitangent sign in unflipped UV space, so negate it. + float handedness = -tan4.w; + float3 B = normalize(cross(N, T)) * handedness; + float3x3 TBN = float3x3(T, B, N); + N = normalize(mul(TBN, tangentNormal)); + result.normal = N; + } + + // --- Direct lighting (GGX) --- + float3 V = normalize(-direction); + N = result.normal; + // Flip normal for backfaces so lighting works inside single-sided rooms + if (dot(N, V) < 0.0) { + N = -N; + result.normal = N; + } + + // Effective transmission for diffuse scaling (to avoid over-brightening transmissive surfaces) + float T_diff = clamp(material.transmissionFactor, 0.0, 1.0); + if (material.isGlass != 0) T_diff = max(T_diff, 0.9); + if (material.alphaMode == 2) T_diff = max(T_diff, 1.0 - clamp(baseColor.a, 0.0, 1.0)); + + float3 diffuseLighting = float3(0.0, 0.0, 0.0); + float3 specularLighting = float3(0.0, 0.0, 0.0); + + const bool shadowsEnabled = (ubo._pad1 != 0); + int lc = max(ubo.lightCount, 0); + for (int li = 0; li < lc; ++li) { + LightData light = lightBuffer[li]; + + // Determine whether to use multi-sample soft shadows for this light. + // Directional shadows stay hard for now. + int samples = 1; + float softness = max(ubo.shadowSoftness, 0.0); + int reqSamples = max(ubo.shadowSampleCount, 1); + if (shadowsEnabled && reqSamples > 1 && softness > 0.0 && light.lightType != 1) { + samples = min(reqSamples, 32); + } + + // Build a stable-ish RNG for this light + uint lightRng = rqHash(rngState ^ (uint(li) * 747796405u) ^ rqHash(asuint(result.worldPos.x) + 31u * asuint(result.worldPos.y))); + + float3 diffAcc = float3(0.0, 0.0, 0.0); + float3 specAcc = float3(0.0, 0.0, 0.0); + + for (int si = 0; si < samples; ++si) { + float3 L; + float3 radiance; + float tMaxShadow = 10000.0; + + if (light.lightType == 1) { + // Directional + L = normalize(-light.position.xyz); + radiance = light.color.rgb; + } else { + // Point/spot/emissive + float3 lightPos = light.position.xyz; + + float3 toCenter = lightPos - result.worldPos; + float dCenter = length(toCenter); + float3 Lcenter = (dCenter > 1e-5) ? (toCenter / dCenter) : float3(0, 0, 1); + + // Effective area-light radius (in meters) as a function of range. + // `shadowSoftness` is authored as a fraction of `light.range`. + float lightRadius = softness * max(light.range, 0.0); + lightRadius = clamp(lightRadius, 0.0, 2.0); + + float3 samplePos = lightPos; + if (samples > 1 && lightRadius > 0.0) { + float3 up = (abs(Lcenter.y) < 0.999) ? float3(0, 1, 0) : float3(1, 0, 0); + float3 T = normalize(cross(up, Lcenter)); + float3 B = cross(Lcenter, T); + float2 d = rqSampleDisk(lightRng); + samplePos = lightPos + (T * d.x + B * d.y) * lightRadius; + } + + float3 toLight = samplePos - result.worldPos; + float d = length(toLight); + L = (d > 1e-5) ? (toLight / d) : float3(0, 0, 1); + tMaxShadow = max(d - RQ_RAY_EPS, RQ_RAY_EPS); + + float attenuation = 1.0; + if (light.lightType == 3) { + float r = max(light.range, 0.001); + attenuation = 1.0 / (1.0 + (d / r) * (d / r)); + } else { + attenuation = 1.0 / max(d * d, 0.0001); + // GLTF style range attenuation + if (light.range > 0.0) { + attenuation *= pow(saturate(1.0 - pow(d / light.range, 4.0)), 2.0); + } + } + radiance = light.color.rgb * attenuation; + + if (light.lightType == 2) { + // Spot light cone attenuation + float3 D = normalize(light.direction.xyz); + float cd = dot(D, -L); + float cosInner = cos(light.innerConeAngle); + float cosOuter = cos(light.outerConeAngle); + float spotAttenuation = saturate((cd - cosOuter) / max(cosInner - cosOuter, 0.0001)); + spotAttenuation *= spotAttenuation; + radiance *= spotAttenuation; + } + } + + float rawDot = dot(N, L); + float NdotL = (light.lightType == 3) ? abs(rawDot) : max(rawDot, 0.0); + if (NdotL <= 0.0) { + continue; + } + + float visibility = 1.0; + if (shadowsEnabled) { + // Coplanar decals (alpha BLEND) can self-occlude against the wall behind them + // when starting shadow rays along the surface normal. Bias along the light + // direction for BLEND to keep decal lighting stable under shadows. + float3 shadowOrigin = (result.alphaMode == 2) + ? (result.worldPos + L * (4.0 * RQ_RAY_EPS)) + : (result.worldPos + N * RQ_RAY_EPS); + bool occluded = traceShadowOccluded(shadowOrigin, L, RQ_RAY_EPS, tMaxShadow); + visibility = occluded ? 0.0 : 1.0; + } + + float3 H = normalize(V + L); + float NdotV = max(dot(N, V), 0.0); + float NdotH = max(dot(N, H), 0.0); + float HdotV = max(dot(H, V), 0.0); + float D = DistributionGGX(NdotH, roughness); + float G = GeometrySmith(NdotV, NdotL, roughness); + float3 F = FresnelSchlick(HdotV, F0); + float3 spec = (D * G * F) / max(4.0 * NdotV * NdotL, 0.0001); + float3 kD = (1.0 - F) * (1.0 - metallic) * (1.0 - T_diff); + + specAcc += spec * radiance * NdotL * visibility; + diffAcc += (kD * albedo / PI) * radiance * NdotL * visibility; + } + + float inv = (samples > 1) ? (1.0 / float(samples)) : 1.0; + specularLighting += specAcc * inv; + diffuseLighting += diffAcc * inv; + } + + // Avoid ambient "fill" on true glass/transmission; it can make them look opaque/frosted. + // BUT keep ambient for alpha-BLEND decals/foliage so they don't become unnaturally dark overlays. + bool treatAsTransmissive = (material.isGlass != 0) || (material.transmissionFactor > 0.01); + float3 ambient = treatAsTransmissive ? float3(0.0, 0.0, 0.0) : (albedo * (0.1 * ubo.scaleIBLAmbient) * ao); + float3 color = ambient + diffuseLighting + specularLighting + emissive; + + result.color = color; + result.metallic = metallic; + result.roughness = roughness; + result.transmission = material.transmissionFactor; + result.isGlass = (material.isGlass != 0); + result.alphaMode = material.alphaMode; + // Keep texture-derived alpha/opacity (baseColor factor * baseColor texture). + result.opacity = clamp(baseColor.a, 0.0, 1.0); + result.thinWalled = (material.thinWalled != 0); + } + + return result; +} + +// Compute shader entry point +[[shader("compute")]] +[numthreads(8, 8, 1)] +void main(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint2 pixelCoord = dispatchThreadID.xy; + uint2 imageDim = uint2(ubo.screenDimensions); + + // Bounds check + if (pixelCoord.x >= imageDim.x || pixelCoord.y >= imageDim.y) { + return; + } + + // Generate primary ray + float2 uv = (float2(pixelCoord) + 0.5) / float2(imageDim); + float2 ndc = uv * 2.0 - 1.0; + + // Compute ray direction using inverse view-projection + // Unproject a point on the near plane (z=0 in Vulkan NDC) to get direction + float4x4 invProj = inverse(ubo.proj); + float4x4 invView = inverse(ubo.view); + + // Unproject near plane point (z=0) and far plane point (z=1) to get ray direction + // Near plane in clip space: z=0, w=1 + float4 nearClip = float4(ndc, 0.0, 1.0); + float4 farClip = float4(ndc, 1.0, 1.0); + + // Transform to view space + float4 nearView = mul(invProj, nearClip); + float4 farView = mul(invProj, farClip); + nearView /= nearView.w; + farView /= farView.w; + + // Transform to world space + float3 nearWorld = mul(invView, nearView).xyz; + float3 farWorld = mul(invView, farView).xyz; + + // Primary ray from camera position through the pixel + float3 rayOrigin = ubo.camPos.xyz; + float3 rayDir = normalize(farWorld - nearWorld); + + uint rngState = rqHash(pixelCoord.x + 4099u * pixelCoord.y + 131071u); + HitInfo hit = traceRay(rayOrigin, rayDir, 0.0001, 10000.0, rngState); + + if (hit.hit) { + float3 c = shadeWithSecondaryRays(rayOrigin, rayDir, hit, rngState); + // Output linear HDR-ish color; composite pass will apply exposure/gamma. + outputImage[pixelCoord] = float4(c, 1.0); + + } else { + // Sky/background color + outputImage[pixelCoord] = float4(skyColor(rayDir), 1.0); + } +} diff --git a/attachments/openxr_engine/shaders/texturedMesh.slang b/attachments/openxr_engine/shaders/texturedMesh.slang new file mode 100644 index 00000000..7c259573 --- /dev/null +++ b/attachments/openxr_engine/shaders/texturedMesh.slang @@ -0,0 +1,113 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Combined vertex and fragment shader for textured mesh rendering +// This shader provides basic textured rendering with simple lighting + +// Input from vertex buffer +struct VSInput { + [[vk::location(0)]] float3 Position; + [[vk::location(1)]] float3 Normal; + [[vk::location(2)]] float2 TexCoord; + [[vk::location(3)]] float4 Tangent; + + // Per-instance data. Model matrix is a true 4x4, while the normal + // matrix is provided as three float4 columns that match the CPU + // layout (glm::mat3x4: 3 columns of vec4, xyz used, w unused). + [[vk::location(4)]] column_major float4x4 InstanceModelMatrix; // binding 1 (consumes 4 locations) + [[vk::location(8)]] float4 InstanceNormal0; // normal matrix column 0 + [[vk::location(9)]] float4 InstanceNormal1; // normal matrix column 1 + [[vk::location(10)]] float4 InstanceNormal2; // normal matrix column 2 +}; + +// Output from vertex shader / Input to fragment shader +struct VSOutput { + float4 Position : SV_POSITION; + float3 WorldPos; + float3 Normal : NORMAL; + float2 TexCoord : TEXCOORD0; + float4 Tangent : TANGENT; // Pass through tangent to satisfy validation layer + uint ViewID : SV_ViewID; +}; + +// Uniform buffer (matched to renderer.h) +struct UniformBufferObject { + float4x4 model; + float4x4 viewProjections[4]; +}; + +// Bindings +[[vk::binding(0, 0)]] ConstantBuffer ubo; +[[vk::binding(1, 0)]] Sampler2D texSampler; + +// Vertex shader entry point +[[shader("vertex")]] +VSOutput VSMain(VSInput input, uint viewID : SV_ViewID) +{ + VSOutput output; + output.ViewID = viewID; + + // Use instance matrices directly (CPU uploads column-major model + // matrix and three float4 normal-matrix columns in attributes + // 4..10) + float4x4 instanceModelMatrix = input.InstanceModelMatrix; + + // Transform position to world space: entity model * instance model + float4 worldPos = mul(ubo.model, mul(instanceModelMatrix, float4(input.Position, 1.0))); + + // Final clip space position + output.Position = mul(ubo.viewProjections[viewID], worldPos); + + // Pass world position and transformed normal to fragment shader + // (apply entity model to normals too). Reconstruct the 3x3 normal + // matrix from the three uploaded columns and apply it in column + // form to avoid any row/column layout ambiguity. + float3x3 model3x3 = (float3x3)ubo.model; + output.WorldPos = worldPos.xyz; + + float3 instNormal = + input.InstanceNormal0.xyz * input.Normal.x + + input.InstanceNormal1.xyz * input.Normal.y + + input.InstanceNormal2.xyz * input.Normal.z; + + output.Normal = normalize(mul(model3x3, instNormal)); + output.TexCoord = input.TexCoord; + output.Tangent = input.Tangent; // Pass through tangent (unused in basic rendering) + + return output; +} + +// Fragment shader entry point +[[shader("fragment")]] +float4 PSMain(VSOutput input) : SV_TARGET +{ + // Sample the texture with flipped V coordinate (glTF UV origin vs Vulkan) + float2 uv = float2(input.TexCoord.x, 1.0 - input.TexCoord.y); + float4 texColor = texSampler.Sample(uv); + + // Simple directional lighting + float3 lightDir = normalize(float3(0.5, 1.0, 0.3)); // Fixed light direction + float3 normal = normalize(input.Normal); + float lightIntensity = max(dot(normal, lightDir), 0.2); // Minimum ambient of 0.2 + + // If texture is nearly white, use a default color to avoid washed-out look + float whiteness = (texColor.r + texColor.g + texColor.b) / 3.0; + float4 finalColor = (whiteness > 0.95) + ? float4(float3(0.8, 0.8, 0.8) * lightIntensity, 1.0) + : float4(texColor.rgb * lightIntensity, texColor.a); + + return finalColor; +} diff --git a/attachments/openxr_engine/shaders/tonemapping_utils.slang b/attachments/openxr_engine/shaders/tonemapping_utils.slang new file mode 100644 index 00000000..33646b79 --- /dev/null +++ b/attachments/openxr_engine/shaders/tonemapping_utils.slang @@ -0,0 +1,34 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Tonemapping utilities for HDR to LDR conversion +// Shared between rasterization and ray query shaders + +// Hable/Uncharted 2 filmic tonemapping operator +// Provides a cinematic look with good highlight rolloff +namespace Hable_Filmic_Tonemapping { + static const float A = 0.15; + static const float B = 0.50; + static const float C = 0.10; + static const float D = 0.20; + static const float E = 0.02; + static const float F = 0.30; + static const float W = 11.2; + + float3 Uncharted2Tonemap(float3 x) { + return ((x * (A * x + C * B) + D * E) / (x * (A * x + B) + D * F)) - E / F; + } +} diff --git a/attachments/openxr_engine/xr_context.cpp b/attachments/openxr_engine/xr_context.cpp new file mode 100644 index 00000000..33242703 --- /dev/null +++ b/attachments/openxr_engine/xr_context.cpp @@ -0,0 +1,581 @@ +#include "xr_context.h" +#include +#include +#include +#include +#include + +#if defined(PLATFORM_ANDROID) +#include +#include +#include +#endif + +XrContext::XrContext() : + instance(XR_NULL_HANDLE), + systemId(XR_NULL_SYSTEM_ID), + session(XR_NULL_HANDLE), + appSpace(XR_NULL_HANDLE), + actionSet(XR_NULL_HANDLE), + gazeSpace(XR_NULL_HANDLE) +{} +XrContext::~XrContext() { cleanup(); } + +bool XrContext::checkRuntimeAvailable() { + uint32_t extCount = 0; + if (xrEnumerateInstanceExtensionProperties(nullptr, 0, &extCount, nullptr) != XR_SUCCESS) { + return false; + } + std::vector extensions(extCount, {XR_TYPE_EXTENSION_PROPERTIES}); + if (xrEnumerateInstanceExtensionProperties(nullptr, extCount, &extCount, extensions.data()) != XR_SUCCESS) { + return false; + } + for (const auto& ext : extensions) { + if (std::strcmp(ext.extensionName, XR_KHR_VULKAN_ENABLE2_EXTENSION_NAME) == 0) { + return true; + } + } + return false; +} + +bool XrContext::isExtensionEnabled(const char* extName) const { + for (const auto& ext : enabledExtensions) { + if (ext == extName) return true; + } + return false; +} + +bool XrContext::createInstance(const std::string& appName) { + std::cout << "XrContext: Creating OpenXR instance for " << appName << std::endl; + + uint32_t extCount = 0; + xrEnumerateInstanceExtensionProperties(nullptr, 0, &extCount, nullptr); + std::vector availableExtensions(extCount, {XR_TYPE_EXTENSION_PROPERTIES}); + xrEnumerateInstanceExtensionProperties(nullptr, extCount, &extCount, availableExtensions.data()); + + auto checkExt = [&](const char* name) { + for (const auto& ext : availableExtensions) { + if (std::strcmp(ext.extensionName, name) == 0) return true; + } + return false; + }; + + std::vector extensions = { XR_KHR_VULKAN_ENABLE2_EXTENSION_NAME }; + const char* optionalExtensions[] = { + XR_MSFT_SCENE_UNDERSTANDING_EXTENSION_NAME, + XR_EXT_EYE_GAZE_INTERACTION_EXTENSION_NAME, + XR_MSFT_HAND_INTERACTION_EXTENSION_NAME + }; + + for (auto ext : optionalExtensions) { + if (checkExt(ext)) { + extensions.push_back(ext); + enabledExtensions.push_back(ext); + } + } + +#if defined(PLATFORM_ANDROID) + if (checkExt(XR_KHR_ANDROID_CREATE_INSTANCE_EXTENSION_NAME)) { + extensions.push_back(XR_KHR_ANDROID_CREATE_INSTANCE_EXTENSION_NAME); + enabledExtensions.push_back(XR_KHR_ANDROID_CREATE_INSTANCE_EXTENSION_NAME); + } +#endif + + XrInstanceCreateInfo instanceCreateInfo{XR_TYPE_INSTANCE_CREATE_INFO}; + std::strncpy(instanceCreateInfo.applicationInfo.applicationName, appName.c_str(), XR_MAX_APPLICATION_NAME_SIZE); + instanceCreateInfo.applicationInfo.apiVersion = XR_CURRENT_API_VERSION; + instanceCreateInfo.enabledExtensionCount = static_cast(extensions.size()); + instanceCreateInfo.enabledExtensionNames = extensions.data(); + +#if defined(PLATFORM_ANDROID) + XrInstanceCreateInfoAndroidKHR androidCreateInfo{XR_TYPE_INSTANCE_CREATE_INFO_ANDROID_KHR}; + if (isExtensionEnabled(XR_KHR_ANDROID_CREATE_INSTANCE_EXTENSION_NAME)) { + if (!androidApp) { + std::cerr << "XrContext: androidApp not set" << std::endl; + return false; + } + androidCreateInfo.applicationVM = androidApp->activity->vm; + androidCreateInfo.applicationActivity = androidApp->activity->clazz; + instanceCreateInfo.next = &androidCreateInfo; + } +#endif + + if (xrCreateInstance(&instanceCreateInfo, &this->instance) != XR_SUCCESS) { + return false; + } + + // Load Vulkan extension functions + xrGetInstanceProcAddr(instance, "xrGetVulkanInstanceExtensionsKHR", (PFN_xrVoidFunction*)&pfnGetVulkanInstanceExtensionsKHR); + xrGetInstanceProcAddr(instance, "xrGetVulkanDeviceExtensionsKHR", (PFN_xrVoidFunction*)&pfnGetVulkanDeviceExtensionsKHR); + xrGetInstanceProcAddr(instance, "xrGetVulkanGraphicsRequirementsKHR", (PFN_xrVoidFunction*)&pfnGetVulkanGraphicsRequirementsKHR); + xrGetInstanceProcAddr(instance, "xrGetVulkanGraphicsRequirements2KHR", (PFN_xrVoidFunction*)&pfnGetVulkanGraphicsRequirements2KHR); + xrGetInstanceProcAddr(instance, "xrGetVulkanGraphicsDeviceKHR", (PFN_xrVoidFunction*)&pfnGetVulkanGraphicsDeviceKHR); + xrGetInstanceProcAddr(instance, "xrGetVulkanGraphicsDevice2KHR", (PFN_xrVoidFunction*)&pfnGetVulkanGraphicsDevice2KHR); + + XrSystemGetInfo systemGetInfo{XR_TYPE_SYSTEM_GET_INFO}; + systemGetInfo.formFactor = XR_FORM_FACTOR_HEAD_MOUNTED_DISPLAY; + if (xrGetSystem(this->instance, &systemGetInfo, &this->systemId) != XR_SUCCESS) { + return false; + } + + return true; +} + +bool XrContext::createSession(vk::PhysicalDevice physicalDevice, vk::Device device, uint32_t queueFamilyIndex, uint32_t queueIndex) { + std::cout << "XrContext: Creating session" << std::endl; + + XrGraphicsBindingVulkanKHR graphicsBinding{XR_TYPE_GRAPHICS_BINDING_VULKAN_KHR}; + graphicsBinding.instance = (VkInstance)this->vkInstance; + graphicsBinding.physicalDevice = (VkPhysicalDevice)physicalDevice; + graphicsBinding.device = (VkDevice)device; + graphicsBinding.queueFamilyIndex = queueFamilyIndex; + graphicsBinding.queueIndex = queueIndex; + + XrSessionCreateInfo sessionCreateInfo{XR_TYPE_SESSION_CREATE_INFO}; + sessionCreateInfo.next = &graphicsBinding; + sessionCreateInfo.systemId = this->systemId; + if (xrCreateSession(this->instance, &sessionCreateInfo, &this->session) != XR_SUCCESS) { + return false; + } + + XrReferenceSpaceCreateInfo spaceCreateInfo{XR_TYPE_REFERENCE_SPACE_CREATE_INFO}; + spaceCreateInfo.referenceSpaceType = XR_REFERENCE_SPACE_TYPE_STAGE; + spaceCreateInfo.poseInReferenceSpace = {{0,0,0,1}, {0,0,0}}; + if (xrCreateReferenceSpace(this->session, &spaceCreateInfo, &this->appSpace) != XR_SUCCESS) { + return false; + } + + this->views.resize(2, {XR_TYPE_VIEW}); + for (uint32_t i = 0; i < 2; ++i) { + this->views[i].pose = {{0,0,0,1}, {0,0,0}}; + this->views[i].fov = {-1, 1, 1, -1}; + } + + XrActionSetCreateInfo actionSetInfo{XR_TYPE_ACTION_SET_CREATE_INFO}; + std::strncpy(actionSetInfo.actionSetName, "main", XR_MAX_ACTION_SET_NAME_SIZE); + std::strncpy(actionSetInfo.localizedActionSetName, "Main Actions", XR_MAX_LOCALIZED_ACTION_SET_NAME_SIZE); + xrCreateActionSet(this->instance, &actionSetInfo, &this->actionSet); + + auto createAction = [&](const std::string& name, const std::string& localizedName, XrActionType type) { + XrActionCreateInfo actionInfo{XR_TYPE_ACTION_CREATE_INFO}; + actionInfo.actionType = type; + std::strncpy(actionInfo.actionName, name.c_str(), XR_MAX_ACTION_NAME_SIZE); + std::strncpy(actionInfo.localizedActionName, localizedName.c_str(), XR_MAX_LOCALIZED_ACTION_NAME_SIZE); + XrAction action; + xrCreateAction(this->actionSet, &actionInfo, &action); + this->actions[name] = action; + this->actionTypes[name] = type; + }; + + createAction("trigger_left", "Left Trigger", XR_ACTION_TYPE_BOOLEAN_INPUT); + createAction("trigger_right", "Right Trigger", XR_ACTION_TYPE_BOOLEAN_INPUT); + createAction("pose_left", "Left Hand Pose", XR_ACTION_TYPE_POSE_INPUT); + createAction("pose_right", "Right Hand Pose", XR_ACTION_TYPE_POSE_INPUT); + createAction("grab_left", "Left Grab", XR_ACTION_TYPE_FLOAT_INPUT); + createAction("grab_right", "Right Grab", XR_ACTION_TYPE_FLOAT_INPUT); + createAction("Grab", "Grab", XR_ACTION_TYPE_BOOLEAN_INPUT); + createAction("GrabPose", "Grab Pose", XR_ACTION_TYPE_POSE_INPUT); + createAction("menu", "Menu Button", XR_ACTION_TYPE_BOOLEAN_INPUT); + + XrPath khrSimplePath; + xrStringToPath(this->instance, "/interaction_profiles/khr/simple_controller", &khrSimplePath); + std::vector bindings; + auto addBinding = [&](const std::string& act, const char* path) { + XrPath p; + xrStringToPath(this->instance, path, &p); + bindings.push_back({actions[act], p}); + }; + addBinding("trigger_left", "/user/hand/left/input/select/click"); + addBinding("trigger_right", "/user/hand/right/input/select/click"); + addBinding("pose_left", "/user/hand/left/input/grip/pose"); + addBinding("pose_right", "/user/hand/right/input/grip/pose"); + addBinding("Grab", "/user/hand/right/input/select/click"); + addBinding("GrabPose", "/user/hand/right/input/grip/pose"); + addBinding("menu", "/user/hand/left/input/menu/click"); + + XrInteractionProfileSuggestedBinding suggestedBindings{XR_TYPE_INTERACTION_PROFILE_SUGGESTED_BINDING}; + suggestedBindings.interactionProfile = khrSimplePath; + suggestedBindings.suggestedBindings = bindings.data(); + suggestedBindings.countSuggestedBindings = (uint32_t)bindings.size(); + xrSuggestInteractionProfileBindings(this->instance, &suggestedBindings); + + XrSessionActionSetsAttachInfo attachInfo{XR_TYPE_SESSION_ACTION_SETS_ATTACH_INFO}; + attachInfo.countActionSets = 1; + attachInfo.actionSets = &this->actionSet; + xrAttachSessionActionSets(this->session, &attachInfo); + + for (const auto& actionName : {"pose_left", "pose_right", "GrabPose"}) { + XrActionSpaceCreateInfo actionSpaceInfo{XR_TYPE_ACTION_SPACE_CREATE_INFO}; + actionSpaceInfo.action = actions[actionName]; + actionSpaceInfo.poseInActionSpace = {{0,0,0,1}, {0,0,0}}; + XrSpace space; + xrCreateActionSpace(this->session, &actionSpaceInfo, &space); + this->actionSpaces[actionName] = space; + } + + if (isExtensionEnabled(XR_EXT_EYE_GAZE_INTERACTION_EXTENSION_NAME)) { + XrReferenceSpaceCreateInfo gazeSpaceInfo{XR_TYPE_REFERENCE_SPACE_CREATE_INFO}; + gazeSpaceInfo.referenceSpaceType = (XrReferenceSpaceType)1000031008; // XR_REFERENCE_SPACE_TYPE_EYE_GAZE_EXT + gazeSpaceInfo.poseInReferenceSpace = {{0,0,0,1}, {0,0,0}}; + xrCreateReferenceSpace(this->session, &gazeSpaceInfo, &this->gazeSpace); + } + + return true; +} + +void XrContext::cleanup() { + for (auto& swapchain : swapchains) { + xrDestroySwapchain(swapchain.handle); + } + swapchains.clear(); + + if (gazeSpace != XR_NULL_HANDLE) { + xrDestroySpace(gazeSpace); + gazeSpace = XR_NULL_HANDLE; + } + + for (auto& pair : actionSpaces) { + xrDestroySpace(pair.second); + } + actionSpaces.clear(); + + for (auto& pair : actions) { + xrDestroyAction(pair.second); + } + actions.clear(); + + if (actionSet != XR_NULL_HANDLE) { + xrDestroyActionSet(actionSet); + actionSet = XR_NULL_HANDLE; + } + + if (appSpace != XR_NULL_HANDLE) { + xrDestroySpace(appSpace); + appSpace = XR_NULL_HANDLE; + } + + if (session != XR_NULL_HANDLE) { + xrDestroySession(session); + session = XR_NULL_HANDLE; + } + + if (instance != XR_NULL_HANDLE) { + xrDestroyInstance(instance); + instance = XR_NULL_HANDLE; + } +} + +std::vector XrContext::getVulkanInstanceExtensions() { + if (instance == XR_NULL_HANDLE) return { "XR_KHR_vulkan_enable2" }; + uint32_t size = 0; + if (!pfnGetVulkanInstanceExtensionsKHR) return {}; + pfnGetVulkanInstanceExtensionsKHR(instance, systemId, 0, &size, nullptr); + std::vector buffer(size); + pfnGetVulkanInstanceExtensionsKHR(instance, systemId, size, &size, buffer.data()); + + static std::vector extStrings; + extStrings.clear(); + std::string extensions(buffer.data()); + std::istringstream iss(extensions); + std::string ext; + while (iss >> ext) extStrings.push_back(ext); + + static std::vector extPtrs; + extPtrs.clear(); + for (const auto& s : extStrings) extPtrs.push_back(s.c_str()); + return extPtrs; +} + +std::vector XrContext::getVulkanDeviceExtensions(vk::PhysicalDevice physicalDevice) { + if (instance == XR_NULL_HANDLE) return { "VK_KHR_external_memory", "VK_KHR_external_semaphore" }; + uint32_t size = 0; + if (!pfnGetVulkanDeviceExtensionsKHR) return {}; + pfnGetVulkanDeviceExtensionsKHR(instance, systemId, 0, &size, nullptr); + std::vector buffer(size); + pfnGetVulkanDeviceExtensionsKHR(instance, systemId, size, &size, buffer.data()); + + static std::vector devExtStrings; + devExtStrings.clear(); + std::string extensions(buffer.data()); + std::istringstream iss(extensions); + std::string ext; + while (iss >> ext) devExtStrings.push_back(ext); + + static std::vector devExtPtrs; + devExtPtrs.clear(); + for (const auto& s : devExtStrings) devExtPtrs.push_back(s.c_str()); + return devExtPtrs; +} + +const uint8_t* XrContext::getRequiredLUID() { + if (!luidValid && vkInstance && instance != XR_NULL_HANDLE && systemId != XR_NULL_SYSTEM_ID) { + // Step 1: Call graphics requirements as mandated by spec before getting graphics device + XrGraphicsRequirementsVulkanKHR graphicsRequirements{XR_TYPE_GRAPHICS_REQUIREMENTS_VULKAN_KHR}; + if (pfnGetVulkanGraphicsRequirements2KHR) { + pfnGetVulkanGraphicsRequirements2KHR(instance, systemId, &graphicsRequirements); + } else if (pfnGetVulkanGraphicsRequirementsKHR) { + pfnGetVulkanGraphicsRequirementsKHR(instance, systemId, &graphicsRequirements); + } + + // Step 2: Get the physical device from OpenXR + VkPhysicalDevice vkPhysicalDevice = VK_NULL_HANDLE; + XrResult result = XR_ERROR_FUNCTION_UNSUPPORTED; + + if (pfnGetVulkanGraphicsDevice2KHR) { + XrVulkanGraphicsDeviceGetInfoKHR getInfo{XR_TYPE_VULKAN_GRAPHICS_DEVICE_GET_INFO_KHR}; + getInfo.systemId = systemId; + getInfo.vulkanInstance = (VkInstance)vkInstance; + result = pfnGetVulkanGraphicsDevice2KHR(instance, &getInfo, &vkPhysicalDevice); + } else if (pfnGetVulkanGraphicsDeviceKHR) { + result = pfnGetVulkanGraphicsDeviceKHR(instance, systemId, (VkInstance)vkInstance, &vkPhysicalDevice); + } + + if (result == XR_SUCCESS && vkPhysicalDevice != VK_NULL_HANDLE) { + // Step 3: Extract LUID from the physical device + VkPhysicalDeviceIDProperties idProps{VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES}; + idProps.pNext = nullptr; + VkPhysicalDeviceProperties2 props2{VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2}; + props2.pNext = &idProps; + + auto pfnGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr((VkInstance)vkInstance, "vkGetPhysicalDeviceProperties2"); + if (pfnGetPhysicalDeviceProperties2) { + pfnGetPhysicalDeviceProperties2(vkPhysicalDevice, &props2); + if (idProps.deviceLUIDValid) { + std::memcpy(requiredLuid, idProps.deviceLUID, VK_LUID_SIZE); + luidValid = true; + std::cout << "XrContext: Required LUID found and stored." << std::endl; + } else { + std::cout << "XrContext: Physical device LUID is not valid." << std::endl; + } + } else { + std::cerr << "XrContext: Failed to load vkGetPhysicalDeviceProperties2" << std::endl; + } + } else { + std::cerr << "XrContext: Failed to get Vulkan graphics device from OpenXR (XrResult=" << result << ")" << std::endl; + } + } + return luidValid ? requiredLuid : nullptr; +} + +vk::Extent2D XrContext::getRecommendedExtent() const { + uint32_t count = 0; + xrEnumerateViewConfigurationViews(instance, systemId, XR_VIEW_CONFIGURATION_TYPE_PRIMARY_STEREO, 0, &count, nullptr); + std::vector vcv(count, {XR_TYPE_VIEW_CONFIGURATION_VIEW}); + xrEnumerateViewConfigurationViews(instance, systemId, XR_VIEW_CONFIGURATION_TYPE_PRIMARY_STEREO, count, &count, vcv.data()); + if (vcv.empty()) return {1024, 1024}; + vk::Extent2D ext{vcv[0].recommendedImageRectWidth, vcv[0].recommendedImageRectHeight}; + return ext; +} + +void XrContext::createSwapchains(vk::Device device, vk::Format format, vk::Extent2D extent) { + this->format = format; + this->extent = extent; + + XrSwapchainCreateInfo ci{XR_TYPE_SWAPCHAIN_CREATE_INFO}; + ci.arraySize = 2; + ci.format = (int64_t)format; + ci.width = extent.width; + ci.height = extent.height; + ci.mipCount = 1; + ci.faceCount = 1; + ci.sampleCount = 1; + ci.usageFlags = XR_SWAPCHAIN_USAGE_COLOR_ATTACHMENT_BIT; + + XrSwapchain handle; + xrCreateSwapchain(this->session, &ci, &handle); + + uint32_t count = 0; + xrEnumerateSwapchainImages(handle, 0, &count, nullptr); + std::vector images(count, {XR_TYPE_SWAPCHAIN_IMAGE_VULKAN_KHR}); + xrEnumerateSwapchainImages(handle, count, &count, (XrSwapchainImageBaseHeader*)images.data()); + + SwapchainData data; + data.handle = handle; + data.images = std::move(images); + this->swapchains.push_back(std::move(data)); +} + +std::vector XrContext::enumerateSwapchainImages() { + std::vector vkImages; + if (!swapchains.empty()) { + for (const auto& img : swapchains[0].images) { + vkImages.push_back(vk::Image(img.image)); + } + } + return vkImages; +} + +void XrContext::waitSwapchainImage() { + if (swapchains.empty()) return; + XrSwapchainImageWaitInfo wi{XR_TYPE_SWAPCHAIN_IMAGE_WAIT_INFO}; + wi.timeout = XR_INFINITE_DURATION; + xrWaitSwapchainImage(swapchains[0].handle, &wi); +} + +uint32_t XrContext::acquireSwapchainImage() { + if (swapchains.empty()) return 0; + XrSwapchainImageAcquireInfo ai{XR_TYPE_SWAPCHAIN_IMAGE_ACQUIRE_INFO}; + uint32_t index = 0; + xrAcquireSwapchainImage(swapchains[0].handle, &ai, &index); + return index; +} + +void XrContext::releaseSwapchainImage() { + if (swapchains.empty()) return; + XrSwapchainImageReleaseInfo ri{XR_TYPE_SWAPCHAIN_IMAGE_RELEASE_INFO}; + xrReleaseSwapchainImage(swapchains[0].handle, &ri); +} + +XrFrameState XrContext::waitFrame() { + XrFrameWaitInfo wi{XR_TYPE_FRAME_WAIT_INFO}; + this->frameState = {XR_TYPE_FRAME_STATE}; + xrWaitFrame(this->session, &wi, &this->frameState); + return this->frameState; +} + +void XrContext::beginFrame() { + XrFrameBeginInfo bi{XR_TYPE_FRAME_BEGIN_INFO}; + xrBeginFrame(this->session, &bi); +} + +void XrContext::endFrame(const std::array, 2>& eyeViews) { + XrFrameEndInfo ei{XR_TYPE_FRAME_END_INFO}; + ei.displayTime = this->frameState.predictedDisplayTime; + ei.environmentBlendMode = XR_ENVIRONMENT_BLEND_MODE_OPAQUE; + + XrCompositionLayerProjection layer{XR_TYPE_COMPOSITION_LAYER_PROJECTION}; + layer.space = this->appSpace; + + static XrCompositionLayerProjectionView projectionViews[2]; + for (uint32_t i = 0; i < 2; ++i) { + projectionViews[i] = {XR_TYPE_COMPOSITION_LAYER_PROJECTION_VIEW}; + projectionViews[i].pose = this->views[i].pose; + projectionViews[i].fov = this->views[i].fov; + projectionViews[i].subImage.swapchain = this->swapchains[0].handle; + projectionViews[i].subImage.imageRect = {{0, 0}, {(int32_t)extent.width, (int32_t)extent.height}}; + projectionViews[i].subImage.imageArrayIndex = i; + } + + layer.viewCount = 2; + layer.views = projectionViews; + + static const XrCompositionLayerBaseHeader* layers[1]; + uint32_t layerCount = 0; + if (this->frameState.shouldRender) { + layers[layerCount++] = (XrCompositionLayerBaseHeader*)&layer; + } + + ei.layerCount = layerCount; + ei.layers = layers; + + xrEndFrame(this->session, &ei); +} + +void XrContext::locateViews(XrTime predictedTime) { + XrViewLocateInfo li{XR_TYPE_VIEW_LOCATE_INFO}; + li.viewConfigurationType = XR_VIEW_CONFIGURATION_TYPE_PRIMARY_STEREO; + li.displayTime = predictedTime; + li.space = this->appSpace; + + XrViewState vs{XR_TYPE_VIEW_STATE}; + uint32_t count = 0; + xrLocateViews(this->session, &li, &vs, (uint32_t)this->views.size(), &count, this->views.data()); +} + +vk::Viewport XrContext::getViewport(uint32_t eye) const { + return vk::Viewport(0, 0, (float)extent.width, (float)extent.height, 0.0f, 1.0f); +} + +vk::Rect2D XrContext::getScissor(uint32_t eye) const { + return vk::Rect2D({0, 0}, extent); +} + +glm::mat4 XrContext::getProjectionMatrix(uint32_t eye) const { + if (eye >= views.size()) return glm::mat4(1.0f); + const auto& fov = views[eye].fov; + float nearZ = 0.1f, farZ = 100.0f; + float tanLeft = std::tan(fov.angleLeft), tanRight = std::tan(fov.angleRight); + float tanDown = std::tan(fov.angleDown), tanUp = std::tan(fov.angleUp); + float tanWidth = tanRight - tanLeft, tanHeight = tanUp - tanDown; + glm::mat4 projection = glm::mat4(0.0f); + projection[0][0] = 2.0f / tanWidth; + projection[1][1] = 2.0f / tanHeight; + projection[2][0] = (tanRight + tanLeft) / tanWidth; + projection[2][1] = (tanUp + tanDown) / tanHeight; + projection[2][2] = -farZ / (farZ - nearZ); + projection[2][3] = -1.0f; + projection[3][2] = -(farZ * nearZ) / (farZ - nearZ); + return projection; +} + +glm::mat4 XrContext::getViewMatrix(uint32_t eye) const { + if (eye >= views.size()) return glm::mat4(1.0f); + return glm::inverse(xrPoseToMatrix(views[eye].pose)); +} + +glm::vec3 XrContext::getEyePosition(uint32_t eye) const { + if (eye >= views.size()) return glm::vec3(0.0f); + return glm::vec3(views[eye].pose.position.x, views[eye].pose.position.y, views[eye].pose.position.z); +} + +void XrContext::pollActions() { + if (session == XR_NULL_HANDLE || actionSet == XR_NULL_HANDLE) return; + XrActionsSyncInfo si{XR_TYPE_ACTIONS_SYNC_INFO}; + static XrActiveActionSet as; + as.actionSet = actionSet; + as.subactionPath = XR_NULL_PATH; + si.activeActionSets = &as; + si.countActiveActionSets = 1; + xrSyncActions(session, &si); +} + +bool XrContext::isActionActive(const std::string& name) const { + if (session == XR_NULL_HANDLE || actions.find(name) == actions.end()) return false; + XrAction action = actions.at(name); + XrActionType type = actionTypes.at(name); + XrActionStateGetInfo gi{XR_TYPE_ACTION_STATE_GET_INFO}; + gi.action = action; + if (type == XR_ACTION_TYPE_BOOLEAN_INPUT) { + XrActionStateBoolean st{XR_TYPE_ACTION_STATE_BOOLEAN}; + xrGetActionStateBoolean(session, &gi, &st); + if (st.isActive) return st.currentState; + } else if (type == XR_ACTION_TYPE_FLOAT_INPUT) { + XrActionStateFloat st{XR_TYPE_ACTION_STATE_FLOAT}; + xrGetActionStateFloat(session, &gi, &st); + if (st.isActive) return st.currentState > 0.1f; + } + return false; +} + +XrPosef XrContext::getActionPose(const std::string& name) const { + if (session == XR_NULL_HANDLE || actionSpaces.find(name) == actionSpaces.end()) { + return {{0,0,0,1}, {0,0,0}}; + } + XrAction action = actions.at(name); + XrSpace space = actionSpaces.at(name); + XrSpaceLocation loc{XR_TYPE_SPACE_LOCATION}; + xrLocateSpace(space, appSpace, frameState.predictedDisplayTime, &loc); + if ((loc.locationFlags & XR_SPACE_LOCATION_ORIENTATION_VALID_BIT) && (loc.locationFlags & XR_SPACE_LOCATION_POSITION_VALID_BIT)) { + return loc.pose; + } + return {{0,0,0,1}, {0,0,0}}; +} + +std::vector XrContext::getLatestSpatialMeshes() { + return {}; +} + +glm::vec2 XrContext::getGazeNDC() const { + if (gazeSpace == XR_NULL_HANDLE || views.empty()) return glm::vec2(0.5f, 0.5f); + XrSpaceLocation loc{XR_TYPE_SPACE_LOCATION}; + xrLocateSpace(gazeSpace, appSpace, frameState.predictedDisplayTime, &loc); + if (!(loc.locationFlags & XR_SPACE_LOCATION_ORIENTATION_VALID_BIT)) return glm::vec2(0.5f, 0.5f); + glm::mat4 gazeMat = xrPoseToMatrix(loc.pose); + glm::vec3 gazeOrigin = glm::vec3(gazeMat[3]); + glm::vec3 gazeDir = -glm::vec3(gazeMat[2]); + glm::mat4 viewProj = getProjectionMatrix(0) * getViewMatrix(0); + glm::vec4 projected = viewProj * glm::vec4(gazeOrigin + gazeDir, 1.0f); + if (projected.w == 0.0f) return glm::vec2(0.5f, 0.5f); + glm::vec3 ndc = glm::vec3(projected) / projected.w; + return glm::vec2(ndc.x * 0.5f + 0.5f, ndc.y * 0.5f + 0.5f); +} diff --git a/attachments/openxr_engine/xr_context.h b/attachments/openxr_engine/xr_context.h new file mode 100644 index 00000000..4d744ab5 --- /dev/null +++ b/attachments/openxr_engine/xr_context.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include +#define XR_USE_PLATFORM_XLIB +#define XR_USE_GRAPHICS_API_VULKAN +#include +#include +#include +#include +#include +#include +#include +#include + +// Helper structure for spatial meshes (Chapter 16) +struct XrSpatialMesh { + XrUuidMSFT meshGuid; + std::vector vertices; + std::vector indices; + glm::mat4 transform; +}; + +class XrContext { +public: + XrContext(); + ~XrContext(); + + bool createInstance(const std::string& appName); + void setVulkanInstance(vk::Instance instance) { vkInstance = instance; } + bool createSession(vk::PhysicalDevice physicalDevice, vk::Device device, uint32_t queueFamilyIndex, uint32_t queueIndex); + void cleanup(); + +#if defined(PLATFORM_ANDROID) + void setAndroidApp(struct android_app* app) { androidApp = app; } +#endif + + // Core Handshake (Chapter 2) + std::vector getVulkanInstanceExtensions(); + std::vector getVulkanDeviceExtensions(vk::PhysicalDevice physicalDevice); + const uint8_t* getRequiredLUID(); + + // Swapchain Management (Chapter 3 & 8) + vk::Extent2D getRecommendedExtent() const; + void createSwapchains(vk::Device device, vk::Format format, vk::Extent2D extent); + std::vector enumerateSwapchainImages(); // Returns images with 2 layers for multiview + vk::Extent2D getSwapchainExtent() const { return extent; } + vk::Format getSwapchainFormat() const { return format; } + + void waitSwapchainImage(); + uint32_t acquireSwapchainImage(); + void releaseSwapchainImage(); + + // Frame Lifecycle (Chapter 5) + XrFrameState waitFrame(); + void beginFrame(); + void endFrame(const std::array, 2>& eyeViews); + + // View & Projection (Chapter 4 & 11) + void locateViews(XrTime predictedTime); + std::vector getLatestViews() const { return views; } + std::array getLatestViewPoses() const { return {views[0].pose, views[1].pose}; } + vk::Viewport getViewport(uint32_t eye) const; + vk::Rect2D getScissor(uint32_t eye) const; + glm::mat4 getProjectionMatrix(uint32_t eye) const; + glm::mat4 getViewMatrix(uint32_t eye) const; + glm::vec3 getEyePosition(uint32_t eye) const; + + // Input Actions (Chapter 7) + void pollActions(); + bool isActionActive(const std::string& name) const; + XrPosef getActionPose(const std::string& name) const; + + // Scene Understanding (Chapter 16) + std::vector getLatestSpatialMeshes(); + + // ML & Occlusion (Chapter 17 & 18) + glm::vec2 getGazeNDC() const; + XrReferenceSpaceType getReferenceSpace() const { return referenceSpaceType; } + + bool isExtensionEnabled(const char* extName) const; + static bool checkRuntimeAvailable(); + +private: + PFN_xrGetVulkanInstanceExtensionsKHR pfnGetVulkanInstanceExtensionsKHR = nullptr; + PFN_xrGetVulkanDeviceExtensionsKHR pfnGetVulkanDeviceExtensionsKHR = nullptr; + PFN_xrGetVulkanGraphicsRequirementsKHR pfnGetVulkanGraphicsRequirementsKHR = nullptr; + PFN_xrGetVulkanGraphicsRequirements2KHR pfnGetVulkanGraphicsRequirements2KHR = nullptr; + PFN_xrGetVulkanGraphicsDeviceKHR pfnGetVulkanGraphicsDeviceKHR = nullptr; + PFN_xrGetVulkanGraphicsDevice2KHR pfnGetVulkanGraphicsDevice2KHR = nullptr; + + XrInstance instance; + vk::Instance vkInstance; + XrSystemId systemId; + XrSession session; + XrSpace appSpace; + XrReferenceSpaceType referenceSpaceType = XR_REFERENCE_SPACE_TYPE_STAGE; + + uint8_t requiredLuid[VK_LUID_SIZE] = {0}; + bool luidValid = false; + +#if defined(PLATFORM_ANDROID) + struct android_app* androidApp = nullptr; +#endif + + vk::Format format; + vk::Extent2D extent; + + struct SwapchainData { + XrSwapchain handle; + std::vector images; + }; + std::vector swapchains; + + XrFrameState frameState; + std::vector views; + + // Action system members + XrActionSet actionSet; + std::map actions; + std::map actionTypes; + std::map actionSpaces; + + // Gaze interaction member + XrSpace gazeSpace; + + // Scene understanding member + // XrSceneObserverMSFT sceneObserver; + + std::vector enabledExtensions; +}; + +// Common Helper: Convert XrPosef to glm::mat4 +inline glm::mat4 xrPoseToMatrix(const XrPosef& pose) { + glm::quat q(pose.orientation.w, pose.orientation.x, pose.orientation.y, pose.orientation.z); + glm::mat4 m = glm::mat4_cast(q); + m[3] = glm::vec4(pose.position.x, pose.position.y, pose.position.z, 1.0f); + return m; +} diff --git a/en/Building_a_Simple_Engine/Tooling/index.adoc b/en/Building_a_Simple_Engine/Tooling/index.adoc index 78375263..cd18ccdf 100644 --- a/en/Building_a_Simple_Engine/Tooling/index.adoc +++ b/en/Building_a_Simple_Engine/Tooling/index.adoc @@ -12,4 +12,4 @@ This chapter covers essential tooling and techniques for developing, debugging, * xref:Building_a_Simple_Engine/Tooling/06_packaging_and_distribution.adoc[Packaging and Distribution] * xref:Building_a_Simple_Engine/Tooling/07_conclusion.adoc[Conclusion] -xref:Subsystems/06_conclusion.adoc[Previous: Subsystems Conclusion] | xref:../index.adoc[Back to Building a Simple Engine] +xref:Building_a_Simple_Engine/Subsystems/06_conclusion.adoc[Previous: Subsystems Conclusion] | xref:Building_a_Simple_Engine/introduction.adoc[Back to Building a Simple Engine] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc new file mode 100644 index 00000000..80a6add4 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc @@ -0,0 +1,26 @@ +:pp: {plus}{plus} += The OpenXR-Vulkan 1.4 Handshake: Introduction + +Before we can render a single pixel to an XR headset, we must first establish a "handshake" between our application, the OpenXR runtime, and the Vulkan graphics driver. This is not as simple as creating a standard Vulkan instance and passing it to a library; it requires a coordinated dance of extension negotiation and hardware verification. + +In this chapter, we are going to look at the three pillars of a successful spatial handshake: + +1. **System Integration**: How to extend our engine's `VulkanContext` to support the mandatory OpenXR extensions, specifically `XR_KHR_vulkan_enable2`. +2. **Hardware Alignment**: Utilizing the Locally Unique Identifier (**LUID**) to ensure that both OpenXR and Vulkan are talking to the exact same physical GPU. This is critical for cross-process memory visibility and performance. +3. **Vulkan 1.4 Requirements**: Activating the modern features—like Timeline Semaphores, Dynamic Rendering, and Synchronization 2—that allow our spatial pipeline to operate with minimal latency. + +== The Concept of the Handshake + +In a standard desktop application, your engine is the boss. It creates the instance, chooses the device, and owns the swapchain. In OpenXR, the relationship is more of a partnership. The **XR Runtime** (the software that drives the headset, like SteamVR or the Oculus service) needs to know exactly how your Vulkan instance is configured so it can safely inject its own compositor layers into your rendering stream. + +If you don't perform this handshake correctly, you might find that you can't initialize the XR session, or worse, you'll experience massive performance drops as the hardware is forced to copy images between different GPU memory contexts. + +== Vulkan 1.4 benefits in OpenXR Application + +* **Low-latency synchronization** between the CPU simulation and the GPU compositor. +* **Single-pass rendering** via multiview. +* **Direct-to-display submission** without the overhead of legacy render pass state. + +By the end of this chapter, you will have modified your engine's initialization code to be fully spatial-aware, laying the groundwork for the predictive frame loop and runtime-owned swapchains that follow. + +xref:OpenXR_Vulkan_Spatial_Computing/introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc new file mode 100644 index 00000000..725b6ec3 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc @@ -0,0 +1,81 @@ +:pp: {plus}{plus} += System Integration + +In a standard desktop application, our engine is typically responsible for creating a `VkInstance` and selecting a `VkPhysicalDevice` based on its own internal logic. When we move to OpenXR, we are no longer the sole decision-maker. We must first negotiate with the XR runtime to ensure our graphics context is compatible with the headset's compositor. + +The bridge between OpenXR and Vulkan is the extension **XR_KHR_vulkan_enable2**. This extension provides the mechanism for OpenXR to specify exactly which instance and device extensions are required, as well as any specific feature flags that must be enabled for the spatial pipeline to function. + +== The Architectural Boundary: Why a Handshake? + +To understand why this negotiation is necessary, we must first understand the architectural boundary between our engine and the **OpenXR Runtime**. Unlike a standard windowing system, an XR runtime (like SteamVR, Meta Link, or Monado) often operates as a separate, high-priority system service or process. + +1. **Process Isolation**: The runtime manages the physical hardware (cameras, IMUs, and displays) and performs critical tasks like **Asynchronous Reprojection**—warping the final image if our engine misses its frame deadline. +2. **VRAM Sharing**: Because the runtime and our engine are separate processes, they must share access to the GPU's memory. This isn't just a simple copy; it is a "Zero-Copy" handoff where we both see the same physical memory addresses. +3. **Synchronization**: We need a way to tell the runtime "I am done rendering this frame" and for the runtime to tell us "I am done using this image, you can have it back." + +The handshake is our way of telling the OS and the XR runtime: "We are both speaking the same dialect of Vulkan, using the same extensions, and looking at the same physical silicon." + +== Querying XR Graphics Requirements + +Before we can even create our Vulkan instance, we must initialize our OpenXR instance and query the graphics requirements for the specific system (the headset) we are targeting. This is performed using the `xrGetVulkanGraphicsRequirements2KHR` function. + +This call populates an `XrGraphicsRequirementsVulkanKHR` structure, which contains two critical pieces of information: + +1. **minApiVersionSupported**: The minimum Vulkan version the runtime supports. +2. **maxApiVersionSupported**: The maximum Vulkan version the runtime has been tested with. + +Since we are targeting Vulkan 1.4, we must verify that our chosen version falls within this range. Most modern runtimes are rapidly updating to support the latest specifications, ensuring we have access to the advanced synchronization tools we need. + +== Extending the Engine Context + +Our engine's initialization logic needs to be modified to accept these external requirements. Instead of hardcoding a static list of instance extensions, we perform a coordinated handshake: + +1. **Query OpenXR**: Ask the runtime for its mandatory instance extensions using `xrGetVulkanInstanceExtensionsKHR`. +2. **Merge and Initialize**: Combine these OpenXR requirements with our engine's own mandatory extensions (such as `VK_EXT_debug_utils`) and create the instance. + +[IMPORTANT] +==== +**The String Parsing Pitfall**: `xrGetVulkanInstanceExtensionsKHR` returns the required extensions as a single, space-separated string (e.g., `"VK_KHR_external_memory_capabilities VK_KHR_get_physical_device_properties2"`). You must manually parse this string and split it into individual `const char*` entries before passing them to `vk::InstanceCreateInfo`. +==== + +The same principle applies to the physical device. The XR runtime may require specific device extensions, such as `VK_KHR_external_memory` or color space extensions, to safely share images between our application and the compositor process. + +[source,cpp] +---- +// Example of merging requirements in our Engine's initialization +std::vector instanceExtensions = engineDefaults.getInstanceExtensions(); + +// 1. Query the length of the extension string +uint32_t xrExtensionCount = 0; +xrGetVulkanInstanceExtensionsKHR(xrInstance, systemId, 0, &xrExtensionCount, nullptr); + +// 2. Retrieve the space-separated string +std::string xrExtensionString(xrExtensionCount, '\0'); +xrGetVulkanInstanceExtensionsKHR(xrInstance, systemId, xrExtensionCount, &xrExtensionCount, xrExtensionString.data()); + +// 3. Parse the string into individual extension names +std::stringstream ss(xrExtensionString); +std::string extension; +while (ss >> extension) { + // Check if we already have it, then add it + if (std::find(instanceExtensions.begin(), instanceExtensions.end(), extension) == instanceExtensions.end()) { + instanceExtensions.push_back(strdup(extension.c_str())); + } +} + +vk::InstanceCreateInfo createInfo({}, &applicationInfo, + static_cast(layers.size()), layers.data(), + static_cast(instanceExtensions.size()), instanceExtensions.data()); + +vk::raii::Instance instance(context, createInfo); +---- + +== Why the Handshake Matters: Zero-Copy Efficiency + +The ultimate goal of this handshake is **Zero-Copy Efficiency**. If we selected a GPU that the XR runtime couldn't talk to, the operating system would be forced to copy our final rendered frames through system memory to reach the headset's display. This would destroy our frame budget and introduce "judder"—the visual stuttering that causes motion sickness in XR. + +By following the `XR_KHR_vulkan_enable2` protocol, we guarantee that our frames stay on the GPU at all times, moving from our render pipeline to the headset's display with the absolute minimum possible latency. + +In the next section, we will look at **Hardware Alignment**, where we ensure that the `VkPhysicalDevice` we select is the exact same one the headset is physically connected to. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc new file mode 100644 index 00000000..a954b864 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc @@ -0,0 +1,63 @@ +:pp: {plus}{plus} += Hardware Alignment (LUID) + +In many modern computing environments, especially high-end gaming desktops, it is common to have multiple GPUs—such as an integrated GPU on the processor and a dedicated high-performance card. For spatial computing, it is absolutely critical that both our application and the OpenXR runtime are using the exact same physical hardware. + +If our engine renders a frame on GPU A, but the headset is physically connected to GPU B, the operating system would be forced to copy that image across the **PCIe** (Peripheral Component Interconnect Express) bus between the two cards. This "cross-talk" introduces massive latency and can easily break the tight timing requirements needed for a comfortable XR experience. To prevent this, we use the **Locally Unique Identifier (LUID)**. + +== The "PCIe Tax": Why Alignment is Mandatory + +To understand why we care about the LUID, we must consider the cost of data movement. The PCIe bus is the highway between your CPU, system RAM, and your GPUs. While it is very fast, it is still orders of magnitude slower than the internal memory bus of a modern GPU (VRAM). + +* **Internal VRAM Copy**: Moving an image within the same GPU is almost instantaneous, often happening at speeds of 500 GB/s to 1 TB/s. +* **Cross-GPU Copy**: Moving an image from GPU A to GPU B requires sending the data over the PCIe bus to system RAM, and then back down to GPU B. This adds several milliseconds of latency. + +In XR, every millisecond counts. If your alignment is wrong, you might lose 3-5ms just moving the pixels to the display. That is nearly half of your entire frame budget for a 90Hz or 120Hz headset! + +== What is an LUID? + +An **LUID** is a 64-bit value guaranteed to be unique on the local machine until the next reboot. Unlike a **UUID** (Universally Unique Identifier), which is persistent across machines and reboots, the LUID is a transient hardware handle provided by the operating system (specifically Windows via **DXGI**, the DirectX Graphics Infrastructure, though Vulkan provides a cross-platform way to access it). + +In the context of the OpenXR-Vulkan handshake, the LUID serves as the hardware "fingerprint" of the GPU. OpenXR tells us: "I am currently talking to the GPU with this specific LUID," and we must search through our available `VkPhysicalDevice` handles until we find the one that matches. + +== Querying the XR Device + +Once we have initialized our `xr::Instance`, we can query the specific `VkPhysicalDevice` handle that the runtime expects us to use. We do this by calling `xrGetVulkanGraphicsDevice2KHR`. + +[source,cpp] +---- +VkPhysicalDevice xrRequiredDevice; +xrGetVulkanGraphicsDevice2KHR(xrInstance, systemId, *instance, &xrRequiredDevice); +---- + +While we can get the raw handle directly from the runtime, it is often safer and more robust to match based on the LUID, especially when our engine's architecture abstracts physical device selection or when multiple Vulkan instances are present. Matching via LUID ensures that we are selecting the device based on hardware identity rather than just a transient API handle. + +== Matching the LUID in Vulkan + +To find the LUID of a Vulkan physical device, we query the `VkPhysicalDeviceIDProperties` structure. This structure contains the `deviceLUID` and a boolean `deviceLUIDValid`. In the `vulkan-hpp` RAII world, we can retrieve this by chaining structures in a `getProperties2` call. + +[source,cpp] +---- +// Iterating through physical devices to find a match +for (const auto& physicalDevice : instance.enumeratePhysicalDevices()) { + auto props2 = physicalDevice.getProperties2(); + const auto& idProps = props2.get(); + + if (idProps.deviceLUIDValid) { + // Compare the 8-byte LUID against the target from OpenXR + if (std::memcmp(idProps.deviceLUID, targetLUID, VK_LUID_SIZE) == 0) { + return physicalDevice; + } + } +} +---- + +== Cross-Process Memory Visibility + +The LUID isn't just for selection; it is the foundation of **Cross-Process Memory Visibility**. Because the XR runtime usually lives in a separate process from our game engine, the images we render must be shared across process boundaries. + +Vulkan 1.4 makes this easier by standardizing external memory handles, but these handles only work if the memory was allocated on the same physical silicon. By aligning our hardware selection via the LUID, we guarantee that the "Wait-Acquire-Release" cycle can happen entirely on-device, without expensive CPU-side synchronization or system memory copies. + +In the final part of our handshake, we will ensure that our selected device is configured with the mandatory **Vulkan 1.4 features** required for a modern spatial pipeline. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/02_system_integration.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc new file mode 100644 index 00000000..137de6b9 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc @@ -0,0 +1,72 @@ +:pp: {plus}{plus} += Vulkan 1.4 Feature Requirements + +The final step in our spatial handshake is ensuring that our `VkDevice` is initialized with the correct set of features. While OpenXR can technically run on older Vulkan versions, the **Vulkan 1.4** specification is the "gold standard" for modern spatial computing. It codifies several previously optional extensions into the core API, providing a guaranteed baseline of high-performance tools. + +In this section, we'll look at the three mandatory "pillars" of Vulkan 1.4 that our spatial pipeline depends on. + +== 1. Timeline Semaphores: The Async Heartbeat + +In a standard application, we use binary semaphores to sync the GPU. A binary semaphore is like a single-use "go" signal. In XR, where we have complex dependencies between the CPU simulation, the GPU render, and the XR compositor (which might be in a different process), binary semaphores are too rigid. + +**Timeline Semaphores** provide a 64-bit monotonically increasing value. Think of them as a "Global Clock" for your GPU work. This allows us to express complex "happens-before" relationships: + +* **Pacing**: We can signal that "Frame 500 is ready for composition" by simply reaching value `500` on a single timeline. +* **Late Latching**: We can tell the GPU to "Wait until value 501 is signaled by the CPU" before starting the final matrix update. +* **Cross-Process**: Because they can be exported to OS handles, timeline semaphores allow our engine and the XR runtime to stay perfectly in sync without expensive CPU read-backs. + +== 2. Synchronization 2: Mastering Queue Ownership + +The original Vulkan synchronization API was notoriously verbose and error-prone. **Synchronization 2** (`VK_KHR_synchronization2`) simplifies this by combining pipeline stages and access masks into a more unified structure (`VkDependencyInfo`). + +In XR, this is critical for handling **Queue Ownership Transfers**. Because the XR runtime owns the swapchain images, we must perform a "Release" operation on our queue when we're done rendering, and the runtime performs an "Acquire" on its side. Synchronization 2 makes these hand-offs explicit and readable, significantly reducing the risk of "Race Conditions" between our engine and the headset compositor. + +== 3. Dynamic Rendering: Ultimate Flexibility + +If you've followed our engine-building journey, you're already intimately familiar with **Dynamic Rendering**. In the context of spatial computing, this isn't just a convenient refactoring—it's an absolute necessity. + +* **Asymmetric Views**: Most headsets use "Canted Displays" where the left and right eye frustums are not parallel. This often requires different viewport and scissor settings for each eye. +* **Dynamic Resolution**: To maintain a steady 90Hz, the engine might need to drop the resolution of peripheral views instantly. +* **No Rigid State**: By using `vkCmdBeginRendering` directly on our XR swapchain images, we avoid the heavy overhead and rigid state of legacy `VkRenderPass` and `VkFramebuffer` objects. + +== Enabling Vulkan 1.4 Features in RAII + +Vulkan 1.4 simplifies feature enablement. Instead of deep, nested `pNext` chains, we can use the unified `VkPhysicalDeviceVulkan14Features` (and its predecessors for 1.3 and 1.2) to toggle the core requirements. + +[source,cpp] +---- +// Enabling mandatory features for our spatial pipeline +vk::PhysicalDeviceVulkan13Features features13; +features13.dynamicRendering = VK_TRUE; +features13.synchronization2 = VK_TRUE; + +vk::PhysicalDeviceVulkan12Features features12; +features12.timelineSemaphore = VK_TRUE; + +// Chain them together +features13.pNext = &features12; + +vk::DeviceCreateInfo createInfo({}, + static_cast(queueCreateInfos.size()), queueCreateInfos.data(), + 0, nullptr, // Layers (deprecated) + static_cast(deviceExtensions.size()), deviceExtensions.data(), + nullptr); // pEnabledFeatures (use pNext instead) + +// Ensure we are explicitly asking for Vulkan 1.4 core features if needed +// Note: In Vulkan 1.4, many of these are enabled by default if the version is supported, +// but explicit enablement is still best practice for engine portability. +createInfo.pNext = &features13; + +vk::raii::Device device(physicalDevice, createInfo); +---- + +== Summary of the Handshake + +We have now successfully: +1. Extended our `VulkanContext` to negotiate extensions with OpenXR. +2. Aligned our `VkPhysicalDevice` selection using the hardware LUID. +3. Enabled the modern Vulkan 1.4 features required for low-latency spatial rendering. + +With the handshake complete, we are ready to tackle the most significant architectural change in an XR engine: moving from engine-owned swapchains to **Runtime-Owned Swapchains**. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/03_hardware_alignment_luid.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..5ccb94af --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc @@ -0,0 +1,177 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +In this chapter, we've explored the fundamental handshake between OpenXR and Vulkan 1.4. To bring these concepts into our `simple_game_engine`, we must evolve our core initialization logic in `renderer_core.cpp` and `xr_context.cpp`. An example of this implementation can be found in link:../../../attachments/openxr_engine/xr_context.cpp[xr_context.cpp] and link:../../../attachments/openxr_engine/renderer_core.cpp[renderer_core.cpp]. + +== Testing for XR Mode + +Before we change the engine, we need a way to detect if we should even try to use OpenXR. In `main.cpp`, within the `main` function, we can add a check before initializing the engine: + +[source,cpp] +---- +// main.cpp +int main(int, char *[]) { + // ... initial setup ... + + Engine engine; + + // Test if we are able to run in XR mode + bool useXR = false; + if (XrContext::checkRuntimeAvailable()) { + std::cout << "OpenXR Runtime detected. Attempting spatial initialization..." << std::endl; + useXR = true; + } + + if (!engine.Initialize("Simple Engine", WINDOW_WIDTH, WINDOW_HEIGHT, ENABLE_VALIDATION_LAYERS, useXR)) { + throw std::runtime_error("Failed to initialize engine"); + } + + // ... +} +---- + +== Modifying the Renderer Initialization + +In `renderer_core.cpp`, we need to update `Renderer::Initialize` and its helper methods to handle the OpenXR handshake. + +== Supporting Code: The XrContext Helper + +Throughout this tutorial, we will use a helper class named `XrContext` (or `xrContext` as an instance) to wrap the OpenXR API. This keeps our engine code clean and focused on spatial logic. Before modifying the renderer, ensure your engine has a basic structure for this helper. + +[source,cpp] +---- +// xr_context.h +class XrContext { +public: + // Core Handshake (Chapter 2) + std::vector getVulkanInstanceExtensions(); + const uint8_t* getRequiredLUID(); + + // Swapchain Management (Chapter 3) + void createSwapchains(vk::Device device, vk::Format format, vk::Extent2D extent); + std::vector enumerateSwapchainImages(); + vk::Extent2D getSwapchainExtent(); + vk::Format getSwapchainFormat(); + void waitSwapchainImage(); + uint32_t acquireSwapchainImage(); + void releaseSwapchainImage(); + + // Frame Lifecycle (Chapter 5) + XrFrameState waitFrame(); + void beginFrame(); + void endFrame(const std::array, 2>& eyeViews); + + // View & Projection (Chapter 4 & 11) + void locateViews(XrTime predictedTime); + std::vector getLatestViews(); + std::array getLatestViewPoses(); // Added for Late Latching + vk::Viewport getViewport(uint32_t eye); + vk::Rect2D getScissor(uint32_t eye); + glm::mat4 getProjectionMatrix(uint32_t eye); + glm::mat4 getViewMatrix(uint32_t eye); + glm::vec3 getEyePosition(uint32_t eye); // Added for UBO updates + + // Input Actions (Chapter 7) + void pollActions(); + bool isActionActive(const std::string& name); + XrPosef getActionPose(const std::string& name); + + // Scene Understanding (Chapter 16) + std::vector getLatestSpatialMeshes(); + + // ML & Occlusion (Chapter 17 & 18) + glm::vec2 getGazeNDC(); + XrReferenceSpaceType getReferenceSpace(); +}; + +// Common Helper: Convert XrPosef to glm::mat4 +inline glm::mat4 xrPoseToMatrix(const XrPosef& pose) { + glm::quat q(pose.orientation.w, pose.orientation.x, pose.orientation.y, pose.orientation.z); + glm::mat4 m = glm::mat4_cast(q); + m[3] = glm::vec4(pose.position.x, pose.position.y, pose.position.z, 1.0f); + return m; +} +---- + +== 1. Instance Creation with XR Extensions + +In `Renderer::createInstance`, we must append the extensions required by OpenXR. Instead of only relying on GLFW, we query the XR runtime: + +[source,cpp] +---- +// renderer_core.cpp +bool Renderer::createInstance(const std::string& appName, bool enableValidationLayers) { + // ... + std::vector extensions; + + // Standard GLFW extensions + #if defined(PLATFORM_DESKTOP) + uint32_t glfwExtensionCount = 0; + const char** glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + extensions.insert(extensions.end(), glfwExtensions, glfwExtensions + glfwExtensionCount); + #endif + + // NEW: Add OpenXR mandatory extensions + if (xrMode) { + auto xrExtensions = xrContext.getVulkanInstanceExtensions(); + extensions.insert(extensions.end(), xrExtensions.begin(), xrExtensions.end()); + } + + // ... create vk::raii::Instance ... +} +---- + +=== 2. Hardware Alignment (LUID Matching) + +In `Renderer::pickPhysicalDevice`, we must ensure we select the GPU that OpenXR is actually using. We do this by matching the LUID: + +[source,cpp] +---- +// renderer_core.cpp +bool Renderer::pickPhysicalDevice() { + // ... enumerate devices ... + + for (auto& _device : devices) { + if (xrMode) { + // Match the LUID provided by OpenXR + auto props2 = _device.getProperties2(); + const auto& idProps = props2.get(); + + const uint8_t* requiredLuid = xrContext.getRequiredLUID(); + if (requiredLuid && std::memcmp(idProps.deviceLUID, requiredLuid, VK_LUID_SIZE) != 0) { + continue; // Not the right GPU for XR! + } + } + // ... rest of suitability scoring ... + } +} +---- + +=== 3. Enabling Vulkan 1.3/1.4 Features + +In `Renderer::createLogicalDevice`, we must explicitly enable the Vulkan 1.3 features required for spatial computing, particularly **Dynamic Rendering** and **Synchronization 2**: + +[source,cpp] +---- +// renderer_core.cpp +bool Renderer::createLogicalDevice(bool enableValidationLayers) { + // ... + // Enable Vulkan 1.3 features + vk::PhysicalDeviceVulkan13Features vulkan13Features; + vulkan13Features.dynamicRendering = vk::True; + vulkan13Features.synchronization2 = vk::True; + + // Enable Multiview (if in XR mode) + vk::PhysicalDeviceMultiviewFeatures multiviewFeatures; + multiviewFeatures.multiview = xrMode ? vk::True : vk::False; + multiviewFeatures.pNext = &vulkan13Features; + + // ... create vk::raii::Device with multiviewFeatures in pNext ... +} +---- + +== Why These Changes? + +By referencing the specific locations in `renderer_core.cpp`, we can see that OpenXR isn't just an "add-on"—it's a **collaborator** in the Vulkan lifecycle. Without the LUID matching in `pickPhysicalDevice`, our engine might initialize on a discrete GPU while the VR headset is tethered to an integrated one, leading to a complete failure to display. Similarly, the extension negotiation ensures that the XR compositor and our engine speak the same "dialect" of Vulkan. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/04_vulkan_1_4_feature_requirements.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc new file mode 100644 index 00000000..bc1cec9d --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc @@ -0,0 +1,26 @@ +:pp: {plus}{plus} += Runtime-Owned Swapchains & Image Wrapping: Introduction + +In a standard Vulkan application, the engine is in total control of the swapchain. We create the `VkSwapchainKHR`, we decide the image count and format, and we own the memory. In spatial computing, this model is turned on its head. + +To achieve the near-zero latency required for a comfortable XR experience, the **XR Runtime** (the compositor) must own the final set of images. This allows the runtime to perform **Asynchronous Reprojection**—a late-stage warping of your image to match the user's head pose at the exact moment the pixels are strobed onto the display—without needing to copy your data into its own memory space. + +In this chapter, we will explore the architectural shift from engine-owned to runtime-owned resources: + +1. **External Image Negotiation**: How to request swapchain images from OpenXR and handle the formats provided by the runtime. +2. **RAII Resource Integration**: Wrapping raw `VkImage` handles provided by the runtime into our engine's `vk::raii::Image` and `vk::raii::ImageView` abstractions. +3. **Memory Ownership Lifecycle**: Mastering the **Wait-Acquire-Release** cycle, which replaces the standard `vkAcquireNextImageKHR` loop. + +== The "Wait-Acquire-Release" Rhythm + +The most significant change is how we access our render targets. In a desktop application, we ask the swapchain for an image index and render to it. In OpenXR, we must follow a strict three-step protocol: + +* **Wait**: Block until the runtime is ready to give us a swapchain image. +* **Acquire**: Formally take ownership of a specific image index. +* **Release**: Hand the image back to the compositor once our command buffer is submitted. + +This rhythm ensures that we never render into an image that is currently being displayed or warped by the compositor, preventing tearing and ensuring the highest possible visual stability. + +By the end of this chapter, your engine will be able to render its spatial views directly into the headset's compositor buffers, using our existing RAII wrappers while respecting the runtime's ownership. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/05_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc new file mode 100644 index 00000000..3a4082dc --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc @@ -0,0 +1,74 @@ +:pp: {plus}{plus} += External Image Negotiation + +In a standard Vulkan application, we use `vkCreateSwapchainKHR` to create a surface-bound swapchain. In OpenXR, we replace this entire mechanism with the `XrSwapchain`. Instead of the operating system's windowing system (like Win32 or X11) managing the buffers, the **XR Runtime** acts as our swapchain provider. + +This negotiation starts by asking the runtime what image formats it supports and choosing the one that best fits our engine's internal pipeline. + +== The Inversion of Control: The Swapchain Provider + +To understand this negotiation, you must shift your mindset. In a desktop app, *you* create the swapchain and the OS *consumes* it. In OpenXR, the **Runtime creates the images** and *lends* them to you. + +1. **Ownership**: The runtime allocates the VRAM and manages the memory layout. +2. **Compatibility**: The runtime knows exactly which bit-depths and color spaces its internal compositor can handle without performing expensive conversions. +3. **Efficiency**: By providing the images directly, the runtime can ensure they are allocated in a memory region that is visible to both your engine and the compositor process. + +== Enumerating Swapchain Formats + +The first step is to query the runtime for its supported `VkFormat` list using `xrEnumerateSwapchainFormats`. + +[source,cpp] +---- +uint32_t formatCount = 0; +xrEnumerateSwapchainFormats(xrSession, 0, &formatCount, nullptr); +std::vector formats(formatCount); +xrEnumerateSwapchainFormats(xrSession, formatCount, &formatCount, formats.data()); +---- + +It is important to note that OpenXR returns these as `int64_t` values, which we must cast to `VkFormat` (or `vk::Format` in our RAII engine). Usually, we want a high-dynamic-range (**HDR**) format like `VK_FORMAT_R16G16B16A16_SFLOAT` or a standard **SRGB** format like `VK_FORMAT_R8G8B8A8_SRGB`. + +[TIP] +==== +**Linear vs SRGB**: If your engine performs its own gamma correction in a post-process pass, you should prefer a linear format. However, if you are outputting directly to the headset, using an `_SRGB` format allows the hardware to perform the conversion for free during the final write. +==== + +== Creating the XrSwapchain + +Once we have chosen a format, we fill out an `XrSwapchainCreateInfo` structure. This structure is similar to `VkSwapchainCreateInfoKHR`, but it also includes XR-specific fields like **sampleCount** (for multisampling) and **faceCount** (for cubemap views). + +[source,cpp] +---- +XrSwapchainCreateInfo createInfo{XR_TYPE_SWAPCHAIN_CREATE_INFO}; +createInfo.format = chosenVulkanFormat; +createInfo.sampleCount = 1; +createInfo.width = viewWidth; +createInfo.height = viewHeight; +createInfo.faceCount = 1; +createInfo.arraySize = 1; // 1 for standard mono/stereo, more for multiview +createInfo.mipCount = 1; + +XrSwapchain xrSwapchain; +xrCreateSwapchain(xrSession, &createInfo, &xrSwapchain); +---- + +== Retrieving the Raw Images + +After creating the `XrSwapchain`, we need to get the actual `VkImage` handles so we can wrap them in our engine's abstractions. We use `xrEnumerateSwapchainImages` to retrieve an array of `XrSwapchainImageVulkanKHR` structures. + +[source,cpp] +---- +uint32_t imageCount = 0; +xrEnumerateSwapchainImages(xrSwapchain, 0, &imageCount, nullptr); +std::vector xrImages(imageCount, {XR_TYPE_SWAPCHAIN_IMAGE_VULKAN_KHR}); +xrEnumerateSwapchainImages(xrSwapchain, imageCount, &imageCount, (XrSwapchainImageBaseHeader*)xrImages.data()); +---- + +Each `XrSwapchainImageVulkanKHR` contains a `VkImage` handle. It is these handles that we will feed into our engine's RAII system. + +== Why "Zero-Copy" Matters: The Compositor Pipeline + +By negotiating the images this way, we ensure a **Zero-Copy** hand-off. When we finish rendering, we don't copy our frame to a separate buffer. Instead, we simply signal to the runtime that we are done with its image. The runtime then takes that exact same memory and uses it for the final compositor pass. + +This efficiency is the difference between a high-performance 90Hz experience and a laggy, uncomfortable one. In the next section, we'll see how to wrap these raw Vulkan handles into the engine's `vk::raii` abstractions. + +xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc new file mode 100644 index 00000000..4201d67d --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc @@ -0,0 +1,75 @@ +:pp: {plus}{plus} += RAII Resource Integration + +Once we have our raw `VkImage` handles from OpenXR, we need to integrate them into our engine's Resource Acquisition Is Initialization (**RAII**) framework. Our engine uses the `vk::raii` namespace, which provides high-level C++ wrappers that automatically manage the lifetime of Vulkan objects. + +However, there is a catch: `vk::raii` objects typically assume they own the resource and will attempt to destroy it when they go out of scope. Because the XR runtime owns these swapchain images, we must ensure that our engine does not accidentally call `vkDestroyImage` on them. + +== The Ownership Contract: "Lent, Not Given" + +When we receive a `VkImage` from `xrEnumerateSwapchainImages`, we are entering into a legal contract with the XR runtime. + +* **We can**: Bind the image as a color attachment, transition its layout, and write to it with our shaders. +* **We cannot**: Destroy the image, change its allocation flags, or attempt to free its memory. + +If our engine attempts to call `vkDestroyImage` (which a standard `vk::raii::Image` would do in its destructor), we will likely crash the entire XR system—including the headset runtime and possibly our own graphics drivers. + +== Non-Owning Image Wrappers + +The simplest way to handle this in `vulkan-hpp` is to treat the `VkImage` handles as non-owning `vk::Image` objects. While we lose the automatic cleanup of the image itself, we still gain the benefit of the type-safe C++ API. + +We then create our own `vk::raii::ImageView` for each image. Unlike the image itself, the **Image View** is something our application creates and owns, so it can (and should) be managed by our RAII system. + +[source,cpp] +---- +// Wrapping the raw handles into our engine's per-frame data +struct XrFrameBuffer { + vk::Image image; // Non-owning handle + vk::raii::ImageView imageView = nullptr; // RAII owned by us +}; + +std::vector framebuffers; +for (const auto& xrImg : xrImages) { + XrFrameBuffer fb; + fb.image = vk::Image(xrImg.image); + + // Note: If using multiview, the view type might be e2DArray instead of e2D + vk::ImageViewCreateInfo viewInfo({}, fb.image, vk::ImageViewType::e2D, + format, {}, {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}); + + fb.imageView = vk::raii::ImageView(device, viewInfo); + framebuffers.push_back(std::move(fb)); +} +---- + +== Integrating with Dynamic Rendering + +Because we are using **Dynamic Rendering** (introduced in Vulkan 1.3 and core in 1.4), we don't need to wrap these images into `VkFramebuffer` objects. Instead, we can directly reference the `vk::raii::ImageView` in our `vk::RenderingAttachmentInfo`. + +[source,cpp] +---- +vk::RenderingAttachmentInfo colorAttachment{}; +colorAttachment.imageView = *framebuffers[imageIndex].imageView; +colorAttachment.imageLayout = vk::ImageLayout::eColorAttachmentOptimal; +colorAttachment.loadOp = vk::AttachmentLoadOp::eClear; +colorAttachment.storeOp = vk::AttachmentStoreOp::eStore; +colorAttachment.clearValue = vk::ClearColorValue(0.0f, 0.0f, 0.0f, 1.0f); + +vk::RenderingInfo renderingInfo({}, renderRect, 1, 0, 1, &colorAttachment); +commandBuffer.beginRendering(renderingInfo); +---- + +== Handling the Lifetime: Hybrid Destruction + +In our engine architecture, we keep these `XrFrameBuffer` objects alive for the duration of the `XrSwapchain`'s existence. When the user closes the application or switches environments, we destroy the `XrSwapchain` via `xrDestroySwapchain`. + +This leads to a **Hybrid Destruction** flow: +1. **Manual Destruction**: `xrDestroySwapchain` signals to the runtime that it can reclaim its VRAM. +2. **RAII Destruction**: Our `XrFrameBuffer` structures go out of scope, causing their `vk::raii::ImageView` members to automatically call `vkDestroyImageView`. +3. **Safety**: The non-owning `vk::Image` handles disappear without any Vulkan call, fulfilling our contract with the runtime. + +This hybrid approach—using raw handles for external resources and RAII for internal ones—is a common pattern when interfacing Vulkan with external runtimes like OpenXR, CUDA, or video decoders. + +In the next section, we will look at the **Memory Ownership Lifecycle**, where we master the delicate dance of waiting, acquiring, and releasing these images during our frame loop. + +xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/02_external_image_negotiation.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc new file mode 100644 index 00000000..3f254f32 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc @@ -0,0 +1,87 @@ +:pp: {plus}{plus} += Memory Ownership Lifecycle + +In a standard desktop Vulkan application, we use `vkAcquireNextImageKHR` to get an image index from the swapchain. When we are done, we call `vkQueuePresentKHR`. In OpenXR, this is replaced by a more explicit three-step lifecycle: **Wait**, **Acquire**, and **Release**. + +This lifecycle is designed to allow the XR runtime to manage high-frequency tasks like asynchronous reprojection while keeping our engine's rendering pipeline efficient. + +== The Three-Step Dance: Syncing with the Compositor + +Unlike a desktop window, which can handle images whenever the GPU is ready, an XR headset is a rigid, time-sensitive display. The compositor needs to know exactly when it can "take" an image to show it to the user. + +1. **Wait**: "Is there an image available for me to start drawing into?" +2. **Acquire**: "Give me the index of that image and lock it for my use." +3. **Release**: "I'm done drawing; you can have it back now." + +This dance ensures that the engine and the compositor never attempt to access the same memory at the same time, preventing the "tearing" and "stuttering" that can occur if synchronization fails. + +== 1. Wait (`xrWaitSwapchainImage`) + +The first step is `xrWaitSwapchainImage`. This call is a blocking operation (with an optional timeout) that waits until the XR runtime is ready for the application to begin writing to the next image in the swapchain. + +[source,cpp] +---- +XrSwapchainImageWaitInfo waitInfo{XR_TYPE_SWAPCHAIN_IMAGE_WAIT_INFO}; +waitInfo.timeout = XR_INFINITE_DURATION; +xrWaitSwapchainImage(xrSwapchain, &waitInfo); +---- + +**Why wait?** If our engine is rendering faster than the headset's refresh rate (e.g., we are rendering at 144 FPS but the headset is 90Hz), `xrWaitSwapchainImage` will block our engine thread. This acts as the "Pacing" mechanism for our entire engine loop, ensuring we don't waste GPU power on frames that will never be shown. + +== 2. Acquire (`xrAcquireSwapchainImage`) + +Once the wait is satisfied, we call `xrAcquireSwapchainImage`. This function actually identifies which specific image index in our swapchain we should render into. + +[source,cpp] +---- +uint32_t imageIndex; +XrSwapchainImageAcquireInfo acquireInfo{XR_TYPE_SWAPCHAIN_IMAGE_ACQUIRE_INFO}; +xrAcquireSwapchainImage(xrSwapchain, &acquireInfo, &imageIndex); +---- + +After this call, our application formally "owns" the image at `imageIndex`. This is the Vulkan-level guarantee that the compositor is no longer reading from this specific piece of VRAM. + +== 3. Synchronization and Layout Transitions + +Even though OpenXR manages the ownership, we are still responsible for Vulkan-side synchronization. When an image is returned to us, it might be in a state used by the XR compositor (usually `VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL` or `VK_IMAGE_LAYOUT_PRESENT_SRC_KHR`). + +We must use a **Pipeline Barrier** (ideally via **Synchronization 2** in Vulkan 1.4) to transition it to `VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL` before we can use it in our dynamic rendering pass. + +[source,cpp] +---- +vk::ImageMemoryBarrier2 barrier{}; +barrier.oldLayout = vk::ImageLayout::eUndefined; // Safe to use undefined if we don't care about previous contents +barrier.newLayout = vk::ImageLayout::eColorAttachmentOptimal; +barrier.image = framebuffers[imageIndex].image; +barrier.subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}; +barrier.srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; +barrier.dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; +barrier.dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite; + +vk::DependencyInfo depInfo({}, 0, nullptr, 0, nullptr, 1, &barrier); +commandBuffer.pipelineBarrier2(depInfo); +---- + +== 4. Release (`xrReleaseSwapchainImage`) + +Finally, once our command buffer has been submitted to the GPU (but not necessarily finished executing), we call `xrReleaseSwapchainImage`. This tells the XR runtime: "I am finished writing to this image; you may take it back for composition." + +[source,cpp] +---- +XrSwapchainImageReleaseInfo releaseInfo{XR_TYPE_SWAPCHAIN_IMAGE_RELEASE_INFO}; +xrReleaseSwapchainImage(xrSwapchain, &releaseInfo); +---- + +**Crucial Detail**: Calling `Release` does not mean the GPU has finished rendering. It means the *CPU* is done recording commands. The runtime will use the synchronization primitives we enabled during the handshake (like timeline semaphores) to wait for the GPU to actually finish the work before it displays the image. + +== Summary of Resource Management + +We have now seen how to: + +1. Negotiate image formats and sizes with the XR runtime. +2. Wrap raw runtime-owned images into our RAII engine structures. +3. Manage the lifecycle of image ownership during our frame loop. + +With our resources properly managed, we can now move on to the actual rendering process. In the next chapter, we will see how to use **Dynamic Rendering** to render our spatial views directly into these images with maximum efficiency. + +xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/03_raii_resource_integration.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..0c35f706 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc @@ -0,0 +1,72 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Transitioning from a desktop-controlled swapchain to one owned by the OpenXR runtime requires us to intercept the engine's resource creation and frame acquisition logic in `renderer_rendering.cpp`. An example of this implementation can be found in link:../../../attachments/openxr_engine/xr_context.cpp[xr_context.cpp] and link:../../../attachments/openxr_engine/renderer_resources.cpp[renderer_resources.cpp]. + +== Adjusting Swapchain Creation + +In `renderer_rendering.cpp`, we need to modify `Renderer::createSwapChain` to bypass the standard Vulkan surface-based swapchain when in XR mode. + +[source,cpp] +---- +// renderer_rendering.cpp +bool Renderer::createSwapChain() { + if (xrMode) { + // NEW: Negotiate with OpenXR instead of the windowing system + vk::Extent2D xrExtent = xrContext.getRecommendedExtent(); + xrContext.createSwapchains(*device, vk::Format::eB8G8R8A8Srgb, xrExtent); + + // Use a single swapchain with 2 layers for multiview + eyeSwapchainImages[0] = xrContext.enumerateSwapchainImages(); + eyeSwapchainImages[1].clear(); // Not used in multiview mode + + swapChainExtent = xrContext.getSwapchainExtent(); + swapChainImageFormat = xrContext.getSwapchainFormat(); + + return createImageViews(); + } + + // ... original desktop vk::raii::SwapchainKHR logic ... +} +---- + +== Handling the Frame Lifecycle + +The "Wait-Acquire-Release" rhythm happens every frame. We must update `Renderer::Render` to synchronize with the XR compositor instead of the desktop window. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... + uint32_t imageIndex; + + if (xrMode) { + // 1. ACQUIRE: Get the index of the next available XR image from the runtime + imageIndex = xrContext.acquireSwapchainImage(); + + // 2. WAIT: Block until the image is safe to be written to + xrContext.waitSwapchainImage(); + } else { + // Standard desktop acquisition + auto result = swapChain.acquireNextImage(UINT64_MAX, *imageAvailableSemaphores[currentFrame]); + imageIndex = result.second; + } + + // ... record and submit command buffers ... + + if (xrMode) { + // 3. RELEASE: Hand the image back to the compositor for display + xrContext.releaseSwapchainImage(); + } else { + // Standard desktop present + presentQueue.presentKHR(presentInfo); + } +} +---- + +== Why These Changes? + +By injecting these calls into `createSwapChain` and `Render`, we maintain the engine's high-level abstractions (like `vk::raii::Image`) while relinquishing control of the **lifecycle** to the XR runtime. The `xrContext` serves as a bridge, ensuring that the engine never tries to draw into an image that the compositor is currently reading to project onto the headset's lenses. + +xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/04_memory_ownership_lifecycle.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc new file mode 100644 index 00000000..58a9c599 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc @@ -0,0 +1,16 @@ +:pp: {plus}{plus} += Dynamic Rendering for Spatial Views + +In our journey to integrate OpenXR with Vulkan 1.4, we have established a robust handshake and learned how to manage images that are actually owned by the XR runtime. However, having the images is only half the battle. We now need to render our stereo views into them. In our previous engine building series, we adopted **Dynamic Rendering** (via `VK_KHR_dynamic_rendering`) as our primary rendering path. Now, we'll see why that decision was so critical for spatial computing. + +== Why Leverage Dynamic Rendering for XR? + +For spatial computing, the dynamic rendering system we've already built offers three major advantages that a legacy render pass pipeline would struggle with: + +1. **Reduced Submission Latency**: Since we've already bypassed the need for complex render pass and framebuffer objects, we can more quickly translate our view matrices (which we'll predict in the next chapter) into command buffer submissions. +2. **Adaptive Viewports**: XR headsets often use asymmetric projections and non-parallel views. Our dynamic rendering path makes it trivial to reconfigure viewport and scissor states for each eye without needing to match them to a predefined static structure. +3. **Multiview Integration**: When we move to `VK_KHR_multiview` for single-pass stereo rendering, our existing dynamic rendering logic integrates seamlessly, allowing us to specify a `viewMask` directly in our `vk::RenderingInfo`. + +In this chapter, we will apply our engine's rendering logic to the unique requirements of spatial displays. We'll start by looking at how to bind our XR swapchain images directly to our rendering session and then move on to managing the complex viewports required by modern headsets. + +xref:OpenXR_Vulkan_Spatial_Computing/03_Runtime_Owned_Swapchains/05_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc new file mode 100644 index 00000000..ac51a811 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc @@ -0,0 +1,62 @@ +:pp: {plus}{plus} += Rendering into Spatial Swapchains + +The beauty of the dynamic rendering system we've already built is that it doesn't care where an image comes from. Whether it's a standard swapchain or a set of images provided by an XR runtime, the process of beginning a rendering session remains consistent and lightweight. + +== Binding XR Images: The Runtime Handover + +In our previous chapter, we learned how to wrap `XrSwapchainImageVulkanKHR` handles into `vk::raii::Image` and `vk::raii::ImageView`. With dynamic rendering, we take these image views and provide them directly to the `vk::RenderingInfo` structure at the start of each frame. + +This represents a **Decoupling of the Graphics Pipeline**. Our engine no longer owns the final output buffer; it simply "paints" into whatever view the XR runtime provides. + +[source,cpp] +---- +// Define the color attachment for our XR view +vk::RenderingAttachmentInfo colorAttachment{}; +colorAttachment.imageView = *xrImageView; // Our RAII-wrapped image view from the XR runtime +colorAttachment.imageLayout = vk::ImageLayout::eColorAttachmentOptimal; +colorAttachment.loadOp = vk::AttachmentLoadOp::eClear; +colorAttachment.storeOp = vk::AttachmentStoreOp::eStore; +colorAttachment.clearValue = vk::ClearValue{vk::ClearColorValue{std::array{0.0f, 0.0f, 0.0f, 1.0f}}}; + +// Configure the rendering session +vk::RenderingInfo renderingInfo{}; +renderingInfo.renderArea = vk::Rect2D{{0, 0}, {xrWidth, xrHeight}}; +renderingInfo.layerCount = 1; // 1 for standard mono/stereo, more if using multiview arrays +renderingInfo.colorAttachmentCount = 1; +renderingInfo.pColorAttachments = &colorAttachment; + +// Begin dynamic rendering session +commandBuffer.beginRendering(renderingInfo); + +// ... Recording draw commands as usual ... + +commandBuffer.endRendering(); +---- + +== Handling XR Pipeline State: The Power of `pNext` + +As we've seen in our engine construction, our `vk::raii::Pipeline` doesn't require a static `vk::RenderPass` object. Instead, we use `vk::PipelineRenderingCreateInfo` in the `pNext` chain of our graphics pipeline. + +For XR, this is particularly powerful because we can quickly swap between different swapchain formats (like HDR10 or standard RGBA) simply by updating this info structure during pipeline creation. This allows our engine to support **Variable Quality Levels** or different headset requirements without recompiling shaders or rebuilding complex render pass objects. + +[source,cpp] +---- +vk::PipelineRenderingCreateInfo pipelineRenderingInfo{}; +pipelineRenderingInfo.colorAttachmentCount = 1; +pipelineRenderingInfo.pColorAttachmentFormats = &xrColorFormat; +pipelineRenderingInfo.depthAttachmentFormat = vk::Format::eD32Sfloat; // If our XR view has depth + +// Link to our graphics pipeline create info +vk::GraphicsPipelineCreateInfo pipelineInfo{}; +pipelineInfo.pNext = &pipelineRenderingInfo; +// ... Standard pipeline configuration ... +---- + +== Decoupling the Frame Loop: Separation of Concerns + +By using dynamic rendering, we've successfully decoupled our engine's internal rendering logic from the physical output. Our `VulkanContext` remains focused on managing pipeline state and recording commands, while the OpenXR loop manages the "render area" and provides the target images. + +This **Separation of Concerns** is what allows our engine to scale seamlessly. Whether we are rendering to a 4K desktop monitor or a dual-eye VR headset, the core command recording logic remains the same. The only thing that changes is *where* the image view points and *how* the projection matrices are calculated. + +xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc new file mode 100644 index 00000000..3118d1ba --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc @@ -0,0 +1,57 @@ +:pp: {plus}{plus} += Stereo Viewport & Scissor Management + +When rendering for XR, we aren't just drawing to a single screen. We are drawing to multiple **Views**, each with its own perspective and projection. The most common case is **Stereo** (two views), but headsets with extreme fields of view or foveal insets can require many more. To manage this in a dynamic rendering context, we must master **Viewports** and **Scissors**. + +== The Architecture of the Eye: Asymmetric Projections + +To understand why viewport and scissor management is so critical in XR, we must look at the human eye. Unlike standard 2D displays, where the viewer is typically centered and looking straight at the screen, XR lenses are often offset. + +1. **Optical Center Offset**: The center of the lens (where the image is clearest) might not be the center of the display panel. +2. **Field of View (FOV)**: Headsets often have larger outer FOVs than inner FOVs to provide a wider sense of immersion. +3. **Result**: This leads to **Asymmetric Projections**. The "frustum" is skewed. In our engine, this means we cannot simply render one eye and "flip" it for the other. Each eye must be treated as a unique camera with its own viewport and scissor constraints. + +[source,cpp] +---- +// Configure the viewport for a specific eye view +vk::Viewport eyeViewport{}; +eyeViewport.x = 0.0f; +eyeViewport.y = 0.0f; +eyeViewport.width = static_cast(viewWidth); +eyeViewport.height = static_cast(viewHeight); +eyeViewport.minDepth = 0.0f; +eyeViewport.maxDepth = 1.0f; + +// Configure the scissor to match the viewport +vk::Rect2D eyeScissor{}; +eyeScissor.offset = vk::Offset2D{0, 0}; +eyeScissor.extent = vk::Extent2D{viewWidth, viewHeight}; + +// Dynamic state update in our command buffer +commandBuffer.setViewport(0, eyeViewport); +commandBuffer.setScissor(0, eyeScissor); +---- + +== Single-Texture vs. Array-Texture Swapchains + +Depending on the XR runtime, we might receive one large "atlas" texture containing both eyes, or an **Array Texture** where each eye has its own layer. + +* **Atlasing**: We use the `x` and `y` offsets of the `vk::Viewport` and `vk::Rect2D` to draw the eyes side-by-side. +* **Array Textures**: We keep the viewport at `(0,0)` but we change the `imageView` layer in our `vk::RenderingAttachmentInfo`. + +Dynamic rendering makes it easy to handle both cases by simply adjusting the attachment info. This flexibility allows our engine to support different headset architectures (like the Pimax wide-FOV headsets vs. the Meta Quest) without rewriting the render loop. + +== Optimal View Management: The Pacing Pattern + +To minimize state transitions and optimize performance, our engine follows a specific pattern for each eye view: + +1. **Acquire the Index**: Get the swapchain index from OpenXR for the current frame. +2. **Bind the View**: Point the `vk::RenderingAttachmentInfo` to the correct eye layer or atlas region. +3. **Apply Viewport/Scissor**: Tell the GPU the exact bounds of the current eye's frustum. +4. **Issue Draw Calls**: Render the scene using the predicted view and projection matrices. + +By treating viewports and scissors as dynamic state, we avoid the heavy "Pipeline Stall" that would occur if we had to switch entire pipeline objects between eyes. This keeps our GPU utilization high and our latency low. + +With our rendering pipeline now fully adapted to the unique requirements of spatial views, we are ready to tackle the timing and pacing logic that makes XR truly immersive. In the next chapter, we will build our **Predictive Frame Loop**. + +xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/02_rendering_to_spatial_swapchains.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..de163dc0 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc @@ -0,0 +1,109 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +In this chapter, we've transitioned to direct spatial rendering by leveraging the engine's existing dynamic rendering path in `renderer_rendering.cpp`. This bypasses legacy state and allows for eye-specific viewport management. An example of this implementation can be found in link:../../../attachments/openxr_engine/renderer_rendering.cpp[renderer_rendering.cpp] and link:../../../attachments/openxr_engine/renderer_pipelines.cpp[renderer_pipelines.cpp]. + +== Direct Rendering into Spatial Swapchains + +In `renderer_rendering.cpp`, our `Render` function already uses `vk::RenderingInfo`. To support OpenXR, we must ensure we point it at the correct runtime image: + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... Wait for runtime to be ready ... + uint32_t imageIndex = xrContext.acquireSwapchainImage(); + xrContext.waitSwapchainImage(); + + for (uint32_t eye = 0; eye < 2; ++eye) { + // 1. Point rendering at the acquired XR image view + // Our engine uses a single multiview swapchain at index [0] + vk::RenderingAttachmentInfo colorAttachment{ + .imageView = *eyeSwapchainImageViews[0][imageIndex], + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearValue{vk::ClearColorValue{std::array{0.1f, 0.1f, 0.1f, 1.0f}}} + }; + + vk::RenderingInfo passInfo{ + .renderArea = vk::Rect2D({0, 0}, swapChainExtent), + .layerCount = 1, + .viewMask = (1u << eye), // Render specifically to this eye's layer + .colorAttachmentCount = 1, + .pColorAttachments = &colorAttachment + }; + + commandBuffer.beginRendering(passInfo); + + // Issue draws for this eye + RenderEye(commandBuffer, eye); + + commandBuffer.endRendering(); + } + + // 2. Hand the image back to OpenXR + xrContext.releaseSwapchainImage(); +} +---- + +== Managing Stereo Viewports + +For headsets, we must render from two different viewpoints. In our engine's `Render` loop, we apply view-specific viewports and scissors: + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... inside the render loop ... + + for (uint32_t eye = 0; eye < 2; ++eye) { + // Retrieve asymmetric projection and eye-specific viewport from OpenXR + // xrContext provides these based on xrLocateViews + auto viewport = xrContext.getViewport(eye); + auto scissor = xrContext.getScissor(eye); + + commandBuffer.setViewport(0, viewport); + commandBuffer.setScissor(0, scissor); + + // Update the eye-specific View and Projection matrices in our UBO + // Note: Shaders don't need to change yet! We are rendering each eye sequentially. + updateUniformBuffer(currentFrame, eye); + + // Draw the scene for this eye + // ... + } +} +---- + +== The updateUniformBuffer Helper + +Since our engine is currently rendering eyes **sequentially** (one after another), we can reuse our existing shaders. We simply need to update the view and projection matrices in our `UniformBufferObject` for each eye pass. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::updateUniformBuffer(uint32_t currentFrame, uint32_t eye, const XrPosef* overridePose) { + // 1. Retrieve the predicted View and Projection from OpenXR + // If overridePose is null, we query the latest from the context + glm::mat4 viewMatrix = overridePose ? xrPoseToMatrix(*overridePose) : xrContext.getViewMatrix(eye); + glm::mat4 projMatrix = xrContext.getProjectionMatrix(eye); + + // 2. Update the UBO for the current frame and eye + UniformBufferObject ubo{}; + ubo.view = viewMatrix; + ubo.proj = projMatrix; + ubo.camPos = overridePose ? glm::vec3(overridePose->position.x, overridePose->position.y, overridePose->position.z) + : xrContext.getEyePosition(eye); + + // 3. Directly update the mapped memory + // uniformBuffersMapped is indexed by frame and eye + memcpy(uniformBuffersMapped[currentFrame][eye], &ubo, sizeof(UniformBufferObject)); +} +---- + +== Why These Changes? + +By binding the XR image directly into `vk::RenderingInfo` and updating viewports/scissors per-eye, we eliminate the need for an intermediate "Blit" pass. This ensures that every GPU cycle is spent on spatial content, not memory copies. At this stage, our shaders remain unchanged because we are essentially just "drawing the world twice" from different perspectives. In later chapters, we'll see how **Multiview** can collapse these two passes into one. + +xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/03_stereo_viewport_scissor.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc new file mode 100644 index 00000000..45296508 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc @@ -0,0 +1,26 @@ +:pp: {plus}{plus} += The Predictive Frame Loop and Timing Math + +In a traditional desktop application, the engine's main loop is often governed by simple vertical synchronization (**VSync**). We render as fast as possible, and the monitor displays the result when its scanline allows. However, in spatial computing, this "render and display" model is insufficient. If the user's head moves even slightly between the time we start rendering and the time the photons hit their retinas, the scene will appear to swim or lag, causing immediate discomfort or motion sickness. + +To solve this, OpenXR uses a **Predictive Frame Loop**. Instead of rendering the *current* state of the world, we render what the world *will* look like when the display actually strobes. + +== The Rhythm of Spatial Computing + +The OpenXR frame loop is a conversation between our engine and the XR runtime. It's not just about when to draw; it's about predicting the future. This conversation is built on three pillars: + +1. **Strict Pacing**: The runtime dictates the exact heartbeat of the application via `xrWaitFrame`. This is not just a frame limit; it's a synchronization point that aligns our engine with the display's actual refresh cycle. +2. **Predicted Display Time**: Every frame we process comes with a `predictedDisplayTime`. This is the most important number in our engine. It tells us exactly when the user will see the frame we are currently building. +3. **Simulation Alignment**: We must use this predicted time to advance our physics, animations, and—most crucially—our head tracking. We don't ask where the head *is*; we ask where the head *will be* at `predictedDisplayTime`. + +== Why Timing Math Matters + +In this chapter, we will transition our engine's main loop from its legacy `glfwWindowShouldClose` logic to the OpenXR frame lifecycle. We will learn how to: + +- Synchronize our CPU and GPU work with the XR runtime's heartbeat. +- Utilize predicted timing data to calculate accurate view and projection matrices. +- Understand the "Swim" effect and how predictive math eliminates it. + +By the end of this chapter, your engine will no longer be "reacting" to user movement; it will be "anticipating" it, providing the smooth, low-latency experience that defines high-quality spatial computing. + +xref:OpenXR_Vulkan_Spatial_Computing/04_Dynamic_Rendering/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc new file mode 100644 index 00000000..37201efd --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc @@ -0,0 +1,71 @@ +:pp: {plus}{plus} += The XR Lifecycle + +Transitioning our engine's main loop to the OpenXR lifecycle involves three primary functions: `xrWaitFrame`, `xrBeginFrame`, and `xrEndFrame`. These functions act as the "heartbeat" of our spatial application, replacing the standard unthrottled loop or legacy `glfwPollEvents` based pacing. + +== The Temporal Paradox: Rendering the Future + +To understand the XR lifecycle, you must first accept a fundamental truth: **You are always rendering a future that hasn't happened yet.** + +In a desktop game, you render what is happening "now." In XR, because of the time it takes to record commands, submit them to the GPU, and actually light up the pixels on the display, "now" is already in the past by the time the user sees it. If we rendered the "current" head pose, the world would appear to "swim" or lag behind the user's head movements. + +The XR lifecycle is designed to solve this by providing a **Predicted Display Time**. + +== 1. `xrWaitFrame`: The Pacing Heartbeat + +The `xrWaitFrame` function is our engine's synchronization point with the XR runtime. When we call this, the runtime may block our execution until it is the optimal time for us to begin processing the next frame. + +[source,cpp] +---- +XrFrameWaitInfo frameWaitInfo{XR_TYPE_FRAME_WAIT_INFO}; +XrFrameState frameState{XR_TYPE_FRAME_STATE}; +xrWaitFrame(xrSession, &frameWaitInfo, &frameState); +---- + +* **Throttling**: If your engine is too fast, `xrWaitFrame` will block to keep you synced with the headset's refresh rate (e.g., 90Hz). +* **The Prediction**: The `frameState` object returned by this call contains the `predictedDisplayTime`. This is the exact nanosecond when the pixels we are about to render will actually be shown on the headset's display. +* **Should We Render?**: `frameState.shouldRender` tells us if the runtime actually wants a frame. If the headset is off or the user is in a system menu, we might still "wait" but skip the "render" work to save power. + +== 2. `xrBeginFrame`: Opening the Window + +Once the wait is satisfied, we call `xrBeginFrame`. This signals the start of our frame's GPU work. + +[source,cpp] +---- +XrFrameBeginInfo frameBeginInfo{XR_TYPE_FRAME_BEGIN_INFO}; +xrBeginFrame(xrSession, &frameBeginInfo); +---- + +This is our engine's cue to start recording command buffers. It is important to keep the time between `xrWaitFrame` and `xrBeginFrame` as short as possible, as this is when we perform our simulation updates and view predictions. + +== 3. `xrEndFrame`: Submitting to the Compositor + +The final step is `xrEndFrame`. This is the counterpart to `vkQueuePresentKHR`. + +[source,cpp] +---- +XrFrameEndInfo frameEndInfo{XR_TYPE_FRAME_END_INFO}; +frameEndInfo.displayTime = frameState.predictedDisplayTime; +frameEndInfo.layerCount = 1; +frameEndInfo.pLayers = &layerPtr; +xrEndFrame(xrSession, &frameEndInfo); +---- + +Instead of just "presenting" a single image, `xrEndFrame` takes an array of **Composition Layers**. + +* **Layers**: These are separate planes of imagery (like the 3D scene, a 2D HUD, or a curved menu) that the runtime's compositor will stack together at the very last microsecond using its own high-priority GPU pass. +* **Pacing**: `xrEndFrame` tells the runtime: "My GPU work is queued. Use the synchronization we agreed upon in the handshake to know when it's safe to show this frame." + +== The Engine Loop Summary + +Our engine's main loop now follows this precise rhythm: + +1. **Poll Events**: Check for system messages (like "Headset Connected"). +2. **Wait**: Sync with the runtime heartbeat and get the future `displayTime`. +3. **Begin**: Start the frame work. +4. **Render**: Use the future time to predict poses, record Vulkan commands, and submit them. +5. **End**: Hand the finished (or queued) layers back to the runtime for display. + +By following this rhythm, we ensure our engine is perfectly synchronized with the headset's hardware, providing the smooth and stable experience required for spatial computing. + +xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc new file mode 100644 index 00000000..04b1feb4 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc @@ -0,0 +1,56 @@ +:pp: {plus}{plus} += Display Time Prediction + +The key to a stable and immersive spatial experience is aligning our engine's internal state with the future point in time when the user will actually see the frame. This is where `predictedDisplayTime` from `XrFrameState` comes into play. We must use this timestamp for two critical tasks: **Locating the User's Head (the View)** and **Calculating our Simulation (Physics and Animation)**. + +== 1. Locating the User (`xrLocateViews`) + +When we render our views, we need to know exactly where the user's eyes will be. If we use the "current" head pose, the frame will appear to lag because the head will have moved further by the time the frame is displayed. This is known as **Motion-to-Photon Latency**. + +We use `xrLocateViews` to ask the runtime to predict the user's pose for the future `displayTime`. + +[source,cpp] +---- +XrViewLocateInfo viewLocateInfo{XR_TYPE_VIEW_LOCATE_INFO}; +viewLocateInfo.viewConfigurationType = XR_VIEW_CONFIGURATION_TYPE_PRIMARY_STEREO; +viewLocateInfo.displayTime = frameState.predictedDisplayTime; +viewLocateInfo.space = xrAppSpace; + +XrViewState viewState{XR_TYPE_VIEW_STATE}; +uint32_t viewCountOutput; +std::vector views(viewCount, {XR_TYPE_VIEW}); +xrLocateViews(xrSession, &viewLocateInfo, &viewState, viewCount, &viewCountOutput, views.data()); +---- + +The resulting `XrView` array contains the predicted **Pose** (position and orientation) and **FOV** (Field of View) for each eye. Note that FOV can change! Some headsets use dynamic lenses that shift based on the user's focus, so we must update our projection matrices *every frame*. + +== 2. Simulation Alignment: Syncing Physics + +It's not just the camera that must be predicted; our entire world simulation must be aligned with `predictedDisplayTime`. This includes physics, animations, and particle systems. + +If we use a different delta time for our simulation (like a fixed 60Hz update) than for our view prediction (which is tied to the 90Hz display), the result will be a subtle but nauseating disconnect. Objects in the world will appear to "jitter" relative to the user's head movement because they are being updated on a different temporal schedule. + +In our engine, we should calculate our simulation's delta time based on the difference between the `predictedDisplayTime` of the current frame and the previous one. + +[source,cpp] +---- +double deltaTime = static_cast(currentPredictedTime - lastPredictedTime) / 1e9; // Convert nanoseconds to seconds +engine.update(deltaTime); +---- + +== The Danger of Clock Drift + +A common mistake in XR development is to use `std::chrono::now()` or `glfwGetTime()` to drive simulation. These clocks are decoupled from the XR runtime's heartbeat. + +* **System Clock**: Measures real-world wall time. +* **XR Clock**: Measures the display strobes of the headset hardware. + +Even if they start at the same time, they will eventually **Drift**. After 30 minutes of gameplay, your physics engine might be 5ms ahead or behind the display's actual refresh cycle. This drift is what causes "random" stutters and hitches that are impossible to debug if you don't use the `predictedDisplayTime` consistently. + +By using only the `predictedDisplayTime` provided by the XR runtime, we ensure that every element of our spatial scene—from the smallest particle to the user's own viewpoint—is perfectly synchronized with the hardware's display strobes. + +== Next Steps: Minimizing Latency + +Now that our frame loop is correctly paced and our simulation is aligned with the future, we have established the foundation for a high-quality spatial experience. However, there is still a small window of latency between the end of our command buffer recording and the absolute final moment before submission. In the next chapter, we will explore **Late Latching** and how **Timeline Semaphores** can help us shave off those final microseconds of "swim." + +xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/02_xr_lifecycle.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..4b18c9cd --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc @@ -0,0 +1,112 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +In this chapter, we've transitioned from a reactive desktop loop to a predictive, runtime-driven spatial heartbeat. To implement this in our `simple_game_engine`, we must modify the main loop in `engine.cpp`. An example of this implementation can be found in link:../../../attachments/openxr_engine/engine.cpp[engine.cpp] and link:../../../attachments/openxr_engine/xr_context.cpp[xr_context.cpp]. + +== Pacing the Engine Heartbeat + +In `engine.cpp`, we need to update `Engine::Run` to use the OpenXR frame lifecycle. This ensures our simulation and rendering are perfectly aligned with the headset's display. + +[source,cpp] +---- +// engine.cpp +void Engine::Run() { + // ... + while (running) { + if (renderer->IsXrMode()) { + auto& xrContext = renderer->GetXrContext(); + + // 1. WAIT: OpenXR blocks here until it's time to work on the next frame + XrFrameState frameState = xrContext.waitFrame(); + + // 2. BEGIN: Signal that we are starting CPU work for this frame + xrContext.beginFrame(); + + // Update simulation using predicted display time + Update(frameState.predictedDisplayTime); + + // 3. RENDER: Record and submit drawing commands + Render(frameState.predictedDisplayTime); + + // 4. END: Submit the finished frame to the compositor + // We pass our swapchain image views to the compositor for display + xrContext.endFrame(renderer->GetXrImageViews()); + } else { + // Standard desktop loop + deltaTimeMs = CalculateDeltaTimeMs(); + Update(deltaTimeMs); + Render(); + } + } +} +---- + +== Aligning the Camera + +In our `Update` logic, we must ensure the `activeCamera` is updated with the predicted poses from the XR runtime. + +[source,cpp] +---- +// camera_component.cpp +// Helper: Build asymmetric projection from FOV tangents +inline glm::mat4 getAsymmetricProjection(const XrFovf& fov, float nearZ, float farZ) { + float tanLeft = std::tan(fov.angleLeft); + float tanRight = std::tan(fov.angleRight); + float tanUp = std::tan(fov.angleUp); + float tanDown = std::tan(fov.angleDown); + + float tanWidth = tanRight - tanLeft; + float tanHeight = tanUp - tanDown; + + glm::mat4 projection(0.0f); + projection[0][0] = 2.0f / tanWidth; + projection[1][1] = 2.0f / tanHeight; + projection[2][0] = (tanRight + tanLeft) / tanWidth; + projection[2][1] = (tanUp + tanDown) / tanHeight; + projection[2][2] = -farZ / (farZ - nearZ); + projection[2][3] = -1.0f; + projection[3][2] = -(farZ * nearZ) / (farZ - nearZ); + + return projection; +} + +void CameraComponent::SetStereoViews(const XrView& left, const XrView& right) { + // 1. Convert OpenXR poses to 4x4 matrices + // xrPoseToMatrix is a common helper that builds a GLM matrix from a quaternion/vector + eyeViewMatrices[0] = xrPoseToMatrix(left.pose); + eyeViewMatrices[1] = xrPoseToMatrix(right.pose); + + // 2. Build asymmetric projection matrices from FOV tangents + // These are provided per-eye by the XR runtime + eyeProjectionMatrices[0] = getAsymmetricProjection(left.fov, nearZ, farZ); + eyeProjectionMatrices[1] = getAsymmetricProjection(right.fov, nearZ, farZ); +} +---- + +In `engine.cpp`, we call this within our `Update` loop: + +[source,cpp] +---- +// engine.cpp +void Engine::Update(XrTime predictedTime) { + if (renderer->IsXrMode()) { + auto& xrContext = renderer->GetXrContext(); + + // Retrieve the predicted views (poses and FOVs) from OpenXR + xrContext.locateViews(predictedTime); + auto views = xrContext.getLatestViews(); + + // Update the camera with the spatial data + if (activeCamera && views.size() >= 2) { + activeCamera->SetStereoViews(views[0], views[1]); + } + } + // ... rest of physics and entity updates ... +} +---- + +== Why These Changes? + +By moving `xrWaitFrame` into our main loop, we transform the engine from an "as-fast-as-possible" renderer into a "just-in-time" spatial engine. This eliminates the judder caused by simulation/display mismatch and ensures that when the user moves their head, the virtual world moves in perfect, predicted synchrony. + +xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/03_display_time_prediction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc new file mode 100644 index 00000000..23d91e95 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc @@ -0,0 +1,18 @@ +:pp: {plus}{plus} += Late Latching and Timeline Semaphores + +In spatial computing, the gap between "I think my head is here" and "the pixel is actually drawn on the display" is known as the **Motion-to-Photon (M2P)** latency. Even with the predictive frame loop we explored in the previous chapter, there is still a window of time where the user's head can move after we've started our simulation but before the GPU has finished rendering. If we use stale head poses, the virtual world will feel like it's "swimming" or trailing behind the user's actual movement. + +This is where **Late Latching** comes in. Instead of gathering our head pose once at the start of the frame and using it for everything, we want to "latch" onto the most up-to-date pose as late as humanly possible—ideally, right before the GPU begins executing the draw calls that depend on it. + +To achieve this in Vulkan 1.4 without causing massive CPU stalls, we leverage **Timeline Semaphores**. Traditionally, synchronizing the CPU and GPU required "heavy" operations like `vkDeviceWaitIdle` or binary semaphores that are difficult to manage across complex frames. Timeline semaphores allow us to create a 64-bit monotonically increasing counter that both the CPU and GPU can wait on or signal. + +By gating our Uniform Buffer Object (UBO) or Push Constant updates behind a timeline semaphore value, we can ensure that the GPU only proceeds with a draw call once the CPU has finished writing the absolute latest view and projection matrices. This minimizes the prediction error and keeps the virtual world anchored firmly to the user's reality. + +In this chapter, we will look at how to: + +1. Orchestrate the CPU/GPU handshake using timeline semaphores. +2. Update our shader data at the "last microsecond" before submission. +3. Integrate these late-breaking updates into the engine's command buffer flow. + +xref:OpenXR_Vulkan_Spatial_Computing/05_Predictive_Frame_Loop/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc new file mode 100644 index 00000000..25751597 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc @@ -0,0 +1,52 @@ +:pp: {plus}{plus} += The Last-Second Update + +To implement late latching, we need a mechanism that allows the GPU to sit in a "ready state" for as long as possible without stalling the entire hardware pipeline. In Vulkan 1.4, the combination of **Timeline Semaphores** and **Synchronization 2** provides exactly what we need. + +== Gating the Pipeline: The Wait-Before-Submit Pattern + +Think of a timeline semaphore as a **Digital Gate**. In a standard loop, the CPU prepares everything, then sends it to the GPU. In a "Late Latching" loop, we change the order of operations: + +1. **CPU Records Commands**: We record our entire command buffer, including all the draw calls for our spatial views. However, we use a placeholder matrix or just "stale" data from the start of the frame. +2. **CPU Submits Work**: We submit the command buffer to the GPU using `vkQueueSubmit2`. +3. **GPU Waits**: Crucially, we tell the GPU to **WAIT** at the very beginning of the pipeline (usually the Vertex Shader stage) until a specific timeline semaphore reaches a target value. +4. **CPU Polls**: While the GPU is sitting at the gate, the CPU is free to keep polling OpenXR for the absolute latest head pose. +5. **The Signal**: Only at the last possible microsecond, the CPU updates the matrix in memory and **Signals** the semaphore. The GPU gate opens, and the draw calls begin with the freshest possible data. + +== Stage Mask Selection: Where to Wait? + +A critical performance decision is *where* the GPU should wait. In `vk::SemaphoreSubmitInfo`, we specify a `stageMask`. + +* **Top of Pipe**: If we wait here, the GPU does nothing. +* **Vertex Shader**: If we wait here, the GPU can still perform "Top of Pipe" work like command parsing or index fetching. It only stalls when it needs the actual vertex data. +* **Fragment Shader**: This is usually too late for late latching, as the geometry has already been rasterized based on the (potentially stale) vertex positions. + +By waiting at the **Vertex Shader** stage, we maximize hardware utilization while still ensuring that our projection and view matrices are as fresh as the hardware allows. + +[source,cpp] +---- +// Configure the timeline semaphore wait for the GPU +vk::TimelineSemaphoreSubmitInfo timelineSubmitInfo; +uint64_t waitValue = currentFrameCount + 1; +timelineSubmitInfo.setWaitSemaphoreValues(waitValue); + +vk::SubmitInfo2 submitInfo; +vk::SemaphoreSubmitInfo waitSemaphoreInfo; +waitSemaphoreInfo.setSemaphore(*lateLatchingSemaphore); +waitSemaphoreInfo.setValue(waitValue); +waitSemaphoreInfo.setStageMask(vk::PipelineStageFlagBits2::eVertexShader); + +submitInfo.setWaitSemaphoreInfos(waitSemaphoreInfo); +submitInfo.setCommandBufferInfos(commandBufferInfo); + +// The GPU will now wait at the vertex shader stage until the CPU signals waitValue +queue.submit2(submitInfo); +---- + +== Why This Kills "Swim" + +"Swim" is the nauseating effect where virtual objects seem to float or slide slightly when you move your head. It happens because of the **Latency Gap**—the time between when your engine sampled the head pose and when the pixels were actually shown. + +By using the wait-before-submit pattern, we reduce that gap from ~10-15ms down to ~1-2ms. The virtual world feels "anchored" to the physical space because the view matrices being used by the GPU were updated only microseconds before the first vertex was processed. + +xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc new file mode 100644 index 00000000..bbd41854 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc @@ -0,0 +1,63 @@ +:pp: {plus}{plus} += Implementation + +The magic of late latching happens in the final moments before the CPU gives the GPU the "go ahead." In our engine's main loop, we've already submitted our command buffers using the `vk::SubmitInfo2` structure with the appropriate timeline semaphore wait. Now, we just need to update the data and signal the semaphore. + +== The "Last-Second" View Prediction + +To get the absolute latest head pose, we call `xrLocateSpace` using the `predictedDisplayTime` we calculated in the previous chapter. + +[source,cpp] +---- +// 1. Get the absolute latest head pose from OpenXR +XrSpaceLocation spaceLocation{XR_TYPE_SPACE_LOCATION}; +XrResult result = xrLocateSpace(viewSpace, baseSpace, predictedDisplayTime, &spaceLocation); + +if (XR_UNQUALIFIED_SUCCESS(result) && (spaceLocation.locationFlags & XR_SPACE_LOCATION_ORIENTATION_VALID_BIT)) { + // 2. Convert XrSpaceLocation to our engine's internal View/Projection matrices + auto latestViewMatrix = convertXrPoseToMatrix(spaceLocation.pose); + + // 3. Directly update the Host-Visible UBO buffer (already persistently mapped) + std::memcpy(mappedUboPointer, &latestViewMatrix, sizeof(latestViewMatrix)); + + // 4. Flush the memory range if it's not coherent + // This ensures the GPU can see our update immediately + vk::MappedMemoryRange flushRange; + flushRange.setMemory(*uboBufferMemory); + flushRange.setOffset(0); + flushRange.setSize(sizeof(latestViewMatrix)); + device.flushMappedMemoryRanges(flushRange); +} + +// 5. Signal the timeline semaphore to value X, unblocking the GPU +vk::SemaphoreSignalInfo signalInfo; +signalInfo.setSemaphore(*lateLatchingSemaphore); +signalInfo.setValue(waitValue); +device.signalSemaphore(signalInfo); +---- + +== Persistent Mapping: The Efficiency Requirement + +Wait, there’s an important architectural detail here: **Persistent Mapping**. For late latching to work effectively, we cannot afford the overhead of calling `vkMapMemory` and `vkUnmapMemory` every frame. + +* **The Concept**: We map the Vulkan buffer once during initialization and keep the CPU-side pointer (`mappedUboPointer`) available for the lifetime of the application. +* **The Why**: Mapping memory is an expensive system call that often involves kernel-mode transitions. In our late latching loop, we have less than a millisecond to perform the update. We need the raw performance of a direct `memcpy`. + +== The Coherency Question: `vkFlushMappedMemoryRanges` + +Not all GPU memory is "Coherent." On some hardware (especially discrete desktop GPUs), the CPU and GPU have separate caches. Even if we `memcpy` the data, the GPU might still see the "stale" version in its cache. + +If our memory heap doesn't have the `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` flag, we must call `vkFlushMappedMemoryRanges`. This forces the CPU cache to write the data out to physical VRAM where the GPU can see it. In our late latching implementation, this flush is the final step before we open the digital gate. + +== Limitations of Late Latching + +This technique is most effective for "fast-moving" data like the head pose or hand tracking data. However, it’s not suitable for updating entire scene hierarchies or large amounts of vertex data. + +* **Data Size**: Keep your late-latched updates small (usually just a few matrices). +* **Timing**: If the update takes too long, the GPU will idle while waiting for the semaphore, which can reduce your overall framerate. + +By focusing purely on the view and projection matrices, we keep the overhead minimal and the visual latency as low as the hardware allows. + +With late latching implemented, our virtual environment is now as anchored as possible. In the next chapter, we will shift our focus to **Action Spaces**—how we turn user input (button clicks and hand movements) into meaningful interactions within this spatially-aligned world. + +xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/02_last_second_update.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..871ce25f --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc @@ -0,0 +1,60 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Late Latching is a powerful optimization that minimizes "swim" by updating view and projection matrices at the last possible moment before GPU submission. To implement this in our `simple_game_engine`, we modify the `Render` loop in `renderer_rendering.cpp`. An example of this implementation can be found in link:../../../attachments/openxr_engine/renderer_rendering.cpp[renderer_rendering.cpp]. + +== Implementing the Late Update + +In `renderer_rendering.cpp`, we need to delay the final UBO update until just before the `vkCmdDraw` calls. This involves using **Timeline Semaphores** to gate our command buffer submission until the last-second matrices are ready. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... + if (xrMode) { + // Retrieve the ABSOLUTE LATEST view poses from OpenXR + // Note: This happens AFTER we've already waited on the frame fence + auto latestPoses = xrContext.getLatestViewPoses(); + + // Directly update the mapped UBO memory for each eye + // Since we use persistent mapping (vk::MemoryPropertyFlagBits::eHostVisible), + // we can write directly to the GPU-resident buffers. + for (uint32_t eye = 0; eye < 2; ++eye) { + updateUniformBuffer(currentFrame, eye, &latestPoses[eye]); + } + + // Use Timeline Semaphores to signal that the late update is finished + vk::TimelineSemaphoreSubmitInfo timelineInfo{ + .signalSemaphoreValueCount = 1, + .pSignalSemaphoreValues = ¤tFrameValue + }; + + // Gate the graphics queue submission + // The GPU will wait for 'currentFrameValue' before executing + graphicsQueue.submit(submitInfo); + } +} +---- + +== Persistent Mapping for Low Latency + +In `Renderer::createUniformBuffers`, we ensure our buffers are mapped once and held open for the lifetime of the renderer, avoiding the overhead of per-frame mapping. + +[source,cpp] +---- +// renderer_rendering.cpp +bool Renderer::createUniformBuffers(Entity* entity) { + // ... create vk::raii::Buffer ... + + // NEW: Persistently map the buffer for last-second updates + // entityRes.uniformBuffersMapped is a void* array for each frame-in-flight + entityRes.uniformBuffersMapped[i] = entityRes.uniformBuffersAllocation[i]->map(0, VK_WHOLE_SIZE); +} +---- + +== Why These Changes? + +By moving the `updateUniformBuffer` call to the very end of the `Render` function (just before submission), we ensure that the GPU receives the most accurate head-tracking data possible. This reduces the "perceived latency" (Motion-to-Photon) from the time we recorded the frame to the time it's actually scanned out to the headset's display. + +xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/03_implementation.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc new file mode 100644 index 00000000..40334529 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc @@ -0,0 +1,17 @@ +:pp: {plus}{plus} += Action Spaces and Input-to-Pipeline Flow + +In a traditional PC or mobile game, we often think about input in terms of raw physical devices: "Was the A button pressed?" or "What is the mouse delta?" However, in spatial computing, this model breaks down. The diversity of VR and AR controllers—ranging from standard gamepads and 6DoF hand controllers to hand-tracking and eye-tracking—means that our engine should never care about the *specific* button a user is pressing. + +Instead, OpenXR uses an **Action-Based Input System**. This abstraction layer allows us to define what we want the user to *do* (like "Grab", "Teleport", or "Menu") and then map those actions to various hardware profiles later. This ensures that our engine is future-proof and compatible with all current and future devices. + +But in spatial computing, input isn't just about buttons; it's about **Pose Actions**. When a user moves their hand, we don't just get a button event; we get an entire spatial transformation—a position and an orientation in 3D space. These actions are tracked within **Spaces**, which we've briefly touched on before. + +In this chapter, we will explore: +1. How to define and map OpenXR Actions to physical controller profiles. +2. The concept of **Action Spaces**, which bridge the gap between input and the rendering pipeline. +3. How to convert raw XrSpace poses into 4x4 transform matrices that our Vulkan shaders can use for interaction. + +By the end of this section, we will have a robust system for handling both digital input (button clicks) and spatial input (hand positions) in a way that is clean, modular, and performant. + +xref:OpenXR_Vulkan_Spatial_Computing/06_Late_Latching/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc new file mode 100644 index 00000000..87e02de8 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc @@ -0,0 +1,80 @@ +:pp: {plus}{plus} += The OpenXR Action System + +The OpenXR input system is built around three core concepts: **Action Sets**, **Actions**, and **Interaction Profiles**. By separating our engine's logic from the physical hardware, we ensure that our application remains robust and adaptable across a vast ecosystem of controllers. + +== The Hardware Abstraction Layer: Thinking in Actions + +In a legacy input system, you might write code like `if (buttonA.isPressed()) { jump(); }`. This is a fragile approach in XR, where one user might have an Oculus Touch controller (with an 'A' button) and another might have Valve Index controllers (with a pressure-sensitive trackpad). + +OpenXR uses an **Action-Based** model. Instead of looking for buttons, your engine defines what the user can *do*: + +1. **Define the Action**: "I need an action called 'Jump'." +2. **Assign the Type**: Is it a boolean (on/off), a float (analog trigger), or a pose (hand position)? +3. **Suggest the Binding**: "On an Oculus controller, map 'Jump' to the 'A' button. On a Vive controller, map it to the 'Trackpad Click'." + +The XR runtime handles the "Translation" at the last possible second, giving your engine a clean, unified stream of "Jump" events regardless of the physical hardware. + +== 1. Action Sets: Contextual Input + +An **Action Set** is a container for related actions. This allows you to enable or disable groups of inputs based on what the user is doing. + +* **Gameplay Set**: Contains "Teleport", "Grab", and "Shoot". +* **Menu Set**: Contains "Select" and "Back". + +[source,cpp] +---- +// 1. Create an Action Set +XrActionSetCreateInfo actionSetInfo{XR_TYPE_ACTION_SET_CREATE_INFO}; +std::strcpy(actionSetInfo.actionSetName, "gameplay"); +std::strcpy(actionSetInfo.localizedActionSetName, "Main Gameplay"); +actionSetInfo.priority = 0; + +XrActionSet gameplayActionSet; +xrCreateActionSet(instance, &actionSetInfo, &gameplayActionSet); +---- + +[TIP] +==== +**Localized Names**: The `localizedActionSetName` is important! The XR runtime may show this to the user in a "Controller Binding" menu, allowing them to remap your actions to their own preferences. +==== + +== 2. Actions: Defining Intent + +Within an action set, we define our individual **Actions**. + +[source,cpp] +---- +// 2. Create a Pose Action for hand tracking +XrActionCreateInfo actionInfo{XR_TYPE_ACTION_CREATE_INFO}; +actionInfo.actionType = XR_ACTION_TYPE_POSE_INPUT; +std::strcpy(actionInfo.actionName, "hand_pose"); +std::strcpy(actionInfo.localizedActionName, "Hand Pose"); +actionInfo.countSubactionPaths = 0; + +XrAction handPoseAction; +xrCreateAction(gameplayActionSet, &actionInfo, &handPoseAction); +---- + +== 3. Interaction Profiles: Suggesting the Map + +Finally, we suggest how these actions should map to real hardware. We don't "force" a mapping; we provide a "Suggested Binding" for each known controller type. + +[source,cpp] +---- +// 3. Map the action to a specific controller path +std::vector bindings; +// We use standardized OpenXR paths like /user/hand/right/input/grip/pose +bindings.push_back({handPoseAction, xrPathToString(instance, "/user/hand/right/input/grip/pose")}); + +XrInteractionProfileSuggestedBinding profileBinding{XR_TYPE_INTERACTION_PROFILE_SUGGESTED_BINDING}; +profileBinding.interactionProfile = xrPathToString(instance, "/interaction_profiles/oculus/touch_controller"); +profileBinding.suggestedBindings = bindings.data(); +profileBinding.countSuggestedBindings = bindings.size(); + +xrSuggestInteractionProfileBindings(instance, &profileBinding); +---- + +This abstraction is especially critical for **Space Manifolds**, where a user's physical movement is converted into a 3D coordinate system for our renderer—the topic of our next section. + +xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc new file mode 100644 index 00000000..d2930dff --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc @@ -0,0 +1,55 @@ +:pp: {plus}{plus} += Space Manifolds + +While most input actions result in discrete events (like a button press), pose actions are special: they are inherently spatial. To bridge the gap between the user's hand movement and the objects in our 3D engine, we need to create an **Action Space**. + +== Virtual Coordinate Systems: The Action Space + +Think of an action space as a virtual coordinate system that "follows" the user's hand. In OpenXR, all spatial tracking is relative. To find where a hand is, we must ask: "Where is this Hand Space relative to my World Space?" + +1. **Grip Pose**: The location of the user's palm. This is used for holding objects like a sword or a steering wheel. +2. **Aim Pose**: The location and orientation of a ray extending from the controller's front. This is used for pointing, laser-pointers, and UI interaction. + +[source,cpp] +---- +// 1. Create a Space for our Hand Pose Action +XrActionSpaceCreateInfo actionSpaceInfo{XR_TYPE_ACTION_SPACE_CREATE_INFO}; +actionSpaceInfo.action = handPoseAction; +// This allows us to apply an offset (e.g., to align the virtual hand with the physical controller) +actionSpaceInfo.poseInActionSpace = identityPose; + +XrSpace handSpace; +xrCreateActionSpace(session, &actionSpaceInfo, &handSpace); +---- + +== Space Manifolds: From Quaternions to Matrices + +Every frame, we use `xrLocateSpace` to find where this hand space is located. The runtime provides this as an `XrSpaceLocation`, which contains a position (vector) and an orientation (quaternion). + +However, our Vulkan shaders typically expect a **4x4 Matrix** for spatial transformations. This conversion is the "Space Manifold"—mapping the 7D tracking data (3 position + 4 quaternion) into the 16D matrix format used by the GPU's fixed-function vertex hardware. + +[source,cpp] +---- +// 2. Locate the hand space relative to the world +XrSpaceLocation location{XR_TYPE_SPACE_LOCATION}; +xrLocateSpace(handSpace, worldSpace, predictedDisplayTime, &location); + +if (location.locationFlags & XR_SPACE_LOCATION_ORIENTATION_VALID_BIT) { + // 3. Convert XrPosef to a 4x4 transformation matrix + // This matrix represents the manifold of the controller's path through the virtual world + vk::mat4 handTransform = xrPoseToMatrix(location.pose); + + // Pass this matrix to our engine's scene graph or directly to a shader + pushConstants.modelMatrix = handTransform; +} +---- + +== Why We Use "App Space" + +In spatial computing, the **App Space** (or Stage Space) is the origin of our virtual world. By locating the `handSpace` relative to the `worldSpace`, we ensure that if the user walks around their room, the virtual hand stays perfectly aligned with their physical hand relative to the virtual environment. + +This transformation process effectively turns a physical controller into a **Space Manifold**—a virtual object that moves through our world's coordinate system. Whether the user is pointing a laser, picking up a sword, or simply waving their hand, the engine sees it all as a single, consistent matrix. + +In the next chapter, we will look at how to optimize the rendering of these multiple views using **Slang for Spatial Shaders**. + +xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/02_openxr_action_system.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..d2ca2b85 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc @@ -0,0 +1,46 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +OpenXR's Action System requires us to move away from polling raw buttons (like `glfwGetMouseButton`) and toward an abstract, intent-based input system in `engine.cpp`. An example of this implementation can be found in link:../../../attachments/openxr_engine/xr_context.cpp[xr_context.cpp] and link:../../../attachments/openxr_engine/engine.cpp[engine.cpp]. + +== Mapping Actions in the Engine + +In `engine.cpp`, we update our `Update` logic to process OpenXR actions. Instead of checking for a specific mouse button to "ThrowBall", we check for a "Grab" action. + +[source,cpp] +---- +// engine.cpp +void Engine::Update(XrTime predictedTime) { + if (renderer->IsXrMode()) { + auto& xrContext = renderer->GetXrContext(); + + // 1. Poll OpenXR action states (Grab, Teleport, etc.) + xrContext.pollActions(); + + if (xrContext.isActionActive("Grab")) { + // 2. Retrieve the spatial pose of the controller (Action Space) + XrPosef handPose = xrContext.getActionPose("GrabPose"); + + // 3. Convert the XR pose into our engine's 4x4 matrix + glm::mat4 handMatrix = xrPoseToMatrix(handPose); + + // Spawn the ball at the controller's exact spatial location + ThrowBallFromHand(handMatrix); + } + } else { + // Standard desktop mouse input + if (platform->GetMouseButton(1)) { + ThrowBall(mouseX, mouseY); + } + } + + // Standard physics/entity update + Update(deltaTimeMs); +} +---- + +== Why These Changes? + +By decoupling input from physical hardware (like a specific controller's trigger), we make our engine cross-platform out of the box. Whether the user is using an Oculus Touch controller or a Valve Index "knuckle," the engine only sees the "Grab" action and the associated spatial pose, allowing us to focus on the physics of the interaction rather than the nuances of different hardware drivers. + +xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/03_space_manifolds.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc new file mode 100644 index 00000000..8a9b859a --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc @@ -0,0 +1,23 @@ +:pp: {plus}{plus} += Slang for Spatial Shaders: Single-Pass Stereo and Beyond + +Welcome to one of the most exciting parts of our spatial computing journey! Up to this point, we've focused on the "Vulkan-to-OpenXR" plumbing—how to get images, how to time our frames, and how to track user input. Now, we're shifting our focus into the **Shaders**. In spatial computing, we aren't just rendering to a single flat screen; we're rendering for two eyes, and potentially even more views for foveated or wide-FOV headsets. + +The traditional way to handle stereo rendering was to simply draw the entire scene twice—once for the left eye and once for the right. While this is conceptually simple, it's incredibly inefficient. The CPU has to submit twice as many draw calls, and the GPU has to process the same geometry twice, often with nearly identical vertex data. This is where **Single-Pass Stereo** comes in, and more specifically, the **N-View** mindset. + +[source,slang] +---- +// The "N-View" mindset in Slang +void main(uint viewID : SV_ViewID) { + // Shaders that know which eye they are rendering for + float4 eyePosition = loadEyePosition(viewID); +} +---- + +In this chapter, we're going to explore how **Slang** makes authoring these multi-view shaders much more natural and productive than traditional GLSL or HLSL. We'll start by looking at **Native Multiview** (`VK_KHR_multiview`), a core Vulkan feature that allows the GPU to broadcast a single draw call to multiple layers of an image array, each representing a different view. + +We'll define some key concepts like **SIMD** (Single Instruction, Multiple Data) and how it relates to our "N-View" architecture, and we'll see how Slang's modern syntax allows us to write shaders that are clean, readable, and highly optimized for spatial hardware. Whether you're targeting a simple mobile VR headset or a high-end desktop AR system, the principles of efficient multi-view shader design remain the same. + +In the following sections, we'll dive deep into the implementation of **Native Multiview** and then see how to architect our **Slang** shaders to take full advantage of this hardware-level optimization. + +xref:OpenXR_Vulkan_Spatial_Computing/07_Action_Spaces_Input/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc new file mode 100644 index 00000000..b8471e82 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc @@ -0,0 +1,66 @@ +:pp: {plus}{plus} += Native Multiview + +The core technology behind **Single-Pass Stereo** is **Native Multiview** (`VK_KHR_multiview`). This feature allows a single draw call to be broadcast to multiple layers of an image array. For spatial computing, where we almost always have at least two views (left and right eye), this is a massive performance win. + +== The CPU Overhead Win: Why We Hate Loops + +To understand the value of multiview, look at the alternative: **The Multi-Pass Loop**. + +1. **Loop**: For each eye (0 to 1): +2. **Bind**: Update the view/projection UBO for that eye. +3. **Draw**: Re-issue all 2,000 draw calls for the scene. + +In a complex scene, this means the CPU is doing twice the work (4,000 draw calls total). This often makes the application **CPU-Bound**, meaning the GPU is sitting idle while waiting for the CPU to finish its second loop. + +**Multiview kills the loop.** You issue the 2,000 draw calls *once*, and the GPU's fixed-function hardware handles the duplication for the second eye. The CPU overhead is cut in half, and the GPU can stay saturated. + +== View Masks and the Broadcast Mechanism + +In a standard Vulkan pipeline, a draw call targets a specific attachment. With multiview enabled (part of the Vulkan 1.4 core), you specify a **View Mask** in your `vk::RenderingInfo`. + +A view mask is a bitmask where each set bit represents a layer in the output attachment array. + +* **Mask 0b01 (1)**: Renders only to Layer 0. +* **Mask 0b11 (3)**: Renders to both Layer 0 (Left Eye) and Layer 1 (Right Eye). +* **Mask 0b1111 (15)**: Renders to four layers (used for Quad-Views). + +[source,cpp] +---- +// 1. Enabling Multiview in Vulkan 1.4 Dynamic Rendering +vk::RenderingInfo renderingInfo; +renderingInfo.viewMask = 0b11; // Broadcast to bit 0 and bit 1 +---- + +== The Hardware Implementation: SV_ViewID + +When multiview is active, the GPU executes the vertex shader for each bit set in the mask. Within the shader, you access the current view index using the `SV_ViewID` semantic. + +[source,slang] +---- +// 2. Using the View ID in a Slang Shader +struct VertexInput { + float3 position : POSITION; +}; + +struct VertexOutput { + float4 position : SV_Position; + uint viewID : SV_ViewID; +}; + +VertexOutput main(VertexInput input, uint viewID : SV_ViewID) { + VertexOutput output; + // Load the correct eye matrix using the viewID + // The GPU automatically runs this shader once per viewID (0 and 1) + float4x4 mvp = eyeMatrices[viewID]; + output.position = mul(mvp, float4(input.position, 1.0)); + output.viewID = viewID; + return output; +} +---- + +It's important to understand that while the vertex work is duplicated, the **Primitive Assembly** and **Rasterization** stages can often be optimized by the hardware. Because the two eyes are looking at the same geometry from slightly different angles, the GPU can reuse vertex cache data, making single-pass stereo much more efficient than two separate passes. + +In the next section, we'll see how to architect our **Slang** shaders to manage this "N-View" logic efficiently. + +xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc new file mode 100644 index 00000000..85053863 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += Slang Architecture for N-View Shaders + +Authoring multiview-aware shaders is a unique challenge. While the GPU's hardware handles the duplication of draw commands, it's our job as shader developers to ensure that we're accessing our eye-specific data—like view and projection matrices—in a way that is both efficient and organized. This is where the **Slang** language really shines. + +== The N-View Mindset: Agnostic Shaders + +In a traditional shader, you might pass a single `viewMatrix`. In an XR-aware engine, we must adopt an **N-View Mindset**. We write our shaders once, assuming they might be used for mono, stereo, or even "Quad-Views" (4 eyes). + +Slang's powerful type system and **StructuredBuffers** allow us to pass an array of view data that the shader can index into dynamically based on which eye is currently being processed. + +[source,slang] +---- +// 1. Defining a View Structure in Slang +struct SpatialView { + float4x4 viewProjection; + float4x4 inverseView; + float4 eyePosition; +}; + +// 2. Accessing N-Views in a Structured Buffer +// This buffer contains data for all active views (e.g., Left, Right, Inset Left, Inset Right) +StructuredBuffer eyeViews; + +struct VertexInput { + float3 position : POSITION; +}; + +struct VertexOutput { + float4 position : SV_Position; + uint viewID : SV_ViewID; +}; + +VertexOutput main(VertexInput input, uint viewID : SV_ViewID) { + VertexOutput output; + + // Slang makes it easy to access the current eye view data + // The compiler and hardware ensure that 'viewID' matches the current broadcast layer + SpatialView currentView = eyeViews[viewID]; + + output.position = mul(currentView.viewProjection, float4(input.position, 1.0)); + output.viewID = viewID; + return output; +} +---- + +== Propagation to the Fragment Shader + +The `SV_ViewID` isn't just for the vertex shader. You can propagate it down to the **Fragment Shader** to perform eye-specific logic. + +Why would you do this? Consider **Specular Highlights**. Since each eye is at a slightly different physical location, the reflections on a shiny surface should be slightly different for each eye. By using the `eyePosition` from our `SpatialView` structure (indexed by `viewID`), we can calculate the correct view-dependent lighting for each eye in a single pass. + +== Modular Spatial Shaders + +Slang's support for **Generics** and **Interfaces** means we can write shaders that are completely agnostic of the number of views. We can define our spatial logic once and then let the compiler handle the underlying details of the multiview integration. + +This significantly reduces the chances of errors. For example, if you add a third view for a "Spectator Camera" or a "Rear View Mirror," you don't need to change your shader code—you just update your view mask and the size of your `eyeViews` buffer. + +In the next chapter, we'll see how to push this "N-View" mindset even further by looking at **Quad-Views** and **Foveated Rendering**. + +xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/02_native_multiview.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..10c7d8ac --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc @@ -0,0 +1,76 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Adopting Multiview in our `simple_game_engine` allows us to render both eyes in a single pass, significantly reducing CPU draw call overhead. An example of this implementation can be found in link:../../../attachments/openxr_engine/shaders/pbr.slang[pbr.slang] and link:../../../attachments/openxr_engine/renderer_pipelines.cpp[renderer_pipelines.cpp]. + +== Updating Shader Loading + +In `renderer_resources.cpp`, we need to update our pipeline creation to enable multiview. We do this by adding `vk::PipelineRenderingCreateInfo` with the `viewMask` set. + +[source,cpp] +---- +// renderer_resources.cpp +bool Renderer::createGraphicsPipeline() { + // ... + vk::PipelineRenderingCreateInfo renderingInfo{ + .viewMask = xrMode ? 0x3 : 0x0, // 0x3 enables views 0 and 1 + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat + }; + + pipelineInfo.pNext = &renderingInfo; + // ... +} +---- + +== Adapting Slang Shaders + +Our Slang shaders (like `pbr.slang`) must be updated to handle the `SV_ViewID` semantic. This allows the shader to distinguish between the left and right eye. + +First, we update our `common_types.slang` to include an array of matrices in our `UniformBufferObject`: + +[source,slang] +---- +// common_types.slang +struct UniformBufferObject { + // NEW: Replace the single viewProjection matrix with an array + // This supports Stereo (N=2), Quad-Views (N=4), and beyond + float4x4 viewProjections[4]; + float4x4 model; +}; +---- + +Next, in `pbr.slang`: + +[source,slang] +---- +// pbr.slang +struct VS_Input { + float3 position : POSITION; + // ... +}; + +struct VS_Output { + float4 position : SV_Position; + uint viewID : SV_ViewID; // NEW: The hardware-provided view index + // ... +}; + +[shader("vertex")] +VS_Output vertexMain(VS_Input input, uint viewID : SV_ViewID) { + VS_Output output; + + // Select the correct ViewProjection matrix based on the eye index (0 to 3) + float4x4 vp = ubo.viewProjections[viewID]; + output.position = mul(vp, mul(pushConstants.model, float4(input.position, 1.0))); + output.viewID = viewID; + + return output; +} +---- + +== Why These Changes? + +By enabling the `viewMask` and using `SV_ViewID`, we shift the burden of stereo rendering from the CPU to the GPU. Instead of the engine looping twice and submitting two sets of commands, it submits one set, and the hardware's **Instance Data Step** or **Geometry Replication** handles the expansion to two views. This is the most efficient way to achieve high-performance spatial rendering in Vulkan 1.4. + +xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/03_slang_architecture.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc new file mode 100644 index 00000000..33f0d00d --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc @@ -0,0 +1,14 @@ +:pp: {plus}{plus} += Quad-Views and Foveated Rendering: High-Fidelity Spatial Computing + +As spatial computing hardware continues to advance, we're seeing headsets that push the boundaries of resolution and field-of-view (FOV). To maintain high performance while delivering incredible detail, many modern systems utilize **Quad-Views** and **Foveated Rendering**. These techniques move beyond simple two-eye stereo to a more sophisticated "N=4" model of rendering. + +The core idea is simple: the human eye only sees high detail in a very small central area called the **fovea**. Outside of that area, our vision is much lower resolution. By dedicating more rendering power to the foveal region and less to the peripheral area, we can significantly reduce the overall GPU workload without the user noticing a loss in quality. This is the essence of **Foveated Rendering**. + +In this chapter, we're going to explore how to architect our spatial engine to handle these multi-layer rendering setups. We'll start by looking at **Quad-Views**, where each eye receives two distinct views: a high-resolution "inset" (the foveal view) and a lower-resolution "base" (the peripheral view). This results in a total of four views (N=4) that need to be synchronized and composed. + +We'll define some key terms like **PPS** (Pixel per Degree) and see how to manage the **Swapchain** submission for these multiple layers in OpenXR. Whether you're targeting high-end PC VR or the latest mobile spatial systems, understanding how to manage multi-layer composition is a vital skill for any advanced spatial developer. + +In the following sections, we'll dive into the details of architecting these primary and inset view layers, and we'll see how to efficiently submit them to the OpenXR compositor for the final image. + +xref:OpenXR_Vulkan_Spatial_Computing/08_Slang_Spatial_Shaders/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc new file mode 100644 index 00000000..820ea84a --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc @@ -0,0 +1,60 @@ +:pp: {plus}{plus} += Primary Stereo with Insets + +In a **Quad-View** architecture, each eye is actually seeing two overlapping images. The **Primary View** is a wide, lower-resolution image that covers the user's entire field-of-view (FOV). The **Inset View** is a smaller, high-resolution image that is centered around the user's gaze. + +== The Pixel Density Problem + +To understand why we use quad-views, we must consider the hardware limits of modern displays. A typical VR headset might have a 4K resolution per eye spread over a 110-degree FOV. + +* **Result**: This gives a density of roughly 20-25 **PPD** (Pixels Per Degree). +* **The Goal**: Human "Retina" resolution is closer to 60 PPD. + +We cannot simply double the resolution of the entire display—the GPU couldn't handle it. Instead, we use **Foveated Rendering**. We render a high-density "Inset" (60 PPD) only where the eye is looking, and a lower-density "Primary" (20 PPD) for the periphery. + +== Fixed vs. Dynamic Foveation + +1. **Fixed Foveation (FFR)**: The high-detail inset is locked to the center of the lens. This is effective because XR lenses are naturally blurrier at the edges anyway. +2. **Dynamic Foveation (DFR)**: The high-detail inset moves in real-time based on **Eye Tracking** data. This is the "Holy Grail" of XR performance, as the user always sees perfect detail regardless of where they look. + +[source,cpp] +---- +// 1. Defining the View Config for Quad-Views (N=4) +// Indices 0,1 are the wide base views. +// Indices 2,3 are the high-detail foveal insets. +XrViewConfigurationView configViews[4]; +---- + +== Rendering the Quad-View in Slang + +From a rendering perspective, our engine manages four distinct view matrices. The inset views (2 and 3) have a much smaller FOV, which effectively "zooms in" on the scene without changing the camera position. + +[source,slang] +---- +// 2. Slang Shader Handling N=4 Views +struct QuadViewData { + float4x4 viewProjection; + float4 viewportScale; // Scale factor for foveal inset +}; + +StructuredBuffer allViews; // 4 Views total + +struct VertexOutput { + float4 position : SV_Position; + uint viewID : SV_ViewID; +}; + +VertexOutput main(uint viewID : SV_ViewID, float3 pos : POSITION) { + VertexOutput output; + QuadViewData current = allViews[viewID]; + + // The GPU broadcasts the same geometry to 4 different viewports/layers + output.position = mul(current.viewProjection, float4(pos, 1.0)); + output.viewID = viewID; + return output; +} +---- + +Crucially, the **Inset View** isn't just a zoomed-in version of the base view. It's a high-density image that captures more detail from the 3D scene. We are effectively focusing the GPU's rasterization power on a smaller subset of the world. + +xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc new file mode 100644 index 00000000..57baf5fe --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc @@ -0,0 +1,49 @@ +:pp: {plus}{plus} += Multi-Layer Composition + +Rendering a **Quad-View** is only half the battle. Once we have our four views (base eyes and high-detail insets), we need to get them to the **OpenXR Compositor** for the final display. This is where **Multi-Layer Composition** comes in. + +== The Layer Stack: Modular Imagery + +In OpenXR, you don't just submit a single image. You submit an array of **Composition Layers**. Think of the compositor as a "Super-GPU" that performs a final, high-priority pass to stack these images together before they hit the display. + +1. **Base Layer**: Your standard 3D world (usually indices 0 and 1 of your quad-view). +2. **Inset Layer**: The high-detail foveal data (indices 2 and 3). +3. **Overlay Layer**: A flat HUD or menu that stays perfectly sharp, regardless of the 3D scene's resolution. + +[source,cpp] +---- +// 1. Defining Composition Layers in OpenXR +XrCompositionLayerProjection baseLayer{XR_TYPE_COMPOSITION_LAYER_PROJECTION}; +XrCompositionLayerProjection insetLayer{XR_TYPE_COMPOSITION_LAYER_PROJECTION}; + +// 2. Submitting both layers to xrEndFrame +// The order matters: the last layer in the list is drawn on top. +std::vector layers; +layers.push_back(reinterpret_cast(&baseLayer)); +layers.push_back(reinterpret_cast(&insetLayer)); + +XrFrameEndInfo frameEndInfo{XR_TYPE_FRAME_END_INFO}; +frameEndInfo.layerCount = layers.size(); +frameEndInfo.layers = layers.data(); +xrEndFrame(session, &frameEndInfo); +---- + +== Masking and Blending + +How does the compositor know where to show the high-detail inset? Each layer specifies a **Projection View**. For quad-views, the inset layer's views will have a smaller FOV but the same camera pose. + +The compositor uses the FOV tangents to mathematically "place" the high-detail image over the low-detail one. It then performs a **Soft Edge Blend** to ensure the transition between the 20 PPD and 60 PPD regions is invisible to the user. + +== Resource Synchronization: Multi-Swapchain Pacing + +From an engine perspective, this means we are now managing **Multiple Swapchains** per frame. + +* **Pacing**: We must call `xrWaitSwapchainImage` and `xrAcquireSwapchainImage` for *each* swapchain (Base and Inset). +* **Vulkan Sync**: Our command buffer recording must ensure that all four views are rendered and transitioned before we call `xrReleaseSwapchainImage`. + +This multi-layer approach is incredibly flexible. Each layer can even have its own resolution and bit-depth, allowing the compositor to optimize the final image based on the specific needs of each part of the scene. + +In the next chapter, we'll see how to push our performance even further by looking at **Variable Rate Shading** (VRS). + +xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/02_primary_stereo_with_insets.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..64f26606 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/04_incorporating_into_the_engine.adoc @@ -0,0 +1,67 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Supporting Quad-Views (N=4) in our `simple_game_engine` requires us to move from a single-layer swapchain to a multi-layer composition in `renderer_rendering.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Multi-Layer Swapchain Composition + +In `renderer_rendering.cpp`, we update `Render` to submit multiple projection layers to OpenXR. Each eye will have a peripheral (wide FOV) and a foveal (narrow, high-res) layer. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... + if (xrMode) { + // Define two layers for the XR compositor: + // Layer 0: Peripheral (Wide FOV) + // Layer 1: Foveal (Narrow FOV, High-Res) + std::vector layers; + + for (uint32_t eye = 0; eye < 2; ++eye) { + // Retrieve views from OpenXR (N=4) + // xrContext provides these based on xrLocateViews + auto const& views = xrContext.getQuadViews(); + + // Populate peripheral layer + layers[0].views[eye].pose = views[0 + eye].pose; + layers[0].views[eye].fov = views[0 + eye].fov; + layers[0].views[eye].subImage.swapchain = *xrPeripheralSwapchain; + + // Populate foveal layer (the "inset") + layers[1].views[eye].pose = views[2 + eye].pose; + layers[1].views[eye].fov = views[2 + eye].fov; + layers[1].views[eye].subImage.swapchain = *xrFovealSwapchain; + } + + // Submit multi-layer composition to OpenXR + xrContext.submitLayers(layers); + } +} +---- + +== Handling High-Fidelity Insets + +In our `createGraphicsPipeline`, we ensure our `viewMask` is expanded if we are rendering all 4 views in a single pass. + +[source,cpp] +---- +// renderer_resources.cpp +bool Renderer::createGraphicsPipeline() { + // ... + vk::PipelineRenderingCreateInfo renderingInfo{ + .viewMask = xrMode ? 0xF : 0x0, // 0xF enables views 0, 1, 2, 3 + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat + }; + + pipelineInfo.pNext = &renderingInfo; + // ... +} +---- + +== Why These Changes? + +By moving to a multi-layer composition, we enable "Foveal Rendering" where the engine renders the center of the user's view at a much higher resolution than the periphery. Because our engine's `Render` loop is already dynamic, it can seamlessly switch between these views and layers, ensuring that the user always experiences the highest fidelity where they are actually looking. + +xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc new file mode 100644 index 00000000..a6bf34e8 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc @@ -0,0 +1,16 @@ +:pp: {plus}{plus} += Variable Rate Shading (VRS) for Peripheral Optimization + +The core challenge of high-fidelity spatial computing is the sheer number of pixels we need to push. With headsets moving toward 4K-per-eye resolutions at 90Hz or 120Hz, simply rendering the entire scene at full resolution is often computationally prohibitive. However, human vision provides us with a unique optimization opportunity: **Foveated Rendering**. + +Our eyes only perceive high detail in a very small central area called the **fovea**. As we move toward the periphery of our vision, our ability to resolve fine detail drops off significantly. In this chapter, we will explore how to leverage **Variable Rate Shading (VRS)**—a core feature of Vulkan 1.4—to intelligently reduce fragment processing in these peripheral areas without sacrificing perceived quality. + +Variable Rate Shading allows us to decoupling the shading rate from the pixel rate. Instead of running a fragment shader once for every pixel, we can tell the hardware to run it once for a group of pixels (e.g., a 2x2 or 4x4 tile). This "coarse shading" significantly reduces the **ALU** (Arithmetic Logic Unit) load on the GPU, which is often the primary bottleneck in complex spatial shaders. + +We will focus on two primary strategies: +1. **Static Peripheral Optimization**: Reducing shading rates at the edges of the lens where optical distortion and chromatic aberration already obscure detail. +2. **Dynamic Gaze-Driven Shading**: Using eye-tracking telemetry to center the high-resolution region wherever the user is currently looking. + +By the end of this chapter, you will understand how to integrate the **VK_KHR_fragment_shading_rate** extension (now part of the Vulkan 1.4 core) into your spatial pipeline and how to manage shading rate maps that update in real-time. + +xref:OpenXR_Vulkan_Spatial_Computing/09_Quad_Views_Foveated/03_multi_layer_composition.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc new file mode 100644 index 00000000..655784e7 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc @@ -0,0 +1,44 @@ +:pp: {plus}{plus} += Fragment Density Control and Vulkan 1.4 VRS + +In legacy pipelines, reducing shading quality often meant rendering to a lower-resolution buffer and upscaling, which introduced blurring and aliasing artifacts. With Vulkan 1.4, we have a much more elegant solution: **Variable Rate Shading (VRS)**. + +== The Concept: Decoupling Rasterization from Shading + +To understand VRS, you must distinguish between two parts of the graphics pipeline: + +1. **Rasterization**: The process of determining which pixels are covered by a triangle. +2. **Shading**: The process of running a fragment shader to determine the color of those pixels. + +In a standard pipeline, they are 1:1. For every pixel covered, you run one fragment shader. With **VRS**, you can run one fragment shader for a *group* of pixels (e.g., a 2x2 or 4x4 block) while still keeping the high-resolution edges and depth data from the rasterizer. + +This allows us to maintain **Geometric Sharpness** (no aliasing on object edges) while significantly reducing the **ALU** (Arithmetic Logic Unit) cost of complex lighting and materials in areas where detail isn't needed. + +== Shading Rate Images: The Density Map + +In a spatial pipeline, we use a **Shading Rate Image**. This is a low-resolution attachment (usually 1/16th the size of the main view) where each "pixel" represents a tile of the screen. + +* **Value 0x0**: 1x1 shading (Full quality). +* **Value 0x5**: 2x2 shading (1/4th the cost). +* **Value 0xA**: 4x4 shading (1/16th the cost). + +[source,cpp] +---- +// Configuring the shading rate state for a spatial pipeline +vk::PipelineFragmentShadingRateStateCreateInfoKHR shadingRateState{}; +shadingRateState.fragmentSize = {1, 1}; // Default fallback +// We "Replace" the pipeline rate with the rate from our density image +shadingRateState.combinerOps[0] = vk::FragmentShadingRateCombinerOpKHR::eReplace; +shadingRateState.combinerOps[1] = vk::FragmentShadingRateCombinerOpKHR::eKeep; + +vk::GraphicsPipelineCreateInfo pipelineInfo{}; +pipelineInfo.pNext = &shadingRateState; +---- + +== Fixed Vignette Shading + +Because XR lenses are naturally bluer and more distorted at the edges, we can pre-calculate a "Vignette" map. The center of the lens (the "Sweet Spot") is set to 1x1, while the outer rings are set to 2x2 or 4x4. Since the user rarely looks at the extreme edges of the lens, this is essentially "Free Performance." + +When combined with **Multiview**, a single shading rate map can be applied to both eyes simultaneously, saving massive amounts of GPU time without affecting the user's immersion. + +xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc new file mode 100644 index 00000000..1f37abb5 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc @@ -0,0 +1,43 @@ +:pp: {plus}{plus} += Gaze-Driven Logic and Eye-Tracking Telemetry + +While static peripheral optimization is a great baseline, the real power of foveated rendering comes when we align the high-detail region with the user's actual gaze. This is known as **Dynamic Foveated Rendering (DFR)**. To achieve this, we ingest eye-tracking data from the OpenXR runtime via the **XR_EXT_eye_gaze_interaction** extension. + +== The Biological Exploit: Saccadic Suppression + +To understand why DFR is so effective, we must look at how the human brain processes visual information. Our eyes are constantly making tiny, rapid movements called **Saccades**. During a saccade, the brain actually "blanks out" visual input for a fraction of a second—a phenomenon called **Saccadic Suppression**. + +DFR exploits this. By using high-speed eye tracking (usually 120Hz or higher), our engine can shift the high-detail foveal region to the user's new gaze point during the saccade itself. The user never "catches" the low-resolution periphery because their brain is literally ignoring visual input while the eye is moving. + +== Implementing Gaze-Driven VRS + +The process involves three steps: + +1. **Ingestion**: Querying the `XrEyeGaze` pose to find where the user is looking in 3D space. +2. **Projection**: Projecting that 3D gaze vector onto the 2D plane of the viewport to find the "foveal center" in UV coordinates. +3. **Update**: Regenerating or shifting the **Shading Rate Image** so that the 1x1 shading region follows that center. + +[source,cpp] +---- +// Updating the shading rate image based on projected gaze coordinates +void updateShadingRateMap(vk::raii::CommandBuffer& cmd, const glm::vec2& gazeCenter) { + // We use a simple compute shader to fill a small R8_UINT texture. + // The shader calculates distance from gazeCenter and assigns: + // 0x0 (1x1 shading), 0x4 (2x1), 0x5 (2x2), etc. + cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *gazeVrsPipeline); + cmd.pushConstants(*pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, gazeCenter); + cmd.dispatch(shadingRateWidth / 8, shadingRateHeight / 8, 1); +} +---- + +== Latency: The Immersion Killer + +Human eyes move incredibly fast. If your eye-tracking telemetry or your shading rate update has too much latency, the user will see the blurriness at the edge of their vision before the high-detail patch "catches up." + +This is why **Late Latching** (which we covered in Chapter 6) is equally important here. We want to sample the eye gaze at the absolute last microsecond before we signal the timeline semaphore to start the GPU work. By minimizing the latency between "where the eye is" and "where the pixels are sharp," we maintain the illusion of a perfectly high-resolution world. + +When implemented correctly, DFR allows you to push the boundaries of visual fidelity, enabling effects like heavy path-tracing or complex volumetrics that would be impossible to run at full resolution across the entire screen. + +In the next chapter, we will shift from software-driven optimization to hardware-driven geometry by looking at **Canted Displays and Asymmetric Frustums**. + +xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/02_fragment_density_control.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..ebec77e0 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc @@ -0,0 +1,82 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Integrating **Variable Rate Shading (VRS)** into our `simple_game_engine` allows us to unlock a massive performance boost by being smarter about how we process fragments in `renderer_resources.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Enabling VRS in the Backend + +In `renderer_core.cpp`, we must enable the VRS features during device creation. + +[source,cpp] +---- +// renderer_core.cpp +bool Renderer::createLogicalDevice(bool enableValidationLayers) { + // ... + vk::PhysicalDeviceVulkan13Features vulkan13Features; + vulkan13Features.dynamicRendering = vk::True; + + // NEW: Enable Fragment Shading Rate (VRS) + vk::PhysicalDeviceFragmentShadingRateFeaturesKHR vrsFeatures; + vrsFeatures.attachmentFragmentShadingRate = vk::True; + vrsFeatures.pipelineFragmentShadingRate = vk::True; + + vulkan13Features.pNext = &vrsFeatures; + // ... +} +---- + +== Dynamic Gaze-Driven VRS + +In `renderer_rendering.cpp`, we update our `Render` loop to bind a shading rate image that highlights the user's current gaze. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... + if (xrMode) { + // Query current gaze from OpenXR (Eye Tracking) + glm::vec2 gazeNDC = xrContext.getGazeNDC(); + + // Update the shading rate image (a low-res 8-bit image) + // High density (1x1) at gazeNDC, low density (4x4) at periphery + updateShadingRateImage(gazeNDC); + + // Bind the shading rate attachment to our dynamic rendering info + vk::RenderingFragmentShadingRateAttachmentInfoKHR vrsAttachment{ + .imageView = *vrsImageView, + .imageLayout = vk::ImageLayout::eFragmentShadingRateAttachmentOptimalKHR, + .shadingRateAttachmentTexelSize = vk::Extent2D(16, 16) + }; + + passInfo.pNext = &vrsAttachment; + } + + commandBuffer.beginRendering(passInfo); + // ... +} + +void Renderer::updateShadingRateImage(glm::vec2 gazeNDC) { + // 1. Map NDC (-1 to 1) to our shading rate image texels + // The image is low-res (e.g., 1/16th of the main display) + uint32_t gazeX = static_cast((gazeNDC.x * 0.5f + 0.5f) * vrsWidth); + uint32_t gazeY = static_cast((gazeNDC.y * 0.5f + 0.5f) * vrsHeight); + + // 2. Fill the image: 1x1 (full rate) at gaze, 4x4 (low rate) at periphery + // We use a simple distance-based falloff + for (uint32_t y = 0; y < vrsHeight; ++y) { + for (uint32_t x = 0; x < vrsWidth; ++x) { + float dist = glm::distance(glm::vec2(x, y), glm::vec2(gazeX, gazeY)); + vrsData[y * vrsWidth + x] = (dist < foveaRadius) ? 0x0 : 0x4; // 0x0=1x1, 0x4=4x4 + } + } + // 3. Upload to GPU + vrsBuffer.upload(vrsData); +} +---- + +== Why These Changes? + +By integrating eye-tracking data directly into our Vulkan 1.4 rendering pass, we enable **Dynamic Foveated Rendering (DFR)**. This allows the engine to save up to 60% of fragment processing costs in complex PBR scenes by only rendering the tiny area the user is actually looking at in full detail. Because our engine uses dynamic rendering, adding the `vrsAttachment` to our `passInfo` is a simple `pNext` extension away. + +xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/03_gaze_driven_logic.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc new file mode 100644 index 00000000..3185c492 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc @@ -0,0 +1,15 @@ +:pp: {plus}{plus} += Canted Displays and Asymmetric Frustums + +In a standard desktop renderer, we typically assume that the two virtual cameras (the "eyes") are parallel to each other. This is known as a **Parallel Projection** model. However, many modern high-end headsets—such as the Pimax or Valve Index—utilize a **Canted Display** architecture. In these devices, the physical screens and lenses are tilted outward to provide a wider **Field of View (FOV)**. + +This mechanical canting introduces a significant layer of complexity to our rendering pipeline. We can no longer assume that the view matrices are simple lateral offsets of each other. Instead, each eye has its own unique orientation and an **Asymmetric Frustum**—where the focal point is not in the center of the image. + +In this chapter, we will explore how to handle these non-parallel architectures by: +1. **Decomposing the XR Pose**: Understanding how to use `XrView` data to build proper 4x4 matrices that account for the canting angle. +2. **Calculating Asymmetric Frustums**: Using the OpenXR `fov` tangents to build projection matrices where the "center" is shifted to align with the optical axis. +3. **Optimizing with Viewport Swizzling**: Utilizing the **VK_NV_viewport_swizzle** extension (or its Vulkan 1.4 equivalents) to optimize how these non-rectilinear projections are rasterized. + +By mastering these concepts, your engine will be compatible with the entire spectrum of spatial hardware, from mobile VR headsets with parallel screens to ultra-wide FOV enthusiast hardware. + +xref:OpenXR_Vulkan_Spatial_Computing/10_Variable_Rate_Shading/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc new file mode 100644 index 00000000..040bbbd3 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc @@ -0,0 +1,52 @@ +:pp: {plus}{plus} += Calculating Asymmetric Frustums for Wide-FOV Lenses + +When you query `xrLocateViews`, the OpenXR runtime returns two pieces of information for each eye: a `pose` (position and orientation) and a `fov` (field of view). While many developers are tempted to simply use a fixed FOV and an eye-offset, this approach will fail on headsets with canted lenses. + +== The Tangent Frustum: Mapping Angles to Pixels + +To understand why OpenXR uses tangents, we must look at how a camera sees the world. A standard field of view is just an angle. But GPUs work with **Clip Space**—a box from -1 to 1. To get from a 3D world to a 2D box, we need the tangents of the FOV angles. + +The `fov` provided by OpenXR is expressed as four tangents: `angleLeft`, `angleRight`, `angleUp`, and `angleDown`. + +* **Symmetric (Desktop)**: Left = -Right, Up = -Down. The optical center is in the middle. +* **Asymmetric (XR)**: Left != -Right. The optical center is shifted to align with the physical lens. + +== Building the Matrix + +To build a proper 4x4 projection matrix from these tangents using `vulkan-hpp`, we use the following logic: + +[source,cpp] +---- +// Constructing an asymmetric projection matrix from OpenXR FOV tangents +glm::mat4 createProjectionMatrix(const XrFovf& fov, float zNear, float zFar) { + const float tanLeft = tan(fov.angleLeft); + const float tanRight = tan(fov.angleRight); + const float tanUp = tan(fov.angleUp); + const float tanDown = tan(fov.angleDown); + + const float tanWidth = tanRight - tanLeft; + const float tanHeight = tanUp - tanDown; + + glm::mat4 projection(0.0f); + projection[0][0] = 2.0f / tanWidth; + projection[1][1] = 2.0f / tanHeight; + + // The "Shift" components: These align the frustum with the lens optical center + projection[2][0] = (tanRight + tanLeft) / tanWidth; + projection[2][1] = (tanUp + tanDown) / tanHeight; + + projection[2][2] = zFar / (zNear - zFar); + projection[2][3] = -1.0f; + projection[3][2] = -(zFar * zNear) / (zFar - zNear); + return projection; +} +---- + +== Canted Displays: The Rotation Problem + +On headsets like the Valve Index or Pimax, the display panels themselves are rotated outward (canted). This means your **View Matrices** (the eye's position and rotation) are also different for each eye. + +When you use **Multiview**, you must push an array of projection matrices and an array of view matrices to your Slang shaders. If you try to use a single "stereo" matrix with a fixed offset, the world will appear warped and uncomfortable because the GPU is drawing a rectilinear world for a non-rectilinear display. + +xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc new file mode 100644 index 00000000..fa0f1bd8 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc @@ -0,0 +1,40 @@ +:pp: {plus}{plus} += Optimizing Projections with Viewport Swizzling + +In some wide-FOV headsets, the physical display panel may be mounted at an angle that does not align with the standard Cartesian coordinate system. This is where the **VK_NV_viewport_swizzle** extension (or the modern Vulkan 1.4 equivalents) becomes invaluable. + +== The Swizzle Stage: Re-routing the Pipeline + +To understand viewport swizzling, you must look at the hardware path of a pixel. After the vertex shader finishes, the GPU performs **Perspective Division** and **Viewport Transformation**. Usually, this is fixed: X goes to X, Y goes to Y. + +With **Viewport Swizzling**, you can re-route these components *at the hardware level*. + +* **Why?**: Some headsets use rotated display panels to fit more pixels into a specific FOV. +* **Shader Problem**: Flipping X and Y in a shader is easy, but it happens *before* rasterization, which can break depth testing and hardware culling. +* **Hardware Solution**: Swizzling happens *after* the shader but *before* the pixel is drawn. It allows the hardware to handle rotated or inverted axes with zero performance penalty. + +[source,cpp] +---- +// Configuring a viewport swizzle for non-standard display layouts +vk::PipelineViewportSwizzleStateCreateInfoNV swizzleState{}; +vk::ViewportSwizzleNV leftEyeSwizzle{ + vk::ViewportCoordinateSwizzleNV::ePositiveX, + vk::ViewportCoordinateSwizzleNV::eNegativeY, // Invert vertical axis for hardware-level flip + vk::ViewportCoordinateSwizzleNV::ePositiveZ, + vk::ViewportCoordinateSwizzleNV::ePositiveW +}; +swizzleState.pViewportSwizzles = &leftEyeSwizzle; + +vk::GraphicsPipelineCreateInfo pipelineInfo{}; +pipelineInfo.pNext = &swizzleState; +---- + +== Device Specificity and Portability + +While viewport swizzling is a powerful tool for low-level hardware optimization, it is important to remember that it is often **vendor-specific** (like the `NV` suffix suggests). + +In a modern spatial engine, you should treat swizzling as a "Hardware Driver" feature. Your engine's core logic should remain rectilinear, but your `VulkanContext` can apply these swizzles based on the headset's requirements discovered during the handshake. This keeps your game code clean while still hitting the absolute maximum performance on exotic wide-FOV hardware. + +In the next chapter, we will look at how to scale these spatial techniques to even larger environments using **CAVE Architecture and Multi-GPU Synchronization**. + +xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/02_non_parallel_projections.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..b2b647b0 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc @@ -0,0 +1,62 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +The introduction of **Canted Displays** and **Asymmetric Frustums** requires us to move away from the "parallel eyes" assumption in our `simple_game_engine`'s camera system. An example of this implementation can be found in link:../../../attachments/openxr_engine/camera_component.cpp[camera_component.cpp]. + +== Evolving the Camera System + +In `camera.cpp`, we must update how we calculate the projection matrix to handle the asymmetric FOVs provided by OpenXR. + +[source,cpp] +---- +// camera.cpp +glm::mat4 Camera::getAsymmetricProjection(const XrFovf& fov, float nearZ, float farZ) { + float tanLeft = std::tan(fov.angleLeft); + float tanRight = std::tan(fov.angleRight); + float tanUp = std::tan(fov.angleUp); + float tanDown = std::tan(fov.angleDown); + + float tanWidth = tanRight - tanLeft; + float tanHeight = tanUp - tanDown; + + // Build the asymmetric projection matrix + glm::mat4 projection(0.0f); + projection[0][0] = 2.0f / tanWidth; + projection[1][1] = 2.0f / tanHeight; + projection[2][0] = (tanRight + tanLeft) / tanWidth; + projection[2][1] = (tanUp + tanDown) / tanHeight; + projection[2][2] = -farZ / (farZ - nearZ); + projection[2][3] = -1.0f; + projection[3][2] = -(farZ * nearZ) / (farZ - nearZ); + + return projection; +} +---- + +== Handling Canted View Matrices + +In `engine.cpp`, when we update the camera, we must ensure we use the full 6-DOF pose for each eye, not just a simple X-offset. + +[source,cpp] +---- +// engine.cpp +void Engine::Update(XrTime predictedTime) { + // ... + auto [leftView, rightView] = xrContext.locateViews(predictedTime); + + // Instead of a single offset, we pass the full view matrices + // which may include rotation (canting) + activeCamera->setEyeViewMatrix(0, xrPoseToMatrix(leftView.pose)); + activeCamera->setEyeViewMatrix(1, xrPoseToMatrix(rightView.pose)); + + // Set the asymmetric projections we calculated above + activeCamera->setEyeProjectionMatrix(0, getAsymmetricProjection(leftView.fov, ...)); + activeCamera->setEyeProjectionMatrix(1, getAsymmetricProjection(rightView.fov, ...)); +} +---- + +== Why These Changes? + +By supporting canted views and asymmetric frustums, our engine becomes compatible with high-end "enthusiast" headsets. Proper projection matrix calculation is the difference between a world that looks "right" and one that feels slightly distorted or "warped" as you move your head. Because we've already integrated the OpenXR handshake, our `xrContext` can provide the exact FOV tangents and poses required for these calculations. + +xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/03_viewport_swizzling.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc new file mode 100644 index 00000000..c9956a5f --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc @@ -0,0 +1,28 @@ +:pp: {plus}{plus} += CAVE Architecture and Multi-GPU Synchronization: Introduction + +While most of our focus so far has been on head-mounted displays (HMDs), spatial computing also encompasses large-scale environments like **CAVEs** (Cave Automatic Virtual Environments). A CAVE is a room where the walls, floor, and sometimes the ceiling are high-resolution projection surfaces. The user typically wears lightweight shutter glasses and is tracked by a system of cameras, creating a shared, immersive 3D space. + +Architecting an engine for a CAVE is fundamentally different from a single-GPU headset. To drive six or more 4K projectors at 120Hz, we often need a **Networked Cluster** of powerful workstations, each equipped with one or more high-end GPUs. This introduces a massive synchronization challenge: every frame must be displayed on every screen at the exact same microsecond to avoid "tearing" at the corners of the room. + +In this chapter, we will dive into the engineering required for room-scale spatial technology: + +1. **Projector-Based Spatial Tech**: Understanding the geometry of a CAVE and how to architect an engine to handle multiple, non-standard projection surfaces. +2. **Hardware Synchronization**: Utilizing the **VK_NV_present_barrier** extension and **Swap Groups** to ensure frame-perfect synchronization across networked cluster nodes. +3. **Distributed Rendering**: Designing a data-parallel architecture where each node in the cluster renders its own slice of the 3D world while remaining in lock-step with the master simulation. + +== The Challenge of Room-Scale Spatial Tech + +In an HMD, the screens are physically attached to your head. In a CAVE, the screens are the walls. This means that as you move around the room, the **Asymmetric Frustums** for each projection surface must be recalculated in real-time based on your tracked eye position. + +Furthermore, because the images are being stitched together from multiple projectors, we must handle **Geometric Correction** and **Edge Blending**—concepts we will explore in the next chapter—to ensure the virtual world appears seamless as it wraps around the physical corners of the room. + +== Synchronizing the Cluster + +The "holy grail" of CAVE engineering is **Genlock** (Generator Lock) and **Swaplock**. Genlock ensures that the video signal from every GPU is sent to the projectors at the exact same time. Swaplock ensures that the command to swap the front and back buffers happens simultaneously across the entire cluster. + +Without these hardware-level guarantees, the "immersion" of a CAVE is quickly broken by jitter and visual discontinuities. We will look at how Vulkan 1.4 features and specialized hardware extensions allow us to achieve this level of precision. + +By the end of this chapter, you will understand how to scale your spatial engine from a single-user headset to a multi-node, room-scale immersive environment. + +xref:OpenXR_Vulkan_Spatial_Computing/11_Canted_Displays/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc new file mode 100644 index 00000000..c9933c33 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc @@ -0,0 +1,47 @@ +:pp: {plus}{plus} += Architecting for Projector-Based Spatial Environments + +When we transition from a headset to a **CAVE** (Cave Automatic Virtual Environment), the way we define our "cameras" changes fundamentally. In a headset, the cameras are relative to the user's head. In a CAVE, the "cameras" are fixed physical surfaces—the walls—and the user moves *within* them. + +== The Observer Paradox: One Perspective, Many Screens + +To understand CAVE rendering, you must accept a paradox: **The 3D scene is only correct for one person.** + +Because the walls are fixed, if the user moves to the left, the image on the front wall must be skewed to maintain the illusion of depth from their new perspective. This is known as **Off-Axis Projection**. + +1. **Fixed Display**: The projection surface (the wall) never moves. +2. **Moving Eye**: The user's eye (the viewpoint) moves constantly. +3. **Result**: The "Pyramid of Vision" (the frustum) is highly asymmetric and changes its skew every frame based on the user's physical location in the room. + +== Calculating the Off-Axis Frustum + +To calculate this, we treat the physical wall as the "Near Plane" of our camera. We calculate the frustum that passes from the tracked eye through the four corners of the wall. + +[source,cpp] +---- +// A simple representation of a physical projection wall in our engine +struct ProjectionWall { + glm::vec3 bottomLeft; + glm::vec3 bottomRight; + glm::vec3 topLeft; + // We calculate the normal and plane equation from these points +}; + +// The resulting Projection Matrix is built by measuring the distance +// from the eye to the wall's corners in the wall's local coordinate system. +---- + +== The Distributed Rendering Model: Leader and Follower + +Driving a 5-sided CAVE at 4K resolution requires immense GPU power. Most systems use a **Cluster** of computers. + +* **Leader Node**: Runs the main simulation, physics, and input processing. It then broadcasts the "State of the World" (object positions, user pose, system time) to the followers. +* **Follower Nodes**: These nodes perform **Deterministic Simulation**. They receive the packet, update their internal world state to match the leader exactly, and then render only the specific wall or projector they are responsible for. + +== Determinism is Mandatory + +In a cluster, every computer must agree on the state of the world. If a particle system uses a random seed that differs between nodes, one wall will show a particle in a different place than the adjacent wall, breaking the immersion. Every "Random" or "Time-based" element must be synchronized across the network. + +To the user, this cluster behaves as a single, massive GPU. However, this only works if every node renders and displays the exact same frame at the exact same time. + +xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc new file mode 100644 index 00000000..a9d4c3f7 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc @@ -0,0 +1,51 @@ +:pp: {plus}{plus} += Hardware Synchronization: Swap Groups and Barriers + +In a networked cluster driving a **CAVE**, "close enough" is not good enough for synchronization. If Node A (the left wall) swaps its buffers even one millisecond before Node B (the front wall), the user will see a distracting horizontal line or a "misaligned" object across the corner where the two walls meet. This is known as **Cluster Tearing**. + +== Swap Groups: Intra-Machine Sync + +A **Swap Group** synchronizes multiple GPUs or multiple displays connected to a *single* workstation. + +When one GPU in the group is ready to present, the hardware holds the "Swap" signal until all other GPUs in that same machine are also ready. This ensures that the two outputs of a single dual-GPU workstation always update their pixels at the exact same refresh cycle. + +== Swap Barriers: Inter-Machine Sync + +A **Swap Barrier** is the network extension of a swap group. This works across *multiple* workstations. + +1. **Hardware Connection**: A physical cable (often a BNC or RJ45 sync cable) connects the GPUs across the cluster. +2. **The Handshake**: When Node A finishes its frame, it sends a "Ready" signal over the cable. +3. **The Wait**: Node A blocks its execution. +4. **The Release**: Only when *every* node in the cluster has sent its "Ready" signal does the hardware release the barrier, allowing every GPU to swap their buffers simultaneously. + +== Implementing the Barrier in Vulkan + +We use the **VK_NV_present_barrier** extension to join these hardware groups. + +[source,cpp] +---- +// Checking for Present Barrier support and joining a group +auto features2 = physicalDevice.getFeatures2(); +const auto& barrierFeatures = features2.get(); + +if (barrierFeatures.presentBarrier) { + // When creating the swapchain, we join Barrier 1 + // This barrier is managed by the physical sync cable between machines + vk::SwapchainPresentBarrierCreateInfoNV barrierInfo{}; + barrierInfo.presentBarrierEnable = VK_TRUE; + barrierInfo.barrierID = 1; + + vk::SwapchainCreateInfoKHR createInfo{}; + createInfo.pNext = &barrierInfo; + // ... +} +---- + +== Genlock vs. Swaplock + +* **Genlock (Generator Lock)**: Syncs the *beginning* of the video signal (the start of the scanline). This prevents "rolling" artifacts. +* **Swaplock (Swap Barrier)**: Syncs the *moment* the front and back buffers are swapped. This prevents "stale frame" artifacts. + +By combining Genlock (signal timing) with Swap Barriers (frame timing), we achieve the "Gold Standard" of spatial synchronization: **Frame-Perfect Cluster Rendering**. The entire room updates as if it were a single piece of digital paper. + +xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/02_projector_based_spatial_tech.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..c5822077 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc @@ -0,0 +1,50 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Scaling our `simple_game_engine` to support **CAVE Architecture** and **Multi-GPU Synchronization** requires a shift to hardware-level barriers in `renderer_rendering.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Joining the Swap Barrier + +In `renderer_rendering.cpp`, we must update our `createSwapChain` logic to support the `VK_NV_present_barrier` extension for synchronized cluster rendering. + +[source,cpp] +---- +// renderer_rendering.cpp +bool Renderer::createSwapChain() { + // ... + vk::SwapchainPresentBarrierCreateInfoNV barrierInfo{ + .presentBarrierEnable = vk::True, + .presentBarrierName = 1 // All nodes in the CAVE must use the same ID + }; + + if (hasPresentBarrierSupport) { + createInfo.pNext = &barrierInfo; + } + + // Create the synchronized swapchain + swapChain = vk::raii::SwapchainKHR(device, createInfo); + // ... +} +---- + +== Synchronized Present Call + +In our `Render` function, the `presentKHR` call now becomes a synchronized event. The GPU will automatically block until all other nodes in the swap group are ready to flip their buffers. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... record and submit ... + + // This call now blocks at the hardware level if a Swap Barrier is active + // ensuring no tearing or "seams" between CAVE walls + presentQueue.presentKHR(presentInfo); +} +---- + +== Why These Changes? + +Hardware synchronization is the only way to ensure that the corners of a CAVE room look seamless. Without the `VK_NV_present_barrier`, the "seam" between projectors will always be visible as a line of tearing or stutter. By integrating this into our `createSwapChain` and `Render` logic, we transform the engine into a professional-grade simulation platform capable of driving multi-projector environments. + +xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/03_hardware_sync.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc new file mode 100644 index 00000000..d15acd19 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc @@ -0,0 +1,28 @@ +:pp: {plus}{plus} += Warp and Blend using Vulkan Compute: Introduction + +In an ideal world, every projection surface in a **CAVE** would be perfectly flat, every projector would be perfectly aligned, and the walls would have zero thickness. In the real world, walls have slight curves, projectors are mounted at awkward angles, and corners are never truly 90 degrees. + +To solve these physical imperfections, we use **Warp and Blend** techniques. **Warping** is the process of geometrically distorting the rendered image so that it looks correct when projected onto a non-flat surface. **Blending** (or Edge Blending) is the process of smoothing the brightness in areas where multiple projectors overlap, preventing "hot spots" of double-brightness. + +In this chapter, we will explore how to use the power of **Vulkan Compute** to handle these post-processing tasks with maximum efficiency: + +1. **Geometric Correction**: Using compute shaders to map our 3D rendered scenes onto curved or irregular physical surfaces via a lookup table (**LUT**). +2. **Edge Blending**: Implementing alpha-blending logic in compute to seamlessly join multiple projector outputs. +3. **Lens Distortion Correction**: Utilizing Slang to author high-performance warping shaders that can be shared across both CAVE systems and HMDs. + +== Why Use Compute? + +While warping could be done in a traditional fragment shader by rendering a screen-aligned quad, **Vulkan Compute** offers several advantages for this specific task: + +* **Atomic Precision**: Compute shaders allow us to use atomic operations if we need to build complex histograms for auto-calibration. +* **Shared Memory (LDS)**: We can use **groupshared** memory to perform high-quality filtering (like bicubic interpolation) during the warp, which is significantly higher quality than standard hardware bilinear filtering. +* **Asynchronous Execution**: As we learned in the *Advanced Vulkan Compute* tutorial, we can run these warping kernels on an **Async Compute Queue** while the next frame is already being rendered on the graphics queue. + +== The Calibration Pipeline + +Warping is only as good as the data driving it. Most professional installations use a camera-based calibration system that generates a **Warp Map**—a high-precision texture where each pixel contains the (u, v) coordinates of where that pixel should actually "land" on the physical wall. + +By the end of this chapter, you will be able to take a standard rectilinear render and "warp" it into any physical shape required by the installation, ensuring that the virtual world remains perfectly undistorted to the user's eyes. + +xref:OpenXR_Vulkan_Spatial_Computing/12_CAVE_Architecture/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc new file mode 100644 index 00000000..e2833abc --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += Geometric Correction: Mapping 3D Scenes to Physical Surfaces + +The core of **Geometric Correction** is a coordinate transformation. We start with a perfectly rectangular image rendered by our engine (the "Source"), and we need to transform it into a "Warped" image that looks correct when projected onto a physical surface like a curved screen or a corner. + +== The Concept: Reverse Lookup + +To understand warping, you must think in reverse. Instead of asking "Where does this source pixel go?", we ask **"For this pixel on my wall, where should I look in my rendered frame?"** + +1. **Rectilinear Source**: A standard 3D render (straight lines, 90-degree corners). +2. **Physical Destination**: A curved screen or a corner where two walls meet. +3. **The Warp Map**: A Look-Up Table (LUT) that maps every destination pixel to a $(u, v)$ coordinate in the source render. + +Treating correction as a reverse lookup is essential because it ensures that every pixel on the output display is filled exactly once, preventing gaps or "holes" in the final image. + +== Implementing the Warp Map + +The "brain" of this operation is the **Warp Map**. This is typically a `vk::raii::Image` containing floating-point data. + +[source,slang] +---- +// A simple compute-based warping kernel in Slang +RWTexture2D outputTarget; // The final image sent to the projector +Texture2D sourceRender; // The rectilinear frame we rendered +Texture2D warpMap; // The LUT containing (u, v) coordinates + +[numthreads(16, 16, 1)] +void main(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + float2 outputDim; + outputTarget.GetDimensions(outputDim.x, outputDim.y); + + if (dispatchThreadID.x >= outputDim.x || dispatchThreadID.y >= outputDim.y) + return; + + // 1. Look up the source coordinate from the Warp Map + // This tells us: "To draw pixel (x,y) on the wall, look at (u,v) in the render" + float2 sourceUV = warpMap.Load(int3(dispatchThreadID.xy, 0)); + + // 2. Sample the source image using high-quality interpolation + // We use Bilinear or Bicubic filtering to prevent "aliasing" + SamplerState linearSampler; + float4 color = sourceRender.SampleLevel(linearSampler, sourceUV, 0); + + // 3. Write the warped pixel to the output + outputTarget[dispatchThreadID.xy] = color; +} +---- + +== Luminance Summation and Edge Blending + +In a multi-projector setup, the edges of the images overlap. + +* **The Problem**: In the overlap region, you have light from *two* projectors hitting the same spot. This area will be twice as bright as the rest of the wall, creating a distracting "hot spot." +* **The Solution**: **Luminance Edge Blending**. We include an alpha channel in our Warp Map that defines a smooth gradient (1.0 to 0.0) in the overlap region. By multiplying the final color by this alpha, we "fade out" one projector as the other "fades in," maintaining uniform brightness across the entire wall. + +== Precision: Bicubic vs Sinc Filtering + +Because warping involves sampling a texture at non-integer coordinates, the quality of your filtering is paramount. While `SamplerState` with linear filtering is fast, it can lead to "blurring" in areas of high warp. High-end spatial engines often implement custom **Bicubic** or **Lanczos** filters in the compute shader to preserve sharpness during the coordinate transformation. + +xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc new file mode 100644 index 00000000..2b8b06cd --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc @@ -0,0 +1,56 @@ +:pp: {plus}{plus} += Post-Process Warping and Lens Distortion + +While the previous section focused on external physical surfaces like CAVE walls, **Warping** is also a fundamental part of head-mounted displays (HMDs). In a headset, the lenses themselves introduce significant **Pin-Cushion Distortion**. To counteract this, the engine must apply a **Barrel Distortion** post-process so that the final image looks rectilinear to the user. + +== The Physics of the Lens: Why Warp? + +To understand HMD warping, you must look at the glass. VR lenses are designed to take a tiny display and magnify it to cover your entire field of view. This magnification is not uniform. + +1. **Pin-Cushion Distortion**: The lens physically stretches the image more at the corners than in the center. +2. **The Solution**: We pre-distort the image in the opposite direction (**Barrel Distortion**). By "squeezing" the corners in our shader, the lens's physical stretch pulls them back into their correct rectilinear positions. + +== Mathematical Models of Distortion + +Most lens distortion can be modeled using a radial polynomial (like the **Brown-Conrady** model). This model uses coefficients ($k_1, k_2, k_3$) to shift the $(u, v)$ coordinates based on their distance from the optical center. + +[source,slang] +---- +// Applying a simple radial distortion model in Slang +float2 ApplyDistortion(float2 uv, float2 k) +{ + float2 center = float2(0.5, 0.5); + float2 d = uv - center; + float r2 = dot(d, d); + + // Barrel distortion: k[0] and k[1] are typically positive + // This shifts pixels towards the center based on their distance + float2 distortedD = d * (1.0 + k.x * r2 + k.y * r2 * r2); + + return center + distortedD; +} +---- + +== Chromatic Aberration: The Rainbow Effect + +Lenses also refract different wavelengths of light at different angles. This causes Red, Green, and Blue light to hit the user's eye at slightly different positions, creating "color fringing" or **Chromatic Aberration**. + +To fix this, we apply a slightly different warp to each color channel. In our Vulkan compute shader, this is as simple as performing three samples instead of one: + +[source,slang] +---- +float4 color; +// We sample the source image three times with different distortion coefficients +color.r = sourceRender.SampleLevel(s, ApplyDistortion(uv, k_red), 0).r; +color.g = sourceRender.SampleLevel(s, ApplyDistortion(uv, k_green), 0).g; +color.b = sourceRender.SampleLevel(s, ApplyDistortion(uv, k_blue), 0).b; +color.a = 1.0; +---- + +== Rendering 1.2x: The Oversampling Tax + +Because warping "squeezes" the image, some areas of your render will be scaled down while others are scaled up. To prevent the center of your view from looking blurry after the warp, you must render your 3D scene at a **Higher Resolution** than the physical display (typically 1.2x to 1.4x the panel resolution). This ensures that even after the distortion is applied, you still have 1:1 pixel mapping in the most critical areas. + +In the next chapter, we will move from pixels to 4D functions by exploring **LightField Theory**. + +xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/02_geometric_correction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..71645434 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc @@ -0,0 +1,74 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +The addition of **Warp and Blend** capabilities allows our `simple_game_engine` to handle geometric distortion and multi-projector blending in `renderer_rendering.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Enhancing the Post-Processing Chain + +In `renderer_rendering.cpp`, we introduce a compute-based "Warp" pass that runs after the main scene rendering is complete. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::Render(...) { + // ... Main Scene Pass (Raster) ... + + // NEW: Post-Process Warp Pass (Compute) + // This kernel maps our rectilinear render target to a warped swapchain image + auto& cmd = commandBuffers[currentFrame]; + + cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *warpPipeline); + + // Bind the Rectilinear Input (Descriptor Set 0) + cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *warpPipelineLayout, 0, *rectilinearSets[currentFrame], nullptr); + + // Bind the Warp Map LUT (Descriptor Set 1) + cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *warpPipelineLayout, 1, *warpMapSets, nullptr); + + // Dispatch the warp compute kernel + uint32_t groupCountX = (swapChainExtent.width + 15) / 16; + uint32_t groupCountY = (swapChainExtent.height + 15) / 16; + cmd.dispatch(groupCountX, groupCountY, 1); +} +---- + +== Slang Warp Shader Integration + +In our `warp.slang` shader, we use the Warp Map LUT to calculate the distorted texture coordinates. + +[source,slang] +---- +// warp.slang +Texture2D inputTexture : register(t0); +Texture2D warpMap : register(t1); // The LUT +RWTexture2D outputSwapchain : register(u0); + +[shader("compute")] +[numthreads(16, 16, 1)] +void computeMain(uint3 threadID : SV_DispatchThreadID) { + // 1. Sample the Warp Map to get the distorted coordinate + float2 distortedUV = warpMap.SampleLevel(sampler, float2(threadID.xy) / screenRes, 0); + + // 2. Sample the main scene render target + float3 color = inputTexture.SampleLevel(sampler, distortedUV, 0); + + // 3. Apply Edge Blending (alpha ramp) if needed + color *= calculateEdgeBlend(threadID.xy); + + outputSwapchain[threadID.xy] = float4(color, 1.0); +} + +// Edge Blending Helper: Multi-projector luminosity matching +float3 calculateEdgeBlend(uint2 pos) { + // Simple 10% overlap ramp for the left-edge of a projector + float overlapWidth = screenRes.x * 0.1f; + float alpha = clamp(float(pos.x) / overlapWidth, 0.0, 1.0); + return float3(alpha, alpha, alpha); +} +---- + +== Why These Changes? + +Proper warping and blending are what make a CAVE room feel like a unified environment. By handling distortion in the engine's compute-based post-processing chain, we can support a wide variety of hardware—from curved screens to custom headsets—without changing our core rasterization logic. Because our engine already has a modular `Renderer`, adding this compute pass is a natural extension of our existing frame loop. + +xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/03_post_process_warping.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc new file mode 100644 index 00000000..6c8a6fe5 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc @@ -0,0 +1,12 @@ +:pp: {plus}{plus} += Storing the Plenoptic Function + +To truly understand the future of spatial computing, we have to move beyond the idea of a simple "camera" looking at a "scene." In traditional rendering, we calculate the color of a pixel based on a single ray from the eye through a point on the image plane. But what if we could capture *all* the light flowing through a volume of space? This is the core of **Plenoptic** theory—from the Latin *plenus* (full) and the Greek *optikos* (of or relating to sight). + +At its most fundamental level, the **Plenoptic Function** is a seven-dimensional (7D) description of every possible light ray in the universe. It describes the intensity of light at any position latexmath:[x, y, z], from any direction latexmath:[\theta, \phi], at any wavelength latexmath:[\lambda], at any point in time latexmath:[t]. While this is a beautiful mathematical construct, rendering or storing a 7D function is computationally impossible for real-time systems. + +In this chapter, we are going to explore how we can simplify this monster into a manageable four-dimensional (4D) representation that we can actually store in Vulkan buffers. By assuming that light travels in straight lines through free space (the "Free Space Assumption"), and focusing on a static moment in time with a fixed set of colors, we can reduce our 7D function into a **4D LightField**. + +This 4D LightField is the "Holy Grail" for high-end spatial displays. It allows us to render holographic images where your eyes can naturally focus at different depths (solving the Vergence-Accommodation Conflict, or **VAC**) and see different perspectives as you move your head, all without the need for expensive per-eye re-rendering. We will look at how to define these rays using the classic "Two-Plane Parametrization"—the **ST plane** (spatial position) and the **UV plane** (directional orientation)—and how to map this mathematical grid into a high-performance memory layout for the engine. + +xref:OpenXR_Vulkan_Spatial_Computing/13_Warp_and_Blend/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc new file mode 100644 index 00000000..36e380ff --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc @@ -0,0 +1,65 @@ +:pp: {plus}{plus} += 4D LightField Representation + +When we talk about representing a lightfield in the engine, we need a mathematical structure that maps a 4D light ray into a linear memory address. + +== The 7D Plenoptic Function: The Starting Point + +To understand lightfields, we must first look at the **Plenoptic Function** (from the Latin *plenus* meaning full and *opticus* meaning vision). This is a 7D function latexmath:[P(x, y, z, \theta, \phi, \lambda, t)] that describes every photon in the universe: + +* latexmath:[(x, y, z)]: Position in space. +* latexmath:[(\theta, \phi)]: Direction of travel. +* latexmath:[\lambda]: Wavelength (color). +* latexmath:[t]: Time. + +== Reduction to 4D: The Light Box + +Storing 7 dimensions is impossible. To make this practical for a GPU, we make three assumptions: + +1. **Static Scene**: We ignore time (latexmath:[t]). +2. **RGB Color**: We reduce wavelength (latexmath:[\lambda]) to three channels. +3. **Empty Space**: We assume light doesn't change color as it travels through air. This means we only need to know where a ray enters and leaves a "Light Box." + +This leaves us with **4 Dimensions**: two for the entry point (**ST plane**) and two for the entry direction (**UV plane**). This is the **Two-Plane Parametrization**. + +== Mapping to Vulkan Memory + +In Vulkan, we represent this 4D structure as a massive `vk::raii::Buffer`. Because the dataset is so large, we use `RWStructuredBuffer` in Slang to handle the indexing. + +[source,cpp] +---- +// Initializing a 4D LightField buffer using RAII +vk::BufferCreateInfo bufferInfo{ + .size = spatialRes * directionalRes * sizeof(glm::vec4), + .usage = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress, + .sharingMode = vk::SharingMode::eExclusive +}; + +vk::raii::Buffer lightFieldBuffer(device, bufferInfo); +---- + +== Indexing the 4th Dimension + +In our Slang shaders, we treat this linear memory as a 4D structure. Slang's ability to handle custom indexing makes it easy to calculate the 1D index from our 4D coordinates. + +[source,slang] +---- +struct LightFieldRay { + float4 color; +}; + +RWStructuredBuffer lightFieldData; + +// Mapping 4D coordinates to a linear index +// st: entry point on the spatial plane +// uv: entry point on the directional plane +uint getLinearIndex(uint2 st, uint2 uv, uint2 stRes, uint2 uvRes) { + return st.x + st.y * stRes.x + + uv.x * (stRes.x * stRes.y) + + uv.y * (stRes.x * stRes.y * uvRes.x); +} +---- + +By using this representation, the engine can treat a lightfield as a simple array of "light fragments," making it compatible with existing spatial pipelines while providing the foundation for complex plenoptic synthesis. + +xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc new file mode 100644 index 00000000..02593106 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += High-Density View Arrays + +Managing a 4D lightfield poses a significant memory challenge. Even at modest resolutions, the total number of color samples can quickly reach gigabytes. + +== The Gigabyte Problem + +Consider a modest lightfield: +* **Spatial**: latexmath:[1024 \times 1024] entry points. +* **Directional**: latexmath:[32 \times 32] angles. +* **Data**: latexmath:[1024 \times 1024 \times 32 \times 32 \approx 1 \text{ billion rays}]. + +At 16 bytes per ray (RGBA FP32), that is **16 GB of VRAM**. This exceeds the capacity of many consumer GPUs. To make this work, we must optimize both storage and access. + +== FP16 and Quantization: Bandwidth Savings + +The first step is reducing the precision of each ray. Because lightfields often exhibit high directional redundancy (adjacent rays are very similar), we can compress the color data. + +In Slang, we can use `half4` instead of `float4`. This immediately halves the VRAM footprint to **8 GB**. + +== Buffer Device Address (BDA): Pointer-Based Access + +With such large datasets, the overhead of standard Vulkan Descriptor Sets can become a bottleneck. By using **Buffer Device Address**, we can pass a raw 64-bit pointer to our lightfield data directly into our shaders. + +[source,cpp] +---- +// Using BDA to get a raw pointer to a high-density buffer +vk::BufferDeviceAddressInfo addressInfo{ + .buffer = *lightFieldBuffer +}; + +uint64_t bufferPointer = device.getBufferAddress(addressInfo); +---- + +This is particularly useful for **N-View** systems where we might be accessing dozens of different view layers in a single pass. We can simply pass the pointer once and calculate offsets mathematically. + +== Cache Locality: Spatial-Directional Tiling + +A standard 4D array layout is terrible for GPU caches. If the shader samples a ray and then its neighbor, the neighbor might be megabytes away in linear memory, causing a "Cache Miss." + +To fix this, we implement **4D Tiling**. We reorganize the data so that a latexmath:[4 \times 4 \times 4 \times 4] block of spatial and directional rays are stored contiguously. This ensures that when a Compute Unit (CU) fetches one sample, the neighboring rays are already in the L1/L2 cache. + +[source,slang] +---- +// Tiled 4D indexing example in Slang +uint getTiledIndex(uint2 st, uint2 uv) { + uint2 tileSize = uint2(8, 8); + uint2 tileCoord = st / tileSize; + uint2 localCoord = st % tileSize; + + // Reorganizing memory to keep spatial neighbors together + return (tileCoord.x + tileCoord.y * stResTiles) * (tileSize.x * tileSize.y) + + (localCoord.x + localCoord.y * tileSize.x); +} +---- + +By combining data compression, raw pointer flexibility via BDA, and tiled memory layouts, we can manage massive lightfield datasets that would otherwise be impossible to store. + +In the next chapter, we will see how to turn these rays back into images using **Plenoptic Synthesis**. + +xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/02_4d_lightfield_representation.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..51ed0f0c --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +The introduction of **LightField Theory** and plenoptic data structures requires our `simple_game_engine` to handle high-density memory arrays using **Buffer Device Address (BDA)** in `renderer_core.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Enabling BDA in the Backend + +In `renderer_core.cpp`, we must enable the BDA feature during logical device creation. This allows us to treat GPU buffers as raw 64-bit pointers in our Slang shaders. + +[source,cpp] +---- +// renderer_core.cpp +bool Renderer::createLogicalDevice(bool enableValidationLayers) { + // ... + vk::PhysicalDeviceVulkan12Features vulkan12Features; + vulkan12Features.bufferDeviceAddress = vk::True; + + // NEW: Also enable 1.4 features for easier memory management + vk::PhysicalDeviceVulkan13Features vulkan13Features; + vulkan13Features.dynamicRendering = vk::True; + vulkan13Features.pNext = &vulkan12Features; + // ... +} +---- + +== Mapping 4D LightField Buffers + +In `renderer_resources.cpp`, we create a high-density buffer for our ST/UV planes. Instead of a standard descriptor set, we retrieve the raw device address. + +[source,cpp] +---- +// renderer_resources.cpp +// Supporting Struct: Push Constants for BDA +struct LightFieldPushConstants { + uint64_t lightFieldDataAddress; + float4 uvScaling; + // ... +}; + +void Renderer::createLightFieldBuffer(size_t totalBytes) { + // 1. Create the buffer with BDA usage + vk::BufferCreateInfo createInfo{ + .size = totalBytes, + .usage = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress + }; + lightFieldBuffer = vk::raii::Buffer(device, createInfo); + + // 2. Retrieve the raw 64-bit address + vk::BufferDeviceAddressInfo addressInfo{ .buffer = *lightFieldBuffer }; + uint64_t gpuPointer = device.getBufferAddress(addressInfo); + + // 3. Pass this pointer to our Slang shader via Push Constants + pushConstants.lightFieldDataAddress = gpuPointer; +} +---- + +== Why These Changes? + +By moving from traditional descriptor sets to BDA for lightfield data, we avoid the overhead of binding thousands of individual view textures. Our engine can now manage massive plenoptic datasets (often multi-gigabyte) as a single contiguous block of memory. This ensures maximum cache locality during the synthesis pass, which is critical for maintaining spatial immersion. + +xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/03_high_density_view_arrays.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc new file mode 100644 index 00000000..ae7b7058 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc @@ -0,0 +1,12 @@ +:pp: {plus}{plus} += Generating Views from Sparse Data + +In the previous chapter, we looked at how to represent and store a 4D lightfield in Vulkan. But a static buffer of rays is only useful if we can reconstruct a perspective from it for the viewer's current eye position. This process—generating a new image from a discrete set of samples—is what we call **Plenoptic Synthesis**. + +The challenge of synthesis is that we will rarely have a direct ray that exactly matches the one we need for a given pixel. Instead, we have to perform high-speed interpolation between our sparse samples. This is essentially a specialized form of **Image-Based Rendering (IBR)**, where we are warping and blending existing data rather than rasterizing 3D geometry from scratch. + +In this chapter, we will explore the two primary paths for plenoptic synthesis: the **Raster path** and the **Ray Traced path**. The raster path uses specialized fragment shaders to quickly blend samples together, making it ideal for low-power mobile headsets. The ray traced path, on the other hand, utilizes Vulkan's hardware acceleration to trace primary rays directly through virtual microlens arrays, providing much higher fidelity and more complex optical effects. + +We'll look at how the engine coordinates these two paths using compute and fragment shaders and how to use Slang's productivity features to author complex synthesis logic without getting lost in the 4D math. + +xref:OpenXR_Vulkan_Spatial_Computing/14_LightField_Theory/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc new file mode 100644 index 00000000..2a3e955a --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc @@ -0,0 +1,50 @@ +:pp: {plus}{plus} += Specialized Fragment Shaders for Synthesis + +To turn a 4D lightfield back into a 2D image, we use a process called **Image-Based Rendering (IBR)**. Instead of drawing triangles, we treat our 4D buffer as a dense "cloud" of light rays and selectively pull the ones that would pass through our virtual camera lens. + +== The Concept: Reconstructing the Wavefront + +To understand synthesis, imagine your virtual camera is looking into the "Light Box" we defined in the previous chapter. + +1. **Ray Tracing**: For every pixel on your screen, you calculate a ray starting at your eye and passing through that pixel. +2. **Intersection**: You find where that ray hits the **ST plane** (position) and the **UV plane** (direction) of your lightfield. +3. **Synthesis**: You look up the color of the ray at those coordinates in your 4D buffer. + +== Quadrilinear Interpolation: The 16-Sample Lookup + +Because our lightfield is stored as a discrete grid, the ray we calculate will rarely hit an exact sample point. We must interpolate. In 4D, this requires **Quadrilinear Interpolation**. + +* **Bilinear on ST**: Interpolate between the 4 nearest spatial neighbors. +* **Bilinear on UV**: Interpolate between the 4 nearest directional neighbors. +* **Result**: To get a single smooth pixel, you must sample **16 different rays** (4 spatial $\times$ 4 directional). + +[source,slang] +---- +// Quadrilinear interpolation logic in Slang +float4 sampleLightField(float2 st, float2 uv) { + // 1. Calculate floor and fractional parts for both planes + uint2 st0 = uint2(floor(st)); + uint2 uv0 = uint2(floor(uv)); + float2 stF = frac(st); + float2 uvF = frac(uv); + + // 2. Perform ST-plane interpolation at the surrounding UV coordinates + // fetch() uses the linear index calculation we learned in the last chapter + float4 c00 = lerp(lerp(fetch(st0, uv0), fetch(st0 + uint2(1, 0), uv0), stF.x), + lerp(fetch(st0, uint2(0, 1), uv0), fetch(st0 + uint2(1, 1), uv0), stF.x), + stF.y); + // ... repeat for the other 3 UV neighbors (c10, c01, c11) ... + + // 3. Perform the final UV-plane interpolation + return lerp(lerp(c00, c10, uvF.x), lerp(c01, c11, uvF.x), uvF.y); +} +---- + +== Performance: Coalescing the Rays + +Because we are doing 16 lookups per pixel, we must be extremely mindful of **SIMT Coalescing**. If we are not careful with our memory layout, these 16 lookups will scatter across different DRAM pages, causing a "Memory Wall" where the GPU sits idle while waiting for data. + +By using the **4D Tiling** we implemented in Chapter 14, we ensure that these 16 samples are geographically close in physical VRAM. This allows the GPU to satisfy all 16 requests with just one or two cache line fetches, keeping our plenoptic synthesis running at high frame rates. + +xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc new file mode 100644 index 00000000..bb83e57c --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc @@ -0,0 +1,49 @@ +:pp: {plus}{plus} += Ray Traced Synthesis and Microlens Arrays + +While the raster path for plenoptic synthesis is fast, it's often an approximation. For high-fidelity holographic displays, we want to simulate the exact physical optics of a display—particularly how rays travel through a **Microlens Array** (MLA). This is where **Vulkan Ray Tracing** (VRT) becomes the engine's primary tool for synthesis. + +== Physical Optics: The Microlens Array (MLA) + +To understand ray-traced synthesis, you must understand how lightfield cameras work. They use a **Microlens Array**—a grid of thousands of tiny lenses placed in front of the sensor. Each microlens captures a small "patch" of the scene from multiple angles. + +In our ray tracing pipeline, we reverse this process: + +1. **Primary Ray**: We trace a ray from the virtual eye to the screen. +2. **Microlens Intersection**: We simulate the ray passing through a virtual microlens. +3. **Refraction**: We calculate how the lens bends the ray based on its physical properties (focal length, curvature). +4. **LightField Lookup**: The refracted ray now points to a specific 4D coordinate in our plenoptic buffer. + +== Implementing the Virtual Lens in Slang + +Using the `RaytracingAccelerationStructure` in Vulkan, we can simulate complex lenses that a raster shader cannot. + +[source,slang] +---- +// RayGen shader in Slang for plenoptic synthesis +[shader("raygeneration")] +void rayGenMain() { + uint2 pixelID = DispatchRaysIndex().xy; + float2 uv = float2(pixelID) / float2(DispatchRaysDimensions().xy); + + // 1. Trace a ray through the virtual microlens optics + // This function calculates the refraction based on the MLA geometry + RayDesc ray = generateRayThroughLens(uv); + + // 2. Intersect the ray with our 4D LightField "box" (the ST/UV planes) + float4 color = intersectLightField(ray, lightFieldData); + + // 3. Write the final result to the OpenXR swapchain image + outputImage[pixelID] = color; +} +---- + +== Why Ray Trace? Physical Fidelity + +Ray tracing allows us to simulate **Vergence-Accommodation Conflict (VAC)** solutions. By adjusting the virtual focal length of our microlens array in real-time, we can change the "Plane of Focus" of the holographic image. This allows the user's eyes to naturally focus on near or far virtual objects, eliminating the eye strain common in standard fixed-focus headsets. + +Furthermore, ray tracing allows us to handle **Non-Uniform LightFields**. If our 4D data is stored in a sparse tree (like the Octrees we covered in the Compute tutorial), ray tracing can traverse those structures much more efficiently than a series of nested loops in a fragment shader. + +By combining the 4D theory of LightFields with the physical accuracy of Vulkan Ray Tracing, we create a spatial rendering pipeline that can handle the most advanced holographic displays on the market. + +xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/02_synthesis_shaders.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..1a04f40c --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc @@ -0,0 +1,75 @@ +:pp: {plus}{plus} += Incorporating into the Engine + +Integrating **Plenoptic Synthesis** into our `simple_game_engine` requires a flexible rendering architecture that can switch between traditional rasterization and advanced ray tracing in `renderer_rendering.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Implementing the Raster Synthesis Path + +In `renderer_rendering.cpp`, we treat raster synthesis as a full-screen fragment pass. We sample the lightfield buffer using the quadrilinear interpolation logic we built. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::RenderSynthesisRaster(...) { + // ... + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *rasterSynthesisPipeline); + + // Bind the LightField buffer (using BDA address from push constants) + cmd.pushConstants(*pipelineLayout, + vk::ShaderStageFlagBits::eFragment, 0, synthesisPushConstants); + + // Draw full-screen triangle to trigger fragment synthesis + cmd.draw(3, 1, 0, 0); +} +---- + +== Implementing the Ray Traced Path + +If the engine detects `VK_KHR_ray_tracing_pipeline` support, we can use the more accurate Ray Traced synthesis path. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::RenderSynthesisRayTraced(...) { + // ... + cmd.bindPipeline(vk::PipelineBindPoint::eRayTracingKHR, *rtSynthesisPipeline); + + // Dispatch rays through our virtual microlens array + // One ray per pixel of the high-res spatial display + cmd.traceRaysKHR(sbtRayGen, sbtMiss, sbtHit, sbtCallable, + swapChainExtent.width, swapChainExtent.height, 1); +} +---- + +== Sharing Logic via Slang Modules + +We use Slang to ensure that the core interpolation logic is identical across both the raster and ray traced paths. + +[source,slang] +---- +// synthesis_utils.slang +// Common logic for both paths +float3 samplePlenopticFunction(uint64_t bufferPtr, float4 ray) { + // Shared quadrilinear interpolation math + // ... +} + +// fragment_synthesis.slang +[shader("pixel")] +float4 fragmentMain(V2P input) : SV_Target { + return float4(samplePlenopticFunction(pushConstants.ptr, input.ray), 1.0); +} + +// raygen_synthesis.slang +[shader("raygeneration")] +void raygenMain() { + float4 ray = calculateRayFromMicrolens(DispatchRaysIndex().xy); + outputImage[DispatchRaysIndex().xy] = samplePlenopticFunction(pushConstants.ptr, ray); +} +---- + +== Why These Changes? + +By providing dual paths in our `Renderer`, we allow the engine to scale from low-power mobile AR glasses (using the raster path) to high-end holographic workstation displays (using ray traced paths). Because our engine already has a modular `Render` structure and utilizes Slang, sharing the complex 4D interpolation logic between these paths ensures mathematical consistency and visual fidelity. + +xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/03_ray_traced_synthesis.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc new file mode 100644 index 00000000..5d8343a9 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc @@ -0,0 +1,10 @@ +:pp: {plus}{plus} += Scene Understanding: Data Ingestion and Semantic Labels + +In previous chapters, we focused on rendering virtual objects into the real world. But true spatial computing is a two-way street. To create convincing interactions—like virtual balls bouncing off your actual coffee table or virtual characters hiding behind your real couch—our engine needs to "see" and "understand" the physical environment. + +This is the domain of **Scene Understanding**. In this chapter, we will explore how OpenXR runtimes use sensors like **LiDAR** (Light Detection and Ranging) and depth cameras to build a dynamic 3D model of the user's room. We'll learn how to ingest this data as **Spatial Meshes**, how to use **Semantic Labels** to distinguish a "Floor" from a "Human," and how to achieve ultra-low latency via **Zero-Copy Hand-off** directly into our Vulkan compute pipeline. + +By the end of this chapter, you'll understand how to turn the messy physical world into structured geometry that our engine can treat just like any other 3D asset. + +xref:OpenXR_Vulkan_Spatial_Computing/15_Plenoptic_Synthesis/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc new file mode 100644 index 00000000..79f4a194 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc @@ -0,0 +1,43 @@ +:pp: {plus}{plus} += Environmental Ingestion: Scanning the Digital Twin + +To make our virtual objects interact with the physical world, our engine must first capture that world's geometry. This process is known as **Environmental Ingestion** or **Scene Understanding**. Using OpenXR extensions like `XR_MSFT_scene_understanding` or the cross-vendor **Scene Mesh** functionality, we can bridge the gap between photons and triangles. + +== The Concept: The Digital Twin + +Scene understanding turns the messy physical world into a structured **Digital Twin**. The XR runtime uses the headset's cameras and depth sensors (like **LiDAR**) to build this twin in real-time. This isn't a static model; it is a dynamic, evolving representation of the user's environment. + +1. **Spatial Meshing**: The runtime generates a 3D triangle mesh of all visible surfaces in the room. This mesh is updated as the user moves and looks around. +2. **Semantic Labels**: The runtime categorizes parts of that mesh. Instead of just "triangles," the engine knows: "This is a Floor," "This is a Table," or "This is a Human." +3. **Plane Detection**: For simpler logic, the runtime identifies large flat surfaces, which are much cheaper to process than full 3D meshes for tasks like placing a virtual lamp. + +== Accessing Meshes in Vulkan + +When we request a scene mesh from OpenXR, we aren't just getting a one-time blob of data. We are subscribing to a stream of updates. In our engine, we handle this by creating dynamic `vk::raii::Buffer` objects that we update whenever the runtime signals that the environment has changed. + +Crucially, because this data is generated by the headset's sensors, we often receive it in specialized formats. OpenXR allows us to query these buffers directly, but we must be careful with our synchronization—using **Timeline Semaphores** to ensure the GPU doesn't try to render a spatial mesh while the XR runtime is still updating its vertices. + +[source,cpp] +---- +// Querying the scene mesh from OpenXR with vulkan-hpp +XrSceneMeshBuffersMSFT meshBuffers{XR_TYPE_SCENE_MESH_BUFFERS_MSFT}; +meshBuffers.vertexCapacityInput = maxVertices; +meshBuffers.vertexBuffer = reinterpret_cast(*vertexBufferMemory); +meshBuffers.indexCapacityInput = maxIndices; +meshBuffers.indexBuffer = reinterpret_cast(*indexBufferMemory); + +// The call that populates our Vulkan-mapped memory +xrGetSceneMeshBuffersMSFT(sceneHandle, meshId, &meshBuffers); +---- + +== Semantic Labels: Meaningful Interaction + +Semantic labels are the "Intelligence Layer" of scene understanding. Instead of just seeing geometry, our engine can ask: "Give me all surfaces labeled as **FLOOR**." This allows us to implement high-level spatial logic: + +* **Physics Interaction**: Virtual objects should bounce off tables but pass through "Air" (regions the sensor hasn't scanned yet). +* **AI Navigation**: Characters should only walk on surfaces labeled as "Floor." +* **Contextual Occlusion**: If a mesh is labeled as "Wall," we can prioritize it for occlusion tests, ensuring that virtual windows don't accidentally render "inside" your physical walls. + +By ingesting these meshes directly into our Vulkan buffers, we enable real-time interaction between our shaders and the user's physical room. + +xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc new file mode 100644 index 00000000..d4e4c0a2 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc @@ -0,0 +1,45 @@ +:pp: {plus}{plus} += Zero-Copy Hand-off for Sensor Data + +To achieve real-time scene understanding, we cannot afford to process high-resolution sensor data on the CPU. Moving depth maps or raw camera feeds from the headset's hardware into system RAM, processing them with the CPU, and then sending them back to the GPU is simply too slow for Mixed Reality (**MR**). We need a **Zero-Copy Hand-off**. + +== The Concept: Direct Memory Access (DMA) + +In a zero-copy hand-off, the OpenXR runtime captures sensor data directly into a GPU buffer that our Vulkan engine can see. This uses **DMA** (Direct Memory Access) to bypass the host processor entirely. + +1. **VRAM Allocation**: The XR runtime or the OS driver allocates a region of **VRAM** (Video RAM) for the sensor data. +2. **External Memory Handle**: OpenXR provides a handle to this memory using extensions like `VK_KHR_external_memory`. +3. **Engine Import**: Our engine imports this handle, wrapping it in a `vk::raii::Image` or `vk::raii::Buffer`. +4. **Compute Access**: Our compute shaders (authored in **Slang**) process this raw data directly in-place. + +== Implementation: Piping Data to Compute + +By bypassing the CPU, we reduce latency and free up cycles for the main game simulation. In our engine, we use this for **Sensor Fusion**—the process of combining depth data with color camera feeds to create high-fidelity, textured spatial meshes. + +When importing these buffers, we must ensure our Vulkan memory types match the requirements of the external handle. This typically requires matching the **LUID** (Locally Unique Identifier) of the device that created the memory. + +[source,cpp] +---- +// Importing external sensor memory into Vulkan RAII +vk::ImportMemoryBufferInfoKHR importInfo{ + .handleType = vk::ExternalMemoryHandleTypeFlagBits::eOpaqueFd, // Linux/Android (or Win32 on Windows) + .handle = sensorMemoryHandle +}; + +vk::MemoryAllocateInfo allocInfo{ + .pNext = &importInfo, + .allocationSize = sensorDataSize, + .memoryTypeIndex = findSensorCompatibleMemoryType() +}; + +// RAII handle automatically manages the lifetime of the imported memory +vk::raii::DeviceMemory importedMemory(device, allocInfo); +---- + +== Why Zero-Copy is Critical for MR + +In Mixed Reality (MR), users are often viewing the real world through **Passthrough**—a digital video feed of their surroundings. If there is even a 20ms delay between the camera capturing the room and the GPU displaying it, the user will feel a nauseating disconnect between their physical movement and the visual feed. This is known as **Motion-to-Photon Latency**. + +Zero-copy hand-offs ensure that sensor data moves at the speed of the GPU's internal bus, keeping the latency below the threshold of human perception and maintaining the illusion that the virtual and physical worlds occupy the same space. + +xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/02_environmental_ingestion.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..39c0807b --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc @@ -0,0 +1,82 @@ +:pp: {plus}{plus} += Incorporating Scene Understanding into the Engine + +Integrating **Scene Understanding** allows our `simple_game_engine` to treat the physical room as dynamic geometry. We achieve this by bridging OpenXR spatial data with our engine's `Entity` and `MeshComponent` systems. An example of this implementation can be found in link:../../../attachments/openxr_engine/xr_context.cpp[xr_context.cpp]. + +== Tracking Spatial Mesh Lifecycle + +In `engine.cpp`, within the `Engine::update()` loop, we add a synchronization step that queries OpenXR for updated spatial meshes. This process ensures that virtual objects interact with the room as it is scanned. + +We create a dedicated method, `Engine::updateSpatialMeshes()`, which iterates through the runtime's mesh stream. + +[source,cpp] +---- +// engine.cpp +// Supporting Struct: Spatial Mesh Bridge +struct XrSpatialMesh { + XrUuidMSFT meshGuid; + std::vector vertices; + std::vector indices; + glm::mat4 transform; +}; + +void Engine::updateSpatialMeshes() { + if (!xrMode) return; + + // 1. Retrieve current spatial meshes from the OpenXR context + auto xrMeshes = xrContext.getLatestSpatialMeshes(); + + for (const auto& xrMesh : xrMeshes) { + // 2. Map the XR mesh ID to an Engine Entity + // We use a internal map: std::unordered_map + Entity* spatialEntity = getOrCreateSpatialEntity(xrMesh.meshGuid); + + auto* meshComp = spatialEntity->getComponent(); + auto* transform = spatialEntity->getComponent(); + + // 3. Update the GPU resources via our Renderer + // This transfers raw XrVector3f data into our vk::raii::Buffer + renderer->updateDynamicMesh( + meshComp->getBuffer(), + xrMesh.vertices, + xrMesh.indices + ); + + // 4. Update the transform to match the physical room coordinates + // OpenXR spatial data is already provided as a 4x4 matrix + transform->setLocalMatrix(xrMesh.transform); + } +} +---- + +== Applying Semantic Intelligence + +In `mesh_component.h`, we extend the component to store a **Semantic Label**. This allows our physics and AI systems to query whether a specific piece of geometry represents a "Floor," "Table," or "Wall." + +When the engine detects a new mesh, we assign its properties in `engine.cpp`: + +[source,cpp] +---- +// engine.cpp +void Engine::configureSpatialEntity(Entity* entity, XrSceneComponentTypeMSFT type) { + auto* meshComp = entity->getComponent(); + + if (type == XR_SCENE_COMPONENT_TYPE_PLANE_MSFT) { + // High-level plane: Perfect for floor-based AI navigation + meshComp->setSemanticTag("Floor"); + entity->addComponent(Physics::Static); + } else if (type == XR_SCENE_COMPONENT_TYPE_OBJECT_MSFT) { + // Physical object: Enable shadows and complex physics + meshComp->setSemanticTag("Prop"); + entity->addComponent(Physics::Kinematic); + } +} +---- + +== Architectural Win: Unified Pipelines + +By mapping spatial data into our existing `Entity` system, we don't need a "special" renderer for the real world. Our **PBR** (Physically Based Rendering) pipeline in `renderer_rendering.cpp` simply sees these spatial entities as standard geometry. They receive virtual light, cast virtual shadows, and provide depth information for our transparency passes automatically. + +This "Unified Geometry" approach is the foundation of believable Mixed Reality, allowing us to treat the physical room and our virtual assets as a single, coherent scene. + +xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/03_zero_copy_hand_off.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc new file mode 100644 index 00000000..76a55ddf --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc @@ -0,0 +1,10 @@ +:pp: {plus}{plus} += ML Inference with Cooperative Matrices + +As we've seen in the previous chapter, the sensors on a modern XR headset provide a flood of raw physical data. But capturing triangles is only half the battle. To create truly intelligent spatial applications, we need to "understand" that data in real-time. + +Is that moving cluster of vertices a user's hand? Is it a cat jumping on the sofa? To answer these questions without stalling our rendering pipeline, we turn to **ML Inference** (Machine Learning). In this chapter, we'll explore how to leverage the **Cooperative Matrices** in Vulkan 1.4 to perform high-speed neural network calculations directly on the GPU. + +By moving inference from the CPU to the GPU's specialized matrix hardware (like **Tensor Cores** or **Matrix Accelerators**), we can achieve sub-millisecond latency for tasks like hand-pose refinement, semantic segmentation, and mesh reconstruction. We'll learn how to integrate these ML kernels into our engine's compute pipeline, turning noisy sensor data into clean, interactive inputs. + +xref:OpenXR_Vulkan_Spatial_Computing/16_Scene_Understanding/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc new file mode 100644 index 00000000..0f416e0e --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc @@ -0,0 +1,54 @@ +:pp: {plus}{plus} += On-GPU Inference with Cooperative Matrices + +To process the flood of real-world data coming from our zero-copy sensor hand-offs, our engine needs a way to "understand" geometry and motion instantly. Standard compute shaders are excellent for general-purpose parallel tasks, but neural networks rely on heavy matrix multiplication—a task that standard **ALU** (Arithmetic Logic Unit) pipelines can find inefficient. + +To solve this, we use **Cooperative Matrices** (as detailed in our Advanced Compute series), which tap into the GPU's dedicated hardware for high-speed matrix math. + +== The Concept: Specialized ML Hardware + +In a spatial engine, we use ML (Machine Learning) for tasks like **Hand Tracking** and **Object Recognition**. These tasks involve multiplying massive matrices of weights against input sensor data to produce probabilities or coordinates. + +* **Standard Compute**: Each thread works independently on a single scalar or vector operation. To multiply a large matrix, threads must coordinate manually via shared memory, often hitting bandwidth bottlenecks. +* **Cooperative Matrices**: A whole **Subgroup** (32 or 64 threads) works as a single unit to multiply a large matrix tile in one operation. This "cooperation" allows the hardware to bypass traditional cache levels and use high-speed internal paths (like **Tensor Cores** on NVIDIA or **Matrix Accelerators** on AMD). + +== Implementing Hand-Pose Refinement + +Consider raw hand-tracking data from a headset. It is often noisy—your virtual fingers might "jitter" as the sensor loses sight of them. We can use a small neural network to **Refine** these poses, using the previous 5 frames of movement to predict the most likely "smooth" position of each joint. + +By performing this inference on the GPU, we can use **GEMM** (General Matrix Multiply) operations to process all joint refinements in parallel. + +[source,slang] +---- +// Using Slang's Matrix fragments for cooperative ML inference +// Note: Requires Vulkan 1.4 with VK_KHR_cooperative_matrix +[shader("compute")] +[numthreads(32, 1, 1)] // Subgroup-sized for optimal matrix math +void performHandRefinement() { + // 1. Load raw joint data and neural network weights into fragments + // These matrices represent a 16x16 tile of the larger network + cooperative_matrix input; + cooperative_matrix weights; + + input.load(rawJointBuffer, jointOffset); + weights.load(weightBuffer, weightOffset); + + // 2. Perform high-speed Matrix-Multiply-Accumulate (MMA) + // The hardware handles the subgroup-wide synchronization automatically + auto result = mul(input, weights); + + // 3. Store the "Refined" joint positions directly back to the Action Space buffer + result.store(refinedPoseBuffer, poseOffset); +} +---- + +== Why On-GPU? Latency and Power + +Performing ML inference on the GPU is critical for spatial computing for two primary reasons: + +1. **Latency**: If we sent sensor data to the CPU for ML processing, the finger movement would visibly lag behind the user's physical hand. This breaks the feeling of **Presence**. +2. **Power Efficiency**: Matrix accelerators are significantly more energy-efficient than general-purpose ALUs for these specific tasks. For mobile headsets, this translates directly to longer sessions and reduced thermal throttling. + +By integrating ML directly into our spatial compute pipeline, we turn raw, noisy sensor data into clean, interactive inputs with the same low-latency performance as our main rendering path. + +xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc new file mode 100644 index 00000000..c3f3a067 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc @@ -0,0 +1,51 @@ +:pp: {plus}{plus} += Refining Spatial Data: From Messy to Clean + +The environmental meshes we ingested in Chapter 16 are often "noisy." They contain holes where the LiDAR sensor couldn't reach, jagged edges on curved surfaces, and floating artifacts caused by reflections or moving objects. To turn these into clean, interactive geometry, we use ML-driven refinement. + +== The Concept: Topology Inference + +Standard mesh smoothing algorithms (like Laplacian or Gaussian smoothing) can remove noise, but they are "dumb"—they can't fill large holes or "guess" missing geometry. **Topology Inference** uses a neural network trained on millions of real-world scans to predict the missing parts of a scene. + +1. **Denoising**: The model identifies small floating vertices or "geometry dust" that doesn't belong to any major surface and marks them for removal. +2. **Infilling**: The model predicts the curvature of a surface (like a wall behind a plant) and generates new triangles to bridge gaps in the scan. +3. **Primitive Approximation**: For optimal performance, the model can suggest replacing a complex, high-poly "Table" mesh with a clean, low-poly box primitive that has the same physical bounds. + +== Implementation: ML-Aided Surface Reconstruction + +In our engine, we use a compute shader to feed the raw spatial vertices into our ML model (processed via cooperative matrices). The model outputs a **Saliency Map**—a probability grid that tells us which regions are likely solid, high-confidence surfaces and which are noise. + +We author this reconstruction logic in **Slang**, using structured buffers to manage our dynamic mesh data. + +[source,slang] +---- +// Using ML results to guide mesh reconstruction in Slang +[shader("compute")] +[numthreads(64, 1, 1)] +void reconstructSurfaces(uint3 dtid : SV_DispatchThreadID) { + // 1. Query the ML saliency map for this vertex's region + float surfaceProbability = mlSaliencyMap.Load(dtid.xy); + + if (surfaceProbability > 0.95) { + // 2. High confidence: Snap the vertex to the predicted "clean" plane + float3 refinedPos = predictCleanSurface(dtid.xyz); + updateMeshVertex(dtid.x, refinedPos); + } else if (surfaceProbability < 0.1) { + // 3. Low confidence: This is likely a sensor artifact (e.g., a mirror reflection) + // Mark the vertex as "degenerate" to prevent it from rendering + degenerateMeshVertex(dtid.x); + } +} +---- + +== Interactive Geometry: The Final Goal + +By refining the spatial data, we move from simple "Visual Passthrough" to true **Physical Interaction**. + +* **Pixel-Perfect Occlusion**: A clean, refined wall mesh allows for perfect depth-aware occlusion of virtual objects, preventing them from "leaking" through the edges of physical furniture. +* **Realistic Shadows**: We can cast virtual shadows onto a refined "Table" mesh. Because the mesh is clean, the shadows look crisp and "grounded," making virtual objects feel like they truly exist in the room. +* **Robust Physics**: A refined floor mesh prevents virtual objects from falling through small "cracks" in the raw spatial scan, ensuring a stable simulation. + +By using on-GPU ML to polish our spatial data, we bridge the final gap between the messy physical world and the digital precision required for high-end XR. In the next chapter, we will see how to combine these refined meshes with our rendering to achieve **Semantic Occlusion**. + +xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/02_on_gpu_inference.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..7681b1f7 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc @@ -0,0 +1,76 @@ +:pp: {plus}{plus} += Incorporating ML Inference into the Engine + +Integrating **ML Inference** allows our `simple_game_engine` to perform high-level spatial refinement. We implement this by adding a dedicated compute pass to our `Renderer` that utilizes **Cooperative Matrices** for maximum throughput. *This is currently left as an exercise for the reader to implement.* + +== Integrating the Inference Pass + +In `renderer_rendering.cpp`, we introduce a dedicated step within our `Renderer::render()` loop. This pass must occur after we've ingested the latest sensor data but before we use that data for occlusion or physics. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::executeSpatialInference(vk::raii::CommandBuffer& cmd, uint32_t frameIndex) { + if (!xrMode || !mlEnabled) return; + + // 1. Bind the specialized ML compute pipeline + // This pipeline is configured with VK_KHR_cooperative_matrix enabled + cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *mlInferencePipeline); + + // 2. Bind descriptors for weights, raw sensor data, and refined output + cmd.bindDescriptorSets( + vk::PipelineBindPoint::eCompute, + *mlPipelineLayout, + 0, + *mlDescriptorSets[frameIndex], + nullptr + ); + + // 3. Dispatch the ML kernel + // We dispatch in subgroup-sized tiles (e.g., 16x16 fragments) + uint32_t groupsX = (inferenceWidth + 15) / 16; + uint32_t groupsY = (inferenceHeight + 15) / 16; + cmd.dispatch(groupsX, groupsY, 1); + + // 4. Memory Barrier: Ensure ML results are visible to subsequent vertex/fragment stages + vk::MemoryBarrier2 refinementBarrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eVertexAttributeInput | + vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eAttributeRead | + vk::AccessFlagBits2::eShaderRead + }; + + vk::DependencyInfo dependencyInfo{ .memoryBarrierCount = 1, .pMemoryBarriers = &refinementBarrier }; + cmd.pipelineBarrier2(dependencyInfo); +} +---- + +== Managing ML Weights in the Engine + +Neural network weights are essentially large static arrays. In `resource_manager.cpp`, we treat these as a specialized asset type, loading them from disk into a high-performance `vk::raii::Buffer` with the `VK_BUFFER_USAGE_STORAGE_BUFFER_BIT`. + +[source,cpp] +---- +// resource_manager.cpp +std::shared_ptr ResourceManager::loadMLWeights(const std::string& path) { + auto rawWeights = loadBinaryFile(path); + + // Create a staging buffer and then transfer to GPU-local memory + // for optimal cooperative matrix loading speed + return createGpuLocalBuffer( + rawWeights.size(), + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress, + rawWeights.data() + ); +} +---- + +== Why These Changes? + +By placing the ML inference pass directly within our `Renderer` and using **Vulkan 1.4** synchronization (Synchronization 2), we ensure that our spatial refinement happens at the absolute last microsecond before the data is needed. This minimizes the **Motion-to-Photon** gap. Furthermore, by treating weights as standard engine resources, we can hot-reload different ML models (e.g., switching from a high-performance "Mobile" model to a high-fidelity "Desktop" model) without restarting the engine. + +This integration turns our engine from a simple rasterizer into a context-aware spatial platform capable of understanding and reacting to the user's physical environment. + +xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/03_refining_spatial_data.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc new file mode 100644 index 00000000..ee064ab3 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc @@ -0,0 +1,10 @@ +:pp: {plus}{plus} += Semantic Occlusion: Depth-Aware Real/Virtual Compositing + +In previous chapters, we learned how to ingest the physical world's geometry. But simply having that geometry isn't enough for a convincing experience. If you place a virtual robot on your floor, it should be hidden when your real-world coffee table is between you and the robot. This is the challenge of **Semantic Occlusion**. + +In this chapter, we will explore how to bridge the gap between real and virtual worlds through depth-aware occlusion and **Semantic Segmentation**. We'll learn how to use ML models to label reality at a pixel level—distinguishing "Human Hand" from "Table"—and how to use the Vulkan **Stencil Buffer** and **Depth-Aware Compositing** to ensure virtual objects "tuck behind" real-world ones with sub-pixel precision. + +By the end of this chapter, you'll be able to implement stable, flicker-free masking that makes virtual assets feel like they truly occupy the same physical space as the user's hands and furniture. + +xref:OpenXR_Vulkan_Spatial_Computing/17_ML_Inference_Spatial/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc new file mode 100644 index 00000000..7185b3b3 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc @@ -0,0 +1,51 @@ +:pp: {plus}{plus} += ML-Driven Segmentation: Understanding Reality + +In standard Augmented Reality (AR), virtual objects are often drawn directly on top of the camera feed. This is known as **Naïve Compositing**, and it frequently leads to a jarring "Ghosting Effect"—where a virtual robot appears to float in front of a real table that should logically be occluding it. To fix this, we use **Semantic Segmentation**. + +== The Concept: Pixel-Wise Understanding + +Semantic segmentation is the process of labeling every pixel in the camera feed with a category. Unlike simple object detection, which just places a box around an item, segmentation provides a high-resolution, pixel-perfect understanding of the scene. + +1. **Input**: A raw RGB camera frame captured by the headset. +2. **ML Pass**: A neural network (often a **U-Net** or **Transformer** architecture) processes the frame. +3. **Output**: A **Segmentation Mask** where each pixel value corresponds to a category: 0 for Background, 1 for Table, 2 for Human Hand, etc. + +By understanding *what* occupies every pixel in the real world, our engine can make intelligent decisions about *how* to blend virtual content, ensuring that digital objects respect the physical boundaries of the user's room. + +== Implementing the Occlusion Mask + +In our Vulkan engine, we treat the output of this ML pass as a specialized texture: the **Occlusion Mask**. This texture is used in our final composite shader to gate the visibility of virtual pixels. + +Authoring this in **Slang**, we can efficiently sample the mask and perform conditional logic to handle complex real-world occlusion. + +[source,slang] +---- +// A Slang fragment shader utilizing a semantic mask for occlusion +[shader("pixel")] +float4 fragmentMain(V2P input) : SV_Target { + // 1. Sample the virtual scene color + float4 virtualColor = sceneTexture.Sample(input.uv); + + // 2. Sample the ML-generated semantic mask (using integer labels) + uint label = semanticMask.Sample(input.uv).r; + + // 3. Perform Semantic Occlusion + // If the pixel is labeled as "Human Hand", we hide the virtual object + // to allow the user's physical hand to appear "in front" of the digital content. + if (label == SEMANTIC_LABEL_HAND) { + // Transparent output allows the passthrough camera feed to show through + return float4(0, 0, 0, 0); + } + + return virtualColor; +} +---- + +== The Depth Buffer of Reality + +While semantic labels tell us *what* an object is, they don't always provide its exact distance. High-end spatial engines combine these labels with raw depth data from **LiDAR**. This enables **Depth-Aware Semantic Occlusion**, where a virtual robot can hide behind a real chair but still appear in front of a real-world wall further back in the room. + +By utilizing on-GPU inference, we generate these masks with minimal latency, ensuring that as the user moves their hand, the virtual world responds and "tucks behind" it instantly. + +xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc new file mode 100644 index 00000000..927ee640 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc @@ -0,0 +1,48 @@ +:pp: {plus}{plus} += Per-Pixel Masking: Stable Compositing + +Even with a perfect semantic mask, compositing virtual objects with the real world is physically complex. The edges where a virtual asset meets a real-world object often appear "jittery" or suffer from "haloing"—a visual artifact where the background camera feed bleeds into the virtual foreground. To solve this, we use stable **Per-Pixel Masking** and high-fidelity compositing. + +== The Concept: The Compositing Equation + +In standard rendering, we use simple alpha blending to combine images. In Mixed Reality (**MR**), our "background" is the live camera feed (passthrough). To achieve realistic occlusion, we must apply our semantic mask as a **Gating Function**. + +1. **Hard Occlusion**: A binary decision (0 or 1). While simple, this leads to aliased, jagged edges that shimmer as the user's head moves. +2. **Soft Occlusion**: A smooth gradient at the boundaries. By "feathering" the mask, we can blend the virtual and real worlds naturally, hiding the slight imperfections of the ML segmentation pass. + +== Implementing Stencil-Based Masking + +In our Vulkan engine, we can use the **Stencil Buffer** to perform high-speed masking. This is more efficient than performing complex branching in a fragment shader, as the hardware can discard pixels before they ever reach the shading stage. + +[source,cpp] +---- +// Configuring stencil masking with vulkan-hpp +vk::StencilOpState stencilOp{ + .failOp = vk::StencilOp::eKeep, + .passOp = vk::StencilOp::eReplace, + .depthFailOp = vk::StencilOp::eKeep, + .compareOp = vk::CompareOp::eAlways, + .compareMask = 0xFF, + .writeMask = 0xFF, + .reference = 0x1 // Mark real-world occluders with 0x1 +}; + +// 1. Fill the stencil buffer using the ML segmentation mask +cmd.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, 0x1); +cmd.draw(6, 1, 0, 0); // Full-screen quad or mesh-based occluder + +// 2. Render virtual scene where stencil is NOT 0x1 +cmd.setStencilCompareMask(vk::StencilFaceFlagBits::eFrontAndBack, 0xFF); +cmd.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, 0x1); +// CompareOp::eNotEqual ensures we only draw on virtual pixels +---- + +== Temporal Stability: The Edge Jitter Problem + +Because ML masks are generated frame-by-frame, they can slightly flicker due to camera noise or changes in lighting. This creates **Temporal Instability** at the occlusion edges. + +To fix this, our engine implements **Temporal Reprojection**. We take the mask from the previous frame, re-project it using the current head pose (from OpenXR's predicted display time), and blend it with the new mask. This ensures that the boundary between reality and virtuality stays rock-steady, which is essential for maintaining user immersion and preventing motion sickness. + +By combining semantic understanding with stable per-pixel masking, we create an environment where virtual objects feel physically "locked" into the user's room. + +xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/02_ml_driven_segmentation.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..beae72b5 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc @@ -0,0 +1,60 @@ +:pp: {plus}{plus} += Incorporating Semantic Occlusion into the Engine + +Integrating **Semantic Occlusion** allows our `simple_game_engine` to realistically blend virtual assets with the physical room. We implement this by adding a masking pass to our `Renderer` and updating our final composition logic in `renderer_rendering.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Creating the Occlusion Mask Pass + +In `renderer_rendering.cpp`, we add a step that uses our ML segmentation results to fill the **Stencil Buffer**. This pass must happen after we've received the latest ML mask but before we draw our virtual scene geometry. + +[source,cpp] +---- +// renderer_rendering.cpp +void Renderer::executeOcclusionMaskPass(vk::raii::CommandBuffer& cmd, uint32_t frameIndex) { + if (!xrMode || !occlusionEnabled) return; + + // 1. Prepare to fill the stencil buffer + // We bind a simple pipeline that reads the ML mask texture + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *occlusionMaskPipeline); + + // 2. Set the stencil reference to 0x1 (meaning "Real World") + cmd.setStencilReference(vk::StencilFaceFlagBits::eFrontAndBack, 0x1); + + // 3. Draw a full-screen quad. The shader will discard fragments + // where the ML mask doesn't indicate an occluding object (like a hand). + cmd.draw(3, 1, 0, 0); // Using a single large triangle for the full screen +} +---- + +== Composition in Slang + +In our `composite.slang` shader, we use the stencil buffer's output to decide how to blend the virtual scene with the passthrough camera feed. Alternatively, we can use a dedicated texture if the hardware doesn't support stencil-based discard. + +[source,slang] +---- +// composite.slang +[shader("pixel")] +float4 fragmentMain(V2P input) : SV_Target { + // 1. Sample the virtual scene and the passthrough feed + float4 virtualColor = virtualScene.Sample(input.uv); + float4 passthroughColor = passthroughTexture.Sample(input.uv); + + // 2. Sample our refined occlusion mask + float mask = occlusionMask.Sample(input.uv).r; + + // 3. Perform the Final Blend + // If mask > 0.5, the real world is occluding. We use a smoothstep + // to provide a "soft" edge, reducing jitter. + float blendFactor = smoothstep(0.4, 0.6, mask); + + return lerp(virtualColor, passthroughColor, blendFactor); +} +---- + +== Why These Changes? + +By moving occlusion from a simple "on/off" switch to a dedicated pass in our `Renderer`, we allow our virtual objects to interact with the real world on a per-pixel basis. A virtual character can now reach "behind" a real-world object. Because our engine uses a modern **Synchronization 2** architecture, we can ensure the ML mask is ready before this pass begins without stalling the entire GPU. + +This pixel-wise awareness is what transforms a "Video Overlay" into a "Spatial Experience," where the boundary between virtual and physical becomes indistinguishable to the user's eye. + +xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/03_per_pixel_masking.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc new file mode 100644 index 00000000..edee3c8b --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc @@ -0,0 +1,10 @@ +:pp: {plus}{plus} += Platform Divergence: Desktop DirectMode vs. Mobile Tiled/ASW + +Spatial computing is unique because it spans two radically different hardware universes. On one end, we have the high-end **Desktop VR** PC, powered by a 400-watt GPU and a high-speed PCIe bus. On the other end, we have the **Mobile Standalone** headset, running on a mobile processor with a strict power budget of under 10 watts. + +As an engine developer, you cannot treat these platforms the same. A "standard" rendering loop that runs at 200 FPS on a desktop will likely drain a mobile battery in minutes or fail to maintain the stable 72Hz required for spatial comfort. + +In this chapter, we will explore the techniques required to master both platforms. We'll learn how to utilize **Direct Mode** and **HDR10** to saturate high-end GPUs, and how to use **Tile-Based Rendering (TBR)** and **Application SpaceWarp (ASW)** to extract every last drop of performance from mobile silicon. By the end of this chapter, you'll know how to architect your engine's backend to automatically scale its fidelity and efficiency based on the detected spatial hardware. + +xref:OpenXR_Vulkan_Spatial_Computing/18_Semantic_Occlusion/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc new file mode 100644 index 00000000..79cbff3f --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc @@ -0,0 +1,40 @@ +:pp: {plus}{plus} += Desktop High-End: Unleashing the GPU + +On high-end desktop platforms, our spatial engine is largely unconstrained by battery life or strict thermal limits. This allows us to focus on absolute visual fidelity, utilizing the full power of modern GPUs to drive high-resolution headsets. The goal on desktop is **Saturation**—keeping the GPU's memory bus and compute units as full as possible to deliver the most immersive experience the hardware can achieve. + +== The Concept: Direct Mode + +In a standard application, your Vulkan windows are managed by the operating system's window manager (like **DWM** on Windows). This adds a layer of latency as the OS composites your window with other desktop elements. For spatial computing, this extra latency is unacceptable. + +We use **Direct Mode**. This allows the XR runtime to take exclusive control of the display output, bypassing the OS desktop compositor entirely. + +* **Latency Win**: By removing the OS compositor, we reduce frame delivery time by 1–2 frames, which is critical for **Motion-to-Photon** stability. +* **Strobe Synchronization**: Direct mode allows for perfect "display strobing." The headset's backlight flashes in precise sync with the GPU's swap signal, eliminating motion blur during rapid head movements. + +== HDR10 and 10-bit Color: Beyond the Screen + +Desktop GPUs have the bandwidth to support **High Dynamic Range (HDR)**. In XR, this is essential for physical realism. A virtual sun should be thousands of times brighter than a virtual candle. Without HDR, the engine must "tone-map" these values into the same narrow range, losing the feeling of scale. + +To implement this, our `VulkanContext` negotiates a 10-bit color format—such as `VK_FORMAT_A2B10G10R10_UNORM_PACK32`—with the OpenXR runtime. This provides over a billion colors, eliminating "banding" artifacts in dark spatial environments like night scenes or caves. + +== Leveraging High PCIe Bandwidth + +Because we have a high-speed **PCIe 4.0** or **5.0** link between the CPU and GPU, we can push massive amounts of data per frame without bottlenecking. + +* **Uncompressed LightFields**: We can upload gigabytes of plenoptic data directly to the GPU without stalling the render loop. +* **Ray Tracing**: We can afford the high overhead of building **Acceleration Structures** (AS) for complex, high-poly environments every frame, enabling real-time spatial reflections and global illumination. + +[source,cpp] +---- +// Configuring HDR10 for high-end desktop XR with vulkan-hpp +vk::SwapchainCreateInfoKHR createInfo{ + .imageFormat = vk::Format::eA2b10g10r10UnormPack32, + .imageColorSpace = vk::ColorSpaceKHR::eHdr10St2084EXT, + // ... other parameters ... +}; +---- + +On the desktop, the spatial mindset is about utilizing every watt and every gigabyte of bandwidth to dissolve the boundary between the user and the virtual world. + +xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc new file mode 100644 index 00000000..60a6c0c3 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc @@ -0,0 +1,32 @@ +:pp: {plus}{plus} += Mobile Mastery: Conquering the Thermal Wall + +Mobile spatial computing, on standalone headsets like the Meta Quest or Vive Focus, is an exercise in extreme efficiency. Unlike a desktop PC where we have hundreds of watts of power, a mobile headset must run its entire engine, tracking system, and display on a power budget of roughly 5–10 watts. + +On mobile, the spatial mindset is about **Efficiency**—minimizing every memory access and every pixel calculation to extract maximum performance from the silicon. + +== The Concept: Tile-Based Rendering (TBR) + +Most mobile GPUs (like those from Qualcomm, Arm, or Imagination) use a **Tile-Based Rendering** architecture. Instead of rendering the whole screen at once, the GPU splits the screen into tiny tiles (e.g., 16x16 or 32x32 pixels). It then processes each tile entirely within high-speed, low-power on-chip memory (**LDS** or **SRAM**) before writing the final result back to main memory. + +In mobile spatial computing, **Discarding is Winning**. + +1. **Load Op**: Does the tile need to be cleared (`eClear`) or loaded from VRAM (`eLoad`)? +2. **Store Op**: Does the final color need to be written back to VRAM (`eStore`) or discarded (`eDontCare`)? + +If you don't need the depth buffer for the next frame, you MUST set its store op to `eDontCare`. This prevents the GPU from wasting battery power moving megabytes of depth data across the bus, which is the most power-hungry operation on a mobile chip. + +== Application SpaceWarp (ASW): Synthetic Frames + +To maintain a smooth 72Hz or 90Hz experience on a mobile chip, we often use **Application SpaceWarp**. This is a hardware-accelerated technique that allows the engine to render at half-framerate (e.g., 36 FPS) while the XR runtime generates every second frame synthetically. + +* **How it works**: The engine provides **Motion Vectors** and a depth map to the runtime. These tell the compositor how each pixel moved since the last frame. +* **The Result**: The GPU only performs heavy shading work for half the frames, allowing for significantly more complex scenes and higher-fidelity spatial logic. + +== Efficient Spatial Sync + +Mobile headsets use **Unified Memory**, where the CPU and GPU share the same physical RAM. While this simplifies zero-copy hand-offs, it also means they are competing for the same **Memory Bandwidth**. + +To master mobile spatial computing, our engine must be highly surgical with its memory access. Every texture sample and every buffer write must be justified. By focusing on tile-local memory and synthetic frame generation, we can deliver high-quality spatial experiences that run for hours on a single battery charge. + +xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/02_desktop_high_end.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..56fd08fd --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc @@ -0,0 +1,64 @@ +:pp: {plus}{plus} += Incorporating Platform Divergence into the Engine + +Integrating **Platform Divergence** allows our `simple_game_engine` to automatically optimize its spatial pipeline for both high-end desktop and low-power mobile hardware. We achieve this by adding a configuration layer to our `Renderer` in `renderer_core.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Defining Hardware Profiles + +In `renderer_core.cpp`, we introduce a method to apply rendering presets based on the detected hardware. This logic ensures that our engine scales its fidelity to match the platform's constraints. + +[source,cpp] +---- +// renderer_core.cpp +void Renderer::applySpatialPlatformPresets() { + // Note: platform is our Engine's platform abstraction + if (platform->isMobile()) { + // 1. Mobile Mastery: Optimize for Tile-Based GPUs + renderScale = 0.75f; // Lower initial resolution for stability + + // 2. Explicitly set Load/Store ops for tile memory efficiency + // Discarding depth/multisample data saves massive bandwidth + mainColorAttachment.storeOp = vk::AttachmentStoreOp::eDontCare; + mainDepthAttachment.storeOp = vk::AttachmentStoreOp::eDontCare; + + // 3. Enable Application SpaceWarp for synthetic frame generation + xrConfig.enableASW = true; + } else { + // 4. Desktop High-End: Maximum Fidelity + renderScale = 1.0f; + useHDR10 = true; // Use 10-bit color formats + + // Enable high-bandwidth features like Ray Query + vk::PhysicalDeviceRayQueryFeaturesKHR rayQueryFeatures{}; + // ... request features from the LUID-matched physical device ... + } +} +---- + +== Handling Direct Mode on Desktop + +For desktop headsets, we must ensure our engine's windowing system (GLFW) doesn't interfere with the headset's **Direct Mode** display. In `main.cpp`, we configure our window creation logic. + +[source,cpp] +---- +// main.cpp +void setupEngineWindow() { + if (engine.isXRMode()) { + // On Desktop, the headset is a "hidden" display. + // We create a small "Mirror Window" for the monitor + // while the main spatial rendering bypasses the OS compositor. + glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); + engine.createMirrorWindow("XR Mirror", 1280, 720); + } else { + engine.createStandardWindow("Simple Engine", 1920, 1080); + } +} +---- + +== Why These Changes? + +By implementing platform-specific presets and Direct Mode handling, we ensure that our engine remains performant regardless of whether it's running on a high-end desktop GPU or a mobile SOC. The `simple_game_engine` already utilizes a modular `Platform` abstraction, which we now leverage to swap between desktop-specific logic (like 10-bit HDR) and mobile-specific optimizations (like **Tile-Based** discarding). + +This architectural flexibility is what allows our spatial applications to scale from the highest possible fidelity to the most efficient mobile standalone performance without rewriting our core rendering logic. + +xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/03_mobile_mastery.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc new file mode 100644 index 00000000..bce3fbb2 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc @@ -0,0 +1,12 @@ +:pp: {plus}{plus} += Advanced Spatial Diagnostics & CI/CD + +In this final technical chapter, we've reached the point where our engine is feature-complete. But in the world of spatial computing, a "feature-complete" engine is useless if it doesn't maintain absolute stability and 90+ FPS across hundreds of varied headset and GPU combinations. + +Debugging a spatial engine is notoriously difficult. Because the XR runtime and its internal compositor are often "black boxes," it can be challenging to determine if a visual artifact is caused by your engine's logic, a synchronization race condition, or the runtime's own reprojection algorithm. + +In this chapter, we will explore the tools and techniques required to peel back these layers. We'll learn how to use **Spatial Debugging** tools like RenderDoc to inspect multiview buffers, how to implement **Headless CI/CD** (Continuous Integration/Continuous Deployment) using the OpenXR Simulator, and how to utilize **AI Vision Analysis** to automatically detect rendering regressions before they ever reach a physical headset. + +By the end of this chapter, you'll have the diagnostic toolkit required to ship professional-grade spatial applications that are both visually stunning and rock-steady. + +xref:OpenXR_Vulkan_Spatial_Computing/19_Platform_Divergence/04_incorporating_into_the_engine.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc new file mode 100644 index 00000000..06902d43 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc @@ -0,0 +1,39 @@ +:pp: {plus}{plus} += Spatial Debugging: Peering into the Black Box + +Debugging a spatial engine is fundamentally different from a standard 2D application. Because the XR runtime and its internal compositor are often black boxes, it can be hard to determine if a rendering error is in your engine, the OpenXR handshake, or the compositor's own reprojection logic. To solve this, we use **Spatial Debugging**. + +== The Concept: Object Tagging + +The **XR_EXT_debug_utils** extension allows you to "label" your OpenXR resources. This is functionally identical to Vulkan's `VK_EXT_debug_utils`. By assigning names to your **Action Sets**, **Spaces**, and **Swapchains**, you can see these names in external tools rather than just raw memory addresses. + +[source,cpp] +---- +// Tagging an OpenXR swapchain for easier debugging +XrDebugUtilsObjectNameInfoEXT nameInfo{XR_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT}; +nameInfo.objectType = XR_OBJECT_TYPE_SWAPCHAIN; +nameInfo.objectHandle = reinterpret_cast(xrSwapchain); +nameInfo.objectName = "Main World Spatial Swapchain"; + +// Now, tools like RenderDoc will display this name for the resource +xrSetDebugUtilsObjectNameEXT(instance, &nameInfo); +---- + +== RenderDoc for Spatial Engines + +**RenderDoc** is the gold standard for Vulkan debugging, and it provides excellent support for modern spatial pipelines. + +1. **Multi-View Inspection**: You can inspect each layer of your multiview swapchain individually. If one eye is black but the other is rendering, RenderDoc can show you the state of each `SV_ViewID` invocation. +2. **Resource State Inspection**: You can verify that your `vk::RenderingInfo` has the correct `viewMask` and that your **Synchronization 2** barriers are actually waiting for the correct stages. +3. **Compositor Insights**: Some advanced headsets allow RenderDoc to capture the compositor's final warped and blended frame, letting you see exactly how your layers are being manipulated. + +== Frame Analysis: Finding the Stutter + +In spatial computing, a "Bug" is often not a crash, but a **Dropped Frame**. A single dropped frame can cause a perceptible "stutter" that breaks immersion. + +* **GPU Profiling**: Use tools like **NVIDIA Nsight Graphics** or **Radeon GPU Profiler** to see if your **Late Latching** wait is taking too long. +* **OpenXR Trace**: Most runtimes provide a tracing tool (like the Meta Quest Metrics Tool or SteamVR System Report) that shows **Compositor Pacing**. If your "App GPU Time" is low but the "Compositor Frame Missed" count is high, you likely have a synchronization race condition in your **Wait-Acquire-Release** loop. + +By instrumenting your code with debug labels and utilizing frame analysis tools, you can ensure your spatial experience remains rock-steady and comfortable for the user. + +xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/01_introduction.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc new file mode 100644 index 00000000..c502087c --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc @@ -0,0 +1,36 @@ +:pp: {plus}{plus} += Automated QA: Headless CI/CD for Spatial Engines + +Testing a spatial engine is traditionally a manual, time-consuming process. Someone must put on a headset, walk around the physical room, and manually check for rendering artifacts. For professional engine development, this is a major bottleneck. We need **Automated QA** that can run in a **Headless** CI/CD (Continuous Integration / Continuous Deployment) environment. + +== The Concept: The Headless Runtime + +A **Headless Runtime** (such as the **OpenXR Simulator** or **Null Runtime**) allows our engine to run without a physical headset or even a physical GPU (using software Vulkan implementations like **SwiftShader**). + +1. **Mock Tracking Data**: The simulator provides pre-recorded or scripted tracking data (poses and controller inputs). +2. **Swapchain Capture**: The engine renders frames into the OpenXR swapchain as usual, but the simulator captures these frames directly to disk rather than displaying them. +3. **Visual Validation**: We compare these captured frames against "Gold Standard" baseline images to ensure no rendering regressions have been introduced by a code change. + +== AI Vision Analysis for Regressions + +Traditional "Pixel-Perfect" image comparison often fails in spatial computing due to subtle differences in reprojection, slight floating-point variations in view matrices, or hardware-specific MSAA patterns. + +Instead, we use **AI Vision Models**. We feed the rendered frames from our CI/CD pipeline into a neural network trained to detect specific **Spatial Artifacts**: + +* **Asymmetric Frustum Errors**: Detecting if the projection matrices are causing "stretching" or "pinching" at the edges of the view. +* **Multiview Inconsistencies**: Checking if an object is missing from one eye but present in the other. +* **Temporal Mask Flicker**: Analyzing a sequence of frames to identify instability in **Semantic Occlusion** masks. + +By using AI, we can automate the "someone put on the headset" step, catching bugs within minutes of a pull request being submitted. + +== Performance Monitoring in CI + +Finally, we track **Spatial Performance Metrics** in every build to prevent performance "creep." + +* **Submission Latency**: The total time between `xrWaitFrame` and `xrEndFrame`. +* **Wait-to-Begin Gap**: The time the CPU spends waiting for the late-latching sync point. +* **Memory Footprint**: Tracking the VRAM usage of our 4D **LightField** and **Semantic Occlusion** buffers. + +By combining headless simulation, AI-driven visual analysis, and automated performance tracking, we ensure that our spatial engine remains visually stunning, stable, and efficient across every commit and every hardware target. + +xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/02_spatial_debugging.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc new file mode 100644 index 00000000..7610298f --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc @@ -0,0 +1,63 @@ +:pp: {plus}{plus} += Incorporating Spatial Diagnostics into the Engine + +Integrating **Spatial Diagnostics & CI/CD** allows our `simple_game_engine` to maintain absolute stability through automated testing and deep-dive resource inspection. We achieve this by adding telemetry hooks to our `Renderer` and configuring a "Headless" mode in `engine.cpp`. *This is currently left as an exercise for the reader to implement.* + +== Adding Diagnostic Labels + +In `renderer_core.cpp`, we extend our resource creation logic to include OpenXR and Vulkan debug labels. This ensures that when we capture a frame in RenderDoc, we can see exactly what each resource represents. + +[source,cpp] +---- +// renderer_core.cpp +void Renderer::setupSpatialResourceLabels(const XrInstance& xrInstance) { + if (!xrMode || !hasDebugUtilsSupport) return; + + // 1. Tag the spatial swapchain for RenderDoc identification + XrDebugUtilsObjectNameInfoEXT nameInfo{ + .type = XR_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, + .objectType = XR_OBJECT_TYPE_SWAPCHAIN, + .objectHandle = reinterpret_cast(spatialSwapchain), + .objectName = "Spatial Swapchain" + }; + + xrSetDebugUtilsObjectNameEXT(xrInstance, &nameInfo); + + // 2. Add a diagnostic hook to our main command buffer + // This allows us to track the frame's progress in a GPU capture + vk::DebugUtilsLabelEXT frameLabel{ .pLabelName = "Spatial Frame Rendering" }; + commandBuffer.beginDebugUtilsLabelEXT(frameLabel); +} +---- + +== Enabling Automated CI/CD Headless Mode + +For automated testing, we configure our engine to run in a "Headless" mode within `engine.cpp`. In this state, we swap the real OpenXR runtime for a simulator and save screenshots for AI vision analysis. + +[source,cpp] +---- +// engine.cpp +void Engine::runSpatialCITest(uint32_t totalFrames) { + // 1. Configure the XR Context for the Headless Simulator + xrContext.enableSimulatorMode("Baseline_Tests.json"); + + for (uint32_t i = 0; i < totalFrames; ++i) { + // 2. Perform the standard update and render + update(fixedTimeStep); + render(); + + // 3. Capture every 100th frame for AI-driven visual validation + if (i % 100 == 0) { + renderer->captureFrameToDisk("CI_Spatial_Capture_" + std::to_string(i) + ".png"); + } + } +} +---- + +== Why These Changes? + +By building diagnostic hooks and automated testing directly into the engine, we ensure that new spatial features (like **LightField** synthesis or **Late Latching**) don't introduce visual regressions or performance regressions. The `simple_game_engine` utilizes these telemetry points in its internal **Watchdog** system, alerting developers if the **Motion-to-Photon** gap exceeds the target threshold. + +This "Spatial-First" CI/CD approach is what allows professional teams to maintain high-fidelity experiences across varied hardware targets while keeping the codebase stable and reliable. + +xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/03_automated_qa.adoc[Previous] | xref:OpenXR_Vulkan_Spatial_Computing/conclusion.adoc[Next] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/conclusion.adoc b/en/OpenXR_Vulkan_Spatial_Computing/conclusion.adoc new file mode 100644 index 00000000..7958de95 --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/conclusion.adoc @@ -0,0 +1,10 @@ +:pp: {plus}{plus} += Conclusion + +Congratulations on completing the **OpenXR and Vulkan 1.4 Spatial Computing** tutorial series! We've covered a vast amount of ground, from the initial handshake between the two APIs to advanced topics like LightField Theory and Plenoptic Synthesis. + +Through the "Incorporating into the Engine" sections at the end of each chapter, you've seen how to take these theoretical concepts and apply them to a real-world renderer like our `simple_game_engine`. + +Spatial computing is an ever-evolving field, and we encourage you to continue exploring, experimenting, and pushing the boundaries of what's possible with Vulkan 1.4 and OpenXR. + +xref:OpenXR_Vulkan_Spatial_Computing/20_Spatial_Diagnostics_CI_CD/04_incorporating_into_the_engine.adoc[Previous] diff --git a/en/OpenXR_Vulkan_Spatial_Computing/introduction.adoc b/en/OpenXR_Vulkan_Spatial_Computing/introduction.adoc new file mode 100644 index 00000000..8f74ef7b --- /dev/null +++ b/en/OpenXR_Vulkan_Spatial_Computing/introduction.adoc @@ -0,0 +1,57 @@ +:pp: {plus}{plus} += OpenXR and Vulkan 1.4 Spatial Computing: Introduction + +Welcome to the frontiers of spatial computing. In this tutorial series, we are going to bridge the gap between traditional 2D engine architecture and the immersive, low-latency requirements of Virtual Reality (VR), Augmented Reality (AR), and Mixed Reality (MR)—collectively known as XR. + +== Prerequisites + +Before beginning this tutorial, you should have a solid foundation in both the Vulkan API and engine architecture. Specifically, we assume you have completed: + +1. **xref:00_Introduction.adoc[The Core Vulkan Tutorial]**: You should be comfortable with the core concepts of Vulkan, including instance creation, physical devices, logical devices, and the basic graphics pipeline. +2. **xref:Building_a_Simple_Engine/introduction.adoc[Building a Simple Engine]**: This series builds directly upon the code and architectural patterns established in the engine tutorial. We will be using the `Simple Engine` codebase as our starting point. +3. **Modern C{pp} and RAII**: We utilize `vulkan-hpp` and RAII (Resource Acquisition Is Initialization) throughout the series. You should be familiar with these patterns to follow the C{pp} integration sections. + +== The OpenXR Ecosystem: Runtimes and Loaders + +Before we dive into the code, it is important to understand the two core components that make OpenXR work: the **Loader** and the **Runtime**. + +1. **The OpenXR Loader**: This is a small, vendor-neutral library that your application links against. Its primary responsibility is to act as a traffic cop. When your engine starts, the loader looks at the system's registry or environment variables to find the currently active XR runtime and redirects all your API calls to it. +2. **The OpenXR Runtime**: This is the heavy lifter provided by the hardware manufacturer (such as Meta, Valve, Microsoft, or the open-source Monado project). The runtime contains the actual implementation of the OpenXR spec, including the drivers for the cameras, displays, and tracking sensors. + +To follow this tutorial, you will need to have an OpenXR-compatible environment set up on your development machine. This typically involves installing the SDK and ensuring a runtime is active. + +[TIP] +==== +For the most up-to-date installation instructions and to download the necessary headers and loader libraries, we highly recommend visiting the official **link:https://github.com/KhronosGroup/OpenXR-SDK-Source[Khronos OpenXR-SDK-Source]** repository and following the **link:https://www.khronos.org/registry/OpenXR/specs/1.1/html/xrspec.html[OpenXR Specification]** or the excellent **link:https://www.openxr-tutorial.com/[OpenXR Tutorial]** for platform-specific setup guides. +==== + +== Why OpenXR? + +In the early days of XR, developers had to write custom backends for every headset on the market. OpenXR changed all that by providing a cross-platform, high-performance API that allows a single application to run on devices ranging from the Meta Quest to the Valve Index and the Microsoft HoloLens. + +But OpenXR isn't just a wrapper. It is a sophisticated state machine that manages hardware poses, predictive frame timing, and specialized swapchains that are owned by the XR runtime itself. To use it effectively, we have to rethink how our engine handles its main loop, its memory, and its synchronization. + +== The Vulkan 1.4 Advantage + +Vulkan 1.4 brings several critical features to the table that are particularly powerful for spatial computing: + +* **Timeline Semaphores**: Essential for the complex, cross-process synchronization required between your engine and the XR compositor. +* **Dynamic Rendering**: Provides a flexible, lightweight rendering path for stereo views, avoiding the rigid state of legacy Render Passes. +* **Synchronization 2**: Simplifies the ownership transfers and barriers needed for low-latency late latching. +* **Maintenance Extensions**: Providing better control over memory and resource visibility across hardware boundaries. + +== What We Will Build + +Throughout this series, we will cover the entire lifecycle of an XR frame: + +1. **The Handshake**: Connecting OpenXR to our Vulkan context using LUID matching and mandatory extensions. +2. **Resource Management**: Wrapping runtime-owned images into our RAII-based engine abstractions. +3. **The Predictive Loop**: Mastering frame timing to ensure that what the user sees matches exactly where their head is located at the moment of display. +4. **Spatial Shaders**: Using Slang to author efficient multiview and foveated rendering shaders. +5. **Advanced Sensing**: Ingesting environmental meshes and using ML inference to refine spatial data. + +By the end of this tutorial, you won't just have a working XR application; you will have a deep understanding of the architectural patterns required for high-performance spatial computing in the modern Vulkan ecosystem. + +Let's get started with the first step: the OpenXR-Vulkan 1.4 Handshake. + +xref:OpenXR_Vulkan_Spatial_Computing/02_OpenXR_Vulkan_Handshake/01_introduction.adoc[Next]