From e99dbff93750a96f6e2c9a97a7c3f28188eb204d Mon Sep 17 00:00:00 2001 From: Roman Schaffert Date: Wed, 10 Jun 2026 14:18:40 +0800 Subject: [PATCH] Added polyline interpolation (in the new `lane_helpers` sub-package) - Added polyline interpolation - Cleaned up error handlingh in torch extensions - Added functionality to generate plots during docs generation Signed-off-by: Roman Schaffert --- .gitignore | 1 - docker/Dockerfile | 4 +- docs/Makefile | 5 +- docs/generate_package_docs_assets.py | 176 +++++ docs/guides/DEVELOPMENT_GUIDE.md | 59 +- docs/guides/DOCUMENTATION_SETUP_GUIDE.md | 127 +++- docs/guides/INSTALLATION_GUIDE.md | 9 +- docs/index.rst | 1 + docs/spelling_wordlist.txt | 4 + namespace_packages_config.py | 1 + .../batched_indexing_access_helpers.h | 98 +-- .../draw_heatmap/csrc/draw_heatmap_cuda.cu | 48 +- .../docs/_on_doc_generation.py | 59 ++ packages/example_package/docs/intro.rst | 15 + .../evaluation_results/simple_plot.csv | 6 + packages/example_package/pyproject.toml | 1 + .../accvlab/lane_helpers/__init__.py | 28 + .../accvlab/lane_helpers/polyline/__init__.py | 27 + .../lane_helpers/polyline/functions.py | 113 +++ .../lane_helpers/docs/_on_doc_generation.py | 91 +++ packages/lane_helpers/docs/api.rst | 9 + packages/lane_helpers/docs/example.rst | 16 + .../images/polyline_sampling_illustration.png | Bin 0 -> 31019 bytes packages/lane_helpers/docs/index.rst | 11 + packages/lane_helpers/docs/introduction.rst | 113 +++ .../lane_helpers/docu_referenced_dirs.txt | 1 + .../evaluation/_shapely_evaluation_outputs.py | 157 ++++ .../evaluation/plot_shapely_evaluation.py | 595 +++++++++++++++ .../evaluation/shapely_evaluation.py | 681 ++++++++++++++++++ .../batch_1_runtime_cpu.md | 13 + .../batch_1_runtime_cuda.md | 13 + .../batch_1_runtime_shapely.md | 13 + .../batch_1_speedup_cpu_vs_shapely.md | 13 + .../batch_1_speedup_cuda_vs_cpu.md | 13 + .../batch_1_speedup_cuda_vs_shapely.md | 13 + .../batch_64_runtime_cpu.md | 13 + .../batch_64_runtime_cuda.md | 13 + .../batch_64_runtime_shapely.md | 13 + .../batch_64_speedup_cpu_vs_shapely.md | 13 + .../batch_64_speedup_cuda_vs_cpu.md | 13 + .../batch_64_speedup_cuda_vs_shapely.md | 13 + packages/lane_helpers/examples/basic_usage.py | 53 ++ packages/lane_helpers/ext_impl/CMakeLists.txt | 55 ++ .../ext_impl/polyline/CMakeLists.txt | 50 ++ .../polyline/include/helper_macros.cuh | 25 + .../ext_impl/polyline/include/polyline.cuh | 178 +++++ .../polyline/include/polyline_common.cuh | 169 +++++ .../include/polyline_dtype_compat.cuh | 127 ++++ .../polyline/include/polyline_kernels.cuh | 466 ++++++++++++ .../include/polyline_shared_memory_config.cuh | 120 +++ .../ext_impl/polyline/src/polyline.cpp | 399 ++++++++++ .../ext_impl/polyline/src/polyline.cu | 270 +++++++ .../ext_impl/polyline/src/polyline_cpu.cpp | 193 +++++ packages/lane_helpers/pyproject.toml | 35 + packages/lane_helpers/setup.py | 53 ++ .../lane_helpers/tests/polyline_test_utils.py | 226 ++++++ .../test_polyline_fixed_interpolation.py | 209 ++++++ .../tests/test_polyline_lengths.py | 179 +++++ .../tests/test_polyline_validation.py | 166 +++++ .../test_polyline_var_size_interpolation.py | 314 ++++++++ 60 files changed, 5781 insertions(+), 118 deletions(-) create mode 100644 docs/generate_package_docs_assets.py create mode 100644 packages/example_package/docs/_on_doc_generation.py create mode 100644 packages/example_package/evaluation_results/simple_plot.csv create mode 100644 packages/lane_helpers/accvlab/lane_helpers/__init__.py create mode 100644 packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py create mode 100644 packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py create mode 100644 packages/lane_helpers/docs/_on_doc_generation.py create mode 100644 packages/lane_helpers/docs/api.rst create mode 100644 packages/lane_helpers/docs/example.rst create mode 100644 packages/lane_helpers/docs/images/polyline_sampling_illustration.png create mode 100644 packages/lane_helpers/docs/index.rst create mode 100644 packages/lane_helpers/docs/introduction.rst create mode 100644 packages/lane_helpers/docu_referenced_dirs.txt create mode 100644 packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py create mode 100644 packages/lane_helpers/evaluation/plot_shapely_evaluation.py create mode 100644 packages/lane_helpers/evaluation/shapely_evaluation.py create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md create mode 100644 packages/lane_helpers/examples/basic_usage.py create mode 100644 packages/lane_helpers/ext_impl/CMakeLists.txt create mode 100644 packages/lane_helpers/ext_impl/polyline/CMakeLists.txt create mode 100644 packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline.cpp create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline.cu create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp create mode 100644 packages/lane_helpers/pyproject.toml create mode 100644 packages/lane_helpers/setup.py create mode 100644 packages/lane_helpers/tests/polyline_test_utils.py create mode 100644 packages/lane_helpers/tests/test_polyline_fixed_interpolation.py create mode 100644 packages/lane_helpers/tests/test_polyline_lengths.py create mode 100644 packages/lane_helpers/tests/test_polyline_validation.py create mode 100644 packages/lane_helpers/tests/test_polyline_var_size_interpolation.py diff --git a/.gitignore b/.gitignore index 31a607c..7a579ee 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,3 @@ __pycache__/ *.whl *.log - diff --git a/docker/Dockerfile b/docker/Dockerfile index 6ae40b1..82c44b8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -113,8 +113,8 @@ RUN pip install pandas==1.5.3 \ numba==0.59 \ pyquaternion==0.9.9 -RUN pip install nuscenes-devkit && \ - pip install shapely tqdm pillow networkx fire +RUN pip install nuscenes-devkit==1.2.0 && \ + pip install shapely==2.0.7 tqdm==4.67.3 pillow==12.2.0 networkx==3.4.2 fire==0.7.1 RUN pip install pytest pytest-timeout RUN pip install pynvml diff --git a/docs/Makefile b/docs/Makefile index 51667be..987e408 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -23,8 +23,9 @@ help: # Generate namespace package documentation before building generate: - python3 mirror_referenced_dirs.py python3 generate_new_namespace_package_docs.py + python3 generate_package_docs_assets.py + python3 mirror_referenced_dirs.py python3 update_docs_index.py # Sync the root README into the docs tree before building @@ -41,7 +42,7 @@ clean: @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) rm -rf $(BUILDDIR)/ rm -rf api/generated/ - rm -rf ../packages/*/docs/generated/ + rm -rf ../packages/*/docs/_generated/ # Auto-build documentation (watches for changes) livehtml: sync-readme generate diff --git a/docs/generate_package_docs_assets.py b/docs/generate_package_docs_assets.py new file mode 100644 index 0000000..e205e28 --- /dev/null +++ b/docs/generate_package_docs_assets.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from dataclasses import dataclass +import importlib.util +from pathlib import Path +import sys +from types import ModuleType +from typing import Callable + + +@dataclass(frozen=True) +class PackageDocsContext: + project_root: Path + namespace_package: str + package_name: str + package_root: Path + docs_root: Path + generated_dir: Path + + +HookFunction = Callable[[PackageDocsContext], None] +_GENERATED_ASSET_GITIGNORE = "*\n" + + +def _load_hook_module(hook_path: Path, package_name: str) -> ModuleType: + # Temporary module name for the imported hook. + module_name = f"_accvlab_docs_assets_{package_name}" + + # Import + spec = importlib.util.spec_from_file_location(module_name, hook_path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not create import spec for docs asset hook: {hook_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + return module + + +def _get_hook_function(module: ModuleType, hook_path: Path) -> HookFunction: + hook_function = getattr(module, "generate_docs_assets", None) + if not callable(hook_function): + raise AttributeError( + f"Docs asset hook must define a callable generate_docs_assets(context): {hook_path}" + ) + return hook_function + + +def _prepare_generated_dir(context: PackageDocsContext) -> None: + """Create the package's generated docs asset directory and keep it untracked.""" + context.generated_dir.mkdir(parents=True, exist_ok=True) + (context.generated_dir / ".gitignore").write_text(_GENERATED_ASSET_GITIGNORE, encoding="utf-8") + + +def _build_context(project_root: Path, namespace_package: str) -> PackageDocsContext: + package_name = namespace_package.split(".")[-1] + package_root = project_root / "packages" / package_name + docs_root = package_root / "docs" + generated_dir = docs_root / "_generated" + ctx = PackageDocsContext( + project_root=project_root, + namespace_package=namespace_package, + package_name=package_name, + package_root=package_root, + docs_root=docs_root, + generated_dir=generated_dir, + ) + return ctx + + +def _generate_assets_for_package( + *, + project_root: Path, + namespace_package: str, + verbose: bool, +) -> bool: + context = _build_context(project_root, namespace_package) + hook_path = context.docs_root / "_on_doc_generation.py" + if not hook_path.exists(): + if verbose: + print(f"No docs asset hook for {context.package_name}") + return False + + if verbose: + print(f"Running docs asset hook for {context.package_name}: {hook_path}") + module = _load_hook_module(hook_path, context.package_name) + _prepare_generated_dir(context) + hook_function = _get_hook_function(module, hook_path) + hook_function(context) + print(f"Generated docs assets for {context.package_name}") + return True + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run optional package-local documentation asset generation hooks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output.", + ) + parser.add_argument( + "--package", + dest="package_names", + action="append", + help="Package name to process, such as lane_helpers. Can be passed more than once.", + ) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + docs_dir = Path(__file__).resolve().parent + project_root = docs_dir.parent + sys.path.insert(0, str(project_root)) + + try: + from namespace_packages_config import NAMESPACE_PACKAGES + except ImportError as exc: + print( + f"Error: Could not import NAMESPACE_PACKAGES from namespace_packages_config.py: {exc}", + file=sys.stderr, + ) + return 1 + + package_filter = set(args.package_names or []) + namespace_packages = [ + namespace_package + for namespace_package in NAMESPACE_PACKAGES + if not package_filter or namespace_package.split(".")[-1] in package_filter + ] + if package_filter and len(namespace_packages) != len(package_filter): + found_package_names = {namespace_package.split(".")[-1] for namespace_package in namespace_packages} + missing_package_names = sorted(package_filter - found_package_names) + print(f"Error: Unknown namespace package(s): {', '.join(missing_package_names)}", file=sys.stderr) + return 1 + + hook_count = 0 + for namespace_package in namespace_packages: + package_name = namespace_package.split(".")[-1] + try: + hook_ran = _generate_assets_for_package( + project_root=project_root, + namespace_package=namespace_package, + verbose=args.verbose, + ) + except Exception as exc: + print(f"Error: docs asset generation failed for {package_name}: {exc}", file=sys.stderr) + return 1 + if hook_ran: + hook_count += 1 + + if args.verbose: + print(f"Ran {hook_count} package docs asset hook(s).") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/guides/DEVELOPMENT_GUIDE.md b/docs/guides/DEVELOPMENT_GUIDE.md index ffb1f5a..08077d4 100644 --- a/docs/guides/DEVELOPMENT_GUIDE.md +++ b/docs/guides/DEVELOPMENT_GUIDE.md @@ -48,7 +48,8 @@ There are two example projects which showcase how a namespace package is structu - `packages/example_package`: Showcases a package containing PyTorch extensions built using `CppExtension` and `CUDAExtension` provided by PyTorch as well as an external implementation (see [External Implementations](#external-implementations) section for more details on external implementations) - as described below. + as described below. It also includes a package-local documentation asset hook that generates a simple plot + from committed CSV data under `evaluation_results/` during the docs build. - `packages/example_skbuild_package`: Showcases a package using `scikit-build` for C++/CUDA implementation (see the [Alternative: SKBuild-Based Packages](#alternative-skbuild-based-packages) section for more details on this approach). @@ -70,6 +71,8 @@ To add a new namespace package (e.g., `example_package`), you need to create: | **Setup** | `packages/example_package/setup.py` | Package build configuration | | **Project Config** | `packages/example_package/pyproject.toml` | Modern Python project configuration and authoritative dependency definition | | **Documentation include list (optional)** | `packages/example_package/docu_referenced_dirs.txt` | List additional directories referenced by the docs (besides `docs/`). See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details.| +| **Documentation asset hook (optional)** | `packages/example_package/docs/_on_doc_generation.py` | Generate package-owned docs assets such as plots from committed evaluation data. See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md#package-local-generated-assets). | +| **Evaluation results (optional)** | `packages/example_package/evaluation_results/` | Package-owned committed inputs for generating docs assets, such as data to plot. | > **ℹ️ Note**: Apart from the above, further folders/files can be included (and made use of manually or added to the > documentation) if needed. A typical use case is to include e.g. an `examples` directory which is: @@ -84,26 +87,29 @@ The following diagram shows the relevant project structure containing the folder ``` accvlab/ -├── packages/ # Namespace packages directory +├── packages/ # Namespace packages directory │ ├── optim_test_tools/... │ ├── batching_helpers/... -│ └── example_package/ # ← New namespace package -│ ├── accvlab/ # ← Namespace root -│ │ └── example_package/ # ← Implementation for "example_package" package +│ └── example_package/ # ← New namespace package +│ ├── accvlab/ # ← Namespace root +│ │ └── example_package/ # ← Implementation for "example_package" package │ │ ├── __init__.py -│ │ ├── csrc/ # ← C++/CUDA sources -│ │ └── include/ # ← Headers -│ ├── ext_impl/ # ← Optional: external implementation +│ │ ├── csrc/ # ← C++/CUDA sources +│ │ └── include/ # ← Headers +│ ├── ext_impl/ # ← Optional: external implementation │ │ ├── build_and_copy.sh │ │ └── ... -│ ├── tests/ # ← Tests for "example_package" package -│ ├── docs/ # ← Documentation for "example_package" package -│ ├── setup.py # ← Package build configuration -│ ├── pyproject.toml # ← Project configuration (including dependencies) -│ └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`) -├── build_config/ # Shared build utilities -├── docs/ # Main documentation -└── namespace_packages_config.py # ← Namespace package needs to be listed here +│ ├── tests/ # ← Tests for "example_package" package +│ ├── evaluation_results/ # ← Optional committed inputs for generated docs assets +│ ├── docs/ # ← Documentation for "example_package" package +│ │ ├── _on_doc_generation.py # ← Optional docs asset hook +│ │ └── ... +│ ├── setup.py # ← Package build configuration +│ ├── pyproject.toml # ← Project configuration (including dependencies) +│ └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`) +├── build_config/ # Shared build utilities +├── docs/ # Main documentation +└── namespace_packages_config.py # ← Namespace package needs to be listed here ``` Note that inside the package, there is the directory structure `accvlab/example_package`. This is where the @@ -238,6 +244,11 @@ root = "../.." Use this pattern for your own namespace package, adapting the dependency names as needed. +Use `[project.optional-dependencies].optional` for dependencies needed by tests, examples, or package-local +documentation asset hooks, but not by the core package at runtime. For example, if a docs hook generates plots +from committed data, put the plotting library in the package's optional dependencies rather than in the base +`[project].dependencies`. + > **ℹ️ Note**: The `accvlab-build-config @ file:../../build_config` build dependency is intentionally a > local path reference. From a package under `packages//`, it resolves to the repository's `build_config/` package > so isolated pip builds use the local helper package. See @@ -317,6 +328,18 @@ Most of the contained packages extend this basic structure considerably to provi documentation. Please see the [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details on the documentation system and how to set it up. +If your package needs generated docs assets, add `packages//docs/_on_doc_generation.py`. The +documentation build creates `packages//docs/_generated/`, keeps it untracked, and passes that +directory to the hook. Keep user-facing `.rst`/`.md` files static and reference generated assets with relative +paths such as `_generated/.png`. The hook should generate those assets from committed inputs and +fail clearly if required inputs are missing. Store committed plot or evaluation inputs outside the package +`docs/` folder, for example under `packages//evaluation_results/`, so Sphinx does not discover +data tables as standalone documentation pages. + +> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement +> workflows. They should only regenerate documentation assets, such as plots, from data that is already +> available in the repository. + #### 8. Test Your Package ```bash @@ -352,6 +375,10 @@ When adding a new namespace package, ensure you have: - [ ] **Documentation**: Generated with docs scripts and customized intro - [ ] **Documentation include list (optional)**: `docu_referenced_dirs.txt` created and populated if extra folders (e.g. `examples/`) are referenced and are needed to build the documentation +- [ ] **Documentation asset hook (optional)**: `_on_doc_generation.py` added if the package needs generated + documentation assets +- [ ] **Evaluation results (optional)**: `packages//evaluation_results/` contains committed + inputs for generated docs assets if needed - [ ] **Examples (optional)**: `packages//examples/` created and referenced from docs if used - [ ] **Dependencies**: Declared runtime and optional dependencies in `pyproject.toml` - [ ] **External implementation**: (Optional) `packages//ext_impl/` for external builds diff --git a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md index 521725f..667ce0e 100644 --- a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md +++ b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md @@ -9,6 +9,7 @@ The documentation system provides: - **Explicit namespace package configuration** through `namespace_packages_config.py` - **Dynamic documentation generation** for each configured namespace package +- **Optional package-local asset generation** for generated documentation assets such as plots - **Comprehensive API reference** with auto-generated content (extracted from docstrings) - **Referenced directories mirroring** to access files from the individual namespace packages in the documentation by @@ -43,6 +44,10 @@ The documentation generation makes use of multiple scripts: - **Template-based**: Uses consistent templates for all namespace packages (but generated files may be modified as needed) - **Safe regeneration**: Only creates missing files if no `index.rst` is present for the namespace package +- **`generate_package_docs_assets.py`**: Runs optional package-local documentation asset hooks + - **Package-owned**: Each package can decide whether it needs generated assets and how to create them + - **Format-agnostic**: The hook can read any package-owned input files and write any output files in the output folder; + The core docs system does not prescribe a data format - **`update_docs_index.py`**: Updates main index file by including references to newly added namespace packages - **`mirror_referenced_dirs.py`**: Mirrors (symlinks by default) the `docs` directory and other needed @@ -64,27 +69,28 @@ The documentation generation makes use of multiple scripts: #### Main Documentation Directory (`docs/`) ``` docs/ -├── conf.py # Sphinx configuration using namespace_packages_config -├── index.rst # Main documentation index +├── conf.py # Sphinx configuration using namespace_packages_config +├── index.rst # Main documentation index ├── generate_new_namespace_package_docs.py # Creates structure for new namespace packages -├── update_docs_index.py # Updates navigation and indices -├── mirror_referenced_dirs.py # Mirrors referenced directories (symlinks by default) -├── sync_root_readme_for_docs.py # Syncs project root README into docs/project_overview -├── Makefile # Build commands -├── requirements.txt # Documentation dependencies -├── project_overview/ # Synced copy of the project root README used as docs overview -├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies) -│ ├── example_package/ # Example namespace package docs (representative) -│ │ ├── docs/ # Documentation files -│ │ │ ├── index.rst # Namespace package overview -│ │ │ ├── intro.rst # Introduction (manual content) -│ │ │ └── api.rst # API reference (auto-generated) -│ │ └── examples/ # Additional mirrored directory (referenced in docs) -│ └── [other_packages]/ # Other configured namespace packages -├── common/ # Shared documentation resources +├── generate_package_docs_assets.py # Runs optional package-local docs asset hooks +├── update_docs_index.py # Updates navigation and indices +├── mirror_referenced_dirs.py # Mirrors referenced directories (symlinks by default) +├── sync_root_readme_for_docs.py # Syncs project root README into docs/project_overview +├── Makefile # Build commands +├── requirements.txt # Documentation dependencies +├── project_overview/ # Synced copy of the project root README used as docs overview +├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies) +│ ├── example_package/ # Example namespace package docs (representative) +│ │ ├── docs/ # Documentation files +│ │ │ ├── index.rst # Namespace package overview +│ │ │ ├── intro.rst # Introduction (manual content) +│ │ │ └── api.rst # API reference (auto-generated) +│ │ └── examples/ # Additional mirrored directory (referenced in docs) +│ └── [other_packages]/ # Other configured namespace packages +├── common/ # Shared documentation resources ├── _static/css/ -│ └── custom.css # Custom styling -└── _build/ # Built documentation output +│ └── custom.css # Custom styling +└── _build/ # Built documentation output ``` **Notes**: @@ -103,7 +109,10 @@ packages/ ├── docs/ # Source documentation files │ ├── index.rst # Namespace package overview │ ├── intro.rst # Introduction (manual content) - │ └── api.rst # API reference (auto-generated) + │ ├── api.rst # API reference (auto-generated) + │ ├── _on_doc_generation.py # Optional package-local docs asset hook + │ └── _generated/ # Generated assets created at docs build time + ├── evaluation_results/ # Optional committed inputs for generated docs assets ├── docu_referenced_dirs.txt # List of additional directories to copy ├── examples/ # Example code (mirrored and referenced by docs) └── [other_dirs]/ # Other package directories @@ -112,6 +121,9 @@ packages/ **Notes**: - The `packages/example_package/` structure shows the source documentation that gets mirrored during build +- The `example_package` includes a small generated plot example: committed CSV data under + `packages/example_package/evaluation_results/` is converted into an image under + `packages/example_package/docs/_generated/` during the docs build - **⚠️ Important**: Content should be edited in the source locations (`packages//docs/`), not in the mirrored locations - In case of the `example_package`, the `examples/` directory is mirrored to maintain documentation references @@ -172,6 +184,65 @@ etc.) can still be found after the documentation is mirrored to the build locati - Only list additional directories that are referenced by your documentation. Note that the API documentation does not rely on this mirroring, but is extracted from the installed packages. +### Package-Local Generated Assets + +Packages can generate documentation assets during the docs build by adding an optional hook: + +```text +packages//docs/_on_doc_generation.py +``` + +If present, `generate_package_docs_assets.py` imports the hook and calls: + +```python +def generate_docs_assets(context): + ... +``` + +The hook receives a context with package and documentation paths, including: + +- `context.project_root` +- `context.package_root` +- `context.docs_root` +- `context.generated_dir` + +The docs asset generator creates `context.generated_dir` before calling the hook. This directory is always: + +```text +packages//docs/_generated/ +``` + +It also writes a local `.gitignore` file there so generated assets remain untracked. The hook should write +generated images or other generated files directly into `context.generated_dir`, or into subdirectories below +it if a package needs additional structure. + +Source documentation files remain static. For example, an `.rst` file can reference a generated image with a +normal relative path: + +```rst +.. figure:: _generated/runtime_plot.png + :alt: Runtime plot +``` + +Packages own the input data and generation logic. For example, a package can commit benchmark result tables +under `packages//evaluation_results/` and generate plots from those tables during the docs +build. If a generated asset is required by the static docs, the hook should fail with a clear error when the +required input data is missing or malformed. + +> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement +> workflows. They should only regenerate documentation assets, such as plots, from data that is already +> available in the repository. It is recommended to store results in simple formats such as .csv or .md, +> and use those as the source of truth for the plots. +> +> Keep committed plot or evaluation inputs outside the package `docs/` folder, for example under +> `packages//evaluation_results/`. This prevents Sphinx from discovering e.g. `.md` data tables as +> standalone documentation pages while keeping the inputs package-local. + +Package-specific dependencies needed only by the hook should be declared in that package's optional +dependencies in `pyproject.toml`. The default local installation path (`./scripts/install_local.sh`) installs +optional package dependencies. If you build docs after installing packages without optional dependencies, +package-local asset hooks may fail when their optional plotting or parsing dependencies are missing. + ### Building Documentation Locally **Quick build using the script** (can be run from any directory, example shows running from the project @@ -208,6 +279,9 @@ make livehtml in sequence - The `html` target ensures all scripts run before building - The `livehtml` target also runs the scripts for development builds +- Package-local docs asset hooks run before package docs are mirrored, so generated assets under + `packages//docs/_generated/` are available from both the package docs source tree and the + mirrored docs tree. - When running spelling via the script, the generation scripts are executed first to ensure mirrored package docs are up to date. Spelling findings are written to `docs/_build/spelling/output.txt`. @@ -220,6 +294,9 @@ make livehtml > - It does **not** reinstall or rebuild packages for you. This means that if you change the docstrings in > the source tree of a package, you need to reinstall the package (for example via > `./scripts/install_local.sh`) and then restart `make livehtml` to see updated docstrings. +> - It does **not** rerun package-local docs asset hooks for you after startup. This means that if you change +> committed plot data or hook code, you need to restart `make livehtml` (or run `make generate`) to +> regenerate plots and other generated docs assets. ### Spell-checking @@ -456,6 +533,10 @@ the per-package runtime dependencies defined in each package's `pyproject.toml`) - Theme packages - Other documentation-specific dependencies +Package-specific docs asset dependencies belong to the corresponding package's optional dependencies. This keeps the +global documentation requirements focused on the Sphinx build itself while allowing package-owned hooks to declare their +own plotting or data-processing dependencies. + ### File Descriptions #### Core Configuration Files @@ -478,6 +559,12 @@ automatically as part of the docs build; you normally do not need to run them ma - **`packages//docs/index.rst`**: Namespace package overview (source) - **`packages//docs/intro.rst`**: Manual introduction content (source) - **`packages//docs/api.rst`**: Auto-generated API reference (source) +- **`packages//docs/_on_doc_generation.py`**: Optional hook for package-local generated docs + assets +- **`packages//docs/_generated/`**: Generated documentation assets created during docs + generation and ignored by Git +- **`packages//evaluation_results/`**: Optional package-owned committed inputs for generated + docs assets, such as benchmark tables used for plots - **`packages//docu_referenced_dirs.txt`**: List of directories containing files used in the documentation in addition to `docs` (to mirror into the documentation source directory). - **`docs/contained_package_docs_mirror//docs/`**: Mirrored documentation (symlink to the diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md index df0fe2e..b9fe12f 100644 --- a/docs/guides/INSTALLATION_GUIDE.md +++ b/docs/guides/INSTALLATION_GUIDE.md @@ -53,10 +53,11 @@ dependencies (needed for some tests and examples), pass the `--optional` flag ex ./scripts/package_manager.sh install -e --optional ``` -> **⚠️ Important**: Installing with optional dependencies is required if you plan to run the contained -> tests, as they rely on optional dependencies such as `pytest` (and possibly other dependencies). It may be -> also required for the contained examples, as they may use additional packages which are otherwise -> not used in the core library. +> **⚠️ Important**: Installing with optional dependencies is required for workflows that rely on packages +> outside the core library, including contained tests, contained examples, and documentation generation. +> Documentation generation may run package-local asset hooks, for example to regenerate plots from committed +> data, and those hooks can require plotting or data-processing packages. Tests commonly require tools such as +> `pytest` and may require further packages. The package manager script: - Automatically installs the required `accvlab_build_config` helper package (see the `build_config` directory diff --git a/docs/index.rst b/docs/index.rst index 273cc02..6492146 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -71,6 +71,7 @@ Please see the documentation of each namespace package for usage instructions (a contained_package_docs_mirror/batching_helpers/docs/index contained_package_docs_mirror/multi_tensor_copier/docs/index contained_package_docs_mirror/dali_pipeline_framework/docs/index + contained_package_docs_mirror/lane_helpers/docs/index contained_package_docs_mirror/draw_heatmap/docs/index contained_package_docs_mirror/optim_test_tools/docs/index diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index bee6c51..bda1b02 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -212,3 +212,7 @@ ABI aggregator multimodal cubin +Polyline +polyline +Polylines +polylines diff --git a/namespace_packages_config.py b/namespace_packages_config.py index a94ab18..aaf9d3d 100644 --- a/namespace_packages_config.py +++ b/namespace_packages_config.py @@ -27,6 +27,7 @@ 'accvlab.batching_helpers', 'accvlab.multi_tensor_copier', 'accvlab.dali_pipeline_framework', + 'accvlab.lane_helpers', 'accvlab.draw_heatmap', 'accvlab.optim_test_tools', # Add new namespace packages in the same way as above diff --git a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h index 9e66681..4df9745 100644 --- a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h +++ b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h @@ -64,48 +64,48 @@ #define DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INDEX_TYPES(__VA_ARGS__)) -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CPU(x) AT_ASSERTM(x.is_cpu(), #x " must be a CPU tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_SAME_CUDA_DEVICE(tensors_list...) \ - { \ - const std::vector tensors = {tensors_list}; \ - CHECK_CUDA(tensors[0]); \ - const auto& device = tensors[0].device(); \ - for (size_t i = 1; i < tensors.size(); ++i) { \ - AT_ASSERTM(tensors[i].device() == device, "All input tensors must be on the same device"); \ - } \ +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CPU(x) TORCH_CHECK(x.is_cpu(), #x " must be a CPU tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_SAME_CUDA_DEVICE(tensors_list...) \ + { \ + const std::vector tensors = {tensors_list}; \ + CHECK_CUDA(tensors[0]); \ + const auto& device = tensors[0].device(); \ + for (size_t i = 1; i < tensors.size(); ++i) { \ + TORCH_CHECK(tensors[i].device() == device, "All input tensors must be on the same device"); \ + } \ } -#define CHECK_SAME_DTYPE(error_msg, tensors_list...) \ - { \ - const std::vector tensors = {tensors_list}; \ - for (size_t i = 1; i < tensors.size(); ++i) { \ - AT_ASSERTM(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \ - } \ +#define CHECK_SAME_DTYPE(error_msg, tensors_list...) \ + { \ + const std::vector tensors = {tensors_list}; \ + for (size_t i = 1; i < tensors.size(); ++i) { \ + TORCH_CHECK(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \ + } \ } -#define CHECK_SIZE_MATCH(tensor1, tensor2) \ - { \ - /* If the tensors are empty, the actual sizes are not relevant */ \ - if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() == (tensor2).dim(), \ - #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ - for (size_t i = 0; i < (tensor1).dim(); ++i) { \ - AT_ASSERTM((tensor1).size(i) == (tensor2).size(i), \ - #tensor1 " and " #tensor2 " must have the same size"); \ - } \ - } \ +#define CHECK_SIZE_MATCH(tensor1, tensor2) \ + { \ + /* If the tensors are empty, the actual sizes are not relevant */ \ + if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ + TORCH_CHECK((tensor1).dim() == (tensor2).dim(), \ + #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ + for (size_t i = 0; i < (tensor1).dim(); ++i) { \ + TORCH_CHECK((tensor1).size(i) == (tensor2).size(i), \ + #tensor1 " and " #tensor2 " must have the same size"); \ + } \ + } \ } #define CHECK_SIZE_MATCH_FIRST_DIMS(tensor1, tensor2, num_dims_to_check) \ { \ /* If the tensors are empty, the actual sizes are not relevant */ \ if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check), \ - #tensor1 " and " #tensor2 " must have at least " + \ - std::to_string(num_dims_to_check) + " dimensions"); \ + TORCH_CHECK((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check), \ + #tensor1 " and " #tensor2 " must have at least " + \ + std::to_string(num_dims_to_check) + " dimensions"); \ for (size_t i = 0; i < (num_dims_to_check); ++i) { \ - AT_ASSERTM( \ + TORCH_CHECK( \ (tensor1).size(i) == (tensor2).size(i), \ #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \ } \ @@ -116,35 +116,35 @@ { \ /* If the tensors are empty, the actual sizes are not relevant */ \ if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() == (tensor2).dim(), \ - #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ + TORCH_CHECK((tensor1).dim() == (tensor2).dim(), \ + #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ for (size_t i = 0; i < (tensor1).dim(); ++i) { \ if (i == (dim_to_allow_mismatch)) { \ continue; \ } \ - AT_ASSERTM( \ + TORCH_CHECK( \ (tensor1).size(i) == (tensor2).size(i), \ #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \ } \ } \ } -#define CHECK_NUM_DIMS(tensor, num_dims) \ - { \ - /* If the tensor is empty, the number of dimensions is not relevant */ \ - if (!((tensor).numel() == 0)) { \ - AT_ASSERTM((tensor).dim() == (num_dims), \ - #tensor " must have " + std::to_string(num_dims) + " dimensions"); \ - } \ +#define CHECK_NUM_DIMS(tensor, num_dims) \ + { \ + /* If the tensor is empty, the number of dimensions is not relevant */ \ + if (!((tensor).numel() == 0)) { \ + TORCH_CHECK((tensor).dim() == (num_dims), \ + #tensor " must have " + std::to_string(num_dims) + " dimensions"); \ + } \ } -#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims) \ - { \ - /* If the tensor is empty, the number of dimensions is not relevant */ \ - if (!((tensor).numel() == 0)) { \ - AT_ASSERTM((tensor).dim() >= (num_dims), \ - #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \ - } \ +#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims) \ + { \ + /* If the tensor is empty, the number of dimensions is not relevant */ \ + if (!((tensor).numel() == 0)) { \ + TORCH_CHECK((tensor).dim() >= (num_dims), \ + #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \ + } \ } static inline int64_t get_number_data_elements_per_index(const torch::Tensor& input_data, diff --git a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu index ed5e9c4..85d5d38 100644 --- a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu +++ b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu @@ -20,8 +20,8 @@ #include #include -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x); @@ -69,11 +69,11 @@ void draw_heatmap_launcher(at::Tensor& heatmap, const at::Tensor& centers, const CHECK_INPUT(radii); CHECK_INPUT(heatmap_idxes); - AT_ASSERTM(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0"); - AT_ASSERTM(centers.size(0) == heatmap_idxes.size(0), - "centers and heatmap_idxes must have the same size at dim0"); - AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]"); - AT_ASSERTM(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]"); + TORCH_CHECK(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0"); + TORCH_CHECK(centers.size(0) == heatmap_idxes.size(0), + "centers and heatmap_idxes must have the same size at dim0"); + TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]"); + TORCH_CHECK(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]"); const int num_targets = centers.size(0); const int num_heatmaps = heatmap.size(0); @@ -101,15 +101,15 @@ void draw_heatmap_batched_launcher(at::Tensor& heatmap, const at::Tensor& center const int batch_size = heatmap.size(0); const int num_targets = radii.size(1); - AT_ASSERTM( + TORCH_CHECK( batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0), "batch_size (dim 0) need to be the same for all inputs"); - AT_ASSERTM(num_targets == centers.size(1), - "maximum number of targets (dim 1) need to be the same centers and radii"); - AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]"); - AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2, - "centers must be of shape [batch_size, num_targets, 2]"); - AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); + TORCH_CHECK(num_targets == centers.size(1), + "maximum number of targets (dim 1) need to be the same centers and radii"); + TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]"); + TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2, + "centers must be of shape [batch_size, num_targets, 2]"); + TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); const int height = heatmap.size(1); const int width = heatmap.size(2); @@ -138,23 +138,23 @@ void draw_heatmap_batched_classwise_launcher(at::Tensor& heatmap, const at::Tens const int batch_size = heatmap.size(0); const int num_targets = radii.size(1); - AT_ASSERTM( + TORCH_CHECK( batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0), "batch_size (dim 0) need to be the same for all inputs"); - AT_ASSERTM(num_targets == centers.size(1), - "maximum number of targets (dim 1) need to be the same centers and radii"); - AT_ASSERTM(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]"); - AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2, - "centers must be of shape [batch_size, num_targets, 2]"); - AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); + TORCH_CHECK(num_targets == centers.size(1), + "maximum number of targets (dim 1) need to be the same centers and radii"); + TORCH_CHECK(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]"); + TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2, + "centers must be of shape [batch_size, num_targets, 2]"); + TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); const int height = heatmap.size(2); const int width = heatmap.size(3); const int max_num_classes = heatmap.size(1); // Validate labels tensor shape and range before launching the kernel - AT_ASSERTM(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]"); - AT_ASSERTM(labels.size(0) == batch_size && labels.size(1) == num_targets, - "labels shape must be [batch_size, radii.size(1)]"); + TORCH_CHECK(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]"); + TORCH_CHECK(labels.size(0) == batch_size && labels.size(1) == num_targets, + "labels shape must be [batch_size, radii.size(1)]"); AT_DISPATCH_FLOATING_TYPES( heatmap.scalar_type(), "draw_heatmap_cuda_batched", ([&] { draw_heatmap_batched_cuda( diff --git a/packages/example_package/docs/_on_doc_generation.py b/packages/example_package/docs/_on_doc_generation.py new file mode 100644 index 0000000..28bb331 --- /dev/null +++ b/packages/example_package/docs/_on_doc_generation.py @@ -0,0 +1,59 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path +from typing import Any + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + +_DATA_FILE = Path("evaluation_results") / "simple_plot.csv" +_OUTPUT_FILE = "simple_plot.png" + + +def _read_plot_data(input_file: Path) -> tuple[list[float], list[float]]: + if not input_file.exists(): + raise FileNotFoundError(f"Required example plot input data is missing: {input_file}") + + with input_file.open("r", encoding="utf-8", newline="") as csv_file: + reader = csv.DictReader(csv_file) + if reader.fieldnames != ["x", "y"]: + raise ValueError(f"Expected CSV columns 'x,y' in {input_file}") + x_values: list[float] = [] + y_values: list[float] = [] + for row in reader: + x_values.append(float(row["x"])) + y_values.append(float(row["y"])) + + if not x_values: + raise ValueError(f"Expected at least one data row in {input_file}") + return x_values, y_values + + +def generate_docs_assets(context: Any) -> None: + input_file = context.package_root / _DATA_FILE + output_file = context.generated_dir / _OUTPUT_FILE + x_values, y_values = _read_plot_data(input_file) + + figure, axis = plt.subplots(figsize=(5.0, 3.2), constrained_layout=True) + axis.plot(x_values, y_values, marker="o") + axis.set_title("Generated Example Plot") + axis.set_xlabel("x") + axis.set_ylabel("y") + axis.grid(True) + figure.savefig(output_file) + plt.close(figure) diff --git a/packages/example_package/docs/intro.rst b/packages/example_package/docs/intro.rst index 104d8ec..237fba1 100644 --- a/packages/example_package/docs/intro.rst +++ b/packages/example_package/docs/intro.rst @@ -56,6 +56,21 @@ Examples For examples, see :doc:`examples`. The example makes use of ``note-literalinclude`` to include the example code in the documentation and highlight notes in the code (comment blocks starting with ``# @NOTE``). +Generated Documentation Assets +------------------------------ + +This package also demonstrates package-local documentation asset generation. The docs build reads committed +data from ``evaluation_results/simple_plot.csv`` and writes the generated plot to +``docs/_generated/simple_plot.png``. The source documentation remains static and references the generated +image using a normal relative path. + +.. figure:: _generated/simple_plot.png + :alt: Simple generated plot from committed CSV data + :align: center + :width: 70% + + Example plot generated from committed CSV data during documentation generation. + .. toctree:: :maxdepth: 2 :caption: Examples diff --git a/packages/example_package/evaluation_results/simple_plot.csv b/packages/example_package/evaluation_results/simple_plot.csv new file mode 100644 index 0000000..9675110 --- /dev/null +++ b/packages/example_package/evaluation_results/simple_plot.csv @@ -0,0 +1,6 @@ +x,y +0,0 +1,1 +2,4 +3,9 +4,16 diff --git a/packages/example_package/pyproject.toml b/packages/example_package/pyproject.toml index ef14420..cbe6014 100644 --- a/packages/example_package/pyproject.toml +++ b/packages/example_package/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ [project.optional-dependencies] optional = [ + "matplotlib", "pytest", ] diff --git a/packages/lane_helpers/accvlab/lane_helpers/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/__init__.py new file mode 100644 index 0000000..e5b5150 --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from importlib.metadata import PackageNotFoundError, version + +from . import polyline + +try: + __version__ = version("accvlab.lane_helpers") +except PackageNotFoundError: + __version__ = "0.0.0" + + +__all__ = [ + "__version__", + "polyline", +] diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py new file mode 100644 index 0000000..e1ec3a1 --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .functions import ( + interpolate, + interpolate_var_size_batch, + lengths, + lengths_var_size_batch, +) + +__all__ = [ + "interpolate", + "interpolate_var_size_batch", + "lengths", + "lengths_var_size_batch", +] diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py new file mode 100644 index 0000000..417575b --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py @@ -0,0 +1,113 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +from .. import _polyline_sampling + +if TYPE_CHECKING: + from accvlab.batching_helpers import RaggedBatch + + +def interpolate(points: torch.Tensor, distances: torch.Tensor, *, relative: bool = False) -> torch.Tensor: + """Interpolate batched polylines at requested distances. + + Args: + points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``. + distances: Tensor with shape ``(batch, num_distances)`` on the same device as ``points``. + Distances below zero are clamped to the first point of the polyline. Distances beyond the + total polyline length are clamped to the last point. When ``relative=True``, this corresponds + to clamping values below ``0`` and above ``1``. + relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length. + If ``False``, interpret them as absolute distances from the start of each polyline. + + Returns: + Tensor with shape ``(batch, num_distances, num_dims)`` on the same device as ``points``. + """ + result = _polyline_sampling.polyline_interpolation(points, distances, relative=relative) + return result + + +def lengths(points: torch.Tensor) -> torch.Tensor: + """Compute the total length of each polyline in a fixed-size batch. + + Args: + points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``. + + Returns: + Tensor with shape ``(batch,)`` on the same device as ``points``. + """ + result = _polyline_sampling._polyline_lengths(points) + return result + + +def interpolate_var_size_batch( + points: RaggedBatch, distances: RaggedBatch, *, relative: bool = False +) -> RaggedBatch: + """Interpolate variable-length batched polylines at requested distances. + + Args: + points: RaggedBatch-like object with tensor data on CPU or CUDA and shape + ``(batch, max_num_points, num_dims)``. + distances: RaggedBatch-like object with shape ``(batch, max_num_distances)`` and tensor data + on the same device as ``points``. Distances below zero are clamped to the first point of the + polyline. Distances beyond the total polyline length are clamped to the last point. When + ``relative=True``, this corresponds to clamping values below ``0`` and above ``1``. + relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length. + If ``False``, interpret them as absolute distances from the start of each polyline. + + Returns: + RaggedBatch-like object with shape ``(batch, max_num_distances, num_dims)`` and tensor data + on the same device as ``points``. + """ + assert points.num_batch_dims == 1, "points must have exactly one batch dimension" + assert distances.num_batch_dims == 1, "distances must have exactly one batch dimension" + assert ( + points.non_uniform_dim == 1 + ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)" + assert ( + distances.non_uniform_dim == 1 + ), "distances.non_uniform_dim must be 1 for shape (batch, max_num_distances)" + + result = _polyline_sampling._polyline_interpolation_var_size_batch( + points.tensor, + distances.tensor, + points.sample_sizes, + distances.sample_sizes, + relative=relative, + ) + result_batch = distances.create_with_sample_sizes_like_self(result) + return result_batch + + +def lengths_var_size_batch(points: RaggedBatch) -> torch.Tensor: + """Compute the total length of each polyline in a variable-size batch. + + Args: + points: RaggedBatch-like object with tensor data on CPU or CUDA and shape + ``(batch, max_num_points, num_dims)``. + + Returns: + Tensor with shape ``(batch,)`` on the same device as ``points``. + """ + assert points.num_batch_dims == 1, "points must have exactly one batch dimension" + assert ( + points.non_uniform_dim == 1 + ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)" + result = _polyline_sampling._polyline_lengths_var_size_batch(points.tensor, points.sample_sizes) + return result diff --git a/packages/lane_helpers/docs/_on_doc_generation.py b/packages/lane_helpers/docs/_on_doc_generation.py new file mode 100644 index 0000000..1d6c43d --- /dev/null +++ b/packages/lane_helpers/docs/_on_doc_generation.py @@ -0,0 +1,91 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +import sys +from typing import Any + +_RESULTS_SUBDIR = Path("evaluation_results") / "polyline_runtime_evaluation" +_GENERATED_IMAGE_SUBDIR = Path("polyline_runtime_evaluation") +_DOC_BATCH_SIZES = [1, 64] +_DOC_REQUIRED_MARKDOWN_METRICS = ( + "runtime_shapely", + "runtime_cpu", + "runtime_cuda", + "speedup_cpu_vs_shapely", + "speedup_cuda_vs_shapely", + "speedup_cuda_vs_cpu", +) +_DOC_REQUIRED_IMAGE_NAMES = tuple( + f"batch_{batch_size}_{plot_kind}_comparison.png" + for batch_size in _DOC_BATCH_SIZES + for plot_kind in ("runtime", "speedup") +) + + +def _required_markdown_paths(input_dir: Path) -> list[Path]: + return [ + input_dir / f"batch_{batch_size}_{metric_name}.md" + for batch_size in _DOC_BATCH_SIZES + for metric_name in _DOC_REQUIRED_MARKDOWN_METRICS + ] + + +def _validate_required_markdown_inputs(input_dir: Path) -> None: + if not input_dir.exists(): + raise FileNotFoundError( + "Required committed Markdown input directory is missing for lane_helpers docs asset generation: " + f"{input_dir}." + ) + + missing_inputs = [path for path in _required_markdown_paths(input_dir) if not path.exists()] + if missing_inputs: + missing_list = "\n".join(f" - {path}" for path in missing_inputs) + raise FileNotFoundError( + "Missing required committed Markdown input file(s) for lane_helpers docs asset generation:\n" + f"{missing_list}" + ) + + +def _validate_required_images(output_dir: Path) -> None: + missing_outputs = [ + output_dir / image_name + for image_name in _DOC_REQUIRED_IMAGE_NAMES + if not (output_dir / image_name).exists() + ] + if missing_outputs: + missing_list = "\n".join(f" - {path}" for path in missing_outputs) + raise FileNotFoundError( + "Polyline runtime docs asset generation did not produce all images referenced by introduction.rst:\n" + f"{missing_list}" + ) + + +def generate_docs_assets(context: Any) -> None: + input_dir = context.package_root / _RESULTS_SUBDIR + output_dir = context.generated_dir / _GENERATED_IMAGE_SUBDIR + + _validate_required_markdown_inputs(input_dir) + + evaluation_dir = context.package_root / "evaluation" + sys.path.insert(0, str(evaluation_dir)) + import plot_shapely_evaluation + + plot_shapely_evaluation.plot_from_markdown_directory( + input_dir=input_dir, + output_dir=output_dir, + batch_sizes=_DOC_BATCH_SIZES, + annotate_plots=True, + ) + _validate_required_images(output_dir) diff --git a/packages/lane_helpers/docs/api.rst b/packages/lane_helpers/docs/api.rst new file mode 100644 index 0000000..072f978 --- /dev/null +++ b/packages/lane_helpers/docs/api.rst @@ -0,0 +1,9 @@ +API Reference +============= + +.. automodule:: accvlab.lane_helpers + +polyline +-------- + +.. automodule:: accvlab.lane_helpers.polyline diff --git a/packages/lane_helpers/docs/example.rst b/packages/lane_helpers/docs/example.rst new file mode 100644 index 0000000..4f74766 --- /dev/null +++ b/packages/lane_helpers/docs/example.rst @@ -0,0 +1,16 @@ +Example +======= + +Polyline Interpolation +---------------------- + +The example below samples a rectangle-shaped polyline at a handful of distances. + +.. important:: + + You can run the example using the script ``packages/lane_helpers/examples/basic_usage.py``. + +.. note-literalinclude:: ../examples/basic_usage.py + :language: python + :caption: packages/lane_helpers/examples/basic_usage.py + :linenos: diff --git a/packages/lane_helpers/docs/images/polyline_sampling_illustration.png b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png new file mode 100644 index 0000000000000000000000000000000000000000..1caaf7819fe08ee86754fc0588ce8b08b3b31d6c GIT binary patch literal 31019 zcmYg&bzD>b_x~L!?LZ_29i_A&oytHYB}PaprF3@;i_wloQYGY~bPNy}DqSj$j!8-* z-TTeY=l%YCe}8bd#~$~ddtT>t>h(P55Pe_oE*&*HH2?r~_cY;#06<3k3K_dVLHuC7 zbngrCgUVCW+y?*@ga7_PMp}YD5`Sd#RX6iB@^JJGc;KfSa}Vt9v&H}d)OOFv}7PHL78I^~ilg)NFE$1hb1_pvQE?ahm4~$o z-_*6HApg0Acu`A4SyG$=7~#IQ)&Oa7x_Ltl)3sjEbP-IIIaps9G_v73GwrD*;u#R~rcwvA9 z_=hdK;}@uG>Pad^itM*wNjPTlBr*$WVZ1l_$fi8o9KimUc{`o&(OtZF^Q&}y2;AOT z;5tVnO=PU`UdMT(dqd^_@%EMxt%Zl;ujBi{U>hzsV^S3gjgl1wAH{llvkN+82?*Sh zPxHC0?L8p=A1*IVq+-8gbsOl<)$O06LaEJYLoas8m9~O)$38-l7VrY)5oroi8sHy3 z;%=}*sq-DU^=G@c1pr|9K@2bSEjJ5cE7EdwJ>q*rUop-2!ZOnrzk+lOoKIMJph( z>hNjnm4RJ>|GVkNOYO)oBs2CS-$t)HI6jcuL@RBUx5=Yq90-a06T(^ zGVeL}oeR!q%b@!7@oU(!%w<6QpWS1si;et_=_AyqV8qJbffHlN$@p!<{X&)hZ|5GL zrW|XZZfqLYR;QdTx9gwoUx&(-6(}tW{C{+cB#~#Qb2h0}Jk16;Q>4JzMzPjd`uQUW z@ULb*Z2abtN8CT{@#L1NwR#U`^mo4Xdny(GxB2%`U7GmT(+zIzv2@`n5?&-V{tDlQ z_V@pf7qbBh@N`Z=dkpRR-kE6&GZPkYS+vnYX{zi zYnTrARA@5$q2C*$EUaG*r|>i8)bLHz7TVnnc9}@~`93g<3u1X4rcV-KfJ3|qH~nW( zZ@2fR2g#<%e%=*|#z}Rsm)_N27jkhh`|vtA+VC38fJ|3F3LP_)>pGH_u((@S*I#0g zBO*^JYXi^ijHkJrzMCcSA|qb=Ev0CM5iN5#M~}|F=u-%CE;5hh7dv12o=P^+8kR@} zqAzkN-;k~h5B1f;iL>!M)r7xyi@PtwucaP;@#Aw}M#l9AxH>+>%y`Olak#5JyQH0O zT~U3Vt5*4RCBON{dkHLP0i;doO4{47;(u6w8m-CfN$*K<|GW4#Y4_3RzkjYkCG?)7 zQ1>MT?@TE}1)X`mf|Ho5V7DIj?vy9^X{&fBPe$uH~vpSI0zbvI-A|PaxBX0`7 zOxk)MCslYRs>?-Brx;%Htpm`uZDx3UMeU;uy=RjGu1@5Yy%iX(U1YO)(h%X=7 zZ1CCnw5v^~{ZXs@=qFx1!Lln8CIy!_LPZ+r%}6ZYfeN2bm36$xC!#yriAl!MdI5LRmAaPyBxpg$V=U8(WE9sIEs>3{g zaB?QkksDh*FXmRWsku5(Tf%2yttU*LMlw^w8~I;lN=f5Jgk$N$T~rKdNr1_b%?1W?gQ#`F3eFvK(9~@+^B4qYi(K)Ua&; z2d?`?Y+r+!0yYOt9tOCGv?`VMZ*Bng?)@@=eryTfGe*Y*lf0{2YP&)1XUs6Tloqi$ z`?2TqOnp_5S9aaUWho+~Dy2QtcKOYgmON0>(U|i)Ascy|GL4`u-s`km)s{ z1|D}9?hDw!wz=?^TTkvG&Tij0F`_M}0S;}p^+`JuvW$Yv&PKax!V0`tx`8cS#}H*|*2mBZpBB_ul0QH#VmRY6nj=7SARYC4J~1 zhq%7DPb;Q zgT~XBKLxeWAQ`U$V#30xWcUh5NxBr%<$@IfkL~X%c*NZ-x(oBB-AaHa+a+u0L=Uh@ zmOkZCHp@GBK(N%2dlO?;yLGzxKtngf)NgmZSl$Q>)q%Gtu`*TDkuZ@Yj--BIc`?Br zs90v-eEn@R383ulvr$3yy^@aU9c6+-P12at!56%WZO0QS-wi_(mGR1%_h*kL>OSh^ zUW19fU#UJDp&^Pk42qVYeEp3{=#96tdV9^y@4uh3vyQFj#hb-poL5b7bft{ZjhAtQ zFotzTC0E4{pSzo98I|p2<~XE@P|s@d%`~n z{az0+lv1$Kexo12^=&$3da_Dx&l-A^2)g%HMay9X?W{hspqC=WNS#OwY+1x5e2}P5 zKqv z+y&V(FUR&>qT}yVE>0Ua;KK<0?k9d*`2~HF&8DF9Tz;~u+VvxaD5X1=ZcU;C7z8Ad zJCSHmzDI>%6|u&Ncf?I5^<6;=Cif{3KSUq!vm`YsIru9$X4|KwLvYAY< zW=b4k1T}P)MU_+uC2n!zXDQBUQ#j?i@Vws{Y5gUV>hpSt_3KYMiDrF)2@Sd1g-!}&?060$E(s4y1JB8PhIEXDt9X_=e~X;m39j8;lCJV>nv zWEMwtMmVCmHy0LXO(PkAlf$ze_4xoW*dt*?_@^13PIZ(3&B6Mpnn94ZgmE?JYU7u1 z0AKJp)%a&z!kWTEYd+i;ZJBm~>n0+9{n5ff-KLS0u5DqFZy~@~&|=`JDu=byZZN&oqdk zX+FF#hxt<$wkEmM{P1MB@%Yd8u-7BSdKb}I#!?8?QYCt0M6DJ4oHkT;YquCl48F?u z&+p|U5anX|Hf`aGPfb* zFY|?4I&teXeHZIZh=(2>mA^;FDW13AU*1G_{hR|q*&@O0Ru;Q=2Ga=ByYZS|&Y78w zB9Fp*-Vwd}c|hpd=8J_m#_A%duNE+aRp`4CJVy?k2YOa*bj?9~a&aBXnDb$wO&%}2S zeA>9aBsxmCZTeD~WX%F0 zP>a6K%u|%IuGe&Ed~Vjg*aQoj`FJ=a{^31}#}!UTylbF*i?&jaHzt~K*p zD)t|nd6~5`;{l&1!xyW%b?F8Grll+8YR(?w<`@n^>R`SvE!&S)X+;X-xw!FF*s zDj{1M!oS+zdR#)+mpHFluQ&15$W#P8X-X_v5qe`4wDp8qk?V^8+&N+1*mAb>7gh_; z4L0)4uNqe^5yC7o+|pw&e0xKs3n`d=FuUO^Hdd1VO3&GGoS-Tu@^?Y^IIotyp3W5A zSSif`uYW6f{Yd|cH8B-B2zsRB6aQlty%>+i)%wQi4^A|uwr|`K~$F|UKHfy+FHg~b0zs4TFRXUQ(Wo_&2X&?H?5TQ*Hd#6nB^Ocw zLu{3OlGU;=lW5kK&we;Dmp!`+H|Y#m=6vDnVgw3TKdX1Y_YV9z@uMHpM3|s??4AQ! zUxjf)7D?h;nrp#E*z*l~zBWMBj+O41`hqEh!JqAvG#~5@MdC*=SKIF1!K{77&V_$G z{AL5gjt5rPIlrDyz@6p%lu%GGDZa0+;9nE~S=E~2CdyFDxb{k*mz_j|b>xfa*?C*5 zT+MH3dP6CC<(*ThHH=6iAlf<%Ch66=XdfleShmh8_zAcJHe1Hz2sZ@oE@F=!5jILe z2%^$UXDD?&L16wkd-RIAcl47qn8EiMe!W(M0b5Cecq4EIiG1H(GyeGIjd+F6GA;5!1mrO$zdww?m0cQ3ckK6{ZnWn)enY-ztr={6FZS z*RHoZ02b+d-!FC7xs_ZQ#pDw)D#?(iz!0^YKHL$~c{J4-+~7_UNR1+Sm_na>F;MZk zhvHq&R@st$x8?XN6GJH$HRWLzudlcy)p(5b)g-ZKkH3N3f!`5py#~W&lHKUYxV8kf z?cBEa_R11GsEM**TY3?gZOTM7Tx6&{Cweq9qt;A>c{5x;kk-|^-E;d{Q}x6kXo^!u zAXPVZ&Ua16V_VLy*Uj?&l!L-gdlHFcb^r7yeIp8I(&{q-vyv`*Gh?d;V^jmM>-Bl% z_u1rz-5G6cyr60m^-+H z%b(4Oa2RD7QOR3}R8{}98t?63VdR}iU5sCrQl;6tQ*y!!V~OdagKb;j*$KPmux%N_ zslN7DYj_$J@D+KFyv{FW#@)!2?Y&IqpI^ezC)c+ct55wYD~hmnJ%;=Zooe0w>uaOA z)`W4R9A9ac`aYvNLoWy1mmJ@^f_i`Ao@2ihZWuK%q4gmnBsTYPCwJV>LoIKl+=bg~ zbh&ZZ^~{7}dst@wEImL3O#T?}>9K!U-Qiz2WH~e0H{dIObVuJJB$qPHPRvm| z)xgKvM$Wil2P*#-6aMOF%qEjFw*|}A`#F-cJlkfKi?5@)IHBfOVKm4RE9>yCR)1@V zQV8`S&lU|{1PjHKtj=r?&RDBXkm`Eg>)0ikkRSD^cl2`Qf9T2UY28A2_Tk9G!1r&Cg)M}(tXG5C&D{rQ1HD_xo zHmLatY^n905?w;!JFuK2wq8tMbmUfdA%Qla+q6Zx<)YKbIIqG;VfvGpOP)?#{fdz^ zj!rGdjn$I^2&)g^!O?u=AX&@#TwZJIHy7HgZS&1OG_FF)n&MduY;8B9=eC>kQE{Xi zhf_&|qBqi5F${h%@4KCKKM6H%u`PASGdaop+(zvcRD~8BQ!PsULI-t2-#_T~a~NhJ zBS2@DZh7NH_`Id%QLN4T71J2MD~PG9uwsb*?21QjKFDz*KgcR6`<@8ASx$k+g#xpG zPL@nexO{wET;@K*pGvwovvRMWY-;$(_bXkwqBuNu&q;o&@_J_IBW=N2X3RYV%Oye+ z5lg$!xCML0wIl<@Z0quK~b{HuI6cGX)8a|SUlX`**Ptz zT@Y(*j!$rVQuisz^8JnR$eGj+l0|owZN3eK*T2fwM&P}qt8YE+G~Np|o=Kor41bIC z?-}2S%*rF*inWglckBHz|5Wq29m`o|yxQPL(&ZVw=Fft>NUrcx`e zoy4BFJkXoQJ^E|I*oPM@Me&a>cT4TR1rMUTOe4zyQxcoLy~d7=NNt2^OxKd(7CN0V zdbh{kdx4na4=K$4i7w}5^i#@7e|os@8~kO(*!if=*jYf-XR{VO)*|dp(=sd5?8zWG zdMeB6QP~XnV5^9qP26&am{j{(d~CA5eumIa-ifZu{BOcX0mjvt$hOx6->Us?z1K2@ z=_lrG^(8Bd;p@-4+rMC=vBprgkKovcUz>umIpSH|7@~gMao2I|p)W(=+MBuLkn4L4 z1)x}T*Lu@1-(U$FTad%LQH~_Y4&w}T{LQ=&T(kX)p6DXG9rAn1Zv{b*>cQ#cxq90;HmtE9 zN9wGqb7r{B49t zVqdib91g?U+g_@Gj*jllbhmoQlsX3$E{igSQ!l$=g%)HF56;OZ1z^d>ds&4WaW`*+ zjplIo$1|@ReJ8ZcuYaCgikRq`B$VBSh2?R-)Qx2nZu}TOkhVMAuRR;<=`iMM;u#de z8en%HDFZ}wzl+Dmce@T3PE-Nkvl1Y26w}3^{_#>2 z#0JP^M(e*{(LZ+9MkK?$AzQ**9MDV`2ljZj;LvYdKCmh=&!6We z+$PXq;=08qu!KFh{jH7Wv(QP(d^*fkk7m>)NvH+F^ayCPA!cTYsbK4Z7M$!*N;K%F z12Ov0d$NjF{$wTi#e$*u2yQ4kbHbFDAY^g_cQG$j zuh#RB+)q7^s$wHkmL_6G>#bC?%UN}Fr&K#Xp7NQL4FJUkHxdGys|84<{zz z0#~w!^vOG<4pm#oj;oz&pcSBs7=mqE;e}T7VtWteQo9ih0c#_5eEn~ha0ccq=VBqC zQP7_{>0y!I1$I&HW(Sr_Mz0@m;(J+fojEKl0yY?O4(bo3nqvx zuYm0+$jC7REMl!x7ZGL-FQaH7I|Tww2WE)*&i(tiTFi~Jh?i5}cOR;{DEB76Bo5E6rO3I+v1??n697ljC=#YKpCg?o>uLHQ zf<(9SQJ+451o~1pyF1=I;%}D=$qMeh`7@SC5byuR^3zB`{ePO={?%fv z<$UaLu0(H&IutjkDCy8gDZNQDK%&=M9ZwG(d%^L_)ZR;LD?{YSlF4+L)^aQCzN!1G z+^3IWhEYv4Uf(iE1s$bzVz?|NVKB<(cWHlozVwoe?|MH~k8pkW@kV%MXx+=z;Y#ls7oLQ$Wg)ts-xr^Lfli2#V7hparMAngZgY=Z%_NCa_ zh-s4)*3g#G^EF7n@;>us$8%jcKS1NT+1$CiJ1qpG{PmQ~4k?f4MTB8yeJn#$-{+r+@yM}H|3W&Oe7{R!npiy6-MWz|#S7VFmTy6o{wjS!e3p9ynQY(2KAWA+K74)z8$s#> zuBaT#?gvh?_g=pYjK=2qZ7WvXL+DdAtaAhDBn+i1_a4@*PjEv%QJ-oe%4wUz?5Gz! zRC6dlH9KQ&#VNgiWbF`U|FeSXW>MaQ{^t#;L1HZKV=0f0ny-9_;MWr@`LkZm&*Em2 zYWdZ&Sj$sx}4xZFQH}$RzdTLzj@?;RW}QVU>KCw?s%$wGIM@ zYv#;4;_HzoStO+${FTOH34$s;i;BcNZ3SKcdHgxdlPy;lK)~H8ren#0eZ~tKt=HB{ zc41W)4`eok+S&7CyJWn2kLe9@sqL7TH(xNKc|2tt0co3%32Bx1|#DM%M#dU<9csQ2vH+o{b9po}i zDr|xq3(Ah5^30jZAtqJ9a{FS~iAZuV*q&>o@lt(?+(tR`xJ0>k3IYK91rnPs%JQ@G zNQXMNryN&)?@q0aBSRr*_U7LH++^Z2;eiW1_cgp%bHmNz$1M*n^kcu?t)%tF(uNz& zj-5U8J;s(y*gKXbD%w{^qOc6VDQpR@#Z)m}GM@L&ftBFcIP#+J_R4XNQXFdVY0A^f z_0rxk{j=g787)UwPn58Zh$~qOasso0^=80TRruvOa|u&*!m6V@$Ml;sOSYj*Wlj|=asNNY`HTwwNtK0<_B zek-f-&>yP0slFsCBdY1=Y|%Ts`#MJ`S!m8}JfUfoK!dP)@Y&=0B~AD_w&a)gw~fVF+?c0JWB=z2eDx@gtbt zI8H3gJHNcE(dqmcg;00q(<;31HpV#yPBgh_f47a%LJNzmB;6c^S~gRPl_2~e*vPxP z>3We7?L0~OEP6gL-3q?(HFeF!6gRG%aK?>y3O^u2^b3Bev5~0aG;Lg&1CQriPlyEw zwbXKQ0>|V`I~r`1jP-eov0a@I2Jon^?^q@*^TYI?Oz3J)zLQxD;azr7`eNY6PUJF{ zSNpkZPYy9-pNBcDXd^rDhy;G^QO=!dv2d$0BKc{KxmV z6GTOmsrS%zU_24|;t&@x?nufI8s6PSgW&~<7@E`0<2j8E=q;5}9PJvtI z(4$?~tFPU-bzQQATa*gX5pHk(NDILP+g8J~CD7g-*)4eP7V^W!s!%@E8s$%=uU z?AAL`w`NHmur+xey~Tc94`-HPI)vKX>!4)>L6|q)mJl8WJ<{T;e%AV&Mm)?69{x=Y zc`Au%%`;t;KRgiKop_=4aEl?R5BOaN4Oj|b6o(wdS~D2`l31ww)pOGo7`9KY81NMn zS6bjlG+*uXsR!Og?t$`^iFPw>xeQm*OID=hES<JxcDv8xZdN%ktLOMY@DH|);! zvys|$7O#NH77(aA^Aj#h*cU_WJ!}7zcC#J7b7wu4gC+ZUmKcP_9ii20C?4T>LL*ixJ9ThR8-k%Pf_yi}6uuuX`) zI?EH8i!TgSMg^7vaS@rYi|8Xvn+itrLD|ZgG;YD}rB*9t$_HjO34l1@ufmS=SEG(a1z8DeAX_YpNonN1x>#nJ*6l-f(mu~ELf5c2C~1Z~gTFRxuI0CS4C?Td+i;PeNY^b z>7XT6r+6za9LjAlcu|T*{;tH>ViUb0Ec2k~=$3@z3GJbYsspykR&ES-l^N>{4MM=f zd0;HDU9J(GUp`G!g=hKbU4xcD(W^)T{=9?h9oYFRJbV+Fj>*dTzMObNkXzV-qHlM# zl!z!_J-I2GFMlP_Nty2biUy+jVg{9Ed~cS|Ymh5z6F)Pkd5W1iYprlfNUgr_2xQD| z7ZN16UlWXNAEmfHRc`w%a&#&&8J*Q3&nQcI=*;CtJR)Y~6@_xYx+geJ)@O=KCy7$x zgVoPIO*@#~_?DJpQ*Ah;ChM;+<@z6r(S3;f82H7)n-N_0&ItAR~X#PQHH7rP5$PB2zKANKj-2x*GOD z&_AYY3d4@uY$tXv(h9;PXwwCF%?#$>Sy2#x_bblOBk`@l^A#KJbILPb2<7cA>ay~TqUfo&&PQWn6xFL+)Mr#H1}D(DV8JMN|NVx~)*b(UR9R`h>8 zvu`HQNZH!?(cs+AbBlXRVV~C_^m=0Wf>1K8TWfer88I`|LlCaR4oEyYL~48FTg`5w zx|FdFm<5wOVFw$nwlsj2matGi(G_w0kl)3zX5?MA?W_DjZ(}9QhzG;&s(;+gXPt!@ zh<%51u(?7p7*JXKRWA=-pas`sLKB$L2eDmkVR@-nY`6nuMOV1@M>y1e&yXOLxB`|A zzF_+?_U2YO8uU0@_`Hsdz|jF_%$%mHOi-t?7B5H|k6nl|y9RSw`aZzh+Ap7f4aPF( zIMVLra@NFTH|UDC9wEeb5nBK!2$L<^aO%uINk_{3EZ<(<>_|!0pLG`wzX$I~!-$RG z&)p;LBJ#w%Y>sotfu3xe&7jva%}vOv;LID(Yn!mi1D7{+2DdC%b*i zlWO6JS5|Chm0h!fc?a9LMR!qQ#>W+qJjQ$Sh_C`q=RRVqCW6}#$4%nn72$J^uU1}q zlriI&YG4UpL6)CPDFl}kyt?;wWc4@#dUR`p;K=WO7EZUt06l6z&8O&$&@4V9rWO!= zL^^&tfaCfvS^yfvq z-nODRyy~VSl8G3aPYl1U7M%yY8ytfxmt$>^ z@t*h&taiGi*YSi`b!)f<{j>AjLvP!WU&>kY@-$bOFa!H1Cs)f{Z|>$E$yFE3mE`uM zMt9wf^dKoNU6DcTg(I4&OqXXb;LBTtABak5E39}c{snc$M!aOG277(#OmCkucC32c z9`=cH4hp6LQd$Wtf!Vk#HC$wa>n`K{fvxe?2UA6s4qBmC)|uFdfu&_*TE+j;`1`zd zUlI599kFNSM_-F=zT%+gkKT+3%Cy}19Os&_SB({Q)W09l!qBtJzZ2Y|-FQ6Y-+E=3 z=+M*ep+-kq)jQ59_bVYvBu!yFLb&LL1F$s%( zNCg8)=+LGKgemr0aU{P_KzLyJ-O7+-!p-mlImEg&k`ETI%1?mnuM)(tQ-u*=)JWdm zMC{3mDNE)iPbnWAvA%o%{R+Qj(v|@>PrU$CDO=rX`NV+HLqy=XM}$pH$n)cl6dv6@ zo+_p`w*kRe_4(VDNnjAxu9-8J?E^u1W;JM#jBBtskxYxIXb)*bFI>#VNymu%28 z+4Jr*oDAvygFQR4gu@+4isvW>9%{`|Bp)nzxu;BO^HPkwV|iEbOSKy3aY75=87Ul} znl$%+2qE2=k=7erB}i89n(S>IYgwKn=hO@I5Ej+HxiL~z7eB8N@#-5g;BkeG@*b0= zLzd;U0y!u#5>qP>0A+(d7dkScggu5vcdlKHqfMRpOb!fuIjd?;KxJRhY44XlREr$3 zcxqv1q&vv>kywL6fko=`5WGlaadpY=_YvVp9qUN9X3peHuJEhGf-I8BlWpb;J#gKV z<&{1F;ANrE*2!xenP1E9HI_5Jl7W_dox)v4OkEBPSN!${oCr~D@HIJdQ}dt<;vq>4 zu~^)1+#|pkNn6T6?wxXnaQAwDB{o%r5$s1?uZ;K=v(YK9Dc0rs(kjJiVxYK$Xr5l; zYkB~eE?pro&&k6iI54CvTVVkZDjX#pQRC~?NE55?RpEpXXX&U@*24^xxm5hV8bhuR`%9V7ZBUQi0 zU)91@=*)jcIDw9Scx2CevLyWEKEA>3Pd90p_|;_1iu$~XzRh-c6$4;DZk4m}hp1=x z_bUPF1aPavDhkm&;|iE4e>_U1Nc0#9Sw}^k+A6vpFB>_A!Sym80w>rH1}iYxd~(^|Gattcy0Iy97ohGgj^u!JmNXb`qo| zu05Iba1|&T8pch5*-Hjg`e#S`kS+Dky>CqRRV`2UhvSUZs+WBaPDra{{pCWmCjAv8iEZpOecMfKt*uV8MXx?jt3*8{Ob&cLv!EU0 zMRa7mxs8a3Dec>S0_}FxcqrhTuHNW!y%H2>++7=5+orZMYsi*HxGSw1L)iv%b7!{fEOuj^^b~TKf~y zfZbGPb2vSzGPNSfWRiMQ=54RUXnTazN6?g{l}_=D-Sa&du8ZI{!Ks7CK9=GFFKC2M zjp2H~3(KKUxxS^EZjHr+(+}2(!OfkXL9FzvtE#PivrFLzU1jAb?BBWV2aiug&X1@q z_mHhl^mN(76H6to;dReOJihj$f;u)#*z8`CAkI?31Ud!DReDJa z6$W99#4KuVxQg%qQC-5zMEjEn#JSi6AXj3sd8h7|WVG`I^057avIXgOPu;}G+$j^T_Bys_cxA${cLAl7N$LvUMJ$cVfi-}LOv2hq$aOP)a^d>JhT+s;QR zl6`1j5W_%aZzE*0^H#Nv3Tt{=f>a%wI1&c;r1WarM=)Sr#X7c?ZoJ#W?zejT68 zU?reOsi;raVt3rz!^-4J%1O_r@{_%egwOe5`uJ`3HIji{BlYsH?Y&@`hulj1x z_WD11rjxWB!`)>Lymh{Dob4|PC(0083#%OO0lg_Uln<5yD7}HJhY&x@#Jg;E6v7Cr z#3qYiH=EA=M^N`0h8_!B{1gdSbT%LxgDJs#a%R&aFdRlA*q)^mZR>8xRo5cSS$_2|U1M*s%J3*(n7ia!%w|INR~b@@~r z>Kt7hUdOjJO3aHhawp?01jQTD?fB=43{-~2lvV_kkw2z(u3(<5G}P78$b_8|;a#2z ztV%BPBgO)Q$WE{Ql*eSMa1d#SlmZ)*gpU}XAIwqo?-=_%sapb9G@wTUo<%5HBiy3v zpa^yf1E}h&)F*qo5V(zS!(0he!94E!HXgSri45_3$caHNqKVxr>hOU}Xdn2z8qB){ z70U0a^A450kVnWyg{s06cF}IhIkteUcH-UCR=2gxV;AC_iH<&3JDq!RaFwve|P!wb1V@ODB~Q@N#xK`dDV+o!BbEA zo^q8MK3`)|l`MO17oc}A!R<6QUF=B}>>*h3w|!KnemfH)p4r^1;=l`^x3>Iw>}#nz z>4zza&1CT5P@iwYCL*MAJ8PqoY%l+|QsP1^{sk5kyc|CN9Z^|M13^?MZ6I(6bH7Tt z)*=+j?5=epqc!veB2ulVlrrFWf=MtO zis6+m&r1;d!G(_|(KB(eA0l2T)w;EjGv#*Z1}M-xzK=We8X{9k$b!kzABt`f!-Z)6 zjN0vr%ewJh>n)-)yd2$6_$v$C_^)^!cL;QD7_FD1U((L2lg{hBB$D3QzG`HEE|aR zvz(ya+VU!Na|5Hy&RQ>d4O$A(%w?MP#}u}0MA-BrcQ@&siS!9uZsR&~rW}DpAaJhF z`2Oz2fOjq=A5`$_esK{^pyXV)VZ?J|RD7>Q&M6?dV>LN3(<)(#o=vE1_H5}W_xJ!I*`-n%G5>1mTs7~j?ec1JtXhD2rUbt`soe>fAF%OSFTXMsEv252A&4+Sr9`}R~|RGAc~oM_qxsE%LHq6eLqHc zH`CsA9dl6KXS_?jKfuqhHqdg;O>odEHbO)Ir&!aXCqSKJa^JubB#_4Q1u;9br>KA# zE$5FK?>&JYUEFel$_)={Uf-9p=;j%{2J`4R%k4XmF(%s^HV; z?Jc!rAON(4Qu%q-jeJZ==mF3=`1aJG+}MwNYyZypZ-KVgpQHb3%U_`8f+0=>D^<7= zbt4(?2DNhPBG?Ru{UNXwP$3PTHM)q!i z$$8mA9Q~hr-u^pAyLse`0P*Nn9mz14bV1}IdRhxk$KS>WkcRbyii;cLh(VFnF6P=o z{n?RB1g|J`iyl9LnQtANJY%R$1cPa|ZtwH52+Kf;MHef}l>{R0$HsoF0DQChI5$O= zFM$rW{Lw^%@DoDnAMZjYbLLcz<|vwM6P|`=N|?jj01b!kU}Afj+K^yt&h!uj#>kB3 zKps%!Dr>^sFQVsmFpH*!+o5Sv7b-L36X%~ z?+YPRsy5?ILGEVwKWx~WSUxMs#bZW;dVEn&GxMR#_h56br$gtl=!yyz@`wS7U}aJw ztB=DeLun6FP>LXU5a1JsPZF)0W0ZW_n<`0{o-Ha6-gvgqkj(?@gyd1003(aRxxorzujUq_x=OO{1^~7X($DDkX;<1l?oj@| zVX>*zsw6!_kLr4ez4EDr-|vllqzht?<62$%%?uJ?s-J}ToDm!xf8N9s(SU+`2JH1- zn+s*yGS9RB_IFrGeRu|fJhIx~97theL<`y`MYI&(ezbUWgZyW*O@g*&%1%m$^4G|d zJFfp}u;imlwH8z-K6pbpFU}*R4Q{vkOX2&4aXDc9@cvshQ3!2*UN4uKW z!s}rm!W8H5&?Z)?fxGU~8@K<^BVNoeG|}P%@l?>DR@5z&<_i=j%_QBYUhW}kOX5*+ z!`oe4&ax(IKkCPWZQJg*_elYw)3lXx*XW4wEPS1WeFocRfU>d7*1lD3o|*bKQTY$2 zb}<5LY(Vj>#I6nL(VxLlx}^T8l>DlZ8T5$&erewHxeRIFMJf% z0Swl~7YeA)JK_!SUQjs`gpA*Wrh7+))057|i{q02&`yr3Z~|_XuCO30vA40^SWDXv z;D9l1@Id_VEGXFIHHBu z&@t`H^H;CW^xLUvc$MO8t z7`qto0|^lR;8?+nYHNcL9RxGH0gL+el$2x4NZ0my;Ek=o2a#@9e%0Twwcze_Bmv$b z&uBhvM22VCZ*F$}%ZEx&12qz*L%wm)WHnMWBL<_w2P2y0^XN-fOl;)0T=6^@vOtT- zRpdm-gC=E75h}&p+cQQ7AwI8WHtyqmxjQMU34;`AES^(30Nx?*_xFDZQ_UMuh(kWD zaEnn$Jd=lL9Cn6HJ={p;&YxaveOaX*+36D$_N3i1?qDMDawmVIoR|~LkWS}Mll0ll zKP0rSF9Arfj8sqLj*SircpQUr#nOFVVji8;Y+Ya7P+7YIuv%fI7%1PK|Jfn0`B&|W zR2@=DvA#k^?0-C=-A)}E8ig}ORkG4!{Xs{v(p#8U3pD>~GPOMsL_4UM8Lo0t3R9K{ zn62(!;Z`hqmH*{tcoR(^h$P~j|GoE87YVSi`HMKpN?w!Pu*1E+cU?gC_aMKMsy?wD z?!;K?`zrchsxdvpdp)Qs+El`QAnP?JZqfm}&u%~cBtb++Be5;v2f3l+|C&7$7x7pk zXS2WM-%6YrF6pW+=?~A+1+E6_80C&A@N6C01l1AU_juK9ylBVD)@7)O|jv_N6tTLTA{^jV-f52%k+uDdwlUz69t%(cbMTvHu zlw3=eE&A}6qymRiGYi3go$X&CW^9=Qw!l^|a6TU}ljS6b?IcO;-1l0OZMAv>a_nG+ z%~i?su_8g)#J0TuGGplG(W%<~AVvZa5zvtVd1WfVEOslTA~7(nhGHEaA_-U^vNLkTZ*3{}Uk$nr=Hy zdkMw}$psR~M2Sc1W7)#;x()~(_JNAp)tNfbtCyyOl;!=3?opk?f8I!;43&c(!to{T z#9l$eJ)(t50u-DPPUnnQ=`63o-qAeKMvRoM^uC}yZ)TXKAPhbVApG;+2#VJJuGA1m zj=82iLRnG{3F}hIc~kdIx0Y!|Ss1;mkHs{VFy%l`pG{qZ zND}Y8q=FoA0F&O+Cv_=y!;dsC1~uSc!vjm8%w;%*9@hKU-2QvhEoD+)Al^LY6WSc?8K86Wnn|Utj^Zot@&+GZ&{^|01&E-1J>s*fGeY}tN zd0e17lyC6h>2noZ_h<`)mZP;^_x?L)3L7E7otGUbFDQ9%P7mC<0(iy&pWQn_tTCMu z350^IKpi~O1$;;u)(*-NjWUFr8cBwp*=#AN9mFu%4=;jb2?N@jUC_aTd@Wd_kjah= zQS$rbP~b7AAEz|{_rT!w5=N4UJ=`0p(x3HyRil(;0+mqTMtgDjyYl93*9!~7434)w zA-N?H{U|*I9uez`MwhfE5o_^EAUg;yndFpqseH3%Z_DJt0ZLCubVwG+ND%S%a}!^L z?aR<6RaJfVI=k7V<0mL+`ESySC*z^?(@66Y>eJ2tYujl7I<@)pMX)ev^LpSC9*p0- zp56oS8JpJ&LbCte@_%t$EH9vF3B)J;gNf$Q`7gv?uhKa&(C-;ZyG!id!&N<+JLtFf zg#_0DHGuPW3v0V(`}@}tbCe+aDq|;L>|`n-Ts`(Yjr@i@r_1Q2MMc*189JdFqZl}X z^;Kq-t-2?e9pG-@9u8!s{zgap8v0RbRM-fe@C?;BPm1s!UEW#^j#vCwisarvf|Xyw z;l8teDKTb^yBTVfE0k%1HzXZ@iPC_VFl#iU)Pt2Eb>DJw+V^?6FYIMhkX9b~NG%31 z_>d*Rl;mfFh&CorHGrN#^u%M#8&yd^OxInmqTy2yfv#V?0_}tt$s$rlEEgd7jeY(< z1lPqY_f^VPBH(jrlVR_6L0u?G_&bDA9ga1!DpwLAiWPV-!Y0t7#vbU0q}(UUfH8vN zZM2&7^FtThT_RQ}wDpY;sycO6SQElifzuJ|Rm#7I(9!`F0?Gy9MAT<*7bd&7P-Rg0 zGq(>8LNm)~tNUqt@sRGFVj6vqBV^e#M6N2ki{?!4R(hG116k1O1FDmUKI@=OQ$WXo{qt4Itnmm#kK%e3 ziIkc9))mTrOdJDef;dP+`(1zI4yEgicf(Uno|kZlex!8&yspCfz-w>RhM~~FFf{Ld zt*PZ0`MFVacz(K0i6XQ}Skv#;6LhVCzy>8Z-JW@RY<~R5`tS-}xKM;AiQT=mSJQ42 z5dzpyOylE|H8n~0eVu{z$Dn~1ttBL8Ca>@dHLI0b^R(Href4tkKMgiE;+f|1RTkdk z(LLf~F45ivOSg%=a@?oBf~Bs5p6rHN2zYD8*+-$QJLWlthSDba6{!{-nbh0R zOrE(K66x^c$7i2lCOe$~Zl9cX`Yt!m?ov>&b44Pyy*~;Y>bY(3qB{2=g^hdDfaj-E z;HJXNihQ#YI{dJ7+4_Z&9S?}dD1l>R5p74p?-qgPAUrVkq(%IZN5G;AQIdj2f{N_r zi>WrXwJKVy343mmsC2+8Il?oHH)kUp5MnJ+)hKR>`$f9^@?eugk!o9JX|)7tiaW1N|U?t+d}%BXiXmNm`Ugz zBW7jAs!vT!L@&NHlkXcee!ajnW52v78gbrQA~GaQk!)wzlSQ$mP72xj&v^53+8@)Uc&1^*mKOcd%MnLZoK99@)a>E3SS z%ZplKO?mJO3$vLPv}~-?vEjNiC|VLR98>|}i;`nKI!i1r%>0NBMy*>$%n>Lugfkd# z3vKnNUJ@k~I-f&C9E0&Z-r&$n-povR57}Yt#c`3Q^k1j%6CtpXbg4|XEl`4qpFrF* zvklblYX!Fmb%=<%``e|w3N&kUX8iKX%siWl$~W|CmlywMB%S}LTA91H!kLVlFnV6I zw#yyEfJ)xz*&xu*hj_?948RhfX?Nfpl=~jGHbV|NFTg0FkaKj`UDe_BB2mwWvqeA{ z;piPVh{Sm15o2iJ_x7x!e>|z9T~8to!?I2kNB+B?&7eiuqO(pHCo{%>X!GkQML~%H zmm-YX%|M&@!m@EY<>f|owK$31`SEQqQ@4ygh(sQ%t4I|s$IyLk8q3mklo}8ib7c=O z`y>OukX=B@j#)N}9cj~JryMqyjv~KJ$a+QO?r-YfQfaY8lM_*_@?*nHyY?<=S{AqWm)1c!^n)>CQE z%o=?Ou5tY6`e)|-helEsxS*{~B^gK?APxyZYZU1HRPuIBuEE}18mZx4e?`l?!a`$x zF>(JyqKgiPrL7nthBV}ZlrS^vN$ksU$zQ^solA##ur359)r7v|v{1*} z)sfte9NEXw2bKpkO~0#UH}X$&&s@&sl8^-4jhS_W$~ccRZ5=8L^rH1+L9lv$rak1qmN#j()qEM= z*xKHn3=YITiQy%c=4aoz$!*6G)qxM%WW=q(dvsj_iHB!!-xz3LhyU@8mx+n5YoRlPUDeA(k&saBqHNF8 zQYXJI%Z_&630v5}#+MH2bf=DK3b-W@IjYiXL;Mq%# zkx>-CY%G1EcwMdO4I#CxhGTjG%@gDvXj+PtNN>z;@HM|7Ki3>265o=V$~SH`4h9x4 zpKQa5xP{6%Ta2e83Aac?^eak7>06Eln4pqSctPliD0kp=f9TVlgtIgi)XmI z_tz&)OUPQP-MPJ_QB@|t2vR6NmQ|vfBOwoty8d`}fTUSDenJ8;^^Z&8XF)fghuc{A z*6fO>E{unT(5}pY+<&rXede;WNM$s#3EbdZMZSaov8E-W1)jDeCIS_my!U!{EnfCOa6rn|UYtxN7 zNE`+A8%L*x#3bWa9(jvWc_C8Z_weZD%~dU$K~Ib5@(_%kCH>kifh`xS&)7(dAq4lgfQ`PGHb6A}4gUnC(>Qg=PW zqWz`)!CSqiW{q(+tXrl{ipzb&i%{*Mm(i$38w!-70t8}+ijEf>_*Mos+$21>wxpGQ;YqFrCsIVpu^*9rizuN z>({oMc?Z&mBVis9sHj8_WX;;@Nc58~ktkJImTsRIv{;I(8=%Z?F5Usa993yq`QWF= z(b4gd?KplBEeefx%^=+(RULwN2*&j2BgDAN+Ed1V!4#2~GsR-~IaXPW4WRy4DC5oc zfwiH9Aq)LzS*R#MOMtFn+UFdtD1}RROW*7{aRE#C)AP#1f6e9hV8}u4`1IH`Bn(Li zuJH3b8ocIr@?-@h_1x9o@T#w?+bJZraQ#yw>M z|D~)=ph;8K(wuN2jg}sE`TPoV!-J7d*~fT9s;WLcOU@aNQeAdt7k9q(a#A?}r*9)m zVZrQJ;y<4>cX&gnc+W%Ud9kWP)mmWJi_z`66b-=9R`A0D*mm2) zZ9=|;XkFm>WmwI3ubs3!vP6mDh)MO&g8{+=E`y+WJpHB){Rs z=W#-P5^*xu75+SGRw=H zk8SA=Zh3z79@i%Rt@B7+snH4cRNH4F#G#JXzHI36AG^7;0<@MGKDX_x`O12P*tk*$ zt+`9Z^py52Ba_mvJo0{fcj)m5GMZKs#-bk{$}tV!(P7m>hPK?lX%w)H9WyMEA+X6- z`24F=A`P88js1oZ7M>Cb3s-swfyBBR=t||bks3}$Mo#Ya%q$h;A#@v$-MTdZT*_%; zsbV{wOUowKar@yI*~Q_#yP!(h|4i2!1ca>2TPu&~xFgxDLt>cvqBGAUA9t(=qXq<{ z5O5Ldl~pZa-rIFvu2lRu_`Lrj;vp&3ypc*MkgAb7*Zj(^m`1CCjfhFbABqmtv6VQa z2IGObYVI;|Lh4|kXoHn-H)PzK>5y$?g$jXSM24P`?ABK}8CNf;Y=%$J&a=f2`{rxH z!dM@0c0E@_mBXgv7!;QO(%h8CX$z5fL&M&b&p9_UoqYA8O^;GyZf5S=IpCPLF}(Wm zGV zNZb*8_F#n5>-7pp{8YED@{AmS=FG-wy5E zA`2{~8}4@i7!I#e@V3j!!8UJjZ@@|hG1(Zp)_#9No|Z3x_Mf7zyA(`3l(>-pou5q0 znE5jMxjb+e_KMo{`QWEcmacmDEwUt>(@6EI4fqCC^PQy^(f;>|76SD9#A!IkyPz=m zWoy~4A#g?-$=5s@vlUP}>IBQNUlX}H@7@0G?ANjGO{u!-IiFK&UW^!6CeR_N$sH4Q z-q<&*=j?@?cSG~=GB)HzMQhu>%Y`EBQ(?4@XBuH(E46S9E>N;;6P~5Ll!pc=1h}9E zPIUe)7v_cL*+n|$>29 zyO(D$F~kY5i9kN68nA(Y~<3c|0nzEzQNxN^MyrJqbMOsV2(!PUki`SQHGmBp=yL>^F|ga z0$tNz3g2>nulDYNcDx1vo-nsU-ym02hf@G87R0)s@~Jfbjqr4H9nfYX;y~L!n;1YF zkd}x7#qbASp;u+xaSRt|panX*a8T^}&#h4R)m|tRsLKHHhh&jRC;j{{*m|KAu)+ai z?9W83eR_I`#GhtU;)$C~*)a-SCwOCL5AFsEl{fvF)<*e;Hw659Z$Ve6)zdAP<#Q-X z03nODMe~AkjwBG9p!2E>JIV*r8#ufOG$b&yJ;PY?V*~2XxHukxWtq=kUO&Px)+o$V zgRx|wNpZv^0Rw>!jry!_t<@G-1G5RXJ!1uKKlf$}&Yaeegi`Sm0^7+4;37`;01>_4 zESGOg0je|h=H)kq z4hqjaCb^Izu;@DR%BrTOJmNF8WKfL!Y`D~sOB8(zOpMC?Y~87Wv#cXV+>cf*5E z=1r4c^+22m<7SP;vp0x;9Qq{_ROj6K3G`4etGN`w5IZu=<|Dbskurk1ThpTFf->5N zJ$xQA)X8L)_Q&F*Hx)>;*SBcp18?{6YK?f|XxEN6VchIb^Ir1i8##i9{{t`7PGf8fTV6T&h0j*2RclC~cF>~R(L15)9@ zZ%Xd*_j4Ubk^qpe;OF;Q{Rv5qFqfK&Gx)YBx0O;&gTcgw6zyH3aru#IGyP-pgt0l_ z=Kmq1v0|}tx}TF8xB*aDm}#I@4I{CwXy0%QqTLQ4zR*Iqs#y1oq|fIz@5 zdolgsHgA7V$Sc)&q8X6a^CUnBSqphyly`}R#C$Kbjs!Nk0XVgrYNIZ-+_Kld+;Q~4 z^9rN-Hwm+q2Ha!pw1xFyiXHmpg{^UUA`OiMTMJu(3?pS4*CT4eS^D9pTRyO=ir5CNNhslNUb@1P36URt51%RaJZZ~0E=84Tzm>TzdTVn%u}6B4Q3cUt1@ zTf-Md{-8o1?~~{n8d(|UpZ$+!@W?r!Ao%YX^8{=}kYOe$n!@U@?HDmeiqFg>9^-_~ zUQP16X&-HG5Wh45Yj(IEwbpFtTeSCQH+?6~VLA52hD`oMFQ&hGb*ijKyj@{}YGBc5 zO|V9z`RCT1)9Q#5f-x1PHBl)EKPUVVb2psScwY}vB=IL<9zN(oOz6EoG>Q~dff7t; zX?N(W!mj+~ViDSraPaIB%cI_+&1>bw;4VomO#6uVl2Ku*ankf_j7z2|@2XQiubxoL zWPL53VwrXoI^#C>Q^3&q#Gs~-wrCMhFqMrVo3Qh!fTDo|eH|TO!=}SaoR?9C%`VUD7(2yc@O?T45##f;;V5obT51n5l_DVq= z%3l}zVMO#T8qcHhlZbYuxyc}BdO-B2a~CJJ{kHeWOXAZ!8OFq*_>77BylZBx#$ax` zh-suraEk7-%gDcbGUfRn?l~`K`#FcRmXeo|<>QV0nhKQx&$Z}MPz*p4r8~Yz^XnD_ zQQV5Ni)!}D?wj9;3Y)pyxo~CK^V-112!r$nsEB-M>URoJOA@LDwi?>_lGPwxFokgM zdSom=pDz?rij`hWQxmS7nJL(L<1MCc>@mS@rysSS0EY*=`1*EVG;0he0G$2WRc6BM zjgH#CQR*Twxs~e(xKh`YVdih5Vp0p$=tKETU78!hbiPZ3EFp4IMV@u8wwqGo8UXo` zkmy7fy*QxjQLPF}rUqQ3ZgJpRSYMvep#UeOToC3FDIC<`3zgTE4BhwJlvKwOfofgh zybtZKkgwZhrnwc!%`<-SwE|izV>+OaFXv} z?Uf7u8cAx@CVi*Ytvtf1j{BNSR@kYY;^5Sf5F_pbq0-U8HARcX_sb3AM{&!EBIzrqmqTVMM%apB$jX|6gvp5=;>NcWU23&4 zlO}^}pceB+*HL2`sFHdGq5nkZ=T+qZAvTlP3x^F!LJdISPoMF{E0tk9`{;@fp{8B8 z+Ox#Q<@&ngB6FMXX>18E+8z*ll@I}&j|&g^3LJDCFRwWbnu$JTu(|Jm+1to|V#NrQ z#Lk#AatAnKYLo_oFToq+4N(D02mqW9%;1{XGqA4(;#)SK$#}D8$4^>IwC?4@sPRPuMZyg zM!P*}^*tA-pKvoZ4DH*nvVPfa%U$0mZewuoI+owwD;>7}~!l zbE4(!BnHpSck_BAvA+tTJ$Zy&mbwwpD0DuGVJD`b5Wqt6hM zJ?QAyv_NxlXL)EnIPr*Snh04K+tkiVB8sFew?q6zG-))n#@%q)_is*n>FxVOLx+8r zvj9CbS0ehBZpHR&J~UtZ{y6cim@{`{TkgUUJ_N}Wn8 z0~*-D6=&b3kps9nM64v+HSRrQf{2K2G=wZ)O28C~fez)?50l5h_?q8iC1C%fWIxkl zmuT3tOfZG)f+TLc$qZG0H4(!4`tn^^`Qr_Lv2o0+-Yd@o3qU$0;u+fqK=K!|?EbDD zjtb+=(_9m7Eu_S%Y&i~)bjWhZFM(g0S($@(6APb5Mt}KXX}!|0_oZs-R^v<2>C#?*ge##e)YuaA_h(!PA z#2PC>UpGU`-^&cDtp#(ebL1iXt}%6Eo9oL8EY-tvx7H?jpu1jgP3JA9D!ZFiKJ&f1 zX6oV5?{;*7U&)ve6<@^wSCmo6PZ{X^3C5zB(uO>ii4sjDwm|p-Q5c<$L6LkP1ikFX zJ&VRG(E0hz#GMi&7aHh=B5T7^bl)ZpZ8}uGB(x)uD1t&^R$S~*^1GlN_lZgZZocN3 zubY?XCQ~k!BT~QpDk$%C&^0IjzJPD%Z4EEAg)JvEed7Dc1w-E7jB8p4SMEU6&(K^> z74uiK3E8oE0z;x?HMmNto)ao#Thw0eQFbdWBUAN^udXoYni6%h*s3a8zo+<^z5 zU0ERjr!vMZ07(I-;#>Z^SP#4V^{RZao_bOwOFvMkiGHR;*WW9`Ns>CKx+O-u{nD1U zln~LA<8*!JQd^x5Xu44Dm8++F|6rNr)7i}138WPR%DmJSijPzN%OTCoOkqD4MF1^R}Z3DPXou*Y$ekRFVJe&vC;W#n^k1HSyV3+mthM+M> z{`M#^LL1)mAdr7`w)1>66#^`;O#%%D??E-XIL+k`)&+RA@G9I)D6T5Bti^%r8=o`n z#6H|QdCRg&lu|mxMKg==Zk825q`2Fxg*~Uku~4py3`W;QREN^SHW}wK%8oQ$f&DP) z^C5sri*tC)v-(;6>)fkx=!!GN-7)teK29hG`7wFpoJ;LP>p7P6B1;S}6 zop)y|3Q%Z=1W{)f(^++a#6?`jmu=kcv)obZa(M4EC;g(ggF+c})|Dl%aeg+%mk z4b$%d^{qtT`h(ujh3tp}qLq`aRHO>~6MJeU*D7~NAMs{5iuOF)!5Ok?q%UL_^#~(^ zzG;-@A*&JVzySW>R4x?=9qmnZ=4ATq^ru>e({+ETFuZ|mJ$W1kWJiX())Khaqfzpg zvfQ4KY=1;3GVp(Uu%T`QA`AnY#W(1WBkdYT7!oj23lG(7ZD8}tuLQ&`gD zca$*Mp~I1Y2J)(>MUYIbx!&nk%rA`}V_+A+#g~XB5Y*~QlX-26&XH|jUXy---AIgY zMIx^6pL9sPz}iYQS%G9Ms~t!xR!gKEl^=%J89pgpviomjP&THiqYFg)FO^Od0B3IC zV+G<>PY~U%GF`rDgx`h8wQpgTvZfy-qibz@ATo0iA#-GxKd<>htP?1qd#w$yK1&oQE(Dt3{l$h-STM0;`^ogxe%;kGlrxbKMN z#N0sb`upmr&Pv6`pg28n&UG(TRLscSPYWs2Z>p4W<&k)buTF4-ZYw6_RKx&n=Z41r zz-drqS;?xw*mRGpL5Qk?eB|fc@(BiWR>)nr3nN07DYtu8w)YWu+|4nkUTOs zjw`WepqV&OtDj;$Nd63T+c|^nk-Lr-8S#JUQqzuZc{gM-$qf=+$&2|Y*A2Y;=W03O zX&G|3q?}y^I+Egu3tEvW4_d5P?XA%Bftzx7sS zoqAR_pQRc~g9n3pHQ-u%N6Y7)wtT~@oO_)mEa8LT5C8v7daekv2gM8|HYVHYV!R+u znj`<-9GzA+ej{7|3%L^%?yC1mYpC|VlNQUHGrKtLUt7ulw-00h#(bhJjv5y^37iFP$-z%d z<)GnyuC~Z*ob856jlx&X^JyH}uS-K_nS*~a{(gCTZOD?D>gte#EiT_L3apiO)c}kX#fh94&73;!QK3DLGl9A0ue{qdV{f zpr1kQJbw-j)UNvVR9WQEL71VcITW;h0PX~Tg;v?}unB7LdKoBAc7mEpzj>O!bWaVL zI`_eP`7b+M9wyZ^3MZ!~K6hp1%AopSQT))oYzf`M9(Mq24}>o6uW&IuPVd{dxDNsd za$V|Ks~@L>h_e8dhiU)*@3rgiiY^F(;_;(+(VX+mX)&Ck?E`ddN2cwYBx0{juMVMU zLmyIeVdg;bj!K^$T?!)VY8~D;xTagYPTF%CSrY~*GkgH!t>RZ#8`g4rPiBplV^>wr zmnGh%3fHf6p-t*yg=#qIA{DYET*kY4ZU3Z`Q!d6YGhb$7m7uY>=)`I@^;`eOcy=AK z)8_zGfnsjm+x8T&>7VuhFGY^jFt^>A~QOiuW;vtf8@ zqL(peTC^hntp?Gm8d?1k>FYo>JXr=_6~ zw-?($OOV7D6JMv2i!~ zT$k(!{|K@icU6k|&xZgPu}w^U0lumD47X8l4X^)Pn4$#bs2YydfB{KIa`wQoA)tad zMq24Xu0*Q_=rtWCBzQSgJg6M!QLORG&3&OTn2CguXyqQnwGYUPD3pxZ(GLRL4qDXg z)htX*XX^OxvmQRzP0TxohZL}@SC!gn1r(q*>0sn^J4+S0T=X29&RejFyLl9uvrS!C z@xzm9e*qJbC^Z=K!G)ety!ZKYGKgDo{kA+kvvC}8WEnoXwaHFfUMiesPcAIDwN|Y@ z(HNyt;t4>+ZK3Qj&klzM?h4RsDqU*vVv1!;1#d@&?WvSR;GV7x2|Gj9HDOjs#Gwya z2cdz7*rpT>%6-yG-e;crYrCf{T8W)QDM357nCD5rBQqT1?r{_WSKP-j0n+F)jmX0m^qU`vO_F)Yh$I*p8c1~Nc@rIJTrmtM zU~!eH1f(K>RKggg3;1m7CmOlB%`!-EYi!C|+vj}oD->_)i}~eHr5yWH?BtRcoc|>t zNb@#~Jb=r!10JlgXOqnmsmmZK`G!Zv9245aMvrxsVIG=IH$kzGTrXzCte*|X3{d0U zqs$Q`0sU=y`Y-SQrBEcHkpMoer2weWHqz<4(Pl7f`dm2U3+oc!;?G?9G+$-5QGAXo za05~K*Mgq_{I)b?T)>#(>FxCYy*FGPUi7+GvFHg*(7um($nq=-pew}GMXWS%bq+0C zytyuL*2A13wg2#CTD?J}()-<5EZVh;)%<|T7nv9cxt&9Q^d9)oG3d0J6{_%r`|bY& DFOY3- literal 0 HcmV?d00001 diff --git a/packages/lane_helpers/docs/index.rst b/packages/lane_helpers/docs/index.rst new file mode 100644 index 0000000..40095e5 --- /dev/null +++ b/packages/lane_helpers/docs/index.rst @@ -0,0 +1,11 @@ +Lane Helpers +============ + +This is the documentation for the ``accvlab.lane_helpers`` package. + +.. toctree:: + :maxdepth: 1 + + introduction + api + example diff --git a/packages/lane_helpers/docs/introduction.rst b/packages/lane_helpers/docs/introduction.rst new file mode 100644 index 0000000..9bf3eac --- /dev/null +++ b/packages/lane_helpers/docs/introduction.rst @@ -0,0 +1,113 @@ +Introduction +============ + +Polyline Sampling +----------------- + +Functionality +^^^^^^^^^^^^^ + +The ``lane_helpers`` package provides utilities for lane-processing workloads. + +The main functionality is batched polyline interpolation. A polyline is a sequence of points in the +space :math:`\mathbb{R}^D`, written as :math:`\mathbf{p}_i`, where each pair of consecutive points defines +one line segment. + +Given sampling distances :math:`d_j` measured from the first point :math:`\mathbf{p}_0` along the +polyline, the sampling function :func:`~accvlab.lane_helpers.polyline.interpolate` returns the +corresponding sampled points :math:`\mathbf{q}_j`. + +.. figure:: images/polyline_sampling_illustration.png + :alt: Illustration of polyline sampling + :align: center + :scale: 45% + + Two-segment polyline sampled at two distances. The input points are shown as green circles, and the + sampled points are shown as red circles. + +Sampling distances do not need to be sorted. Distances can be provided either as absolute distances along +the polyline or as fractions of each polyline's total length. + +Point coordinates are not limited to 2D. The coordinate dimension is the last tensor dimension, and 2D, +3D, and higher-dimensional coordinates are supported. + +For batches with variable numbers of points or distances, use +:func:`~accvlab.lane_helpers.polyline.interpolate_var_size_batch` with +:class:`~accvlab.batching_helpers.RaggedBatch` inputs. + +Functionality to compute the total length of each polyline is also provided (through +:func:`~accvlab.lane_helpers.polyline.lengths` and :func:`~accvlab.lane_helpers.polyline.lengths_var_size_batch`). + +Runtime Evaluation +^^^^^^^^^^^^^^^^^^ + +The runtime evaluation compares batched interpolation for both CPU and CUDA against a Shapely LineString +reference over a grid of point counts, numbers of sampled distances, and batch sizes. Runtime plots report +milliseconds per interpolation call, while speedup plots report the x-fold improvement over the Shapely +reference. + +.. seealso:: + + The evaluation script is available at ``packages/lane_helpers/evaluation/shapely_evaluation.py``. It can be + used to run the benchmark sweep for different problem sizes on your target system. + +Performance depends on the batch size for both CPU and CUDA execution. CUDA parallelism scales with the number +of polylines in the batch, so very small batch sizes may not fully utilize the GPU. + +For practical problem sizes, it is recommended to choose the implementation based primarily on where the +tensors already live: CPU inputs should generally stay on CPU, and CUDA inputs should generally stay on CUDA. +Moving tensors only to use a different implementation can dominate the interpolation cost. + +The plots below focus on batch sizes 1 and 64 as examples. The evaluation script runs for more batch sizes by +default, and other batch sizes can be easily added. + +.. note:: + + The following measurements are intended as directional guidance. Exact runtimes depend on the used system, + with performance primarily influenced by the CPU and GPU. + + The plots shown here were generated on a system with an ``NVIDIA RTX 5000 Ada Generation`` GPU and an + ``AMD Ryzen 9 7950X`` 16-Core Processor. + +.. note:: + + In the following runtime plots, markers highlight the smallest measured problem size, the largest measured + problem size, and the 100-point/100-distance cell. + + In the speedup plots, markers highlight the smallest measured problem size and the largest speedup. If speedup is not + above 1x everywhere, they also mark representative cells near the first matching point-count and distance-count + configuration where speedup exceeds 1x. + +Batch size 1 shows behavior for the smallest batch configuration in the benchmark: + +.. figure:: _generated/polyline_runtime_evaluation/batch_1_runtime_comparison.png + :alt: Runtime comparison heatmaps for batch size 1 + :align: center + :width: 100% + + Runtime comparison for batch size 1. Rows vary the number of polyline points, and columns vary the number + of sampled distances. + +.. figure:: _generated/polyline_runtime_evaluation/batch_1_speedup_comparison.png + :alt: Speedup comparison heatmaps for batch size 1 + :align: center + :width: 100% + + Speedup comparison for batch size 1. + +For larger batch sizes, CUDA can expose more parallel work and its speedup over the other methods typically +becomes more pronounced. Batch size 64 shows this behavior: + +.. figure:: _generated/polyline_runtime_evaluation/batch_64_runtime_comparison.png + :alt: Runtime comparison heatmaps for batch size 64 + :align: center + :width: 100% + + Runtime comparison for batch size 64. + +.. figure:: _generated/polyline_runtime_evaluation/batch_64_speedup_comparison.png + :alt: Speedup comparison heatmaps for batch size 64 + :align: center + :width: 100% + + Speedup comparison for batch size 64. diff --git a/packages/lane_helpers/docu_referenced_dirs.txt b/packages/lane_helpers/docu_referenced_dirs.txt new file mode 100644 index 0000000..1e107f5 --- /dev/null +++ b/packages/lane_helpers/docu_referenced_dirs.txt @@ -0,0 +1 @@ +examples diff --git a/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py new file mode 100644 index 0000000..2cf0d9f --- /dev/null +++ b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py @@ -0,0 +1,157 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import numpy as np + +_LARGE_SPEEDUP_THRESHOLD = 1_000.0 + + +# Helper function for formatting speedup values in result tables. +def _format_speedup_value(value: float) -> str: + if abs(value) >= _LARGE_SPEEDUP_THRESHOLD: + return f"{value:.2e}" + return f"{value:.2f}" + + +# Helper function for formatting one measured metric as a Markdown table. +def _format_table( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + scientific: bool, +) -> str: + rows = [ + "| # Points (down) / # Distances (right) | " + " | ".join(str(item) for item in nums_distances) + " |" + ] + rows.append("| :----- |" + " :-----: |" * len(nums_distances)) + for points_idx, num_points_current in enumerate(nums_points): + values = [] + for distances_idx in range(len(nums_distances)): + value = data[points_idx, distances_idx] + if scientific: + values.append(np.format_float_scientific(value, precision=3)) + else: + values.append(_format_speedup_value(value)) + rows.append(f"| {num_points_current} | " + " | ".join(values) + " |") + table = "\n".join(rows) + return table + + +# Helper function for writing one Markdown table to disk. +def _write_markdown( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + filename: Path, + scientific: bool, +) -> None: + table = _format_table(data, nums_points, nums_distances, scientific=scientific) + filename.write_text(table + "\n", encoding="utf-8") + + +# Helper function for writing the Markdown table output for one metric. +def _write_metric_outputs( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + filename_stem: Path, + scientific: bool, +) -> None: + _write_markdown( + data, + nums_points, + nums_distances, + filename=filename_stem.with_suffix(".md"), + scientific=scientific, + ) + + +# Entry point: write all Markdown tables for one evaluated batch size. +def write_batch_results( + output_dir: Path, + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + shapely_runtime_ms: np.ndarray | None, + cpu_runtime_ms: np.ndarray, + cuda_runtime_ms: np.ndarray, + skip_shapely: bool, + assert_results: bool, + max_abs_diff_cpu: np.ndarray | None, + max_abs_diff_cuda: np.ndarray | None, + max_abs_diff_cuda_vs_cpu: np.ndarray | None, +) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms + prefix = f"batch_{batch_size}" + + def write_metric( + metric_name: str, + data: np.ndarray, + *, + scientific: bool, + ) -> None: + _write_metric_outputs( + data, + nums_points, + nums_distances, + filename_stem=output_dir / f"{prefix}_{metric_name}", + scientific=scientific, + ) + + if not skip_shapely: + cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms + cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms + write_metric( + "runtime_shapely", + shapely_runtime_ms, + scientific=True, + ) + # CPU and CUDA outputs are always available; Shapely-related metrics are optional. + write_metric("runtime_cuda", cuda_runtime_ms, scientific=True) + write_metric( + "runtime_cpu", + cpu_runtime_ms, + scientific=True, + ) + if not skip_shapely: + write_metric( + "speedup_cuda_vs_shapely", + cuda_speedup_over_shapely, + scientific=False, + ) + write_metric( + "speedup_cpu_vs_shapely", + cpu_speedup_over_shapely, + scientific=False, + ) + write_metric( + "speedup_cuda_vs_cpu", + cuda_speedup_over_cpu, + scientific=False, + ) + if assert_results: + write_metric( + "max_abs_diff_cuda_vs_cpu", + max_abs_diff_cuda_vs_cpu, + scientific=True, + ) + if assert_results and not skip_shapely: + write_metric("max_abs_diff", max_abs_diff_cuda, scientific=True) + write_metric("max_abs_diff_cpu", max_abs_diff_cpu, scientific=True) diff --git a/packages/lane_helpers/evaluation/plot_shapely_evaluation.py b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py new file mode 100644 index 0000000..34acf06 --- /dev/null +++ b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py @@ -0,0 +1,595 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from dataclasses import dataclass +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.colors as colors +from matplotlib.axes import Axes +from matplotlib import pyplot as plt +import numpy as np + +DEFAULT_ANNOTATE_PLOTS = True +_LARGE_SPEEDUP_THRESHOLD = 1_000.0 +_PLOT_FIGSIZE = (6.4, 5.2) +_COMPARISON_SUBPLOT_WIDTH = 6.6 +_PLOT_SUPTITLE_FONT_SIZE = 22 +_PLOT_TITLE_FONT_SIZE = 18 +_PLOT_AXIS_LABEL_FONT_SIZE = 16 +_PLOT_TICK_LABEL_FONT_SIZE = 14 +_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE = 14 +_PLOT_ANNOTATION_FONT_SIZE = 16 +_PLOT_ANNOTATION_MARKER_SIZE = 52 +_PLOT_ANNOTATION_X_OFFSET = 0.25 +_PLOT_COLORBAR_FRACTION = 0.046 +_PLOT_COLORBAR_PAD = 0.02 + + +@dataclass(frozen=True) +class _MetricPlotConfig: + title: str + annotation: str | None = None + + +_METRIC_PLOT_CONFIGS = { + "runtime_shapely": _MetricPlotConfig("Shapely", annotation="runtime"), + "runtime_cuda": _MetricPlotConfig("CUDA", annotation="runtime"), + "runtime_cpu": _MetricPlotConfig("CPU", annotation="runtime"), + "speedup_cuda_vs_shapely": _MetricPlotConfig("CUDA vs. Shapely", annotation="speedup"), + "speedup_cpu_vs_shapely": _MetricPlotConfig("CPU vs. Shapely", annotation="speedup"), + "speedup_cuda_vs_cpu": _MetricPlotConfig("CUDA vs. CPU", annotation="speedup"), + "max_abs_diff_cuda_vs_cpu": _MetricPlotConfig("CUDA max abs. difference to CPU"), + "max_abs_diff": _MetricPlotConfig("CUDA max abs. difference to Shapely"), + "max_abs_diff_cpu": _MetricPlotConfig("CPU max abs. difference to Shapely"), +} +_SHAPELY_DEPENDENT_METRICS = frozenset( + { + "runtime_shapely", + "speedup_cuda_vs_shapely", + "speedup_cpu_vs_shapely", + "max_abs_diff", + "max_abs_diff_cpu", + } +) +_RUNTIME_METRICS_WITH_SHAPELY = ("runtime_shapely", "runtime_cpu", "runtime_cuda") +_RUNTIME_METRICS_WITHOUT_SHAPELY = ("runtime_cpu", "runtime_cuda") +_SPEEDUP_METRICS_WITH_SHAPELY = ( + "speedup_cpu_vs_shapely", + "speedup_cuda_vs_shapely", + "speedup_cuda_vs_cpu", +) +_SPEEDUP_METRICS_WITHOUT_SHAPELY = ("speedup_cuda_vs_cpu",) + + +# Helper function for formatting speedup values in tables and annotations. +def _format_speedup_value(value: float) -> str: + if abs(value) >= _LARGE_SPEEDUP_THRESHOLD: + return f"{value:.2e}" + return f"{value:.2f}" + + +# Helper function for splitting one Markdown table row into stripped cells. +def _split_markdown_table_row(row: str) -> list[str]: + row = row.strip() + if not row.startswith("|") or not row.endswith("|"): + raise ValueError(f"Expected Markdown table row, got: {row}") + cells = [cell.strip() for cell in row.strip("|").split("|")] + return cells + + +# Helper function for loading one metric table written by `_write_markdown`. +def _read_metric_table(filename: Path) -> tuple[list[int], list[int], np.ndarray]: + table_rows = [ + line.strip() + for line in filename.read_text(encoding="utf-8").splitlines() + if line.strip().startswith("|") + ] + if len(table_rows) < 3: + raise ValueError(f"Expected a Markdown header, separator, and at least one data row in {filename}") + + header_cells = _split_markdown_table_row(table_rows[0]) + if not header_cells or not header_cells[0].startswith("# Points"): + raise ValueError(f"Expected first Markdown header cell to describe point counts in {filename}") + nums_distances = [int(cell) for cell in header_cells[1:]] + nums_points: list[int] = [] + values: list[list[float]] = [] + + for row in table_rows[2:]: + row_cells = _split_markdown_table_row(row) + if len(row_cells) != len(nums_distances) + 1: + raise ValueError(f"Expected {len(nums_distances) + 1} cells in {filename}, got {len(row_cells)}") + nums_points.append(int(row_cells[0])) + values.append([float(cell) for cell in row_cells[1:]]) + + data = np.asarray(values, dtype=np.float64) + return nums_points, nums_distances, data + + +# Helper function for choosing which speedup heatmap cells should show numeric labels. +def _selected_speedup_annotation_cells( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> list[tuple[int, int]]: + def find_value_index(values: list[int], value: int) -> int | None: + try: + index = values.index(value) + except ValueError: + return None + return index + + def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None: + if cell not in cells: + cells.append(cell) + + def find_first_faster_distance_idx(points_idx: int) -> int | None: + for distances_idx in range(len(nums_distances)): + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0: + return distances_idx + return None + + def find_first_faster_points_idx(distances_idx: int) -> int | None: + for points_idx in range(len(nums_points)): + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0: + return points_idx + return None + + cells: list[tuple[int, int]] = [] + + points_idx = find_value_index(nums_points, 2) + distances_idx = find_value_index(nums_distances, 1) + if points_idx is not None and distances_idx is not None: + add_unique_cell(cells, (points_idx, distances_idx)) + + finite_mask = np.isfinite(data) + if np.any(finite_mask): + finite_data = np.where(finite_mask, data, -np.inf) + points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape) + add_unique_cell(cells, (int(points_idx), int(distances_idx))) + + finite_values = data[finite_mask] + if finite_values.size > 0 and not np.all(finite_values > 1.0): + distances_idx_by_value = {value: idx for idx, value in enumerate(nums_distances)} + for points_idx, num_points_current in enumerate(nums_points): + distances_idx = distances_idx_by_value.get(num_points_current) + if distances_idx is None: + continue + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] > 1.0: + add_unique_cell(cells, (points_idx, distances_idx)) + if points_idx > 0: + faster_distances_idx = find_first_faster_distance_idx(points_idx - 1) + if faster_distances_idx is not None: + add_unique_cell(cells, (points_idx - 1, faster_distances_idx)) + if distances_idx > 0: + faster_points_idx = find_first_faster_points_idx(distances_idx - 1) + if faster_points_idx is not None: + add_unique_cell(cells, (faster_points_idx, distances_idx - 1)) + break + + return cells + + +# Helper function for choosing which runtime heatmap cells should get marker labels. +def _selected_runtime_annotation_cells( + nums_points: list[int], + nums_distances: list[int], +) -> list[tuple[int, int]]: + def find_value_index(values: list[int], value: int) -> int | None: + try: + index = values.index(value) + except ValueError: + return None + return index + + def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None: + if cell not in cells: + cells.append(cell) + + cells: list[tuple[int, int]] = [] + if nums_points and nums_distances: + add_unique_cell(cells, (0, 0)) + add_unique_cell(cells, (len(nums_points) - 1, len(nums_distances) - 1)) + + points_idx = find_value_index(nums_points, 100) + distances_idx = find_value_index(nums_distances, 100) + if points_idx is not None and distances_idx is not None: + add_unique_cell(cells, (points_idx, distances_idx)) + + return cells + + +# Helper function for placing numeric labels on selected speedup heatmap cells. +def _speedup_annotation_text_position( + points_idx: int, + distances_idx: int, + nums_points: list[int], + nums_distances: list[int], + selected_cells: list[tuple[int, int]], + data: np.ndarray, + max_speedup_cell: tuple[int, int] | None, +) -> tuple[float, str]: + if max_speedup_cell == (points_idx, distances_idx) and distances_idx > 0: + return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right" + + is_left_of_value_diagonal = nums_distances[distances_idx] < nums_points[points_idx] + has_adjacent_above_one_annotation = any( + (other_points_idx, other_distances_idx) != (points_idx, distances_idx) + and abs(other_points_idx - points_idx) + abs(other_distances_idx - distances_idx) == 1 + and np.isfinite(data[other_points_idx, other_distances_idx]) + and data[other_points_idx, other_distances_idx] >= 1.0 + for other_points_idx, other_distances_idx in selected_cells + ) + should_place_left = distances_idx == len(nums_distances) - 1 or ( + distances_idx > 0 and is_left_of_value_diagonal and has_adjacent_above_one_annotation + ) + if should_place_left: + return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right" + return distances_idx + _PLOT_ANNOTATION_X_OFFSET, "left" + + +# Helper function for drawing optional numeric labels on selected speedup heatmap cells. +def _annotate_speedup_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> None: + selected_cells = _selected_speedup_annotation_cells(data, nums_points, nums_distances) + finite_mask = np.isfinite(data) + max_speedup_cell = None + if np.any(finite_mask): + finite_data = np.where(finite_mask, data, -np.inf) + points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape) + max_speedup_cell = (int(points_idx), int(distances_idx)) + + for points_idx, distances_idx in selected_cells: + value = data[points_idx, distances_idx] + if not np.isfinite(value): + continue + + ax.scatter( + [distances_idx], + [points_idx], + marker="o", + s=_PLOT_ANNOTATION_MARKER_SIZE, + c="black", + edgecolors="white", + linewidths=0.8, + zorder=3, + ) + + text_x, horizontal_alignment = _speedup_annotation_text_position( + points_idx, + distances_idx, + nums_points, + nums_distances, + selected_cells, + data, + max_speedup_cell, + ) + ax.text( + text_x, + points_idx, + _format_speedup_value(value), + ha=horizontal_alignment, + va="center", + fontsize=_PLOT_ANNOTATION_FONT_SIZE, + color="black", + bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75}, + zorder=4, + ) + + +# Helper function for drawing fixed reference markers on runtime heatmap cells. +def _annotate_runtime_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> None: + for points_idx, distances_idx in _selected_runtime_annotation_cells(nums_points, nums_distances): + value = data[points_idx, distances_idx] + if not np.isfinite(value): + continue + + ax.scatter( + [distances_idx], + [points_idx], + marker="o", + s=_PLOT_ANNOTATION_MARKER_SIZE, + c="black", + edgecolors="white", + linewidths=0.8, + zorder=3, + ) + + if distances_idx == len(nums_distances) - 1: + text_x = distances_idx - 0.15 + horizontal_alignment = "right" + else: + text_x = distances_idx + 0.15 + horizontal_alignment = "left" + ax.text( + text_x, + points_idx, + f"{value:.1e}", + ha=horizontal_alignment, + va="center", + fontsize=_PLOT_ANNOTATION_FONT_SIZE, + color="black", + bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75}, + zorder=4, + ) + + +# Helper function for drawing one heatmap into an existing subplot. +def _draw_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + title: str, + log_scale: bool, + annotate_speedup: bool = False, + annotate_runtime: bool = False, +) -> None: + norm = None + if log_scale: + positive_values = data[data > 0] + if positive_values.size > 0: + norm = colors.LogNorm(vmin=positive_values.min(), vmax=positive_values.max()) + + image = ax.imshow(data, norm=norm) + ax.set_yticks(list(range(len(nums_points))), labels=nums_points, fontsize=_PLOT_TICK_LABEL_FONT_SIZE) + ax.set_ylabel("Number of points", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE) + ax.set_xticks( + list(range(len(nums_distances))), + labels=nums_distances, + rotation=45, + fontsize=_PLOT_TICK_LABEL_FONT_SIZE, + ) + ax.set_xlabel("Number of distances", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE) + ax.set_title(title, fontsize=_PLOT_TITLE_FONT_SIZE, pad=12) + colorbar = ax.figure.colorbar( + image, + ax=ax, + fraction=_PLOT_COLORBAR_FRACTION, + pad=_PLOT_COLORBAR_PAD, + ) + colorbar.ax.tick_params(labelsize=_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE) + colorbar.ax.yaxis.offsetText.set_fontsize(_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE) + if annotate_speedup: + _annotate_speedup_heatmap(ax, data, nums_points, nums_distances) + if annotate_runtime: + _annotate_runtime_heatmap(ax, data, nums_points, nums_distances) + + +# Helper function for writing a multi-subplot comparison plot for one metric group. +def _plot_metric_comparison( + metric_names: tuple[str, ...], + metric_data: dict[str, np.ndarray], + nums_points: list[int], + nums_distances: list[int], + *, + batch_size: int, + figure_title: str, + filename: Path, + annotate_plots: bool, +) -> None: + available_metric_names = tuple(metric_name for metric_name in metric_names if metric_name in metric_data) + if not available_metric_names: + return + + subplot_width = _PLOT_FIGSIZE[0] if len(available_metric_names) == 1 else _COMPARISON_SUBPLOT_WIDTH + fig, axes = plt.subplots( + 1, + len(available_metric_names), + figsize=(subplot_width * len(available_metric_names), _PLOT_FIGSIZE[1]), + constrained_layout=True, + ) + fig.suptitle(f"{figure_title} (Batch Size {batch_size})", fontsize=_PLOT_SUPTITLE_FONT_SIZE) + axes = np.atleast_1d(axes).tolist() + + for ax, metric_name in zip(axes, available_metric_names): + metric_config = _METRIC_PLOT_CONFIGS[metric_name] + _draw_heatmap( + ax, + metric_data[metric_name], + nums_points, + nums_distances, + title=metric_config.title, + log_scale=True, + annotate_speedup=metric_config.annotation == "speedup" and annotate_plots, + annotate_runtime=metric_config.annotation == "runtime" and annotate_plots, + ) + fig.savefig(filename) + plt.close(fig) + + +# Helper function for writing comparison plots whose subplot counts depend on Shapely availability. +def _write_comparison_outputs( + output_dir: Path, + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + metric_data: dict[str, np.ndarray], + *, + has_shapely_results: bool, + annotate_plots: bool, +) -> list[Path]: + runtime_metric_names = ( + _RUNTIME_METRICS_WITH_SHAPELY if has_shapely_results else _RUNTIME_METRICS_WITHOUT_SHAPELY + ) + speedup_metric_names = ( + _SPEEDUP_METRICS_WITH_SHAPELY if has_shapely_results else _SPEEDUP_METRICS_WITHOUT_SHAPELY + ) + prefix = f"batch_{batch_size}" + comparison_files = [ + output_dir / f"{prefix}_runtime_comparison.png", + output_dir / f"{prefix}_speedup_comparison.png", + ] + _plot_metric_comparison( + runtime_metric_names, + metric_data, + nums_points, + nums_distances, + batch_size=batch_size, + figure_title="Runtime [ms]", + filename=comparison_files[0], + annotate_plots=annotate_plots, + ) + _plot_metric_comparison( + speedup_metric_names, + metric_data, + nums_points, + nums_distances, + batch_size=batch_size, + figure_title="Speedup [x-fold]", + filename=comparison_files[1], + annotate_plots=annotate_plots, + ) + return comparison_files + + +# Helper function for parsing comma-separated integer lists. +def _parse_int_list(value: str) -> list[int]: + parsed_values = [int(item) for item in value.split(",") if item] + return parsed_values + + +def plot_batch_results_from_markdown( + input_dir: Path, + output_dir: Path, + batch_size: int, + annotate_plots: bool, +) -> list[Path]: + prefix = f"batch_{batch_size}_" + markdown_files = sorted(input_dir.glob(f"{prefix}*.md")) + if not markdown_files: + raise FileNotFoundError(f"No Markdown result tables found for batch={batch_size} in {input_dir}") + available_metric_names = {markdown_file.stem[len(prefix) :] for markdown_file in markdown_files} + has_shapely_results = "runtime_shapely" in available_metric_names + + metric_data: dict[str, np.ndarray] = {} + comparison_nums_points: list[int] | None = None + comparison_nums_distances: list[int] | None = None + for markdown_file in markdown_files: + metric_name = markdown_file.stem[len(prefix) :] + if metric_name not in _METRIC_PLOT_CONFIGS: + continue + if metric_name in _SHAPELY_DEPENDENT_METRICS and not has_shapely_results: + continue + + nums_points, nums_distances, data = _read_metric_table(markdown_file) + metric_data[metric_name] = data + comparison_nums_points = nums_points + comparison_nums_distances = nums_distances + + if comparison_nums_points is not None and comparison_nums_distances is not None: + output_dir.mkdir(parents=True, exist_ok=True) + plotted_files = _write_comparison_outputs( + output_dir, + batch_size, + comparison_nums_points, + comparison_nums_distances, + metric_data, + has_shapely_results=has_shapely_results, + annotate_plots=annotate_plots, + ) + else: + plotted_files = [] + + if not plotted_files: + raise FileNotFoundError( + f"No known Markdown result tables found for batch={batch_size} in {input_dir}" + ) + return plotted_files + + +def plot_from_markdown_directory( + *, + input_dir: Path, + output_dir: Path, + batch_sizes: list[int], + annotate_plots: bool = DEFAULT_ANNOTATE_PLOTS, +) -> list[Path]: + if not input_dir.exists(): + raise FileNotFoundError(f"Markdown input directory does not exist: {input_dir}") + + plotted_files: list[Path] = [] + for batch_size in batch_sizes: + batch_plotted_files = plot_batch_results_from_markdown( + input_dir=input_dir, + output_dir=output_dir, + batch_size=batch_size, + annotate_plots=annotate_plots, + ) + plotted_files.extend(batch_plotted_files) + return plotted_files + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate polyline runtime plot images from Markdown result tables.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-dir", + type=Path, + required=True, + help="Directory containing Markdown result tables.", + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Directory where plot images should be written.", + ) + parser.add_argument( + "--batch-sizes", + default="1,4,16,64", + help="Comma-separated batch sizes to plot.", + ) + no_annotate_plots_action = parser.add_argument( + "--no-annotate-plots", + dest="annotate_plots", + action="store_false", + help="Disable annotations in generated heatmaps.", + ) + parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS) + no_annotate_plots_action.default = argparse.SUPPRESS + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + batch_sizes = _parse_int_list(args.batch_sizes) + plotted_files = plot_from_markdown_directory( + input_dir=args.input_dir, + output_dir=args.output_dir, + batch_sizes=batch_sizes, + annotate_plots=args.annotate_plots, + ) + for plotted_file in plotted_files: + print(f"Generated plot: {plotted_file}") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/evaluation/shapely_evaluation.py b/packages/lane_helpers/evaluation/shapely_evaluation.py new file mode 100644 index 0000000..1048191 --- /dev/null +++ b/packages/lane_helpers/evaluation/shapely_evaluation.py @@ -0,0 +1,681 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from collections.abc import Callable, Iterator +import gc +import sys +import time +from pathlib import Path +from types import ModuleType + +import numpy as np +from shapely import get_coordinates, line_interpolate_point, linestrings +import torch + +# Import helpers for outputting results and plots +SCRIPT_DIR = Path(__file__).resolve().parent +if str(SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPT_DIR)) +import plot_shapely_evaluation +import _shapely_evaluation_outputs as shapely_evaluation_outputs + +# ==================== Default configuration for the evaluation ==================== + +# These constants are convenient local configuration knobs. However, these configurations can also +# be done with CLI arguments. +# When changing these constants, check the CLI arguments further below in the script, +# because some flags only override the default in one direction. + +# Sweep values for the heatmap axes and the batch-size examples. +DEFAULT_NUMS_POINTS = [2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000] +DEFAULT_NUMS_DISTANCES = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000] +DEFAULT_BATCH_SIZES = [1, 4, 16, 64] +# Keep the measured work roughly constant across batch sizes. +DEFAULT_NUM_POLYLINES_PER_MEASUREMENT = 64 * 10 +# Warm up a representative mid-sized configuration before timing the sweep. +DEFAULT_NUM_WARMUP_RUNS = 3 +DEFAULT_WARMUP_NUM_POINTS = 100 +DEFAULT_WARMUP_NUM_DISTANCES = 100 +# Shapely can be skipped for faster CPU/CUDA-only benchmark sweeps. +DEFAULT_SKIP_SHAPELY = False +# Result checks are optional because they add Shapely reference work to each configuration. +DEFAULT_ASSERT_RESULTS = False +DEFAULT_ASSERT_ATOL = 1e-3 +DEFAULT_ASSERT_RTOL = 0.0 +# Plot annotations call out representative cells in generated heatmaps. +DEFAULT_ANNOTATE_PLOTS = True +# ================== End: Default configuration for the evaluation ================= + + +# ================== Constants for the evaluation ================== +DEVICE = "cuda" +DTYPE_NP = np.float32 +DTYPE_TORCH = torch.float32 +_POLYLINE_MODULE: ModuleType | None = None +# ================ End: Constants for the evaluation =============== + + +# Helper function for lazily importing the compiled polyline module outside plotting-only mode. +def _get_polyline_module() -> ModuleType: + global _POLYLINE_MODULE + if _POLYLINE_MODULE is None: + from accvlab.lane_helpers import polyline as polyline_module + + _POLYLINE_MODULE = polyline_module + return _POLYLINE_MODULE + + +# Helper function for config parsing +def _parse_int_list(value: str) -> list[int]: + parsed_values = [int(item) for item in value.split(",") if item] + return parsed_values + + +# Helper function for computing the batched Shapely reference. +def _compute_batched_shapely_reference(points: np.ndarray, distances: np.ndarray) -> np.ndarray: + line_strings = linestrings(points) + interpolated_points = line_interpolate_point(line_strings[:, None], distances) + batched_reference = ( + get_coordinates(interpolated_points) + .reshape( + points.shape[0], + distances.shape[1], + points.shape[2], + ) + .astype(DTYPE_NP) + ) + return batched_reference + + +# Helper function for computing per-polyline lengths in NumPy. +def _compute_polyline_lengths_np(points: np.ndarray) -> np.ndarray: + if points.shape[1] <= 1: + lengths = np.zeros((points.shape[0],), dtype=DTYPE_NP) + return lengths + lengths = np.linalg.norm(points[:, 1:] - points[:, :-1], axis=2).sum(axis=1).astype(DTYPE_NP) + return lengths + + +# Helper function for comparing CPU and CUDA outputs against Shapely when requested. +def _assert_matches_shapely( + shapely_result: np.ndarray, + result: np.ndarray, + *, + implementation_name: str, + batch_size: int, + num_points: int, + num_distances: int, + atol: float, + rtol: float, +) -> None: + try: + np.testing.assert_allclose(result, shapely_result, atol=atol, rtol=rtol) + except AssertionError as exc: + max_abs_diff = np.abs(shapely_result - result).max() + raise AssertionError( + f"{implementation_name} result differs from Shapely for " + f"batch={batch_size}, points={num_points}, distances={num_distances}; " + f"max_abs_diff={max_abs_diff}, atol={atol}, rtol={rtol}" + ) from exc + + +# Helper function for constructing one deterministic benchmark input configuration. +def _make_evaluation_case( + batch_size: int, + num_points: int, + num_distances: int, + *, + seed: int, +) -> tuple[np.ndarray, np.ndarray]: + generator = np.random.default_rng(seed=seed) + # Set up the polylines + points = generator.uniform(0.0, 1.0, size=(batch_size, num_points, 2)).astype(DTYPE_NP) + lengths = _compute_polyline_lengths_np(points) + # Set up the distances to sample the polyline at + distances_normalized = generator.uniform(0.0, 1.0, size=(batch_size, num_distances)).astype(DTYPE_NP) + distances = distances_normalized * lengths[:, None] + return points, distances + + +# Helper function for iterating over deterministic benchmark configurations. +def _iter_evaluation_cases( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], +) -> Iterator[tuple[int, int, int, int, int]]: + for points_idx, num_points_current in enumerate(nums_points): + for distances_idx, num_distances_current in enumerate(nums_distances): + seed = batch_size * 1_000_000 + num_points_current * 1_000 + num_distances_current + yield points_idx, distances_idx, num_points_current, num_distances_current, seed + + +# Helper function for placing the same NumPy inputs on CUDA and CPU. +def _make_torch_tensors( + *arrays: np.ndarray, +) -> tuple[torch.Tensor, ...]: + tensors_gpu = [torch.tensor(array, device=DEVICE, dtype=DTYPE_TORCH) for array in arrays] + tensors_cpu = [torch.tensor(array, device="cpu", dtype=DTYPE_TORCH) for array in arrays] + return *tensors_gpu, *tensors_cpu + + +# Helper function for placing NumPy inputs on one target device. +def _make_torch_tensors_on_device( + *arrays: np.ndarray, + device: str, +) -> tuple[torch.Tensor, ...]: + tensors = tuple(torch.tensor(array, device=device, dtype=DTYPE_TORCH) for array in arrays) + return tensors + + +# Helper function for timing repeated calls and synchronizing CUDA work when needed. +def _time_call( + function: Callable[[], object], + *, + num_runs: int, + synchronize_cuda: bool = False, +) -> float: + if synchronize_cuda: + # Ensure previous work is finished before starting the timing. + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(num_runs): + function() + if synchronize_cuda: + # Ensure all work is finished before stopping the timing. + torch.cuda.synchronize() + runtime = (time.perf_counter() - start) / num_runs + return runtime + + +# Helper function for reducing cross-implementation timing interference. +def _cleanup_between_implementation_sweeps() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.synchronize() + torch.cuda.empty_cache() + + +# Helper function for timing the Shapely reference implementation. +def _time_shapely( + points: np.ndarray, + distances: np.ndarray, + *, + num_runs: int, +) -> float: + compute_function = lambda: _compute_batched_shapely_reference(points, distances) + runtime = _time_call( + compute_function, + num_runs=num_runs, + ) + return runtime + + +# Helper function for timing the CUDA implementation. +def _time_cuda( + points: torch.Tensor, + distances: torch.Tensor, + *, + num_runs: int, +) -> float: + polyline_module = _get_polyline_module() + compute_function = lambda: polyline_module.interpolate(points, distances) + runtime = _time_call( + compute_function, + num_runs=num_runs, + synchronize_cuda=True, + ) + return runtime + + +# Helper function for timing the CPU implementation. +def _time_cpu( + points: torch.Tensor, + distances: torch.Tensor, + *, + num_runs: int, +) -> float: + polyline_module = _get_polyline_module() + compute_function = lambda: polyline_module.interpolate(points, distances) + runtime = _time_call(compute_function, num_runs=num_runs) + return runtime + + +# Helper function for warming up all selected implementations once before measured runs. +def _run_warmup( + *, + batch_size: int, + num_points: int, + num_distances: int, + num_warmup_runs: int, + skip_shapely: bool, +) -> None: + if num_warmup_runs <= 0: + return + + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points, + num_distances, + seed=0, + ) + points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np) + polyline_module = _get_polyline_module() + + for _ in range(num_warmup_runs): + if not skip_shapely: + _compute_batched_shapely_reference(points_np, distances_np) + polyline_module.interpolate(points_cpu, distances_cpu) + polyline_module.interpolate(points_gpu, distances_gpu) + + torch.cuda.synchronize() + + +# Helper to (optionally) validate the results against the Shapely reference. +def _run_validation_sweep( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + *, + assert_atol: float, + assert_rtol: float, + max_abs_diff_cpu: np.ndarray, + max_abs_diff_cuda: np.ndarray, + max_abs_diff_cuda_vs_cpu: np.ndarray, +) -> None: + print(f"Running validation sweep for batch={batch_size}") + polyline_module = _get_polyline_module() + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running validation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + shapely_result = _compute_batched_shapely_reference(points_np, distances_np) + points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np) + cpu_result = polyline_module.interpolate(points_cpu, distances_cpu).numpy() + cuda_result = polyline_module.interpolate(points_gpu, distances_gpu).cpu().numpy() + + max_abs_diff_cpu[points_idx, distances_idx] = np.abs(shapely_result - cpu_result).max() + max_abs_diff_cuda[points_idx, distances_idx] = np.abs(shapely_result - cuda_result).max() + max_abs_diff_cuda_vs_cpu[points_idx, distances_idx] = np.abs(cpu_result - cuda_result).max() + + _assert_matches_shapely( + shapely_result, + cpu_result, + implementation_name="CPU", + batch_size=batch_size, + num_points=num_points_current, + num_distances=num_distances_current, + atol=assert_atol, + rtol=assert_rtol, + ) + _assert_matches_shapely( + shapely_result, + cuda_result, + implementation_name="CUDA", + batch_size=batch_size, + num_points=num_points_current, + num_distances=num_distances_current, + atol=assert_atol, + rtol=assert_rtol, + ) + + +# Helper function for evaluating every point-count and distance-count pair for one batch size. +def _evaluate_batch_size( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + *, + num_runs: int, + assert_results: bool, + assert_atol: float, + assert_rtol: float, + skip_shapely: bool, +) -> tuple[ + np.ndarray | None, np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None, np.ndarray | None +]: + result_shape = (len(nums_points), len(nums_distances)) + + shapely_runtime_ms = None if skip_shapely else np.zeros(result_shape, dtype=np.float64) + cuda_runtime_ms = np.zeros(result_shape, dtype=np.float64) + cpu_runtime_ms = np.zeros(result_shape, dtype=np.float64) + + max_abs_diff_cuda = np.zeros_like(cpu_runtime_ms) if assert_results else None + max_abs_diff_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None + max_abs_diff_cuda_vs_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None + + if not skip_shapely: + print(f"Running Shapely sweep for batch={batch_size}, runs={num_runs}") + for ( + points_idx, + distances_idx, + num_points_current, + num_distances_current, + seed, + ) in _iter_evaluation_cases(batch_size, nums_points, nums_distances): + print( + "Running Shapely evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + + shapely_runtime = _time_shapely( + points_np, + distances_np, + num_runs=num_runs, + ) + shapely_runtime_ms[points_idx, distances_idx] = shapely_runtime * 1000 + _cleanup_between_implementation_sweeps() + + print(f"Running CPU sweep for batch={batch_size}, runs={num_runs}") + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running CPU evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + points_cpu, distances_cpu = _make_torch_tensors_on_device( + points_np, + distances_np, + device="cpu", + ) + + cpu_runtime_ms[points_idx, distances_idx] = ( + _time_cpu( + points_cpu, + distances_cpu, + num_runs=num_runs, + ) + * 1000 + ) + _cleanup_between_implementation_sweeps() + + print(f"Running CUDA sweep for batch={batch_size}, runs={num_runs}") + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running CUDA evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + points_gpu, distances_gpu = _make_torch_tensors_on_device( + points_np, + distances_np, + device=DEVICE, + ) + + cuda_runtime_ms[points_idx, distances_idx] = ( + _time_cuda( + points_gpu, + distances_gpu, + num_runs=num_runs, + ) + * 1000 + ) + _cleanup_between_implementation_sweeps() + + if assert_results: + _run_validation_sweep( + batch_size, + nums_points, + nums_distances, + assert_atol=assert_atol, + assert_rtol=assert_rtol, + max_abs_diff_cpu=max_abs_diff_cpu, + max_abs_diff_cuda=max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu=max_abs_diff_cuda_vs_cpu, + ) + _cleanup_between_implementation_sweeps() + + return ( + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) + + +# Helper function for parsing command-line arguments. +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Evaluate batched CPU/CUDA polyline interpolation against a Shapely LineString reference " + "over point-count, distance-count, and batch-size sweeps." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--num-points", + dest="nums_points", + default=",".join(str(item) for item in DEFAULT_NUMS_POINTS), + help="Comma-separated point counts for the polyline-length sweep.", + ) + parser.add_argument( + "--num-distances", + dest="nums_distances", + default=",".join(str(item) for item in DEFAULT_NUMS_DISTANCES), + help="Comma-separated sample-distance counts for the interpolation sweep.", + ) + parser.add_argument( + "--batch-sizes", + default=",".join(str(item) for item in DEFAULT_BATCH_SIZES), + help="Comma-separated batch sizes to evaluate.", + ) + parser.add_argument( + "--num-polylines-per-measurement", + type=int, + default=DEFAULT_NUM_POLYLINES_PER_MEASUREMENT, + help="Target number of polylines measured per configuration; divided by batch size to get runs.", + ) + parser.add_argument( + "--num-warmup-runs", + type=int, + default=DEFAULT_NUM_WARMUP_RUNS, + help="Number of untimed warmup runs before the measured sweep.", + ) + parser.add_argument( + "--warmup-num-points", + type=int, + default=DEFAULT_WARMUP_NUM_POINTS, + help="Point count used for warmup inputs.", + ) + parser.add_argument( + "--warmup-num-distances", + type=int, + default=DEFAULT_WARMUP_NUM_DISTANCES, + help="Sample-distance count used for warmup inputs.", + ) + parser.add_argument( + "--skip-shapely", + action="store_true", + default=DEFAULT_SKIP_SHAPELY, + help="Skip Shapely reference timing and Shapely-based speedup plots.", + ) + parser.add_argument( + "--assert-results", + action="store_true", + default=DEFAULT_ASSERT_RESULTS, + help="Compare CPU and CUDA outputs against Shapely using the configured tolerances.", + ) + parser.add_argument( + "--assert-atol", + type=float, + default=DEFAULT_ASSERT_ATOL, + help="Absolute tolerance used when asserting results against Shapely.", + ) + parser.add_argument( + "--assert-rtol", + type=float, + default=DEFAULT_ASSERT_RTOL, + help="Relative tolerance used when asserting results against Shapely.", + ) + no_annotate_plots_action = parser.add_argument( + "--no-annotate-plots", + dest="annotate_plots", + action="store_false", + help="Disable annotations in generated heatmaps.", + ) + parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS) + no_annotate_plots_action.default = argparse.SUPPRESS + parser.add_argument( + "--output-dir", + type=Path, + default=Path("polyline_shapely_eval_results"), + help="Directory for Markdown result tables and generated plot images.", + ) + args = parser.parse_args() + return args + + +# Main entry point for the full benchmark sweep. +def main() -> None: + args = _parse_args() + nums_points = _parse_int_list(args.nums_points) + nums_distances = _parse_int_list(args.nums_distances) + batch_sizes = _parse_int_list(args.batch_sizes) + # Make relative output paths independent of the caller's working directory. + if not args.output_dir.is_absolute(): + args.output_dir = SCRIPT_DIR / args.output_dir + + if not torch.cuda.is_available(): + raise RuntimeError("This evaluation requires a CUDA-capable PyTorch installation.") + + args.output_dir.mkdir(parents=True, exist_ok=True) + + # Result assertions require Shapely, so disabling Shapely also disables assertions. + assert_results = args.assert_results and not args.skip_shapely + + print("Performing runtime evaluation...") + print(f"Numbers of points: {nums_points}") + print(f"Numbers of distances: {nums_distances}") + print(f"Batch sizes: {batch_sizes}") + print(f"Measured polylines per configuration: {args.num_polylines_per_measurement}") + print( + "Warmup configuration: " + f"batch={max(batch_sizes)}, points={args.warmup_num_points}, " + f"distances={args.warmup_num_distances}, runs={args.num_warmup_runs}" + ) + print(f"Use Shapely reference: {not args.skip_shapely}") + print(f"Assert results against Shapely: {assert_results}") + print(f"Annotate plots: {args.annotate_plots}") + print(f"Output directory: {args.output_dir}") + + _run_warmup( + batch_size=max(batch_sizes), + num_points=args.warmup_num_points, + num_distances=args.warmup_num_distances, + num_warmup_runs=args.num_warmup_runs, + skip_shapely=args.skip_shapely, + ) + + for batch_size in batch_sizes: + # Keep (roughly) the same number of measured polylines per configuration across batch sizes. + num_runs = max(1, args.num_polylines_per_measurement // batch_size) + print(f"Using {num_runs} measured runs for batch={batch_size}") + + # Run evaluation & get results for one batch size (number of polylines in single call). + ( + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) = _evaluate_batch_size( + batch_size, + nums_points, + nums_distances, + num_runs=num_runs, + assert_results=assert_results, + assert_atol=args.assert_atol, + assert_rtol=args.assert_rtol, + skip_shapely=args.skip_shapely, + ) + + # Write results to disk. + shapely_evaluation_outputs.write_batch_results( + args.output_dir, + batch_size, + nums_points, + nums_distances, + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + args.skip_shapely, + assert_results, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) + + # Print info. + cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms + if not args.skip_shapely: + cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms + cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms + print(f"Average Shapely runtime [ms], batch={batch_size}:\n{shapely_runtime_ms}") + print(f"Average CPU runtime [ms], batch={batch_size}:\n{cpu_runtime_ms}") + print(f"Average CUDA runtime [ms], batch={batch_size}:\n{cuda_runtime_ms}") + if not args.skip_shapely: + print(f"CPU speedup over Shapely, batch={batch_size}:\n{cpu_speedup_over_shapely}") + print(f"CUDA speedup over Shapely, batch={batch_size}:\n{cuda_speedup_over_shapely}") + print(f"CUDA speedup over CPU, batch={batch_size}:\n{cuda_speedup_over_cpu}") + if assert_results: + print(f"CUDA max absolute difference to CPU, batch={batch_size}:\n{max_abs_diff_cuda_vs_cpu}") + print(f"CPU max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cpu}") + print(f"CUDA max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cuda}") + + plotted_files = plot_shapely_evaluation.plot_from_markdown_directory( + input_dir=args.output_dir, + output_dir=args.output_dir, + batch_sizes=batch_sizes, + annotate_plots=args.annotate_plots, + ) + print(f"Generated {len(plotted_files)} plot image(s) from Markdown results.") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md new file mode 100644 index 0000000..1ac6974 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 1.03e-03 | 9.861e-04 | 1.020e-03 | 1.141e-03 | 1.175e-03 | 1.157e-03 | 1.294e-03 | 1.536e-03 | 2.404e-03 | 3.742e-03 | 6.497e-03 | 1.486e-02 | +| 5 | 1.006e-03 | 1.011e-03 | 1.022e-03 | 1.044e-03 | 1.109e-03 | 1.292e-03 | 1.406e-03 | 1.77e-03 | 2.851e-03 | 4.664e-03 | 8.391e-03 | 2.134e-02 | +| 10 | 1.017e-03 | 1.018e-03 | 1.031e-03 | 1.056e-03 | 1.118e-03 | 1.32e-03 | 1.477e-03 | 1.912e-03 | 3.197e-03 | 5.305e-03 | 9.844e-03 | 2.811e-02 | +| 20 | 1.021e-03 | 1.029e-03 | 1.050e-03 | 1.07e-03 | 1.209e-03 | 1.318e-03 | 1.579e-03 | 2.032e-03 | 3.506e-03 | 6.075e-03 | 1.094e-02 | 2.844e-02 | +| 50 | 1.105e-03 | 1.113e-03 | 1.134e-03 | 1.157e-03 | 1.246e-03 | 1.509e-03 | 1.846e-03 | 2.308e-03 | 4.140e-03 | 7.052e-03 | 1.320e-02 | 4.408e-02 | +| 100 | 1.216e-03 | 1.238e-03 | 1.248e-03 | 1.277e-03 | 1.469e-03 | 1.582e-03 | 1.909e-03 | 2.581e-03 | 4.646e-03 | 8.162e-03 | 1.514e-02 | 5.076e-02 | +| 200 | 1.5e-03 | 1.494e-03 | 1.536e-03 | 1.604e-03 | 1.652e-03 | 1.869e-03 | 2.262e-03 | 3.043e-03 | 5.500e-03 | 9.331e-03 | 1.722e-02 | 7.581e-02 | +| 500 | 2.238e-03 | 2.232e-03 | 2.302e-03 | 2.347e-03 | 2.408e-03 | 2.654e-03 | 3.131e-03 | 4.002e-03 | 6.978e-03 | 1.108e-02 | 2.022e-02 | 9.638e-02 | +| 1000 | 3.430e-03 | 3.437e-03 | 3.476e-03 | 3.52e-03 | 3.594e-03 | 3.868e-03 | 4.464e-03 | 5.305e-03 | 8.287e-03 | 1.318e-02 | 2.306e-02 | 1.374e-01 | +| 2000 | 5.753e-03 | 5.774e-03 | 5.815e-03 | 5.88e-03 | 5.95e-03 | 6.256e-03 | 6.811e-03 | 7.942e-03 | 1.099e-02 | 1.65e-02 | 2.710e-02 | 1.768e-01 | +| 5000 | 1.279e-02 | 1.289e-02 | 1.285e-02 | 1.336e-02 | 1.37e-02 | 1.370e-02 | 1.45e-02 | 1.527e-02 | 1.876e-02 | 2.451e-02 | 3.842e-02 | 2.290e-01 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md new file mode 100644 index 0000000..ee2c4c8 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.535e-03 | 3.428e-03 | 3.44e-03 | 3.456e-03 | 3.432e-03 | 3.443e-03 | 3.43e-03 | 3.521e-03 | 3.464e-03 | 3.47e-03 | 3.486e-03 | 4.911e-03 | +| 5 | 3.394e-03 | 3.39e-03 | 3.455e-03 | 3.415e-03 | 3.385e-03 | 3.388e-03 | 3.478e-03 | 3.472e-03 | 3.471e-03 | 3.533e-03 | 3.648e-03 | 5.321e-03 | +| 10 | 3.437e-03 | 3.397e-03 | 3.396e-03 | 3.392e-03 | 3.454e-03 | 3.384e-03 | 3.467e-03 | 3.482e-03 | 3.480e-03 | 3.476e-03 | 3.841e-03 | 5.798e-03 | +| 20 | 3.419e-03 | 3.408e-03 | 3.403e-03 | 3.381e-03 | 3.393e-03 | 3.442e-03 | 3.482e-03 | 3.464e-03 | 3.467e-03 | 3.468e-03 | 3.970e-03 | 6.137e-03 | +| 50 | 3.470e-03 | 3.4e-03 | 3.388e-03 | 3.394e-03 | 3.405e-03 | 3.382e-03 | 3.48e-03 | 3.528e-03 | 3.462e-03 | 3.471e-03 | 4.098e-03 | 6.448e-03 | +| 100 | 3.400e-03 | 3.453e-03 | 3.415e-03 | 3.401e-03 | 3.412e-03 | 3.411e-03 | 3.491e-03 | 3.417e-03 | 3.481e-03 | 3.535e-03 | 4.291e-03 | 6.711e-03 | +| 200 | 3.486e-03 | 3.396e-03 | 3.396e-03 | 3.454e-03 | 3.396e-03 | 3.403e-03 | 3.478e-03 | 3.460e-03 | 3.405e-03 | 3.461e-03 | 4.406e-03 | 7.067e-03 | +| 500 | 3.479e-03 | 3.389e-03 | 3.394e-03 | 3.409e-03 | 3.476e-03 | 3.385e-03 | 3.471e-03 | 3.474e-03 | 3.471e-03 | 3.594e-03 | 4.706e-03 | 7.429e-03 | +| 1000 | 3.478e-03 | 3.409e-03 | 3.407e-03 | 3.382e-03 | 3.383e-03 | 3.435e-03 | 3.453e-03 | 3.456e-03 | 3.463e-03 | 3.689e-03 | 5.009e-03 | 7.908e-03 | +| 2000 | 3.765e-03 | 3.787e-03 | 3.771e-03 | 3.777e-03 | 3.785e-03 | 3.795e-03 | 3.822e-03 | 3.857e-03 | 4.169e-03 | 4.674e-03 | 5.979e-03 | 9.683e-03 | +| 5000 | 6.e-03 | 5.96e-03 | 6.107e-03 | 6.115e-03 | 6.131e-03 | 6.205e-03 | 6.26e-03 | 6.297e-03 | 6.666e-03 | 7.521e-03 | 9.576e-03 | 1.49e-02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md new file mode 100644 index 0000000..218e33c --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 5.920e-03 | 5.929e-03 | 6.514e-03 | 7.182e-03 | 9.292e-03 | 1.258e-02 | 1.976e-02 | 3.39e-02 | 7.530e-02 | 1.522e-01 | 3.996e-01 | 1.684e+00 | +| 5 | 5.928e-03 | 5.962e-03 | 6.537e-03 | 7.384e-03 | 1.018e-02 | 1.435e-02 | 2.246e-02 | 3.783e-02 | 9.250e-02 | 2.432e-01 | 4.686e-01 | 1.798e+00 | +| 10 | 6.044e-03 | 6.014e-03 | 6.824e-03 | 7.726e-03 | 9.796e-03 | 1.534e-02 | 2.687e-02 | 4.409e-02 | 1.118e-01 | 2.867e-01 | 5.987e-01 | 2.145e+00 | +| 20 | 6.032e-03 | 6.105e-03 | 6.865e-03 | 8.794e-03 | 1.120e-02 | 2.134e-02 | 3.391e-02 | 6.222e-02 | 1.556e-01 | 3.645e-01 | 6.786e-01 | 2.637e+00 | +| 50 | 6.499e-03 | 6.279e-03 | 8.662e-03 | 9.916e-03 | 1.697e-02 | 3.003e-02 | 5.847e-02 | 1.228e-01 | 2.529e-01 | 6.603e-01 | 1.177e+00 | 3.703e+00 | +| 100 | 6.479e-03 | 7.37e-03 | 1.051e-02 | 1.371e-02 | 2.748e-02 | 4.721e-02 | 1.027e-01 | 1.929e-01 | 5.007e-01 | 9.981e-01 | 2.129e+00 | 6.030e+00 | +| 200 | 7.691e-03 | 9.454e-03 | 1.444e-02 | 2.263e-02 | 3.906e-02 | 9.756e-02 | 1.886e-01 | 3.467e-01 | 8.815e-01 | 1.792e+00 | 3.648e+00 | 1.008e+01 | +| 500 | 8.883e-03 | 1.283e-02 | 3.616e-02 | 3.52e-02 | 8.427e-02 | 2.162e-01 | 4.081e-01 | 8.556e-01 | 2.177e+00 | 4.328e+00 | 8.686e+00 | 2.235e+01 | +| 1000 | 8.225e-03 | 2.181e-02 | 5.647e-02 | 8.705e-02 | 1.809e-01 | 4.044e-01 | 7.995e-01 | 1.644e+00 | 4.172e+00 | 8.583e+00 | 1.69e+01 | 4.323e+01 | +| 2000 | 3.628e-02 | 4.154e-02 | 6.644e-02 | 1.663e-01 | 4.074e-01 | 8.366e-01 | 1.748e+00 | 3.414e+00 | 8.769e+00 | 1.775e+01 | 3.397e+01 | 8.585e+01 | +| 5000 | 5.857e-02 | 1.474e-01 | 2.871e-01 | 5.538e-01 | 8.380e-01 | 2.379e+00 | 4.255e+00 | 8.153e+00 | 2.054e+01 | 4.167e+01 | 8.298e+01 | 2.090e+02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md new file mode 100644 index 0000000..4d3f267 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 5.75 | 6.01 | 6.38 | 6.30 | 7.91 | 10.87 | 15.27 | 22.07 | 31.33 | 40.68 | 61.51 | 113.29 | +| 5 | 5.89 | 5.90 | 6.40 | 7.07 | 9.18 | 11.11 | 15.98 | 21.38 | 32.44 | 52.15 | 55.84 | 84.26 | +| 10 | 5.94 | 5.91 | 6.62 | 7.32 | 8.76 | 11.63 | 18.19 | 23.06 | 34.97 | 54.04 | 60.82 | 76.29 | +| 20 | 5.91 | 5.93 | 6.54 | 8.22 | 9.26 | 16.20 | 21.48 | 30.62 | 44.39 | 60.01 | 62.01 | 92.75 | +| 50 | 5.88 | 5.64 | 7.64 | 8.57 | 13.61 | 19.90 | 31.67 | 53.21 | 61.08 | 93.63 | 89.16 | 84.01 | +| 100 | 5.33 | 5.96 | 8.42 | 10.73 | 18.70 | 29.85 | 53.82 | 74.74 | 107.76 | 122.28 | 140.68 | 118.80 | +| 200 | 5.13 | 6.33 | 9.40 | 14.11 | 23.64 | 52.19 | 83.38 | 113.93 | 160.26 | 192.08 | 211.85 | 132.97 | +| 500 | 3.97 | 5.75 | 15.71 | 15.00 | 34.99 | 81.47 | 130.37 | 213.79 | 311.96 | 390.72 | 429.62 | 231.94 | +| 1000 | 2.40 | 6.35 | 16.25 | 24.73 | 50.34 | 104.54 | 179.10 | 309.90 | 503.46 | 651.02 | 732.57 | 314.65 | +| 2000 | 6.31 | 7.19 | 11.43 | 28.28 | 68.47 | 133.72 | 256.58 | 429.83 | 797.87 | 1.08e+03 | 1.25e+03 | 485.50 | +| 5000 | 4.58 | 11.44 | 22.34 | 41.47 | 61.17 | 173.59 | 293.48 | 533.92 | 1.09e+03 | 1.70e+03 | 2.16e+03 | 912.55 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md new file mode 100644 index 0000000..487290e --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 0.29 | 0.29 | 0.30 | 0.33 | 0.34 | 0.34 | 0.38 | 0.44 | 0.69 | 1.08 | 1.86 | 3.03 | +| 5 | 0.30 | 0.30 | 0.30 | 0.31 | 0.33 | 0.38 | 0.40 | 0.51 | 0.82 | 1.32 | 2.30 | 4.01 | +| 10 | 0.30 | 0.30 | 0.30 | 0.31 | 0.32 | 0.39 | 0.43 | 0.55 | 0.92 | 1.53 | 2.56 | 4.85 | +| 20 | 0.30 | 0.30 | 0.31 | 0.32 | 0.36 | 0.38 | 0.45 | 0.59 | 1.01 | 1.75 | 2.76 | 4.63 | +| 50 | 0.32 | 0.33 | 0.33 | 0.34 | 0.37 | 0.45 | 0.53 | 0.65 | 1.20 | 2.03 | 3.22 | 6.84 | +| 100 | 0.36 | 0.36 | 0.37 | 0.38 | 0.43 | 0.46 | 0.55 | 0.76 | 1.33 | 2.31 | 3.53 | 7.56 | +| 200 | 0.43 | 0.44 | 0.45 | 0.46 | 0.49 | 0.55 | 0.65 | 0.88 | 1.62 | 2.70 | 3.91 | 10.73 | +| 500 | 0.64 | 0.66 | 0.68 | 0.69 | 0.69 | 0.78 | 0.90 | 1.15 | 2.01 | 3.08 | 4.30 | 12.97 | +| 1000 | 0.99 | 1.01 | 1.02 | 1.04 | 1.06 | 1.13 | 1.29 | 1.53 | 2.39 | 3.57 | 4.60 | 17.37 | +| 2000 | 1.53 | 1.52 | 1.54 | 1.56 | 1.57 | 1.65 | 1.78 | 2.06 | 2.64 | 3.53 | 4.53 | 18.26 | +| 5000 | 2.13 | 2.16 | 2.10 | 2.18 | 2.23 | 2.21 | 2.32 | 2.43 | 2.81 | 3.26 | 4.01 | 15.38 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md new file mode 100644 index 0000000..1125800 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 1.67 | 1.73 | 1.89 | 2.08 | 2.71 | 3.65 | 5.76 | 9.63 | 21.74 | 43.87 | 114.64 | 342.86 | +| 5 | 1.75 | 1.76 | 1.89 | 2.16 | 3.01 | 4.23 | 6.46 | 10.89 | 26.65 | 68.85 | 128.46 | 337.91 | +| 10 | 1.76 | 1.77 | 2.01 | 2.28 | 2.84 | 4.53 | 7.75 | 12.66 | 32.12 | 82.47 | 155.86 | 369.85 | +| 20 | 1.76 | 1.79 | 2.02 | 2.60 | 3.30 | 6.20 | 9.74 | 17.96 | 44.88 | 105.11 | 170.94 | 429.75 | +| 50 | 1.87 | 1.85 | 2.56 | 2.92 | 4.98 | 8.88 | 16.80 | 34.81 | 73.04 | 190.20 | 287.27 | 574.20 | +| 100 | 1.91 | 2.13 | 3.08 | 4.03 | 8.05 | 13.84 | 29.42 | 56.46 | 143.84 | 282.36 | 496.26 | 898.57 | +| 200 | 2.21 | 2.78 | 4.25 | 6.55 | 11.50 | 28.67 | 54.23 | 100.20 | 258.88 | 517.84 | 827.99 | 1.43e+03 | +| 500 | 2.55 | 3.79 | 10.65 | 10.32 | 24.24 | 63.87 | 117.57 | 246.30 | 627.28 | 1.20e+03 | 1.85e+03 | 3.01e+03 | +| 1000 | 2.36 | 6.40 | 16.58 | 25.74 | 53.47 | 117.71 | 231.55 | 475.60 | 1.20e+03 | 2.33e+03 | 3.37e+03 | 5.47e+03 | +| 2000 | 9.63 | 10.97 | 17.62 | 44.03 | 107.63 | 220.43 | 457.23 | 885.05 | 2.10e+03 | 3.80e+03 | 5.68e+03 | 8.87e+03 | +| 5000 | 9.76 | 24.73 | 47.01 | 90.57 | 136.68 | 383.37 | 679.79 | 1.29e+03 | 3.08e+03 | 5.54e+03 | 8.67e+03 | 1.40e+04 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md new file mode 100644 index 0000000..584833d --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.839e-03 | 2.263e-03 | 2.751e-03 | 3.603e-03 | 5.367e-03 | 1.043e-02 | 1.963e-02 | 3.62e-02 | 8.759e-02 | 1.75e-01 | 3.531e-01 | 8.816e-01 | +| 5 | 2.581e-03 | 2.852e-03 | 3.709e-03 | 5.404e-03 | 8.689e-03 | 2.09e-02 | 4.92e-02 | 1.152e-01 | 3.25e-01 | 6.775e-01 | 1.329e+00 | 3.191e+00 | +| 10 | 3.203e-03 | 3.694e-03 | 4.880e-03 | 6.879e-03 | 1.078e-02 | 2.716e-02 | 6.540e-02 | 1.654e-01 | 4.613e-01 | 9.447e-01 | 1.831e+00 | 4.547e+00 | +| 20 | 4.909e-03 | 5.348e-03 | 6.714e-03 | 8.853e-03 | 1.4e-02 | 3.155e-02 | 8.694e-02 | 2.126e-01 | 5.852e-01 | 1.196e+00 | 2.377e+00 | 5.819e+00 | +| 50 | 9.96e-03 | 1.06e-02 | 1.193e-02 | 1.534e-02 | 2.064e-02 | 4.532e-02 | 1.234e-01 | 2.898e-01 | 7.611e-01 | 1.517e+00 | 3.015e+00 | 7.419e+00 | +| 100 | 1.739e-02 | 1.813e-02 | 2.019e-02 | 2.349e-02 | 2.952e-02 | 5.775e-02 | 1.613e-01 | 3.547e-01 | 8.880e-01 | 1.768e+00 | 3.481e+00 | 8.625e+00 | +| 200 | 3.283e-02 | 3.274e-02 | 3.517e-02 | 3.895e-02 | 4.640e-02 | 8.618e-02 | 2.034e-01 | 4.200e-01 | 1.036e+00 | 2.037e+00 | 4.028e+00 | 9.911e+00 | +| 500 | 7.66e-02 | 7.725e-02 | 8.045e-02 | 8.389e-02 | 9.346e-02 | 1.439e-01 | 2.970e-01 | 5.451e-01 | 1.259e+00 | 2.425e+00 | 4.751e+00 | 1.168e+01 | +| 1000 | 1.509e-01 | 1.580e-01 | 1.543e-01 | 1.593e-01 | 1.691e-01 | 2.349e-01 | 4.054e-01 | 6.792e-01 | 1.476e+00 | 2.797e+00 | 5.417e+00 | 1.32e+01 | +| 2000 | 3.070e-01 | 3.025e-01 | 3.032e-01 | 3.147e-01 | 3.206e-01 | 4.106e-01 | 5.676e-01 | 9.091e-01 | 1.797e+00 | 3.308e+00 | 6.246e+00 | 1.515e+01 | +| 5000 | 7.461e-01 | 7.505e-01 | 7.574e-01 | 7.580e-01 | 7.719e-01 | 8.920e-01 | 1.084e+00 | 1.424e+00 | 2.472e+00 | 4.160e+00 | 7.566e+00 | 1.774e+01 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md new file mode 100644 index 0000000..66190a3 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 8.093e-03 | 4.831e-03 | 4.721e-03 | 4.601e-03 | 4.671e-03 | 4.493e-03 | 4.649e-03 | 4.688e-03 | 4.626e-03 | 4.676e-03 | 5.169e-03 | 8.322e-03 | +| 5 | 4.795e-03 | 4.643e-03 | 4.625e-03 | 4.689e-03 | 4.613e-03 | 4.672e-03 | 4.744e-03 | 4.554e-03 | 4.539e-03 | 5.416e-03 | 5.288e-03 | 8.217e-03 | +| 10 | 4.59e-03 | 4.752e-03 | 5.439e-03 | 4.58e-03 | 4.614e-03 | 4.622e-03 | 4.530e-03 | 4.682e-03 | 4.752e-03 | 4.708e-03 | 5.339e-03 | 8.593e-03 | +| 20 | 4.6e-03 | 4.605e-03 | 4.989e-03 | 4.693e-03 | 4.636e-03 | 5.333e-03 | 4.715e-03 | 4.553e-03 | 4.573e-03 | 4.735e-03 | 5.519e-03 | 9.150e-03 | +| 50 | 4.689e-03 | 4.66e-03 | 4.779e-03 | 4.624e-03 | 4.725e-03 | 4.518e-03 | 4.727e-03 | 4.716e-03 | 5.022e-03 | 4.643e-03 | 5.667e-03 | 9.064e-03 | +| 100 | 4.615e-03 | 4.651e-03 | 4.751e-03 | 4.623e-03 | 4.545e-03 | 4.623e-03 | 4.731e-03 | 4.651e-03 | 5.399e-03 | 4.741e-03 | 1.487e-02 | 9.328e-03 | +| 200 | 4.689e-03 | 5.153e-03 | 4.695e-03 | 4.742e-03 | 4.776e-03 | 4.716e-03 | 5.547e-03 | 4.72e-03 | 4.628e-03 | 4.870e-03 | 5.972e-03 | 9.408e-03 | +| 500 | 4.613e-03 | 4.76e-03 | 4.683e-03 | 4.732e-03 | 5.307e-03 | 4.693e-03 | 4.659e-03 | 4.737e-03 | 5.204e-03 | 4.969e-03 | 6.236e-03 | 9.575e-03 | +| 1000 | 4.701e-03 | 4.749e-03 | 4.716e-03 | 4.71e-03 | 4.676e-03 | 4.783e-03 | 4.781e-03 | 4.735e-03 | 4.820e-03 | 5.343e-03 | 6.652e-03 | 1.028e-02 | +| 2000 | 5.166e-03 | 5.164e-03 | 5.182e-03 | 5.161e-03 | 5.152e-03 | 5.738e-03 | 5.541e-03 | 5.356e-03 | 5.642e-03 | 6.490e-03 | 7.966e-03 | 1.232e-02 | +| 5000 | 7.627e-03 | 7.593e-03 | 7.564e-03 | 7.658e-03 | 7.698e-03 | 7.712e-03 | 7.797e-03 | 7.8e-03 | 8.731e-03 | 1.037e-02 | 1.327e-02 | 2.230e-02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md new file mode 100644 index 0000000..afd2356 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 2.728e-02 | 3.092e-02 | 5.674e-02 | 1.079e-01 | 1.983e-01 | 4.919e-01 | 9.971e-01 | 7.669e+00 | 1.382e+01 | 3.652e+01 | 7.587e+01 | 2.001e+02 | +| 5 | 2.908e-02 | 3.597e-02 | 6.759e-02 | 1.316e-01 | 2.390e-01 | 5.987e-01 | 5.551e+00 | 5.53e+00 | 1.544e+01 | 3.595e+01 | 8.467e+01 | 2.072e+02 | +| 10 | 3.137e-02 | 4.208e-02 | 8.376e-02 | 1.58e-01 | 3.011e-01 | 4.584e+00 | 1.489e+00 | 5.831e+00 | 1.888e+01 | 3.929e+01 | 8.577e+01 | 2.247e+02 | +| 20 | 3.699e-02 | 5.445e-02 | 1.104e-01 | 2.123e-01 | 4.003e-01 | 1.003e+00 | 5.894e+00 | 6.842e+00 | 2.179e+01 | 4.142e+01 | 1.000e+02 | 2.533e+02 | +| 50 | 5.549e-02 | 8.991e-02 | 1.946e-01 | 3.639e-01 | 7.197e-01 | 1.785e+00 | 3.579e+00 | 7.337e+00 | 3.250e+01 | 5.705e+01 | 1.321e+02 | 3.331e+02 | +| 100 | 8.595e-02 | 1.505e-01 | 3.145e-01 | 6.342e-01 | 1.227e+00 | 3.114e+00 | 6.111e+00 | 1.251e+01 | 4.497e+01 | 8.732e+01 | 1.823e+02 | 4.563e+02 | +| 200 | 1.464e-01 | 2.415e-01 | 6.012e-01 | 1.157e+00 | 2.252e+00 | 5.739e+00 | 1.556e+01 | 2.64e+01 | 6.973e+01 | 1.423e+02 | 2.886e+02 | 7.270e+02 | +| 500 | 3.363e-01 | 5.55e-01 | 1.525e+00 | 2.669e+00 | 5.563e+00 | 1.352e+01 | 3.084e+01 | 5.781e+01 | 1.467e+02 | 2.977e+02 | 6.112e+02 | 1.519e+03 | +| 1000 | 6.443e-01 | 1.201e+00 | 2.763e+00 | 5.361e+00 | 1.078e+01 | 2.608e+01 | 5.396e+01 | 1.076e+02 | 2.877e+02 | 5.609e+02 | 1.143e+03 | 2.861e+03 | +| 2000 | 1.189e+00 | 1.968e+00 | 5.459e+00 | 1.057e+01 | 2.136e+01 | 5.446e+01 | 1.096e+02 | 2.185e+02 | 5.670e+02 | 1.114e+03 | 2.244e+03 | 5.570e+03 | +| 5000 | 5.404e+00 | 5.521e+00 | 1.321e+01 | 2.739e+01 | 5.495e+01 | 1.321e+02 | 2.654e+02 | 5.364e+02 | 1.375e+03 | 2.726e+03 | 5.435e+03 | 1.369e+04 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md new file mode 100644 index 0000000..929b485 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 7.11 | 13.66 | 20.62 | 29.95 | 36.94 | 47.14 | 50.78 | 211.88 | 157.80 | 208.71 | 214.88 | 226.96 | +| 5 | 11.27 | 12.61 | 18.22 | 24.35 | 27.51 | 28.65 | 112.84 | 48.00 | 47.51 | 53.07 | 63.71 | 64.93 | +| 10 | 9.79 | 11.39 | 17.16 | 22.96 | 27.92 | 168.75 | 22.77 | 35.24 | 40.93 | 41.59 | 46.86 | 49.41 | +| 20 | 7.53 | 10.18 | 16.45 | 23.99 | 28.60 | 31.78 | 67.79 | 32.19 | 37.23 | 34.62 | 42.08 | 43.52 | +| 50 | 5.57 | 8.48 | 16.31 | 23.72 | 34.86 | 39.38 | 29.00 | 25.31 | 42.71 | 37.61 | 43.83 | 44.90 | +| 100 | 4.94 | 8.30 | 15.58 | 27.00 | 41.57 | 53.93 | 37.89 | 35.28 | 50.64 | 49.39 | 52.37 | 52.91 | +| 200 | 4.46 | 7.38 | 17.09 | 29.70 | 48.53 | 66.59 | 76.49 | 62.85 | 67.33 | 69.88 | 71.66 | 73.36 | +| 500 | 4.39 | 7.18 | 18.96 | 31.82 | 59.52 | 93.93 | 103.85 | 106.04 | 116.48 | 122.75 | 128.65 | 130.10 | +| 1000 | 4.27 | 7.60 | 17.91 | 33.65 | 63.73 | 111.04 | 133.13 | 158.49 | 194.95 | 200.54 | 210.95 | 216.73 | +| 2000 | 3.87 | 6.51 | 18.00 | 33.57 | 66.62 | 132.64 | 193.13 | 240.29 | 315.51 | 336.89 | 359.30 | 367.59 | +| 5000 | 7.24 | 7.36 | 17.44 | 36.14 | 71.19 | 148.05 | 244.92 | 376.69 | 556.45 | 655.34 | 718.35 | 771.81 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md new file mode 100644 index 0000000..39bcbd0 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 0.47 | 0.47 | 0.58 | 0.78 | 1.15 | 2.32 | 4.22 | 7.72 | 18.93 | 37.42 | 68.31 | 105.94 | +| 5 | 0.54 | 0.61 | 0.80 | 1.15 | 1.88 | 4.47 | 10.37 | 25.30 | 71.59 | 125.09 | 251.34 | 388.32 | +| 10 | 0.70 | 0.78 | 0.90 | 1.50 | 2.34 | 5.88 | 14.44 | 35.34 | 97.07 | 200.67 | 342.88 | 529.19 | +| 20 | 1.07 | 1.16 | 1.35 | 1.89 | 3.02 | 5.92 | 18.44 | 46.69 | 127.98 | 252.67 | 430.68 | 636.00 | +| 50 | 2.12 | 2.27 | 2.50 | 3.32 | 4.37 | 10.03 | 26.11 | 61.46 | 151.55 | 326.74 | 532.01 | 818.56 | +| 100 | 3.77 | 3.90 | 4.25 | 5.08 | 6.49 | 12.49 | 34.09 | 76.28 | 164.48 | 372.91 | 234.16 | 924.55 | +| 200 | 7.00 | 6.35 | 7.49 | 8.21 | 9.72 | 18.28 | 36.67 | 89.00 | 223.79 | 418.30 | 674.41 | 1.05e+03 | +| 500 | 16.61 | 16.23 | 17.18 | 17.73 | 17.61 | 30.67 | 63.76 | 115.08 | 242.03 | 488.09 | 761.85 | 1.22e+03 | +| 1000 | 32.10 | 33.27 | 32.72 | 33.83 | 36.17 | 49.11 | 84.79 | 143.44 | 306.20 | 523.52 | 814.37 | 1.28e+03 | +| 2000 | 59.44 | 58.58 | 58.52 | 60.99 | 62.24 | 71.56 | 102.44 | 169.74 | 318.56 | 509.73 | 784.05 | 1.23e+03 | +| 5000 | 97.82 | 98.84 | 100.13 | 98.98 | 100.27 | 115.66 | 139.00 | 182.58 | 283.08 | 401.24 | 570.04 | 795.20 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md new file mode 100644 index 0000000..12e13eb --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.37 | 6.40 | 12.02 | 23.45 | 42.45 | 109.46 | 214.50 | 1.64e+03 | 2.99e+03 | 7.81e+03 | 1.47e+04 | 2.40e+04 | +| 5 | 6.06 | 7.75 | 14.61 | 28.06 | 51.83 | 128.15 | 1.17e+03 | 1.21e+03 | 3.40e+03 | 6.64e+03 | 1.60e+04 | 2.52e+04 | +| 10 | 6.83 | 8.85 | 15.40 | 34.49 | 65.26 | 991.79 | 328.65 | 1.25e+03 | 3.97e+03 | 8.35e+03 | 1.61e+04 | 2.61e+04 | +| 20 | 8.04 | 11.83 | 22.13 | 45.25 | 86.35 | 188.01 | 1.25e+03 | 1.50e+03 | 4.77e+03 | 8.75e+03 | 1.81e+04 | 2.77e+04 | +| 50 | 11.84 | 19.30 | 40.72 | 78.71 | 152.33 | 394.99 | 757.11 | 1.56e+03 | 6.47e+03 | 1.23e+04 | 2.33e+04 | 3.68e+04 | +| 100 | 18.63 | 32.36 | 66.20 | 137.20 | 269.98 | 673.70 | 1.29e+03 | 2.69e+03 | 8.33e+03 | 1.84e+04 | 1.23e+04 | 4.89e+04 | +| 200 | 31.22 | 46.87 | 128.05 | 243.96 | 471.56 | 1.22e+03 | 2.80e+03 | 5.59e+03 | 1.51e+04 | 2.92e+04 | 4.83e+04 | 7.73e+04 | +| 500 | 72.90 | 116.59 | 325.71 | 564.06 | 1.05e+03 | 2.88e+03 | 6.62e+03 | 1.22e+04 | 2.82e+04 | 5.99e+04 | 9.80e+04 | 1.59e+05 | +| 1000 | 137.05 | 252.94 | 585.84 | 1.14e+03 | 2.31e+03 | 5.45e+03 | 1.13e+04 | 2.27e+04 | 5.97e+04 | 1.05e+05 | 1.72e+05 | 2.78e+05 | +| 2000 | 230.13 | 381.18 | 1.05e+03 | 2.05e+03 | 4.15e+03 | 9.49e+03 | 1.98e+04 | 4.08e+04 | 1.01e+05 | 1.72e+05 | 2.82e+05 | 4.52e+05 | +| 5000 | 708.50 | 727.06 | 1.75e+03 | 3.58e+03 | 7.14e+03 | 1.71e+04 | 3.40e+04 | 6.88e+04 | 1.58e+05 | 2.63e+05 | 4.09e+05 | 6.14e+05 | diff --git a/packages/lane_helpers/examples/basic_usage.py b/packages/lane_helpers/examples/basic_usage.py new file mode 100644 index 0000000..a099fd8 --- /dev/null +++ b/packages/lane_helpers/examples/basic_usage.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from accvlab.lane_helpers import polyline + + +def main() -> None: + if not torch.cuda.is_available(): + raise RuntimeError("This example requires a CUDA-capable PyTorch installation.") + + # @NOTE Use one rectangle polyline with shape (batch=1, num_points=5, num_dims=2). + points = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device="cuda", + dtype=torch.float32, + ) + + # @NOTE Use a tensor of distances to sample the polyline at (batch=1, num_distances=5). + distances = torch.tensor([[0.0, 0.5, 1.0, 3.0, 6.0]], device="cuda", dtype=torch.float32) + + # @NOTE Interpolate the polyline at the given distances. + sampled_points = polyline.interpolate(points, distances) + # @NOTE Compute the length of the polyline. + line_lengths = polyline.lengths(points) + + # @NOTE Print the results. + print(f"Interpolated points:\n{sampled_points}") + print(f"Line length(s): {line_lengths}") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/ext_impl/CMakeLists.txt b/packages/lane_helpers/ext_impl/CMakeLists.txt new file mode 100644 index 0000000..bfab1b2 --- /dev/null +++ b/packages/lane_helpers/ext_impl/CMakeLists.txt @@ -0,0 +1,55 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.18) +project(accvlab_lane_helpers_ext LANGUAGES CXX CUDA) + +if(NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +execute_process( + COMMAND "python3" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'share', 'cmake'))" + OUTPUT_VARIABLE TORCH_CMAKE_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PATH}") + +find_package(CUDA REQUIRED) +find_package(Torch REQUIRED) +find_package(Python COMPONENTS Interpreter Development REQUIRED) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -m pybind11 --cmakedir + OUTPUT_VARIABLE pybind11_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +find_package(pybind11 REQUIRED) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + set(CMAKE_CUDA_ARCHITECTURES native) + else() + set(CMAKE_CUDA_ARCHITECTURES "75;80;86") + endif() +endif() + +separate_arguments(TORCH_CXX_FLAGS_LIST NATIVE_COMMAND "${TORCH_CXX_FLAGS}") + +find_library(TORCH_PYTHON_LIBRARY torch_python PATHS ${TORCH_INSTALL_PREFIX}/lib) + +add_subdirectory(polyline) diff --git a/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt new file mode 100644 index 0000000..207431b --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pybind11_add_module(accvlab_lane_helpers_polyline_sampling MODULE + src/polyline.cu + src/polyline_cpu.cpp + src/polyline.cpp +) + +set_target_properties(accvlab_lane_helpers_polyline_sampling PROPERTIES + CXX_STANDARD 17 + CUDA_STANDARD 17 + OUTPUT_NAME "_polyline_sampling" + PREFIX "" +) + +target_compile_definitions(accvlab_lane_helpers_polyline_sampling PRIVATE + TORCH_EXTENSION_NAME=_polyline_sampling + TORCH_API_INCLUDE_EXTENSION_H +) + +target_compile_options(accvlab_lane_helpers_polyline_sampling PRIVATE ${TORCH_CXX_FLAGS_LIST}) + +target_link_libraries(accvlab_lane_helpers_polyline_sampling PRIVATE + ${TORCH_LIBRARIES} + ${CUDA_LIBRARIES} + ${TORCH_PYTHON_LIBRARY} +) + +target_include_directories(accvlab_lane_helpers_polyline_sampling PRIVATE + ${TORCH_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} + include/ +) + +install(TARGETS accvlab_lane_helpers_polyline_sampling + LIBRARY DESTINATION . + RUNTIME DESTINATION . +) diff --git a/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh new file mode 100644 index 0000000..441220a --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH +#define POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH + +#include + +#define CUDA_CHECK(error_code_or_call) C10_CUDA_CHECK(error_code_or_call) +#define CUDA_CHECK_LAST() C10_CUDA_CHECK(cudaGetLastError()) + +#endif \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh new file mode 100644 index 0000000..01ce298 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Host-visible interface for the polyline interpolation CUDA +// implementation. This header is intentionally free of CUDA device intrinsics +// so it can be included from both C++ and CUDA translation units. + +#ifndef LANE_HELPERS_POLYLINE_CUH +#define LANE_HELPERS_POLYLINE_CUH + +#include +#include + +#include +#include +#include + +namespace polyline { + +template +struct PolylineLaunchConfig { + dim3 block_dim; + dim3 grid_dim; + int num_points_full_blocks; + size_t shared_mem_size; + size_t distance_buffer_ext_size_elems; + bool use_shared_distances; + size_t max_shared_full; +}; + +template +PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, int device); + +template +void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances, + dtype* result_points, int num_samples, bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream); + +template +void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples, + cudaStream_t stream); + +template +void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances, + int num_distances, dtype* result_points, int num_samples, + sample_size_dtype* sample_sizes_points, + sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream); + +template +void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths, + int num_samples, sample_size_dtype* sample_sizes_points, + cudaStream_t stream); + +template +void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances, + int num_distances, dtype* result_points, int num_samples, + bool relative_distances); + +template +void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples); + +template +void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + const dtype* distances, int num_distances, + dtype* result_points, int num_samples, + const sample_size_dtype* sample_sizes_points, + const sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances); + +template +void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + dtype* lengths, int num_samples, + const sample_size_dtype* sample_sizes_points); + +// Explicit instantiations are provided in polyline.cu and polyline_cpu.cpp. +#define DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE) \ + extern template PolylineLaunchConfig make_polyline_launch_config( \ + int num_points, int num_samples, int device); + +#define DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE) \ + extern template void polyline_interpolation( \ + DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE) \ + extern template void polyline_lengths(DTYPE * points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples, cudaStream_t stream); + +#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_interpolation_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_lengths_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream); + +#define DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) + +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(float) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(double) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::Half) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::BFloat16) + +#undef DECLARE_POLYLINE_CUDA_DTYPE_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_EXTERN +#undef DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN + +#define DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE) \ + extern template void polyline_interpolation_cpu( \ + const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances); + +#define DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE) \ + extern template void polyline_lengths_cpu(const DTYPE* points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples); + +#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_interpolation_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances); + +#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_lengths_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_points); + +#define DECLARE_POLYLINE_CPU_DTYPE_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) + +DECLARE_POLYLINE_CPU_DTYPE_EXTERN(float) +DECLARE_POLYLINE_CPU_DTYPE_EXTERN(double) + +#undef DECLARE_POLYLINE_CPU_DTYPE_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_CPU_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh new file mode 100644 index 0000000..625dacf --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_COMMON_CUH +#define LANE_HELPERS_POLYLINE_COMMON_CUH + +#include +#include + +#include "polyline_dtype_compat.cuh" + +#ifdef __CUDACC__ +// Keep scalar helpers callable from both CUDA kernels and CPU translation units. +#define POLYLINE_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else +#define POLYLINE_HOST_DEVICE_INLINE inline +#endif + +namespace polyline { + +template +POLYLINE_HOST_DEVICE_INLINE dtype polyline_nan() { + const dtype nan_value = static_cast(NAN); + return nan_value; +} + +template +POLYLINE_HOST_DEVICE_INLINE void fill_point_with_nan_common(point_dtype* res_point, int num_dims) { + const point_dtype nan_value = polyline_nan(); + for (int d = 0; d < num_dims; ++d) { + res_point[d] = nan_value; + } +} + +/** + * @brief Compute the Euclidean length of one polyline segment. + * + * @details + * `segment_idx` refers to the segment between points `segment_idx` and + * `segment_idx + 1`. The point coordinates are laid out consecutively as + * `(num_points, num_dims)`. + * + * The point dtype and accumulation dtype are intentionally separate so the CPU + * path can accumulate in a wider type while the CUDA path preserves its + * existing dtype behavior. + */ +template +POLYLINE_HOST_DEVICE_INLINE accum_dtype compute_segment_length_common(const point_dtype* points_sample, + int segment_idx, int num_dims) { + const point_dtype* first_point = points_sample + segment_idx * num_dims; + const point_dtype* second_point = points_sample + (segment_idx + 1) * num_dims; + accum_dtype accum_sqr = static_cast(0.0); + for (int d = 0; d < num_dims; ++d) { + const accum_dtype diff = + static_cast(first_point[d]) - static_cast(second_point[d]); + accum_sqr += diff * diff; + } + const accum_dtype segment_length = polyline_sqrt(accum_sqr); + return segment_length; +} + +/** + * @brief Find the last index whose value is lower than or equal to `value`. + * + * @details + * The input sequence is expected to be monotonically non-decreasing cumulative + * distances. The return value can be: + * - `-1` when `value` lies before the first point. + * - `sequence_length - 1` when `value` lies at or beyond the last point. + * - Any valid lower segment endpoint otherwise. + * + * This is used to locate the segment containing the requested interpolation + * distance. + */ +template +POLYLINE_HOST_DEVICE_INLINE int get_index_of_last_lower_or_equal_to_common(const accum_dtype* sequence, + accum_dtype value, + int sequence_length) { + int min_idx = 0; + int max_idx = sequence_length - 1; + + if (polyline_value_gt(sequence[0], value)) { + return -1; + } + if (polyline_value_lt(sequence[sequence_length - 1], value)) { + return sequence_length - 1; + } + + while (max_idx - min_idx > 1) { + const int curr_idx = (max_idx + min_idx) >> 1; + const accum_dtype curr_val = sequence[curr_idx]; + if (polyline_value_lt(curr_val, value)) { + min_idx = curr_idx; + } else if (polyline_value_gt(curr_val, value)) { + max_idx = curr_idx; + } else { + min_idx = curr_idx; + max_idx = curr_idx; + } + } + return min_idx; +} + +/** + * @brief Sample one point on a polyline at a requested absolute distance. + * + * @details + * `accum_distances` stores the distance from the start of the polyline to each + * point. Distances outside the polyline are clamped to the first or last point. + * Degenerate zero-length segments return the lower endpoint. + */ +template +POLYLINE_HOST_DEVICE_INLINE void sample_at_distance_common(const point_dtype* points, + const accum_dtype* accum_distances, + accum_dtype distance_to_sample_at, int num_points, + int num_dims, point_dtype* res_point) { + const int index_min = get_index_of_last_lower_or_equal_to_common( + accum_distances, distance_to_sample_at, num_points); + if (index_min >= 0 && index_min < num_points - 1) { + const int index_max = index_min + 1; + const point_dtype* min_point = points + index_min * num_dims; + const point_dtype* max_point = points + index_max * num_dims; + const accum_dtype dist_min = accum_distances[index_min]; + const accum_dtype dist_max = accum_distances[index_max]; + const accum_dtype dist = dist_max - dist_min; + if (polyline_value_ge(dist, static_cast(std::numeric_limits::epsilon()))) { + const accum_dtype weight_max = (distance_to_sample_at - dist_min) / dist; + const accum_dtype weight_min = (dist_max - distance_to_sample_at) / dist; + for (int d = 0; d < num_dims; ++d) { + const accum_dtype interpolated = static_cast(min_point[d]) * weight_min + + static_cast(max_point[d]) * weight_max; + res_point[d] = static_cast(interpolated); + } + } else { + for (int d = 0; d < num_dims; ++d) { + res_point[d] = min_point[d]; + } + } + } else if (index_min == -1) { + for (int d = 0; d < num_dims; ++d) { + // Note that we are accessing the first point, so that points[d] corresponds to the element we + // want to access, and no offset is needed. + res_point[d] = points[d]; + } + } else if (index_min == num_points - 1) { + for (int d = 0; d < num_dims; ++d) { + res_point[d] = points[(num_points - 1) * num_dims + d]; + } + } +} + +} // namespace polyline + +#undef POLYLINE_HOST_DEVICE_INLINE + +#endif // LANE_HELPERS_POLYLINE_COMMON_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh new file mode 100644 index 0000000..eae81c4 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH +#define LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH + +#include + +// CUDA provides native __half/__nv_bfloat16 comparison intrinsics and shuffle +// overloads, while c10 low-precision wrappers add extra conversion paths, +// leading to compilation errors. The CUDA-only specializations below route c10 values +// through the native CUDA operations where available; only scalar math such as sqrt +// intentionally computes via float. Keeping these variants CUDA-only keeps CPU builds +// free of these types. +#ifdef __CUDACC__ +#include +#include +#include +#include +#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else +#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE inline +#endif + +namespace polyline { + +#ifdef __CUDACC__ +template +__device__ __forceinline__ dtype shfl_xor_sync_compat(unsigned mask, dtype val, int laneMask) { + return __shfl_xor_sync(mask, val, laneMask); +} + +template <> +__device__ __forceinline__ c10::Half shfl_xor_sync_compat(unsigned mask, c10::Half val, int laneMask) { + return c10::Half(__shfl_xor_sync(mask, static_cast<__half>(val), laneMask)); +} + +template <> +__device__ __forceinline__ c10::BFloat16 shfl_xor_sync_compat(unsigned mask, c10::BFloat16 val, + int laneMask) { + return c10::BFloat16(__shfl_xor_sync(mask, static_cast<__nv_bfloat16>(val), laneMask)); +} +#endif + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(dtype lhs, dtype rhs) { + return lhs < rhs; +} + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(dtype lhs, dtype rhs) { + return lhs > rhs; +} + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(dtype lhs, dtype rhs) { + return lhs >= rhs; +} + +#ifdef __CUDACC__ +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(c10::Half lhs, c10::Half rhs) { + return __hlt(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(c10::Half lhs, c10::Half rhs) { + return __hgt(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(c10::Half lhs, c10::Half rhs) { + return __hge(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hlt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hgt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hge(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} +#endif + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE dtype polyline_sqrt(dtype value) { + return sqrt(value); +} + +#ifdef __CUDACC__ +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::Half polyline_sqrt(c10::Half value) { + return static_cast(sqrtf(static_cast(value))); +} + +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::BFloat16 polyline_sqrt(c10::BFloat16 value) { + return static_cast(sqrtf(static_cast(value))); +} +#endif + +} // namespace polyline + +#undef POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE + +#endif // LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh new file mode 100644 index 0000000..2354b54 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_KERNELS_CUH +#define LANE_HELPERS_POLYLINE_KERNELS_CUH + +#include "polyline_common.cuh" +#include "polyline_dtype_compat.cuh" + +namespace polyline { + +template +__device__ __forceinline__ void prefix_sum_warp(int index, dtype value, int num_values_in_scan, + dtype& scan_value, dtype& sum_all) { + sum_all = value; + scan_value = static_cast(0.0); + for (int i = 1; i < num_values_in_scan; i <<= 1) { + dtype sum_other = shfl_xor_sync_compat(0xffffffffu, sum_all, i); + scan_value += ((static_cast(index) & static_cast(i)) > 0) * sum_other; + sum_all += sum_other; + } +} + +/** + * @brief Perform a prefix sum on a block of values. + * + * @details + * The buffer `warp_scan_buffer` is used to store the sums of the individual warps, which is then used + * to compute the offsets to add to each warp. For that, a prefix sum is performed on the buffer in a second + * step (in-place). The size of the buffer is (in elements): `blockDim.y * num_warps_per_sample`. + * + * @tparam dtype The type of the values to prefix sum + * + * @param value The value to prefix sum + * @param num_warps_per_sample The number of warps per sample + * @param warp_scan_buffer The buffer to store the partial sums of the iterations so far for each sample + * + * @return The prefix sum for the current thread + */ +template +__device__ __forceinline__ dtype prefix_sum_block(dtype value, int num_warps_per_sample, + dtype* warp_scan_buffer) { + // ix corresponds to the thread index inside a single sample + const int ix = threadIdx.x; + const int iwx = threadIdx.x / 32; // index of the warp inside the sample + const int iw = threadIdx.y * num_warps_per_sample + iwx; // index of the warp in the block + // Get thread id (consecutive IDs correspond to consecutive values in the array) + //const int tid_shared = threadIdx.y * bxsize + ix; + + // Using warp shuffles iteratively, in two stages + + // First stage: perform warp scans + dtype warp_scan1; + dtype warp_sum1; + prefix_sum_warp(ix, value, 32, warp_scan1, warp_sum1); + // If this is the first thread in the warp, it is responsible for storing the partial sum for the warp + if (ix % 32 == 0) warp_scan_buffer[iw] = warp_sum1; + __syncthreads(); + + // Warp scan for for the partial sums to obtain the offsets for each warp + // The first warp (i.e. ix < 32) participates in step 2 of the warp scan. + // Note that the whole first warp always participates in step 2, even if num_warps_per_sample < 32. + // This is to avoid a deadlock without using a complex mask generation method for `__shfl_xor_sync()`. + if (ix < 32) { + const int wid_shared = threadIdx.y * num_warps_per_sample + ix; + const bool is_inside = ix < num_warps_per_sample; + const dtype value = is_inside ? warp_scan_buffer[wid_shared] : static_cast(0.0); + dtype warp_scan2; + dtype warp_sum2; + prefix_sum_warp(ix, value, num_warps_per_sample, warp_scan2, warp_sum2); + if (ix < num_warps_per_sample) { + warp_scan_buffer[wid_shared] = warp_scan2; + } + } + __syncthreads(); + + // Apply offsets to the partial sums to obtain the final values + warp_scan1 += warp_scan_buffer[iw]; + + return warp_scan1; +} + +/** + * @brief Perform a prefix sum on a block of values. + * + * @details + * The buffer is split into 2 parts: + * - The first part (`blockDim.y` elements) stores the running sums of all + * elements processed so far for each sample in y (`sum_buffer`). + * - The second part (`warp_scan_buffer`) is scratch space for the intra‑block + * scan performed by `prefix_sum_block`. The size is: `blockDim.y * num_warps_per_sample` + * (see `prefix_sum_block` for more details) + * Hence, the total buffer size in elements is (in elements): + * `blockDim.y + blockDim.y * num_warps_per_sample`. + * or + * `blockDim.y * (1 + num_warps_per_sample)` + * + * The results are stored in the sequence array, overriding the input values. + * Note that the results are the accumulated values including the current value, i.e. the operation can be expressed as: + * `sequences[i] = sum(sequences[0:(i+1)])`, where the slicing is defined as in Python. + * + * + * + * @tparam dtype The type of the values to prefix sum + * + * @param sequences The sequences to prefix sum for the current thread block. Note that this means that + * the first sequence is the one corresponding to threadIdx.y == 0 of the current block, not necessarily the + * first sequence in the global array. + * @param buffer Combined temporary storage used by the prefix-sum. + * @param numel_x The number of elements in the x dimension + * @param numel_x_full_blocks The number of elements in the x dimension extended to a multiple of blockDim.x + * @param numel_y The number of sequences in the y dimension + * @param offset The initial offset to add to the prefix sum of each sequence + */ +template +__device__ __forceinline__ void prefix_sum_looped(dtype* sequence, dtype* buffer_block, int numel_x, + int numel_x_full_blocks, int numel_y, dtype offset) { + const int ix = threadIdx.x; + const int iy = threadIdx.y; + + // Buffer for keeping the sums of the iterations so far for each sample + dtype* sum_buffer = buffer_block; + // Buffer as needed for the prefix sum implementation + dtype* warp_scan_buffer = buffer_block + blockDim.y; + + int num_warps_per_sample = (blockDim.x + 31) / 32; + + // Initialize the buffer containing the partial sums of the iterations so far for each sample + if (ix == 0) { + sum_buffer[iy] = static_cast(0.0); + } + // Compute the sum one `bxsize` at a time for each sample + for (int i = ix; i < numel_x_full_blocks; i += blockDim.x) { + // Make sure that sum_buffer is written to (either initially or in the previous iteration) + __syncthreads(); + const dtype value = i < numel_x ? sequence[i] : static_cast(0.0); + const dtype value_out = + prefix_sum_block(value, num_warps_per_sample, warp_scan_buffer) + sum_buffer[iy] + offset; + const dtype value_out_incl_current = value_out + value; + // Make sure that + // - sequences are not written to before they are read from for the current iteration + // - sum_buffer is not written to before it is read from for the current iteration + __syncthreads(); + if (i < numel_x) { + sequence[i] = value_out_incl_current; + } + // Update the sum buffer for the next iteration to the current value of the last processed sample. + // Note that the last thread may be out of bounds and not correspond to the last element. However, + // in this case, the value us not needed (and also still is the correct value as the values are + // extended with zeros, so that the cumulative sum (computed as including the current value) is + // the same as for the last element) + if (ix == blockDim.x - 1) { + sum_buffer[iy] = value_out_incl_current; + } + // Offset is only applied in the first iteration. Afterwards, the offset is already included in the + // partial sum as stored in `sum_buffer` and the offset must not be applied again. + offset = static_cast(0.0); + } + __syncthreads(); +} + +template +__device__ __forceinline__ dtype warp_reduce_sum(dtype value, int num_vals_per_partial) { + const int ix = threadIdx.x; + for (int i = 1; i < num_vals_per_partial; i <<= 1) { + const dtype val_other = shfl_xor_sync_compat(0xffffffffu, value, i); + value += val_other; + } + return value; +} + +template +__device__ __forceinline__ dtype sample_reduce_sum(dtype value, int num_warps_per_sample, + dtype* warp_temp_and_result_buffer) { + const int ix = threadIdx.x; // index of thread in the block + const int iwx = threadIdx.x / 32; //index of warp in the sample + const int iw = threadIdx.y * num_warps_per_sample + iwx; // index of the warp in the block + + const dtype warp_sum = warp_reduce_sum(value, 32); + // The first thread in the warp writes the result for the warp + if (ix % 32 == 0) { + warp_temp_and_result_buffer[iw] = warp_sum; + } + // Make sure all warps have written their results + __syncthreads(); + + dtype sample_sum = static_cast(0.0); + // The first warp reduces the results of the first stage + // Note that from now on, ix corresponds to the index of the warp (from stage 1) in the block (previously iw) + if (ix < 32) { + const int iw_base = threadIdx.y * num_warps_per_sample; + const dtype warp_sum_phase_1 = + ix < num_warps_per_sample ? warp_temp_and_result_buffer[iw_base + ix] : static_cast(0.0); + // The partial sum will have constant segments, each segment corresponding to one sample (and containing as many values as + // there are warps per sample). + sample_sum = warp_reduce_sum(warp_sum_phase_1, num_warps_per_sample); + } + __syncthreads(); + return sample_sum; +} + +template +__device__ __forceinline__ void sample_distances(const dtype* points_sample, + const dtype* accum_distances_sample, + const dtype* distances_to_sample_sample, + int num_distances_to_sample, int num_points, int num_dims, + dtype* res_points_sample, bool relative_distances) { + const int x = threadIdx.x; + if (num_points == 0) { + const int result_stride = blockDim.x * num_dims; + + dtype* res_points_current = + x < num_distances_to_sample ? res_points_sample + x * num_dims : res_points_sample; + for (int i = x; i < num_distances_to_sample; i += blockDim.x, res_points_current += result_stride) { + fill_point_with_nan_common(res_points_current, num_dims); + } + return; + } + + dtype total_length_if_needed = static_cast(0.0); + if (relative_distances) { + total_length_if_needed = accum_distances_sample[num_points - 1]; + } + for (int i = x; i < num_distances_to_sample; i += blockDim.x) { + dtype* res_points_current = res_points_sample + i * num_dims; + const dtype distance_to_sample = relative_distances + ? distances_to_sample_sample[i] * total_length_if_needed + : distances_to_sample_sample[i]; + sample_at_distance_common(points_sample, accum_distances_sample, distance_to_sample, + num_points, num_dims, res_points_current); + } +} + +template +__device__ __forceinline__ void compute_distances(dtype* points_sample, int num_points, int num_dims, + dtype* distances_sample) { + const int x = threadIdx.x; + if (num_points == 0) { + return; + } + + if (x == 0) { + // Distance from the start to the first point is zero. + distances_sample[0] = static_cast(0.0); + } + // Store segment lengths starting at index 1 so that an *inclusive* prefix + // sum over `distances_sample` yields distances to points: + // distances_sample[j] = distance from start to point j. + for (int i = x; i < num_points - 1; i += blockDim.x) { + distances_sample[i + 1] = compute_segment_length_common(points_sample, i, num_dims); + } +} + +/** + * @brief Shared implementation for both fixed-size and variable-size batch kernels. + * + * @details + * This routine implements the common logic used by: + * - `polyline_sampling_fully_shared_kernel` (fixed-size batches), and + * - `polyline_sampling_fully_shared_var_batch_kernel` (variable-size batches). + * + * The shared memory is split into two parts: + * - The first part stores the distances and accumulated distances + * (conversion in-place) for all points and has size (in elements): + * `blockDim.y * max_num_points`. + * - The second part stores the temporary buffer used by + * `prefix_sum_looped` and has size (in elements): + * `blockDim.y * (num_warps_per_sample + 1)`. + * (see the documentation of `prefix_sum_looped` for details). + * The total shared memory size is therefore (in elements): + * `(blockDim.y * max_num_points + blockDim.y * (num_warps_per_sample + 1))`. + * + * + * @tparam dtype The type of the points + * + * @param points The points to sample + * @param distances_to_sample The distances to sample at + * @param res_points The resulting sampled points + * @param max_num_points The maximum number of points per polyline in the batch + * @param max_num_points_full_blocks The maximum number of points extended to a multiple of blockDim.x + * @param num_dims The number of dimensions of the points + * @param max_num_distances_to_sample The maximum number of distances to sample at per polyline + * @param num_samples The number of samples (batch size) + * @param sample_sizes_points (optional) Per-sample number of points (variable-size batches) + * @param sample_sizes_distances_to_sample (optional) Per-sample number of distances (variable-size batches) + * @param relative_distances Interpret distances to sample as fractions of each polyline's total length + * @param distance_buffer_ext Optional external buffer for distances when shared memory is insufficient + */ +template +__device__ __forceinline__ void polyline_sampling_fully_shared_common( + dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points, + int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples, + sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, dtype* distance_buffer_ext) { + extern __shared__ uint8_t shared_mem[]; + dtype* distances; + dtype* buffer; + if (use_shared_distances) { + // Shared-memory layout per block: + // distances: [blockDim.y][max_num_points] + // buffer : [blockDim.y * (1 + num_warps_per_sample)] + distances = reinterpret_cast(shared_mem); + buffer = reinterpret_cast(shared_mem + blockDim.y * max_num_points * sizeof(dtype)); + } else { + // External distances buffer is laid out per block as + // [blockIdx.y][blockDim.y][max_num_points] + // so each block gets its own contiguous slice. The scratch `buffer` + // always starts at the beginning of this block's shared memory. + distances = distance_buffer_ext + blockIdx.y * blockDim.y * max_num_points; + buffer = reinterpret_cast(shared_mem); + } + + const int y = threadIdx.y; + const int y_global = blockIdx.y * blockDim.y + y; + const bool is_active_sample = (y_global < num_samples); + + // 1) Compute per-point distances only for valid samples. Inactive rows in + // the final block still participate in sync-heavy code paths with zero work. + int curr_num_points = 0; + int curr_num_distances_to_sample = 0; + if (is_active_sample) { + if (use_variable_size_batch) { + curr_num_points = sample_sizes_points[y_global]; + curr_num_distances_to_sample = sample_sizes_distances_to_sample[y_global]; + } else { + curr_num_points = max_num_points; + curr_num_distances_to_sample = max_num_distances_to_sample; + } + + // Global index for points in device memory; distances remain indexed by the + // local y within the block because they live in shared memory. + dtype* points_sample = points + y_global * max_num_points * num_dims; + dtype* distances_sample = distances + y * max_num_points; + if (curr_num_points > 0) { + compute_distances(points_sample, curr_num_points, num_dims, distances_sample); + } + } + + // 2) Prefix-sum over distances for all rows in this block-local buffer. + // This operates purely on (shared or external) distances, so it is + // safe even for rows that don't correspond to a real sample; their + // results are never used. + // The `distances` are per-block, so we use the local index `y` to access the distances for the current block. + dtype* distance = distances + y * max_num_points; + prefix_sum_looped(distance, // sequences + buffer, // buffer (sum_buffer + warp_scan_buffer) + curr_num_points, // numel_x + max_num_points_full_blocks, // numel_x_full_blocks (extended to full blocks) + blockDim.y, // numel_y (number of samples per block) + static_cast(0.0) // offset + ); + + // 3) Sample only for valid samples, using their (possibly shared or + // external) accumulated distances. + if (is_active_sample) { + // Get the points for the current sample (use of global offset) + const dtype* points_sample = points + y_global * max_num_points * num_dims; + // Get the distances for the current sample (use of block-local offset, as distances are stored in + // shared memory (or in an external buffer with `points` referring to points for this block)) + const dtype* distances_sample = distances + y * max_num_points; + // Get the distances to sample at for the current sample (use of global offset) + const dtype* distances_to_sample_sample = + distances_to_sample + y_global * max_num_distances_to_sample; + sample_distances(points_sample, distances_sample, distances_to_sample_sample, + curr_num_distances_to_sample, curr_num_points, num_dims, + res_points + y_global * max_num_distances_to_sample * num_dims, + relative_distances); + } +} + +/** + * @brief Sample the points at the distances (fixed-size batches). + * + * See `polyline_sampling_fully_shared_common` for implementation details. + */ +template +__global__ void polyline_sampling_fully_shared_kernel(dtype* points, dtype* distances_to_sample, + dtype* res_points, int num_points, + int num_points_full_blocks, int num_dims, + int num_distances_to_sample, int num_samples, + bool relative_distances, dtype* distance_buffer_ext) { + polyline_sampling_fully_shared_common( + points, distances_to_sample, res_points, + num_points, // max_num_points + num_points_full_blocks, // max_num_points_full_blocks + num_dims, + num_distances_to_sample, // max_num_distances_to_sample + num_samples, + /*sample_sizes_points=*/nullptr, + /*sample_sizes_distances_to_sample=*/nullptr, relative_distances, distance_buffer_ext); +} + +// Variable-size batch version of the kernel. +template +__global__ void polyline_sampling_fully_shared_var_batch_kernel( + dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points, + int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples, + sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, dtype* distance_buffer_ext) { + polyline_sampling_fully_shared_common( + points, distances_to_sample, res_points, max_num_points, max_num_points_full_blocks, num_dims, + max_num_distances_to_sample, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, distance_buffer_ext); +} + +template +__device__ __forceinline__ void polyline_lengths_common(dtype* points, dtype* lengths, int max_num_points, + int num_dims, int num_samples, + sample_size_dtype* sample_sizes_points, + dtype* reduction_buffer) { + const int x = threadIdx.x; + const int y_global = blockIdx.y * blockDim.y + threadIdx.y; + const bool is_active_sample = (y_global < num_samples); + + int curr_num_points = 0; + dtype local_length = static_cast(0.0); + if (is_active_sample) { + curr_num_points = use_variable_size_batch ? sample_sizes_points[y_global] : max_num_points; + const dtype* points_sample = points + y_global * max_num_points * num_dims; + for (int i = x; i < curr_num_points - 1; i += blockDim.x) { + local_length += compute_segment_length_common(points_sample, i, num_dims); + } + } + + const int num_warps_per_sample = (blockDim.x + 31) / 32; + const dtype length = sample_reduce_sum(local_length, num_warps_per_sample, reduction_buffer); + if (is_active_sample && x == 0) { + lengths[y_global] = curr_num_points == 0 ? polyline_nan() : length; + } +} + +template +__global__ void polyline_lengths_kernel(dtype* points, dtype* lengths, int num_points, int num_dims, + int num_samples) { + extern __shared__ uint8_t shared_mem[]; + dtype* reduction_buffer = reinterpret_cast(shared_mem); + polyline_lengths_common(points, lengths, num_points, num_dims, num_samples, + /*sample_sizes_points=*/nullptr, reduction_buffer); +} + +template +__global__ void polyline_lengths_var_batch_kernel(dtype* points, dtype* lengths, int max_num_points, + int num_dims, int num_samples, + sample_size_dtype* sample_sizes_points) { + extern __shared__ uint8_t shared_mem[]; + dtype* reduction_buffer = reinterpret_cast(shared_mem); + polyline_lengths_common( + points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points, reduction_buffer); +} + +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_KERNELS_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh new file mode 100644 index 0000000..2668917 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH +#define LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH + +#include +#include +#include + +#include + +#include "helper_macros.cuh" +#include "polyline_kernels.cuh" + +namespace polyline { + +static constexpr int MAX_CACHED_CUDA_DEVICES = 64; + +static void check_non_negative_cuda_device(int device) { + if (device < 0) { + throw std::runtime_error("CUDA device index must be non-negative."); + } +} + +static size_t query_polyline_max_shared_full_for_device(int device) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + size_t max_shared_full = static_cast(prop.sharedMemPerBlock); + if (prop.sharedMemPerBlockOptin != 0) { + max_shared_full = static_cast(prop.sharedMemPerBlockOptin); + } + return max_shared_full; +} + +static size_t polyline_max_shared_full_for_device(int device) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + static size_t max_shared_full_by_device[MAX_CACHED_CUDA_DEVICES] = {}; + + check_non_negative_cuda_device(device); + // Fallback if there are more devices than the maximum number of cached devices we use. + if (device >= MAX_CACHED_CUDA_DEVICES) { + const size_t max_shared_full = query_polyline_max_shared_full_for_device(device); + return max_shared_full; + } + + std::call_once(configured_devices[device], [device]() { + max_shared_full_by_device[device] = query_polyline_max_shared_full_for_device(device); + }); + const size_t max_shared_full = max_shared_full_by_device[device]; + return max_shared_full; +} + +template +static void configure_polyline_sampling_kernel(size_t max_shared_full) { + CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + static_cast(max_shared_full))); + CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); +} + +template +static void configure_polyline_sampling_kernel_once(int device, size_t max_shared_full) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + + check_non_negative_cuda_device(device); + if (device >= MAX_CACHED_CUDA_DEVICES) { + configure_polyline_sampling_kernel(max_shared_full); + return; + } + + std::call_once(configured_devices[device], [max_shared_full]() { + configure_polyline_sampling_kernel(max_shared_full); + }); +} + +template +static void configure_polyline_sampling_var_batch_kernel(size_t max_shared_full) { + CUDA_CHECK(cudaFuncSetAttribute( + polyline_sampling_fully_shared_var_batch_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast(max_shared_full))); + CUDA_CHECK(cudaFuncSetAttribute( + polyline_sampling_fully_shared_var_batch_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); +} + +template +static void configure_polyline_sampling_var_batch_kernel_once(int device, size_t max_shared_full) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + + check_non_negative_cuda_device(device); + if (device >= MAX_CACHED_CUDA_DEVICES) { + configure_polyline_sampling_var_batch_kernel( + max_shared_full); + return; + } + + std::call_once(configured_devices[device], [max_shared_full]() { + configure_polyline_sampling_var_batch_kernel( + max_shared_full); + }); +} + +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp new file mode 100644 index 0000000..a0d82a3 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "polyline.cuh" +#include "helper_macros.cuh" + +//#define PROFILE_AND_SYNC + +#ifdef PROFILE_AND_SYNC +#include +#endif + +namespace polyline { + +#define CHECK_DEVICE(x) check_device(x, #x) +#define CHECK_CONTIGUOUS(x) check_contiguous(x, #x) +#define CHECK_TYPE(x) check_type(x, #x) +#define CHECK_INPUT(x) \ + CHECK_DEVICE(x); \ + CHECK_CONTIGUOUS(x); \ + CHECK_TYPE(x); +inline void check_device(const at::Tensor& tensor, const char* description) { + TORCH_CHECK(tensor.is_cpu() || tensor.is_cuda(), description, " must be on CPU or CUDA"); +} + +inline void check_contiguous(const at::Tensor& tensor, const char* description) { + TORCH_CHECK(tensor.is_contiguous(), description, " must be contiguous"); +} + +inline void check_type(const at::Tensor& tensor, const char* description) { + if (tensor.is_cuda()) { + TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64 || + tensor.scalar_type() == torch::kFloat16 || tensor.scalar_type() == torch::kBFloat16, + description, " must have dtype float16, float32, float64, or bfloat16 on CUDA"); + } else { + TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64, + description, " must have dtype float32 or float64 on CPU"); + } +} + +inline void check_same_device(const at::Tensor& lhs, const at::Tensor& rhs, const char* message) { + TORCH_CHECK(lhs.device() == rhs.device(), message); +} + +inline void check_sample_size_type(const at::Tensor& sample_sizes, const char* description) { + TORCH_CHECK(sample_sizes.scalar_type() == at::kInt || sample_sizes.scalar_type() == at::kLong, + description, " must have dtype int32 or int64"); +} + +inline void check_sample_sizes(const at::Tensor& sample_sizes, int max_size, const char* description) { + if (sample_sizes.numel() == 0) { + return; + } + TORCH_CHECK( + !torch::any(sample_sizes < 0).item() && !torch::any(sample_sizes > max_size).item(), + description, " values must be in [0, ", max_size, "]"); +} + +at::Tensor make_external_distance_buffer(size_t size_elems, const at::TensorOptions& options) { + // Keep external CUDA scratch memory owned by PyTorch's stream-aware allocator. + // A raw cudaFree here can race with the asynchronous custom kernel that uses this buffer. + + // Return an empty tensor if no external distance buffer is needed. + if (size_elems == 0) { + return at::Tensor(); + } + + // Check that the size is not too large to allocate as a tensor. + TORCH_CHECK(size_elems <= static_cast(std::numeric_limits::max()), + "external polyline distance buffer is too large to allocate as a tensor"); + + // Allocate the buffer and return it. + at::Tensor buffer = at::empty({static_cast(size_elems)}, options); + return buffer; +} + +at::Tensor polyline_interpolation(at::Tensor points, at::Tensor distances, bool relative_distances) { + CHECK_DEVICE(points); + CHECK_DEVICE(distances); + CHECK_TYPE(points); + CHECK_TYPE(distances); + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)"); + TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, num_distances)"); + TORCH_CHECK(points.size(0) == distances.size(0), + "points and distances must contain the same number of polylines"); + TORCH_CHECK(points.scalar_type() == distances.scalar_type(), + "points and distances must have the same dtype"); + check_same_device(points, distances, "points and distances must be on the same device"); + + const int num_samples = points.size(0); + const int num_points = points.size(1); + const int num_distances = distances.size(1); + const int num_dims = points.size(2); + // Result has shape (batch, num_distances, point_dim) and otherwise + // matches `distances` (device, dtype). + auto res = at::empty({num_samples, num_distances, num_dims}, distances.options()); + if (num_distances == 0) { + return res; + } + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor distances_contiguous = distances.contiguous(); + + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation", [&] { + const int device = points.get_device(); + c10::cuda::CUDAGuard device_guard(static_cast(device)); + const auto stream = at::cuda::getCurrentCUDAStream(static_cast(device)); + at::cuda::CUDAStreamGuard stream_guard(stream); + const auto cfg = make_polyline_launch_config(num_points, num_samples, device); + // Allocate under the same stream used for the kernel launch so the caching allocator + // does not recycle this temporary scratch buffer before queued kernel work consumes it. + const at::Tensor distance_buffer_ext = + make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options()); + scalar_t* distance_buffer_ext_ptr = + distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr() : nullptr; + polyline_interpolation(points_contiguous.data_ptr(), num_points, num_dims, + distances_contiguous.data_ptr(), num_distances, + res.data_ptr(), num_samples, relative_distances, + device, cfg, distance_buffer_ext_ptr, stream.stream()); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_interpolation_cpu", [&] { + polyline_interpolation_cpu(points_contiguous.data_ptr(), num_points, num_dims, + distances_contiguous.data_ptr(), num_distances, + res.data_ptr(), num_samples, relative_distances); + }); + } + + return res; +} + +at::Tensor polyline_lengths(at::Tensor points) { + CHECK_DEVICE(points); + CHECK_TYPE(points); + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)"); + + const int num_samples = points.size(0); + const int num_points = points.size(1); + const int num_dims = points.size(2); + auto res = at::empty({num_samples}, points.options()); + const at::Tensor points_contiguous = points.contiguous(); + + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths", [&] { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + polyline_lengths(points_contiguous.data_ptr(), // points + num_points, // num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + stream // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_cpu", [&] { + polyline_lengths_cpu(points_contiguous.data_ptr(), // points + num_points, // num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples // num_samples + ); + }); + } + + return res; +} + +at::Tensor polyline_interpolation_var_size_batch(at::Tensor points, at::Tensor distances, + at::Tensor sample_sizes_points, + at::Tensor sample_sizes_distances_to_sample, + bool relative_distances) { + CHECK_DEVICE(points); + CHECK_DEVICE(distances); + check_device(sample_sizes_points, "points.sample_sizes"); + check_device(sample_sizes_distances_to_sample, "distances.sample_sizes"); + CHECK_TYPE(points); + CHECK_TYPE(distances); + check_sample_size_type(sample_sizes_points, "points.sample_sizes"); + check_sample_size_type(sample_sizes_distances_to_sample, "distances.sample_sizes"); + + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)"); + TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, max_num_distances)"); + TORCH_CHECK(points.size(0) == distances.size(0), + "points and distances must contain the same number of polylines"); + TORCH_CHECK(points.scalar_type() == distances.scalar_type(), + "points and distances must have the same dtype"); + check_same_device(points, distances, "points and distances must be on the same device"); + TORCH_CHECK(sample_sizes_points.scalar_type() == sample_sizes_distances_to_sample.scalar_type(), + "points.sample_sizes and distances.sample_sizes must have the same dtype " + "(both int32 or both int64)"); + check_same_device(sample_sizes_points, points, + "points.sample_sizes must be on the same device as points"); + check_same_device(sample_sizes_distances_to_sample, distances, + "distances.sample_sizes must be on the same device as distances"); + TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor"); + TORCH_CHECK(sample_sizes_distances_to_sample.ndimension() == 1, + "distances.sample_sizes must be a 1D tensor"); + + const int num_samples = points.size(0); + const int max_num_points = points.size(1); + const int max_num_distances = distances.size(1); + const int num_dims = points.size(2); + // Result has shape (batch, num_distances, point_dim) and otherwise + // matches `distances` (device, dtype). + auto res = at::empty({num_samples, max_num_distances, num_dims}, distances.options()); + + TORCH_CHECK(sample_sizes_points.size(0) == num_samples, + "points.sample_sizes must contain one count per polyline in points"); + TORCH_CHECK(sample_sizes_distances_to_sample.size(0) == num_samples, + "distances.sample_sizes must contain one count per polyline in distances"); + check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes"); + check_sample_sizes(sample_sizes_distances_to_sample, max_num_distances, "distances.sample_sizes"); + if (max_num_distances == 0) { + return res; + } + + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor distances_contiguous = distances.contiguous(); + const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous(); + const at::Tensor sample_sizes_distances_to_sample_contiguous = + sample_sizes_distances_to_sample.contiguous(); + + auto launch = [&](auto sample_size_type_tag) { + using sample_size_t = decltype(sample_size_type_tag); + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation_var_size_batch", [&] { + const int device = points.get_device(); + c10::cuda::CUDAGuard device_guard(static_cast(device)); + const auto stream = at::cuda::getCurrentCUDAStream(static_cast(device)); + at::cuda::CUDAStreamGuard stream_guard(stream); + const auto cfg = + make_polyline_launch_config(max_num_points, num_samples, device); + // Allocate under the same stream used for the kernel launch so the caching allocator + // does not recycle this temporary scratch buffer before queued kernel work consumes it. + const at::Tensor distance_buffer_ext = + make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options()); + scalar_t* distance_buffer_ext_ptr = + distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr() : nullptr; + polyline_interpolation_var_size_batch( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + distances_contiguous.data_ptr(), // distances + max_num_distances, // num_distances + res.data_ptr(), // result_points + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + sample_sizes_distances_to_sample_contiguous + .data_ptr(), // sample sizes distances + relative_distances, // relative_distances + device, // device + cfg, // launch config + distance_buffer_ext_ptr, // distance_buffer_ext + stream.stream() // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES( + points.scalar_type(), "polyline_interpolation_var_size_batch_cpu", [&] { + polyline_interpolation_var_size_batch_cpu( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + distances_contiguous.data_ptr(), // distances + max_num_distances, // num_distances + res.data_ptr(), // result_points + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + sample_sizes_distances_to_sample_contiguous + .data_ptr(), // sample sizes distances + relative_distances // relative_distances + ); + }); + } + }; + if (sample_sizes_points.scalar_type() == at::kInt) { + launch(int32_t{}); + } else { + launch(int64_t{}); + } + + return res; +} + +at::Tensor polyline_lengths_var_size_batch(at::Tensor points, at::Tensor sample_sizes_points) { + CHECK_DEVICE(points); + check_device(sample_sizes_points, "points.sample_sizes"); + CHECK_TYPE(points); + check_sample_size_type(sample_sizes_points, "points.sample_sizes"); + + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)"); + TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor"); + check_same_device(sample_sizes_points, points, + "points.sample_sizes must be on the same device as points"); + + const int num_samples = points.size(0); + const int max_num_points = points.size(1); + const int num_dims = points.size(2); + auto res = at::empty({num_samples}, points.options()); + + TORCH_CHECK(sample_sizes_points.size(0) == num_samples, + "points.sample_sizes must contain one count per polyline in points"); + check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes"); + + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous(); + + auto launch = [&](auto sample_size_type_tag) { + using sample_size_t = decltype(sample_size_type_tag); + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths_var_size_batch", [&] { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + polyline_lengths_var_size_batch( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + stream // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_var_size_batch_cpu", [&] { + polyline_lengths_var_size_batch_cpu( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr() // sample_sizes_points + ); + }); + } + }; + if (sample_sizes_points.scalar_type() == at::kInt) { + launch(int32_t{}); + } else { + launch(int64_t{}); + } + + return res; +} + +} // namespace polyline + +namespace py = pybind11; +using namespace polyline; + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "Lane helpers polyline interpolation bindings"; + m.def("polyline_interpolation", (at::Tensor(*)(at::Tensor, at::Tensor, bool)) & polyline_interpolation, + py::arg("points"), py::arg("distances"), py::arg("relative") = false, + "Interpolate points along polylines at given distances."); + m.def("_polyline_lengths", (at::Tensor(*)(at::Tensor)) & polyline_lengths, py::arg("points"), + "Internal tensor-only entry point for fixed-size polyline length computation."); + m.def("_polyline_interpolation_var_size_batch", + (at::Tensor(*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, bool)) & + polyline_interpolation_var_size_batch, + py::arg("points"), py::arg("distances"), py::arg("sample_sizes_points"), + py::arg("sample_sizes_distances_to_sample"), py::arg("relative") = false, + "Internal tensor-only entry point for variable-length polyline interpolation."); + m.def("_polyline_lengths_var_size_batch", + (at::Tensor(*)(at::Tensor, at::Tensor)) & polyline_lengths_var_size_batch, py::arg("points"), + py::arg("sample_sizes_points"), + "Internal tensor-only entry point for variable-length polyline length computation."); +} \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cu b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu new file mode 100644 index 0000000..7d62499 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "helper_macros.cuh" + +#include "polyline.cuh" +#include "polyline_kernels.cuh" +#include "polyline_shared_memory_config.cuh" + +namespace polyline { + +// Return the largest power of two that is <= n. +// For n <= 1, this returns 0 for n == 0 and 1 for n == 1. +static inline int last_power_of_2(int n) { + if (n <= 0) { + return 0; + } + unsigned int v = static_cast(n); + // Propagate highest set bit to all lower bits. + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + // Now (v + 1) >> 1 is the highest power of two <= original n. + const int power_of_two = static_cast((v + 1u) >> 1); + return power_of_two; +} + +template +struct PolylineLengthLaunchConfig { + dim3 block_dim; + dim3 grid_dim; + size_t shared_mem_size; +}; + +static int polyline_launch_threads_x(int num_samples_per_block) { + const int max_num_threads = 1024; + + const int max_threads_x_for_y = max_num_threads / num_samples_per_block; + // Round down to a multiple of 32, but keep at least one warp. + int threads_x = (max_threads_x_for_y / 32) * 32; + if (threads_x < 32) { + threads_x = 32; + } + return threads_x; +} + +template +static PolylineLaunchConfig make_polyline_launch_config_for_y(int num_points, int num_samples, + int num_samples_per_block) { + const int threads_x = polyline_launch_threads_x(num_samples_per_block); + + const dim3 block_dim(threads_x, num_samples_per_block, 1); + const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1); + const int num_points_full_blocks = ((num_points + block_dim.x - 1) / block_dim.x) * block_dim.x; + const int num_warps_per_sample = (block_dim.x + 31) / 32; + const size_t scratch_buffer_size_elems = block_dim.y * (num_warps_per_sample + 1); + const size_t distances_buffer_size_elems_shared = static_cast(block_dim.y) * num_points; + + PolylineLaunchConfig cfg; + cfg.block_dim = block_dim; + cfg.grid_dim = grid_dim; + cfg.num_points_full_blocks = num_points_full_blocks; + cfg.shared_mem_size = (distances_buffer_size_elems_shared + scratch_buffer_size_elems) * sizeof(dtype); + cfg.distance_buffer_ext_size_elems = 0; + cfg.use_shared_distances = true; + cfg.max_shared_full = 0; + return cfg; +} + +template +static size_t polyline_external_distance_buffer_size_elems(const PolylineLaunchConfig& cfg, + int num_points) { + const size_t buffer_size_elems = static_cast(cfg.grid_dim.y) * cfg.block_dim.y * num_points; + return buffer_size_elems; +} + +template +static size_t polyline_scratch_shared_mem_size(const PolylineLaunchConfig& cfg) { + const int num_warps_per_sample = (cfg.block_dim.x + 31) / 32; + const size_t shared_mem_size = + static_cast(cfg.block_dim.y) * (num_warps_per_sample + 1) * sizeof(dtype); + return shared_mem_size; +} + +template +PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, int device) { + // Keep blockDim.y at 1 so blockDim.x can use the full thread block for each sample. + PolylineLaunchConfig cfg = make_polyline_launch_config_for_y(num_points, num_samples, 1); + + // Determine whether we can stay in the shared‑memory path using the opt‑in + // limit (`sharedMemPerBlockOptin`) instead of falling back to the external + // buffer. + const size_t max_shared_full = polyline_max_shared_full_for_device(device); + + if (cfg.shared_mem_size <= max_shared_full) { + cfg.max_shared_full = max_shared_full; + return cfg; + } + + cfg.shared_mem_size = polyline_scratch_shared_mem_size(cfg); + cfg.use_shared_distances = false; + cfg.max_shared_full = max_shared_full; + cfg.distance_buffer_ext_size_elems = polyline_external_distance_buffer_size_elems(cfg, num_points); + return cfg; +} + +template +static PolylineLengthLaunchConfig make_polyline_length_launch_config(int num_samples) { + const int max_num_threads = 1024; + const int max_y_by_threads = max_num_threads / 32; // assuming at least one warp in x + const int max_y_candidate = min(num_samples, max_y_by_threads); + int num_samples_per_block = last_power_of_2(max_y_candidate); + if (num_samples_per_block < 1) { + num_samples_per_block = 1; + } + + const int max_threads_x_for_y = max_num_threads / num_samples_per_block; + int threads_x = (max_threads_x_for_y / 32) * 32; + if (threads_x < 32) { + threads_x = 32; + } + + const dim3 block_dim(threads_x, num_samples_per_block, 1); + const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1); + const int num_warps_per_sample = (block_dim.x + 31) / 32; + + PolylineLengthLaunchConfig cfg; + cfg.block_dim = block_dim; + cfg.grid_dim = grid_dim; + cfg.shared_mem_size = static_cast(block_dim.y) * num_warps_per_sample * sizeof(dtype); + return cfg; +} + +template +void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances, + dtype* result_points, int num_samples, bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream) { + if (cfg.use_shared_distances) { + configure_polyline_sampling_kernel_once(device, cfg.max_shared_full); + polyline_sampling_fully_shared_kernel + <<>>( + points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, relative_distances, nullptr); + } else { + polyline_sampling_fully_shared_kernel + <<>>( + points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, relative_distances, distance_buffer_ext); + } + CUDA_CHECK_LAST(); +} + +template +void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples, + cudaStream_t stream) { + auto cfg = make_polyline_length_launch_config(num_samples); + polyline_lengths_kernel<<>>( + points, lengths, num_points, num_dims, num_samples); + CUDA_CHECK_LAST(); +} + +template +void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances, + int num_distances, dtype* result_points, int num_samples, + sample_size_dtype* sample_sizes_points, + sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream) { + if (cfg.use_shared_distances) { + configure_polyline_sampling_var_batch_kernel_once( + device, cfg.max_shared_full); + polyline_sampling_fully_shared_var_batch_kernel + <<>>( + points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, nullptr); + } else { + polyline_sampling_fully_shared_var_batch_kernel + <<>>( + points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, distance_buffer_ext); + } + CUDA_CHECK_LAST(); +} + +template +void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths, + int num_samples, sample_size_dtype* sample_sizes_points, + cudaStream_t stream) { + auto cfg = make_polyline_length_launch_config(num_samples); + polyline_lengths_var_batch_kernel + <<>>( + points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points); + CUDA_CHECK_LAST(); +} + +#define INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE) \ + template void polyline_interpolation( \ + DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE) \ + template PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, \ + int device); + +#define INSTANTIATE_POLYLINE_LENGTHS(DTYPE) \ + template void polyline_lengths(DTYPE * points, int num_points, int num_dims, DTYPE* lengths, \ + int num_samples, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_interpolation_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_lengths_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_CUDA_DTYPE(DTYPE) \ + INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE) \ + INSTANTIATE_POLYLINE_LENGTHS(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int64_t) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int64_t) + +INSTANTIATE_POLYLINE_CUDA_DTYPE(float) +INSTANTIATE_POLYLINE_CUDA_DTYPE(double) +INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::Half) +INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::BFloat16) + +#undef INSTANTIATE_POLYLINE_CUDA_DTYPE +#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH +#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH +#undef INSTANTIATE_POLYLINE_LENGTHS +#undef INSTANTIATE_POLYLINE_LAUNCH_CONFIG +#undef INSTANTIATE_POLYLINE_INTERPOLATION +} // namespace polyline \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp new file mode 100644 index 0000000..f0d7cd0 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "polyline_common.cuh" + +namespace polyline { + +template +using cpu_acc_t = at::acc_type; + +template +static void compute_accumulated_distances_cpu(const dtype* points_sample, int num_points, int num_dims, + cpu_acc_t* accum_distances) { + using acc_t = cpu_acc_t; + accum_distances[0] = static_cast(0.0); + for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) { + accum_distances[point_idx + 1] = + accum_distances[point_idx] + + compute_segment_length_common(points_sample, point_idx, num_dims); + } +} + +template +static void sample_polyline_cpu(const dtype* points_sample, const dtype* distances_sample, int num_points, + int num_dims, int num_distances, dtype* result_sample, + bool relative_distances, std::vector>& accum_distances) { + using acc_t = cpu_acc_t; + if (num_distances == 0) { + return; + } + if (num_points == 0) { + dtype* result_sample_i = result_sample; + for (int distance_idx = 0; distance_idx < num_distances; + ++distance_idx, result_sample_i += num_dims) { + fill_point_with_nan_common(result_sample_i, num_dims); + } + return; + } + compute_accumulated_distances_cpu(points_sample, num_points, num_dims, accum_distances.data()); + const acc_t total_length = accum_distances[num_points - 1]; + for (int distance_idx = 0; distance_idx < num_distances; ++distance_idx) { + const acc_t distance_to_sample = + relative_distances ? static_cast(distances_sample[distance_idx]) * total_length + : static_cast(distances_sample[distance_idx]); + sample_at_distance_common(points_sample, accum_distances.data(), distance_to_sample, + num_points, num_dims, + result_sample + distance_idx * num_dims); + } +} + +template +void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances, + int num_distances, dtype* result_points, int num_samples, + bool relative_distances) { + using acc_t = cpu_acc_t; + const size_t stride_points = static_cast(num_points) * static_cast(num_dims); + const size_t stride_distances = static_cast(num_distances); + const size_t stride_result = static_cast(num_distances) * static_cast(num_dims); + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + std::vector accum_distances(num_points); + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const dtype* points_sample = points + sample_idx * stride_points; + const dtype* distances_sample = distances + sample_idx * stride_distances; + dtype* result_sample = result_points + sample_idx * stride_result; + sample_polyline_cpu(points_sample, distances_sample, num_points, num_dims, num_distances, + result_sample, relative_distances, accum_distances); + } + }); +} + +template +void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, + int num_samples) { + using acc_t = cpu_acc_t; + const size_t stride_points = static_cast(num_points) * static_cast(num_dims); + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const dtype* points_sample = points + sample_idx * stride_points; + acc_t length = static_cast(0.0); + if (num_points == 0) { + length = polyline_nan(); + } else { + for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) { + length += compute_segment_length_common(points_sample, point_idx, num_dims); + } + } + lengths[sample_idx] = static_cast(length); + } + }); +} + +template +void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + const dtype* distances, int num_distances, + dtype* result_points, int num_samples, + const sample_size_dtype* sample_sizes_points, + const sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances) { + using acc_t = cpu_acc_t; + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + std::vector accum_distances(max_num_points); + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const int curr_num_points = static_cast(sample_sizes_points[sample_idx]); + const int curr_num_distances = static_cast(sample_sizes_distances_to_sample[sample_idx]); + const dtype* points_sample = points + sample_idx * max_num_points * num_dims; + const dtype* distances_sample = distances + sample_idx * num_distances; + dtype* result_sample = result_points + sample_idx * num_distances * num_dims; + sample_polyline_cpu(points_sample, distances_sample, curr_num_points, num_dims, + curr_num_distances, result_sample, relative_distances, + accum_distances); + } + }); +} + +template +void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + dtype* lengths, int num_samples, + const sample_size_dtype* sample_sizes_points) { + using acc_t = cpu_acc_t; + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const int curr_num_points = static_cast(sample_sizes_points[sample_idx]); + const dtype* points_sample = points + sample_idx * max_num_points * num_dims; + acc_t length = static_cast(0.0); + if (curr_num_points == 0) { + length = polyline_nan(); + } else { + for (int point_idx = 0; point_idx < curr_num_points - 1; ++point_idx) { + length += compute_segment_length_common(points_sample, point_idx, num_dims); + } + } + lengths[sample_idx] = static_cast(length); + } + }); +} + +#define INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE) \ + template void polyline_interpolation_cpu( \ + const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances); + +#define INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE) \ + template void polyline_lengths_cpu(const DTYPE* points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples); + +#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_interpolation_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances); + +#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_lengths_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_points); + +#define INSTANTIATE_POLYLINE_CPU_DTYPE(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE) \ + INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) + +INSTANTIATE_POLYLINE_CPU_DTYPE(float) +INSTANTIATE_POLYLINE_CPU_DTYPE(double) + +#undef INSTANTIATE_POLYLINE_CPU_DTYPE +#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU +#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU +#undef INSTANTIATE_POLYLINE_LENGTHS_CPU +#undef INSTANTIATE_POLYLINE_INTERPOLATION_CPU + +} // namespace polyline diff --git a/packages/lane_helpers/pyproject.toml b/packages/lane_helpers/pyproject.toml new file mode 100644 index 0000000..41b2940 --- /dev/null +++ b/packages/lane_helpers/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = [ + "setuptools>=64", + "wheel", + "scikit-build>=0.17.0", + "pybind11>=2.10.0", + "setuptools-scm>=8", + "accvlab-build-config @ file:../../build_config", +] +build-backend = "setuptools.build_meta" + +[project] +name = "accvlab.lane_helpers" +dynamic = ["version"] +description = "Lane helper utilities for ACCV-Lab." +requires-python = ">=3.8" +dependencies = [ + "torch>=2.0.0", + "numpy>=1.22.2", +] + +[project.optional-dependencies] +optional = [ + "matplotlib", + "pytest", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["accvlab.lane_helpers*"] + +[tool.setuptools_scm] +version_scheme = "no-guess-dev" +fallback_version = "0.0.0" +root = "../.." diff --git a/packages/lane_helpers/setup.py b/packages/lane_helpers/setup.py new file mode 100644 index 0000000..557ed99 --- /dev/null +++ b/packages/lane_helpers/setup.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from skbuild import setup +from setuptools import find_namespace_packages + +_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR = """ +######################################################################################### +# Missing build dependency: accvlab-build-config. # +# # +# ACCV-Lab package builds normally use --no-build-isolation, so the shared build helper # +# must already be installed in the active environment. Install it first with: # +# # +# pip install /build_config # +# # +# and retry. # +# # +# Alternatively, use /scripts/package_manager.sh to install packages in # +# the documented order. # +######################################################################################### +""" + +try: + from accvlab_build_config import build_cmake_args +except ModuleNotFoundError as exc: + if exc.name != "accvlab_build_config": + raise + raise RuntimeError(_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR) from exc + +_cmake_args = build_cmake_args() + + +setup( + name="accvlab.lane_helpers", + description="Lane helper utilities for ACCV-Lab.", + packages=find_namespace_packages(include=["accvlab.lane_helpers*"]), + include_package_data=True, + zip_safe=False, + cmake_source_dir="ext_impl", + cmake_install_dir="accvlab/lane_helpers", + cmake_args=_cmake_args, +) diff --git a/packages/lane_helpers/tests/polyline_test_utils.py b/packages/lane_helpers/tests/polyline_test_utils.py new file mode 100644 index 0000000..8cad98f --- /dev/null +++ b/packages/lane_helpers/tests/polyline_test_utils.py @@ -0,0 +1,226 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from accvlab.batching_helpers import RaggedBatch + +DEVICES = ["cpu", "cuda"] + + +def sample_polyline_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor: + # For no distances, the result is an empty tensor. + if distances.shape[0] == 0: + sampled_points = points.new_empty((0, points.shape[1])) + return sampled_points + # For no points, the result is NaN for every requested point coordinate. + if points.shape[0] == 0: + sampled_points = points.new_full((distances.shape[0], points.shape[1]), torch.nan) + return sampled_points + + segment_lengths = torch.linalg.vector_norm(points[1:] - points[:-1], dim=1) + accum = torch.cat([segment_lengths.new_zeros(1), torch.cumsum(segment_lengths, dim=0)]) + total_length = accum[-1] + + out = [] + for distance in distances: + d = torch.clamp(distance, min=0.0, max=total_length) + lower_idx = int(torch.nonzero(accum <= d, as_tuple=False)[-1]) + if lower_idx >= points.shape[0] - 1: + out.append(points[-1]) + continue + + upper_idx = lower_idx + 1 + lower_dist = accum[lower_idx] + upper_dist = accum[upper_idx] + segment_dist = upper_dist - lower_dist + if segment_dist <= torch.finfo(points.dtype).eps: + out.append(points[lower_idx]) + continue + + weight_upper = (d - lower_dist) / segment_dist + weight_lower = (upper_dist - d) / segment_dist + out.append(points[lower_idx] * weight_lower + points[upper_idx] * weight_upper) + + sampled_points = torch.stack(out) + return sampled_points + + +def sample_batch_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor: + sampled_points = torch.stack( + [ + sample_polyline_cpu(points_sample, distances_sample) + for points_sample, distances_sample in zip(points, distances) + ] + ) + return sampled_points + + +def polyline_lengths_cpu(points: torch.Tensor) -> torch.Tensor: + # For no points, the length is undefined. + if points.shape[1] == 0: + lengths = points.new_full((points.shape[0],), torch.nan) + return lengths + # For a single point, the length is 0. + if points.shape[1] == 1: + lengths = points.new_zeros((points.shape[0],)) + return lengths + + lengths = torch.linalg.vector_norm(points[:, 1:] - points[:, :-1], dim=2).sum(dim=1) + return lengths + + +def polyline_lengths_var_size_cpu(points: torch.Tensor, sample_sizes: torch.Tensor) -> torch.Tensor: + lengths = [] + for sample_idx in range(points.shape[0]): + num_points = int(sample_sizes[sample_idx].item()) + lengths.append(polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0]) + lengths = torch.stack(lengths) + return lengths + + +def assert_ragged_matches_cpu( + result: RaggedBatch, + points: torch.Tensor, + distances: torch.Tensor, + points_sample_sizes: torch.Tensor, + distances_sample_sizes: torch.Tensor, + *, + atol: float = 1e-5, +) -> None: + assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu()) + + for sample_idx in range(points.shape[0]): + + num_points = int(points_sample_sizes[sample_idx].item()) + num_distances = int(distances_sample_sizes[sample_idx].item()) + expected = sample_polyline_cpu( + points[sample_idx, :num_points].cpu(), + distances[sample_idx, :num_distances].cpu(), + ) + + actual = result.tensor[sample_idx, :num_distances].cpu() + + assert torch.allclose(actual, expected, atol=atol, rtol=0.0, equal_nan=True) + + +def make_random_ragged_polyline_case( + *, + seed: int, + batch_size: int = 7, + max_num_points: int = 12, + max_num_distances: int = 17, + num_dims: int = 3, +) -> tuple[RaggedBatch, RaggedBatch]: + generator = torch.Generator().manual_seed(seed) + points_sample_sizes = torch.randint(1, max_num_points + 1, (batch_size,), generator=generator) + distances_sample_sizes = torch.randint(0, max_num_distances + 1, (batch_size,), generator=generator) + + max_points_in_batch = int(points_sample_sizes.max().item()) + max_distances_in_batch = int(distances_sample_sizes.max().item()) + + points = torch.full((batch_size, max_points_in_batch, num_dims), 9999.0, dtype=torch.float32) + distances = torch.full((batch_size, max_distances_in_batch), -9999.0, dtype=torch.float32) + + for sample_idx in range(batch_size): + num_points = int(points_sample_sizes[sample_idx].item()) + num_distances = int(distances_sample_sizes[sample_idx].item()) + points[sample_idx, :num_points] = torch.rand((num_points, num_dims), generator=generator) + total_length = polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0] + distances[sample_idx, :num_distances] = ( + torch.rand((num_distances,), generator=generator) * total_length + ) + + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + + return points_batch, distances_batch + + +def make_padded_ragged_polyline_case( + device: str, +) -> tuple[RaggedBatch, RaggedBatch]: + # Poitns data + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0], [0.0, 0.0]], + [[3.5, -1.25], [4.5, -1.25], [4.5, 0.75], [9999.0, 9999.0], [9999.0, 9999.0]], + [[-2.0, 3.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]], + [[10.0, 0.0], [12.0, 0.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + points_sample_sizes = torch.tensor([5, 3, 1, 2], device=device) + # Distances data + distances = torch.tensor( + [ + [0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0], + [3.0, 2.0, 1.0, 0.0, -1.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + [9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + [-5.0, 1.0, 5.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + ], + device=device, + dtype=torch.float32, + ) + distances_sample_sizes = torch.tensor([11, 5, 0, 3], device=device) + + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + + return points_batch, distances_batch + + +def distances_for_mode( + points: torch.Tensor, absolute_distances: torch.Tensor, *, relative: bool +) -> torch.Tensor: + + if not relative: + return absolute_distances + + lengths = polyline_lengths_cpu(points.cpu()).to( + device=absolute_distances.device, dtype=absolute_distances.dtype + ) + + # For zero length, use 1.0 to avoid division by zero. + safe_lengths = torch.where(lengths > 0, lengths, torch.ones_like(lengths)) + relative_distances = absolute_distances / safe_lengths[:, None] + + return relative_distances + + +def ragged_distances_for_mode( + points: RaggedBatch, + absolute_distances: RaggedBatch, + *, + relative: bool, +) -> RaggedBatch: + if not relative: + return absolute_distances + + relative_distances = absolute_distances.tensor.clone() + lengths = polyline_lengths_var_size_cpu(points.tensor.cpu(), points.sample_sizes.cpu()).to( + device=absolute_distances.tensor.device, dtype=absolute_distances.tensor.dtype + ) + for sample_idx in range(points.tensor.shape[0]): + num_distances = int(absolute_distances.sample_sizes[sample_idx].item()) + if num_distances == 0: + continue + length = lengths[sample_idx] + if length > 0: + relative_distances[sample_idx, :num_distances] /= length + else: + relative_distances[sample_idx, :num_distances] = 0.0 + relative_distances_batch = absolute_distances.create_with_sample_sizes_like_self(relative_distances) + return relative_distances_batch diff --git a/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py new file mode 100644 index 0000000..ffa4946 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py @@ -0,0 +1,209 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.lane_helpers import polyline + +from polyline_test_utils import DEVICES, distances_for_mode, sample_batch_cpu + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_rectangle_polyline_interpolation(relative: bool, device: str): + points = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0]], + device=device, + ) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_batched_polyline_interpolation(relative: bool, device: str): + base_points = torch.tensor( + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ], + dtype=torch.float32, + ) + offsets = torch.tensor([[0.0, 0.0], [3.5, -1.25]], dtype=torch.float32) + points = (base_points.unsqueeze(0) + offsets.unsqueeze(1)).to(device) + distances = torch.tensor( + [ + [0.0, 0.5, 1.0, 3.0, 6.0], + [6.0, 5.0, 3.0, 1.0, 0.0], + ], + device=device, + dtype=torch.float32, + ) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points.contiguous(), distances_input.contiguous(), relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + distances = torch.tensor( + [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]], + device=device, + dtype=torch.float32, + ).transpose(0, 1) + assert not points.is_contiguous() + assert not distances.is_contiguous() + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_out_of_range_distances_clamp_to_endpoints(relative: bool, device: str): + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0], [1.0, 2.0]]], device=device, dtype=torch.float32) + distances = torch.tensor([[-4.0, -1.0, 0.0, 3.0, 4.0]], device=device, dtype=torch.float32) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_single_point_polyline(relative: bool, device: str): + points = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32) + distances = torch.tensor([[-1.0, 0.0, 1.0]], device=device, dtype=torch.float32) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_zero_point_polyline_returns_nan(relative: bool, device: str): + points = torch.empty((2, 0, 3), device=device, dtype=torch.float32) + distances = torch.tensor([[0.0, 1.0], [-1.0, 2.0]], device=device, dtype=torch.float32) + distances_input = distances_for_mode(points, distances, relative=relative) + + result = polyline.interpolate(points, distances_input, relative=relative) + + assert result.shape == (2, 2, 3) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_zero_point_polyline_with_zero_distances_returns_empty(relative: bool, device: str): + points = torch.empty((2, 0, 3), device=device, dtype=torch.float32) + distances = torch.empty((2, 0), device=device, dtype=torch.float32) + distances_input = distances_for_mode(points, distances, relative=relative) + + result = polyline.interpolate(points, distances_input, relative=relative) + + assert result.shape == (2, 0, 3) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_random_polyline_matches_cpu_reference(relative: bool, device: str): + num_iters = 100 + generator = torch.Generator().manual_seed(0) + for _ in range(num_iters): + num_points = int(torch.randint(15, 61, (), generator=generator).item()) + num_distances = int(torch.randint(15, 61, (), generator=generator).item()) + points_cpu = torch.rand((3, num_points, 2), generator=generator, dtype=torch.float32) + distances_cpu = torch.rand((3, num_distances), generator=generator, dtype=torch.float32) + + segment_lengths = torch.linalg.vector_norm(points_cpu[:, 1:] - points_cpu[:, :-1], dim=2) + total_lengths = torch.sum(segment_lengths, dim=1) + distances_cpu = distances_cpu * total_lengths[:, None] + + distances_input_cpu = distances_for_mode(points_cpu, distances_cpu, relative=relative) + + expected = sample_batch_cpu(points_cpu, distances_cpu) + result = polyline.interpolate( + points_cpu.to(device), distances_input_cpu.to(device), relative=relative + ) + + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_large_polyline_interpolation_external_distance_buffer(relative: bool): + # Create a large polyline to ensure that the external distance buffer is used. + num_points = 200_000 + x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32) + points = torch.stack((x, torch.zeros_like(x)), dim=1).unsqueeze(0) + distances = torch.tensor([[0.0, 0.25, 0.5, 1.0, 2.0]], device="cuda", dtype=torch.float32) + expected = torch.tensor( + [[[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]]], + device="cuda", + dtype=torch.float32, + ) + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + result = polyline.interpolate(points, distances, relative=relative) + stream.synchronize() + + assert torch.allclose(result, expected, atol=1e-4, rtol=0.0) diff --git a/packages/lane_helpers/tests/test_polyline_lengths.py b/packages/lane_helpers/tests/test_polyline_lengths.py new file mode 100644 index 0000000..4b31de2 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_lengths.py @@ -0,0 +1,179 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import ( + DEVICES, + make_padded_ragged_polyline_case, + make_random_ragged_polyline_case, + polyline_lengths_cpu, + polyline_lengths_var_size_cpu, +) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_rectangle_and_single_point(device: str): + rectangle = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device=device, + dtype=torch.float32, + ) + single_point = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32) + + assert torch.allclose(polyline.lengths(rectangle).cpu(), torch.tensor([6.0]), atol=1e-5, rtol=0.0) + assert torch.allclose(polyline.lengths(single_point).cpu(), torch.tensor([0.0]), atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_zero_point_batch_returns_nan(device: str): + points = torch.empty((3, 0, 2), device=device, dtype=torch.float32) + + result = polyline.lengths(points) + + assert result.shape == (3,) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_random_nd_matches_cpu_reference(device: str): + generator = torch.Generator().manual_seed(1) + num_iters = 100 + for _ in range(num_iters): + points_cpu = torch.rand((5, 37, 4), generator=generator, dtype=torch.float32) + + expected = polyline_lengths_cpu(points_cpu) + result = polyline.lengths(points_cpu.to(device)) + + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_accepts_non_contiguous_points(device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + assert not points.is_contiguous() + + result = polyline.lengths(points) + expected = polyline_lengths_cpu(points.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_random_matches_cpu_reference(device: str): + num_iters = 100 + for i in range(num_iters): + points_batch_cpu, _ = make_random_ragged_polyline_case(seed=i * 100) + points_batch = points_batch_cpu.to(device) + + result = polyline.lengths_var_size_batch(points_batch) + expected = polyline_lengths_var_size_cpu(points_batch_cpu.tensor, points_batch_cpu.sample_sizes) + + assert result.shape == (points_batch.tensor.shape[0],) + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_matches_cpu_reference_and_ignores_padding(device: str): + points_batch, _ = make_padded_ragged_polyline_case(device) + + result = polyline.lengths_var_size_batch(points_batch) + expected = polyline_lengths_var_size_cpu(points_batch.tensor.cpu(), points_batch.sample_sizes.cpu()) + + assert result.shape == (4,) + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_zero_point_row_returns_nan(device: str): + points = torch.tensor( + [ + [[9999.0, 9999.0], [9999.0, 9999.0]], + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32) + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0, equal_nan=True) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_all_zero_point_rows_return_nan(device: str): + points = torch.empty((3, 0, 2), device=device, dtype=torch.float32) + sample_sizes = torch.zeros(3, device=device, dtype=torch.int32) + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + + assert result.shape == (3,) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_accepts_int32_sample_sizes_and_non_contiguous_points(device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32) + assert not points.is_contiguous() + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +def test_polyline_lengths_var_size_batch_handles_inactive_cuda_rows(): + num_samples = 33 + points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32) + points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32) + points[:, 0, 1] = 0.0 + points[:, 1, 0] = points[:, 0, 0] + 1.0 + points[:, 1, 1] = 0.0 + sample_sizes = torch.full((num_samples,), 2, device="cuda") + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + + assert torch.allclose(result.cpu(), torch.ones(num_samples), atol=1e-5, rtol=0.0) diff --git a/packages/lane_helpers/tests/test_polyline_validation.py b/packages/lane_helpers/tests/test_polyline_validation.py new file mode 100644 index 0000000..fe7db26 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_validation.py @@ -0,0 +1,166 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import DEVICES + + +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_rejects_invalid_ragged_layout(device: str): + points = torch.randn((2, 3, 4), device=device) + distances = torch.randn((2, 4), device=device) + points_batch = RaggedBatch( + points.transpose(1, 2).contiguous(), + sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32), + non_uniform_dim=2, + ) + distances_batch = RaggedBatch( + distances, sample_sizes=torch.tensor([4, 2], device=device, dtype=torch.int32) + ) + + # Polyline points must use dimension 1 as the non-uniform point dimension. + with pytest.raises(AssertionError, match="points.non_uniform_dim"): + polyline.interpolate_var_size_batch(points_batch, distances_batch) + + +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_validates_inputs(device: str): + points = torch.randn((1, 3, 2), device=device) + distances = torch.randn((1, 4), device=device) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([3], device=device)) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([4], device=device)) + + # Points sample sizes must not be negative. + bad_points_sizes = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.interpolate_var_size_batch(bad_points_sizes, distances_batch) + + # Distance sample sizes must not exceed the padded distance dimension. + bad_distances_sizes = RaggedBatch(distances, sample_sizes=torch.tensor([5], device=device)) + with pytest.raises(RuntimeError, match="distances.sample_sizes"): + polyline.interpolate_var_size_batch(points_batch, bad_distances_sizes) + + # Points and distances must have the same dtype. + distances_double = distances_batch.double() + with pytest.raises(RuntimeError, match="same dtype"): + polyline.interpolate_var_size_batch(points_batch, distances_double) + + # Points and distances must have the same sample size dtype. + mismatched_sample_size_dtype = RaggedBatch( + distances, + sample_sizes=torch.tensor([4], device=device, dtype=torch.int32), + ) + with pytest.raises(RuntimeError, match="same dtype"): + polyline.interpolate_var_size_batch(points_batch, mismatched_sample_size_dtype) + + +def test_polyline_functions_reject_mixed_cpu_cuda_inputs(): + points_cpu = torch.randn((1, 3, 2), device="cpu") + distances_cpu = torch.randn((1, 4), device="cpu") + points_cuda = points_cpu.cuda() + distances_cuda = distances_cpu.cuda() + + # Fixed-size points and distances must live on the same device. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate(points_cpu, distances_cuda) + + # Ragged points and distances must live on the same device. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate_var_size_batch( + RaggedBatch(points_cpu, sample_sizes=torch.tensor([3], device="cpu")), + RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")), + ) + + # Ragged sample sizes must live on the same device as their data tensor. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate_var_size_batch( + RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")), + RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")), + ) + + # Lengths use only points, but points.sample_sizes must still match the points device. + with pytest.raises(RuntimeError, match="same device"): + polyline.lengths_var_size_batch( + RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")) + ) + + +def test_cpu_polyline_functions_reject_low_precision_dtypes(): + for dtype in (torch.float16, torch.bfloat16): + # CPU kernels intentionally support only float32 and float64. + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], dtype=dtype) + distances = torch.tensor([[0.0, 1.0]], dtype=dtype) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2])) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2])) + + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.interpolate(points, distances) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.lengths(points) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.interpolate_var_size_batch(points_batch, distances_batch) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.lengths_var_size_batch(points_batch) + + +def test_cuda_polyline_functions_accept_low_precision_dtypes(): + for dtype in (torch.float16, torch.bfloat16): + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype) + distances = torch.tensor([[0.0, 1.0]], device="cuda", dtype=dtype) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2], device="cuda")) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2], device="cuda")) + + expected_points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype) + expected_lengths = torch.tensor([1.0], device="cuda", dtype=dtype) + + assert torch.equal(polyline.interpolate(points, distances), expected_points) + assert torch.equal(polyline.lengths(points), expected_lengths) + assert torch.equal( + polyline.interpolate_var_size_batch(points_batch, distances_batch).tensor, expected_points + ) + assert torch.equal(polyline.lengths_var_size_batch(points_batch), expected_lengths) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_rejects_invalid_ragged_layout(device: str): + points = torch.randn((2, 3, 4), device=device) + points_batch = RaggedBatch( + points.transpose(1, 2).contiguous(), + sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32), + non_uniform_dim=2, + ) + + # Polyline points must use dimension 1 as the non-uniform point dimension. + with pytest.raises(AssertionError, match="points.non_uniform_dim"): + polyline.lengths_var_size_batch(points_batch) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_validates_sample_sizes(device: str): + points = torch.randn((1, 3, 2), device=device) + + # Length sample sizes must not be negative. + bad_small = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.lengths_var_size_batch(bad_small) + + # Length sample sizes must not exceed the padded point dimension. + bad_large = RaggedBatch(points, sample_sizes=torch.tensor([4], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.lengths_var_size_batch(bad_large) diff --git a/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py new file mode 100644 index 0000000..7234ef0 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py @@ -0,0 +1,314 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import ( + DEVICES, + assert_ragged_matches_cpu, + make_padded_ragged_polyline_case, + make_random_ragged_polyline_case, + ragged_distances_for_mode, +) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_matches_cpu_reference(relative: bool, device: str): + points_batch, distances_batch = make_padded_ragged_polyline_case(device) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch(points_batch, distances_input_batch, relative=relative) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == (4, 11, 2) + assert result.non_uniform_dim == 1 + assert_ragged_matches_cpu( + result, + points_batch.tensor, + distances_batch.tensor, + points_batch.sample_sizes, + distances_batch.sample_sizes, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_random_matches_cpu_reference(relative: bool, device: str): + num_iters = 100 + for i in range(num_iters): + points_batch_cpu, distances_batch_cpu = make_random_ragged_polyline_case(seed=i) + points_batch = points_batch_cpu.to(device) + distances_batch = distances_batch_cpu.to(device) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == ( + points_batch.tensor.shape[0], + distances_batch.tensor.shape[1], + points_batch.tensor.shape[2], + ) + assert_ragged_matches_cpu( + result, + points_batch.tensor, + distances_batch.tensor, + points_batch.sample_sizes, + distances_batch.sample_sizes, + atol=1e-4, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_matches_fixed_size_when_uniform(relative: bool, device: str): + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0]], + [[2.0, 2.0], [3.0, 2.0], [3.0, 4.0], [2.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 0.5, 2.0, 4.0], [4.0, 2.0, 0.5, 0.0]], + device=device, + dtype=torch.float32, + ) + sample_sizes = torch.tensor([points.shape[1], points.shape[1]], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor( + [distances.shape[1], distances.shape[1]], device=device, dtype=torch.int32 + ) + points_batch = RaggedBatch(points, sample_sizes=sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + expected = polyline.interpolate( + points.contiguous(), distances_input_batch.tensor.contiguous(), relative=relative + ) + + assert torch.allclose(result.tensor, expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + distances = torch.tensor( + [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]], + device=device, + dtype=torch.float32, + ).transpose(0, 1) + assert not points.is_contiguous() + assert not distances.is_contiguous() + + points_sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([4, 2], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu( + result, + points, + distances, + points_sample_sizes, + distances_sample_sizes, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_zero_point_row_returns_nan(relative: bool, device: str): + points = torch.tensor( + [ + [[9999.0, 9999.0], [9999.0, 9999.0]], + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 1.0], [0.0, 0.5], [-1.0, 2.0]], + device=device, + dtype=torch.float32, + ) + points_sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([2, 2, 2], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu( + result, + points, + distances, + points_sample_sizes, + distances_sample_sizes, + ) + assert torch.isnan(result.tensor[0, :2]).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_zero_max_distances_returns_empty(relative: bool, device: str): + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.empty((2, 0), device=device, dtype=torch.float32) + points_sample_sizes = torch.tensor([2, 1], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([0, 0], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == (2, 0, 2) + assert torch.equal(result.sample_sizes.cpu(), torch.zeros(2, dtype=torch.int32)) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_all_zero_point_rows_return_nan(relative: bool, device: str): + points = torch.empty((2, 0, 2), device=device, dtype=torch.float32) + distances = torch.tensor([[0.0, 1.0, 2.0], [-1.0, 0.5, 3.0]], device=device, dtype=torch.float32) + points_sample_sizes = torch.zeros(2, device=device, dtype=torch.int32) + distances_sample_sizes = torch.full((2,), 3, device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert result.tensor.shape == (2, 3, 2) + assert torch.isnan(result.tensor).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_variable_size_large_polyline_interpolation_external_distance_buffer(relative: bool): + # Create a large polyline to ensure that the external distance buffer is used. + num_points = 200_000 + x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32) + first_polyline = torch.stack((x, torch.zeros_like(x)), dim=1) + second_polyline = torch.stack((x, torch.ones_like(x)), dim=1) + points = torch.stack((first_polyline, second_polyline), dim=0) + distances = torch.tensor( + # Note that 9999.0 is a filler is not not part of the distances used for interpolation (due to `distances_sample_sizes`) + [[0.0, 0.25, 0.5, 1.0, 2.0], [1.0, 0.5, 0.0, -1.0, 9999.0]], + device="cuda", + dtype=torch.float32, + ) + points_sample_sizes = torch.full((2,), num_points, device="cuda", dtype=torch.int32) + distances_sample_sizes = torch.tensor([5, 4], device="cuda", dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + expected = torch.tensor( + [ + [[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]], + # Note that 9999.0 is a filler and is not checked for equality in the test. + [[1.0, 1.0], [0.5, 1.0], [0.0, 1.0], [0.0, 1.0], [9999.0, 9999.0]], + ], + device="cuda", + dtype=torch.float32, + ) + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + result = polyline.interpolate_var_size_batch(points_batch, distances_batch, relative=relative) + stream.synchronize() + + assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu()) + assert torch.allclose(result.tensor[0, :5], expected[0, :5], atol=1e-4, rtol=0.0) + assert torch.allclose(result.tensor[1, :4], expected[1, :4], atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_variable_size_polyline_interpolation_handles_inactive_cuda_rows(relative: bool): + num_samples = 33 + points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32) + points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32) + points[:, 0, 1] = 0.0 + points[:, 1, 0] = points[:, 0, 0] + 1.0 + points[:, 1, 1] = 0.0 + distances = ( + torch.tensor([[0.0, 0.25, 1.0]], device="cuda", dtype=torch.float32).expand(num_samples, -1).clone() + ) + points_sample_sizes = torch.full((num_samples,), 2, device="cuda") + distances_sample_sizes = torch.full((num_samples,), 3, device="cuda") + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu(result, points, distances, points_sample_sizes, distances_sample_sizes) + + +if __name__ == "__main__": + pytest.main([__file__])