diff --git a/.gitignore b/.gitignore index 31a607c..7a579ee 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,3 @@ __pycache__/ *.whl *.log - diff --git a/docker/Dockerfile b/docker/Dockerfile index 6ae40b1..82c44b8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -113,8 +113,8 @@ RUN pip install pandas==1.5.3 \ numba==0.59 \ pyquaternion==0.9.9 -RUN pip install nuscenes-devkit && \ - pip install shapely tqdm pillow networkx fire +RUN pip install nuscenes-devkit==1.2.0 && \ + pip install shapely==2.0.7 tqdm==4.67.3 pillow==12.2.0 networkx==3.4.2 fire==0.7.1 RUN pip install pytest pytest-timeout RUN pip install pynvml diff --git a/docs/Makefile b/docs/Makefile index 51667be..987e408 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -23,8 +23,9 @@ help: # Generate namespace package documentation before building generate: - python3 mirror_referenced_dirs.py python3 generate_new_namespace_package_docs.py + python3 generate_package_docs_assets.py + python3 mirror_referenced_dirs.py python3 update_docs_index.py # Sync the root README into the docs tree before building @@ -41,7 +42,7 @@ clean: @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) rm -rf $(BUILDDIR)/ rm -rf api/generated/ - rm -rf ../packages/*/docs/generated/ + rm -rf ../packages/*/docs/_generated/ # Auto-build documentation (watches for changes) livehtml: sync-readme generate diff --git a/docs/generate_package_docs_assets.py b/docs/generate_package_docs_assets.py new file mode 100644 index 0000000..e205e28 --- /dev/null +++ b/docs/generate_package_docs_assets.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from dataclasses import dataclass +import importlib.util +from pathlib import Path +import sys +from types import ModuleType +from typing import Callable + + +@dataclass(frozen=True) +class PackageDocsContext: + project_root: Path + namespace_package: str + package_name: str + package_root: Path + docs_root: Path + generated_dir: Path + + +HookFunction = Callable[[PackageDocsContext], None] +_GENERATED_ASSET_GITIGNORE = "*\n" + + +def _load_hook_module(hook_path: Path, package_name: str) -> ModuleType: + # Temporary module name for the imported hook. + module_name = f"_accvlab_docs_assets_{package_name}" + + # Import + spec = importlib.util.spec_from_file_location(module_name, hook_path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not create import spec for docs asset hook: {hook_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + return module + + +def _get_hook_function(module: ModuleType, hook_path: Path) -> HookFunction: + hook_function = getattr(module, "generate_docs_assets", None) + if not callable(hook_function): + raise AttributeError( + f"Docs asset hook must define a callable generate_docs_assets(context): {hook_path}" + ) + return hook_function + + +def _prepare_generated_dir(context: PackageDocsContext) -> None: + """Create the package's generated docs asset directory and keep it untracked.""" + context.generated_dir.mkdir(parents=True, exist_ok=True) + (context.generated_dir / ".gitignore").write_text(_GENERATED_ASSET_GITIGNORE, encoding="utf-8") + + +def _build_context(project_root: Path, namespace_package: str) -> PackageDocsContext: + package_name = namespace_package.split(".")[-1] + package_root = project_root / "packages" / package_name + docs_root = package_root / "docs" + generated_dir = docs_root / "_generated" + ctx = PackageDocsContext( + project_root=project_root, + namespace_package=namespace_package, + package_name=package_name, + package_root=package_root, + docs_root=docs_root, + generated_dir=generated_dir, + ) + return ctx + + +def _generate_assets_for_package( + *, + project_root: Path, + namespace_package: str, + verbose: bool, +) -> bool: + context = _build_context(project_root, namespace_package) + hook_path = context.docs_root / "_on_doc_generation.py" + if not hook_path.exists(): + if verbose: + print(f"No docs asset hook for {context.package_name}") + return False + + if verbose: + print(f"Running docs asset hook for {context.package_name}: {hook_path}") + module = _load_hook_module(hook_path, context.package_name) + _prepare_generated_dir(context) + hook_function = _get_hook_function(module, hook_path) + hook_function(context) + print(f"Generated docs assets for {context.package_name}") + return True + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run optional package-local documentation asset generation hooks.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output.", + ) + parser.add_argument( + "--package", + dest="package_names", + action="append", + help="Package name to process, such as lane_helpers. Can be passed more than once.", + ) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + docs_dir = Path(__file__).resolve().parent + project_root = docs_dir.parent + sys.path.insert(0, str(project_root)) + + try: + from namespace_packages_config import NAMESPACE_PACKAGES + except ImportError as exc: + print( + f"Error: Could not import NAMESPACE_PACKAGES from namespace_packages_config.py: {exc}", + file=sys.stderr, + ) + return 1 + + package_filter = set(args.package_names or []) + namespace_packages = [ + namespace_package + for namespace_package in NAMESPACE_PACKAGES + if not package_filter or namespace_package.split(".")[-1] in package_filter + ] + if package_filter and len(namespace_packages) != len(package_filter): + found_package_names = {namespace_package.split(".")[-1] for namespace_package in namespace_packages} + missing_package_names = sorted(package_filter - found_package_names) + print(f"Error: Unknown namespace package(s): {', '.join(missing_package_names)}", file=sys.stderr) + return 1 + + hook_count = 0 + for namespace_package in namespace_packages: + package_name = namespace_package.split(".")[-1] + try: + hook_ran = _generate_assets_for_package( + project_root=project_root, + namespace_package=namespace_package, + verbose=args.verbose, + ) + except Exception as exc: + print(f"Error: docs asset generation failed for {package_name}: {exc}", file=sys.stderr) + return 1 + if hook_ran: + hook_count += 1 + + if args.verbose: + print(f"Ran {hook_count} package docs asset hook(s).") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/guides/DEVELOPMENT_GUIDE.md b/docs/guides/DEVELOPMENT_GUIDE.md index ffb1f5a..08077d4 100644 --- a/docs/guides/DEVELOPMENT_GUIDE.md +++ b/docs/guides/DEVELOPMENT_GUIDE.md @@ -48,7 +48,8 @@ There are two example projects which showcase how a namespace package is structu - `packages/example_package`: Showcases a package containing PyTorch extensions built using `CppExtension` and `CUDAExtension` provided by PyTorch as well as an external implementation (see [External Implementations](#external-implementations) section for more details on external implementations) - as described below. + as described below. It also includes a package-local documentation asset hook that generates a simple plot + from committed CSV data under `evaluation_results/` during the docs build. - `packages/example_skbuild_package`: Showcases a package using `scikit-build` for C++/CUDA implementation (see the [Alternative: SKBuild-Based Packages](#alternative-skbuild-based-packages) section for more details on this approach). @@ -70,6 +71,8 @@ To add a new namespace package (e.g., `example_package`), you need to create: | **Setup** | `packages/example_package/setup.py` | Package build configuration | | **Project Config** | `packages/example_package/pyproject.toml` | Modern Python project configuration and authoritative dependency definition | | **Documentation include list (optional)** | `packages/example_package/docu_referenced_dirs.txt` | List additional directories referenced by the docs (besides `docs/`). See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details.| +| **Documentation asset hook (optional)** | `packages/example_package/docs/_on_doc_generation.py` | Generate package-owned docs assets such as plots from committed evaluation data. See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md#package-local-generated-assets). | +| **Evaluation results (optional)** | `packages/example_package/evaluation_results/` | Package-owned committed inputs for generating docs assets, such as data to plot. | > **ℹ️ Note**: Apart from the above, further folders/files can be included (and made use of manually or added to the > documentation) if needed. A typical use case is to include e.g. an `examples` directory which is: @@ -84,26 +87,29 @@ The following diagram shows the relevant project structure containing the folder ``` accvlab/ -├── packages/ # Namespace packages directory +├── packages/ # Namespace packages directory │ ├── optim_test_tools/... │ ├── batching_helpers/... -│ └── example_package/ # ← New namespace package -│ ├── accvlab/ # ← Namespace root -│ │ └── example_package/ # ← Implementation for "example_package" package +│ └── example_package/ # ← New namespace package +│ ├── accvlab/ # ← Namespace root +│ │ └── example_package/ # ← Implementation for "example_package" package │ │ ├── __init__.py -│ │ ├── csrc/ # ← C++/CUDA sources -│ │ └── include/ # ← Headers -│ ├── ext_impl/ # ← Optional: external implementation +│ │ ├── csrc/ # ← C++/CUDA sources +│ │ └── include/ # ← Headers +│ ├── ext_impl/ # ← Optional: external implementation │ │ ├── build_and_copy.sh │ │ └── ... -│ ├── tests/ # ← Tests for "example_package" package -│ ├── docs/ # ← Documentation for "example_package" package -│ ├── setup.py # ← Package build configuration -│ ├── pyproject.toml # ← Project configuration (including dependencies) -│ └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`) -├── build_config/ # Shared build utilities -├── docs/ # Main documentation -└── namespace_packages_config.py # ← Namespace package needs to be listed here +│ ├── tests/ # ← Tests for "example_package" package +│ ├── evaluation_results/ # ← Optional committed inputs for generated docs assets +│ ├── docs/ # ← Documentation for "example_package" package +│ │ ├── _on_doc_generation.py # ← Optional docs asset hook +│ │ └── ... +│ ├── setup.py # ← Package build configuration +│ ├── pyproject.toml # ← Project configuration (including dependencies) +│ └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`) +├── build_config/ # Shared build utilities +├── docs/ # Main documentation +└── namespace_packages_config.py # ← Namespace package needs to be listed here ``` Note that inside the package, there is the directory structure `accvlab/example_package`. This is where the @@ -238,6 +244,11 @@ root = "../.." Use this pattern for your own namespace package, adapting the dependency names as needed. +Use `[project.optional-dependencies].optional` for dependencies needed by tests, examples, or package-local +documentation asset hooks, but not by the core package at runtime. For example, if a docs hook generates plots +from committed data, put the plotting library in the package's optional dependencies rather than in the base +`[project].dependencies`. + > **ℹ️ Note**: The `accvlab-build-config @ file:../../build_config` build dependency is intentionally a > local path reference. From a package under `packages//`, it resolves to the repository's `build_config/` package > so isolated pip builds use the local helper package. See @@ -317,6 +328,18 @@ Most of the contained packages extend this basic structure considerably to provi documentation. Please see the [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details on the documentation system and how to set it up. +If your package needs generated docs assets, add `packages//docs/_on_doc_generation.py`. The +documentation build creates `packages//docs/_generated/`, keeps it untracked, and passes that +directory to the hook. Keep user-facing `.rst`/`.md` files static and reference generated assets with relative +paths such as `_generated/.png`. The hook should generate those assets from committed inputs and +fail clearly if required inputs are missing. Store committed plot or evaluation inputs outside the package +`docs/` folder, for example under `packages//evaluation_results/`, so Sphinx does not discover +data tables as standalone documentation pages. + +> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement +> workflows. They should only regenerate documentation assets, such as plots, from data that is already +> available in the repository. + #### 8. Test Your Package ```bash @@ -352,6 +375,10 @@ When adding a new namespace package, ensure you have: - [ ] **Documentation**: Generated with docs scripts and customized intro - [ ] **Documentation include list (optional)**: `docu_referenced_dirs.txt` created and populated if extra folders (e.g. `examples/`) are referenced and are needed to build the documentation +- [ ] **Documentation asset hook (optional)**: `_on_doc_generation.py` added if the package needs generated + documentation assets +- [ ] **Evaluation results (optional)**: `packages//evaluation_results/` contains committed + inputs for generated docs assets if needed - [ ] **Examples (optional)**: `packages//examples/` created and referenced from docs if used - [ ] **Dependencies**: Declared runtime and optional dependencies in `pyproject.toml` - [ ] **External implementation**: (Optional) `packages//ext_impl/` for external builds diff --git a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md index 521725f..667ce0e 100644 --- a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md +++ b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md @@ -9,6 +9,7 @@ The documentation system provides: - **Explicit namespace package configuration** through `namespace_packages_config.py` - **Dynamic documentation generation** for each configured namespace package +- **Optional package-local asset generation** for generated documentation assets such as plots - **Comprehensive API reference** with auto-generated content (extracted from docstrings) - **Referenced directories mirroring** to access files from the individual namespace packages in the documentation by @@ -43,6 +44,10 @@ The documentation generation makes use of multiple scripts: - **Template-based**: Uses consistent templates for all namespace packages (but generated files may be modified as needed) - **Safe regeneration**: Only creates missing files if no `index.rst` is present for the namespace package +- **`generate_package_docs_assets.py`**: Runs optional package-local documentation asset hooks + - **Package-owned**: Each package can decide whether it needs generated assets and how to create them + - **Format-agnostic**: The hook can read any package-owned input files and write any output files in the output folder; + The core docs system does not prescribe a data format - **`update_docs_index.py`**: Updates main index file by including references to newly added namespace packages - **`mirror_referenced_dirs.py`**: Mirrors (symlinks by default) the `docs` directory and other needed @@ -64,27 +69,28 @@ The documentation generation makes use of multiple scripts: #### Main Documentation Directory (`docs/`) ``` docs/ -├── conf.py # Sphinx configuration using namespace_packages_config -├── index.rst # Main documentation index +├── conf.py # Sphinx configuration using namespace_packages_config +├── index.rst # Main documentation index ├── generate_new_namespace_package_docs.py # Creates structure for new namespace packages -├── update_docs_index.py # Updates navigation and indices -├── mirror_referenced_dirs.py # Mirrors referenced directories (symlinks by default) -├── sync_root_readme_for_docs.py # Syncs project root README into docs/project_overview -├── Makefile # Build commands -├── requirements.txt # Documentation dependencies -├── project_overview/ # Synced copy of the project root README used as docs overview -├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies) -│ ├── example_package/ # Example namespace package docs (representative) -│ │ ├── docs/ # Documentation files -│ │ │ ├── index.rst # Namespace package overview -│ │ │ ├── intro.rst # Introduction (manual content) -│ │ │ └── api.rst # API reference (auto-generated) -│ │ └── examples/ # Additional mirrored directory (referenced in docs) -│ └── [other_packages]/ # Other configured namespace packages -├── common/ # Shared documentation resources +├── generate_package_docs_assets.py # Runs optional package-local docs asset hooks +├── update_docs_index.py # Updates navigation and indices +├── mirror_referenced_dirs.py # Mirrors referenced directories (symlinks by default) +├── sync_root_readme_for_docs.py # Syncs project root README into docs/project_overview +├── Makefile # Build commands +├── requirements.txt # Documentation dependencies +├── project_overview/ # Synced copy of the project root README used as docs overview +├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies) +│ ├── example_package/ # Example namespace package docs (representative) +│ │ ├── docs/ # Documentation files +│ │ │ ├── index.rst # Namespace package overview +│ │ │ ├── intro.rst # Introduction (manual content) +│ │ │ └── api.rst # API reference (auto-generated) +│ │ └── examples/ # Additional mirrored directory (referenced in docs) +│ └── [other_packages]/ # Other configured namespace packages +├── common/ # Shared documentation resources ├── _static/css/ -│ └── custom.css # Custom styling -└── _build/ # Built documentation output +│ └── custom.css # Custom styling +└── _build/ # Built documentation output ``` **Notes**: @@ -103,7 +109,10 @@ packages/ ├── docs/ # Source documentation files │ ├── index.rst # Namespace package overview │ ├── intro.rst # Introduction (manual content) - │ └── api.rst # API reference (auto-generated) + │ ├── api.rst # API reference (auto-generated) + │ ├── _on_doc_generation.py # Optional package-local docs asset hook + │ └── _generated/ # Generated assets created at docs build time + ├── evaluation_results/ # Optional committed inputs for generated docs assets ├── docu_referenced_dirs.txt # List of additional directories to copy ├── examples/ # Example code (mirrored and referenced by docs) └── [other_dirs]/ # Other package directories @@ -112,6 +121,9 @@ packages/ **Notes**: - The `packages/example_package/` structure shows the source documentation that gets mirrored during build +- The `example_package` includes a small generated plot example: committed CSV data under + `packages/example_package/evaluation_results/` is converted into an image under + `packages/example_package/docs/_generated/` during the docs build - **⚠️ Important**: Content should be edited in the source locations (`packages//docs/`), not in the mirrored locations - In case of the `example_package`, the `examples/` directory is mirrored to maintain documentation references @@ -172,6 +184,65 @@ etc.) can still be found after the documentation is mirrored to the build locati - Only list additional directories that are referenced by your documentation. Note that the API documentation does not rely on this mirroring, but is extracted from the installed packages. +### Package-Local Generated Assets + +Packages can generate documentation assets during the docs build by adding an optional hook: + +```text +packages//docs/_on_doc_generation.py +``` + +If present, `generate_package_docs_assets.py` imports the hook and calls: + +```python +def generate_docs_assets(context): + ... +``` + +The hook receives a context with package and documentation paths, including: + +- `context.project_root` +- `context.package_root` +- `context.docs_root` +- `context.generated_dir` + +The docs asset generator creates `context.generated_dir` before calling the hook. This directory is always: + +```text +packages//docs/_generated/ +``` + +It also writes a local `.gitignore` file there so generated assets remain untracked. The hook should write +generated images or other generated files directly into `context.generated_dir`, or into subdirectories below +it if a package needs additional structure. + +Source documentation files remain static. For example, an `.rst` file can reference a generated image with a +normal relative path: + +```rst +.. figure:: _generated/runtime_plot.png + :alt: Runtime plot +``` + +Packages own the input data and generation logic. For example, a package can commit benchmark result tables +under `packages//evaluation_results/` and generate plots from those tables during the docs +build. If a generated asset is required by the static docs, the hook should fail with a clear error when the +required input data is missing or malformed. + +> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement +> workflows. They should only regenerate documentation assets, such as plots, from data that is already +> available in the repository. It is recommended to store results in simple formats such as .csv or .md, +> and use those as the source of truth for the plots. +> +> Keep committed plot or evaluation inputs outside the package `docs/` folder, for example under +> `packages//evaluation_results/`. This prevents Sphinx from discovering e.g. `.md` data tables as +> standalone documentation pages while keeping the inputs package-local. + +Package-specific dependencies needed only by the hook should be declared in that package's optional +dependencies in `pyproject.toml`. The default local installation path (`./scripts/install_local.sh`) installs +optional package dependencies. If you build docs after installing packages without optional dependencies, +package-local asset hooks may fail when their optional plotting or parsing dependencies are missing. + ### Building Documentation Locally **Quick build using the script** (can be run from any directory, example shows running from the project @@ -208,6 +279,9 @@ make livehtml in sequence - The `html` target ensures all scripts run before building - The `livehtml` target also runs the scripts for development builds +- Package-local docs asset hooks run before package docs are mirrored, so generated assets under + `packages//docs/_generated/` are available from both the package docs source tree and the + mirrored docs tree. - When running spelling via the script, the generation scripts are executed first to ensure mirrored package docs are up to date. Spelling findings are written to `docs/_build/spelling/output.txt`. @@ -220,6 +294,9 @@ make livehtml > - It does **not** reinstall or rebuild packages for you. This means that if you change the docstrings in > the source tree of a package, you need to reinstall the package (for example via > `./scripts/install_local.sh`) and then restart `make livehtml` to see updated docstrings. +> - It does **not** rerun package-local docs asset hooks for you after startup. This means that if you change +> committed plot data or hook code, you need to restart `make livehtml` (or run `make generate`) to +> regenerate plots and other generated docs assets. ### Spell-checking @@ -456,6 +533,10 @@ the per-package runtime dependencies defined in each package's `pyproject.toml`) - Theme packages - Other documentation-specific dependencies +Package-specific docs asset dependencies belong to the corresponding package's optional dependencies. This keeps the +global documentation requirements focused on the Sphinx build itself while allowing package-owned hooks to declare their +own plotting or data-processing dependencies. + ### File Descriptions #### Core Configuration Files @@ -478,6 +559,12 @@ automatically as part of the docs build; you normally do not need to run them ma - **`packages//docs/index.rst`**: Namespace package overview (source) - **`packages//docs/intro.rst`**: Manual introduction content (source) - **`packages//docs/api.rst`**: Auto-generated API reference (source) +- **`packages//docs/_on_doc_generation.py`**: Optional hook for package-local generated docs + assets +- **`packages//docs/_generated/`**: Generated documentation assets created during docs + generation and ignored by Git +- **`packages//evaluation_results/`**: Optional package-owned committed inputs for generated + docs assets, such as benchmark tables used for plots - **`packages//docu_referenced_dirs.txt`**: List of directories containing files used in the documentation in addition to `docs` (to mirror into the documentation source directory). - **`docs/contained_package_docs_mirror//docs/`**: Mirrored documentation (symlink to the diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md index df0fe2e..b9fe12f 100644 --- a/docs/guides/INSTALLATION_GUIDE.md +++ b/docs/guides/INSTALLATION_GUIDE.md @@ -53,10 +53,11 @@ dependencies (needed for some tests and examples), pass the `--optional` flag ex ./scripts/package_manager.sh install -e --optional ``` -> **⚠️ Important**: Installing with optional dependencies is required if you plan to run the contained -> tests, as they rely on optional dependencies such as `pytest` (and possibly other dependencies). It may be -> also required for the contained examples, as they may use additional packages which are otherwise -> not used in the core library. +> **⚠️ Important**: Installing with optional dependencies is required for workflows that rely on packages +> outside the core library, including contained tests, contained examples, and documentation generation. +> Documentation generation may run package-local asset hooks, for example to regenerate plots from committed +> data, and those hooks can require plotting or data-processing packages. Tests commonly require tools such as +> `pytest` and may require further packages. The package manager script: - Automatically installs the required `accvlab_build_config` helper package (see the `build_config` directory diff --git a/docs/index.rst b/docs/index.rst index 273cc02..6492146 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -71,6 +71,7 @@ Please see the documentation of each namespace package for usage instructions (a contained_package_docs_mirror/batching_helpers/docs/index contained_package_docs_mirror/multi_tensor_copier/docs/index contained_package_docs_mirror/dali_pipeline_framework/docs/index + contained_package_docs_mirror/lane_helpers/docs/index contained_package_docs_mirror/draw_heatmap/docs/index contained_package_docs_mirror/optim_test_tools/docs/index diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index bee6c51..bda1b02 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -212,3 +212,7 @@ ABI aggregator multimodal cubin +Polyline +polyline +Polylines +polylines diff --git a/namespace_packages_config.py b/namespace_packages_config.py index a94ab18..aaf9d3d 100644 --- a/namespace_packages_config.py +++ b/namespace_packages_config.py @@ -27,6 +27,7 @@ 'accvlab.batching_helpers', 'accvlab.multi_tensor_copier', 'accvlab.dali_pipeline_framework', + 'accvlab.lane_helpers', 'accvlab.draw_heatmap', 'accvlab.optim_test_tools', # Add new namespace packages in the same way as above diff --git a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h index 9e66681..4df9745 100644 --- a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h +++ b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h @@ -64,48 +64,48 @@ #define DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INDEX_TYPES(__VA_ARGS__)) -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CPU(x) AT_ASSERTM(x.is_cpu(), #x " must be a CPU tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_SAME_CUDA_DEVICE(tensors_list...) \ - { \ - const std::vector tensors = {tensors_list}; \ - CHECK_CUDA(tensors[0]); \ - const auto& device = tensors[0].device(); \ - for (size_t i = 1; i < tensors.size(); ++i) { \ - AT_ASSERTM(tensors[i].device() == device, "All input tensors must be on the same device"); \ - } \ +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CPU(x) TORCH_CHECK(x.is_cpu(), #x " must be a CPU tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_SAME_CUDA_DEVICE(tensors_list...) \ + { \ + const std::vector tensors = {tensors_list}; \ + CHECK_CUDA(tensors[0]); \ + const auto& device = tensors[0].device(); \ + for (size_t i = 1; i < tensors.size(); ++i) { \ + TORCH_CHECK(tensors[i].device() == device, "All input tensors must be on the same device"); \ + } \ } -#define CHECK_SAME_DTYPE(error_msg, tensors_list...) \ - { \ - const std::vector tensors = {tensors_list}; \ - for (size_t i = 1; i < tensors.size(); ++i) { \ - AT_ASSERTM(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \ - } \ +#define CHECK_SAME_DTYPE(error_msg, tensors_list...) \ + { \ + const std::vector tensors = {tensors_list}; \ + for (size_t i = 1; i < tensors.size(); ++i) { \ + TORCH_CHECK(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \ + } \ } -#define CHECK_SIZE_MATCH(tensor1, tensor2) \ - { \ - /* If the tensors are empty, the actual sizes are not relevant */ \ - if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() == (tensor2).dim(), \ - #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ - for (size_t i = 0; i < (tensor1).dim(); ++i) { \ - AT_ASSERTM((tensor1).size(i) == (tensor2).size(i), \ - #tensor1 " and " #tensor2 " must have the same size"); \ - } \ - } \ +#define CHECK_SIZE_MATCH(tensor1, tensor2) \ + { \ + /* If the tensors are empty, the actual sizes are not relevant */ \ + if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ + TORCH_CHECK((tensor1).dim() == (tensor2).dim(), \ + #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ + for (size_t i = 0; i < (tensor1).dim(); ++i) { \ + TORCH_CHECK((tensor1).size(i) == (tensor2).size(i), \ + #tensor1 " and " #tensor2 " must have the same size"); \ + } \ + } \ } #define CHECK_SIZE_MATCH_FIRST_DIMS(tensor1, tensor2, num_dims_to_check) \ { \ /* If the tensors are empty, the actual sizes are not relevant */ \ if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check), \ - #tensor1 " and " #tensor2 " must have at least " + \ - std::to_string(num_dims_to_check) + " dimensions"); \ + TORCH_CHECK((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check), \ + #tensor1 " and " #tensor2 " must have at least " + \ + std::to_string(num_dims_to_check) + " dimensions"); \ for (size_t i = 0; i < (num_dims_to_check); ++i) { \ - AT_ASSERTM( \ + TORCH_CHECK( \ (tensor1).size(i) == (tensor2).size(i), \ #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \ } \ @@ -116,35 +116,35 @@ { \ /* If the tensors are empty, the actual sizes are not relevant */ \ if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) { \ - AT_ASSERTM((tensor1).dim() == (tensor2).dim(), \ - #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ + TORCH_CHECK((tensor1).dim() == (tensor2).dim(), \ + #tensor1 " and " #tensor2 " must have the same number of dimensions"); \ for (size_t i = 0; i < (tensor1).dim(); ++i) { \ if (i == (dim_to_allow_mismatch)) { \ continue; \ } \ - AT_ASSERTM( \ + TORCH_CHECK( \ (tensor1).size(i) == (tensor2).size(i), \ #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \ } \ } \ } -#define CHECK_NUM_DIMS(tensor, num_dims) \ - { \ - /* If the tensor is empty, the number of dimensions is not relevant */ \ - if (!((tensor).numel() == 0)) { \ - AT_ASSERTM((tensor).dim() == (num_dims), \ - #tensor " must have " + std::to_string(num_dims) + " dimensions"); \ - } \ +#define CHECK_NUM_DIMS(tensor, num_dims) \ + { \ + /* If the tensor is empty, the number of dimensions is not relevant */ \ + if (!((tensor).numel() == 0)) { \ + TORCH_CHECK((tensor).dim() == (num_dims), \ + #tensor " must have " + std::to_string(num_dims) + " dimensions"); \ + } \ } -#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims) \ - { \ - /* If the tensor is empty, the number of dimensions is not relevant */ \ - if (!((tensor).numel() == 0)) { \ - AT_ASSERTM((tensor).dim() >= (num_dims), \ - #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \ - } \ +#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims) \ + { \ + /* If the tensor is empty, the number of dimensions is not relevant */ \ + if (!((tensor).numel() == 0)) { \ + TORCH_CHECK((tensor).dim() >= (num_dims), \ + #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \ + } \ } static inline int64_t get_number_data_elements_per_index(const torch::Tensor& input_data, diff --git a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu index ed5e9c4..85d5d38 100644 --- a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu +++ b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu @@ -20,8 +20,8 @@ #include #include -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x); @@ -69,11 +69,11 @@ void draw_heatmap_launcher(at::Tensor& heatmap, const at::Tensor& centers, const CHECK_INPUT(radii); CHECK_INPUT(heatmap_idxes); - AT_ASSERTM(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0"); - AT_ASSERTM(centers.size(0) == heatmap_idxes.size(0), - "centers and heatmap_idxes must have the same size at dim0"); - AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]"); - AT_ASSERTM(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]"); + TORCH_CHECK(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0"); + TORCH_CHECK(centers.size(0) == heatmap_idxes.size(0), + "centers and heatmap_idxes must have the same size at dim0"); + TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]"); + TORCH_CHECK(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]"); const int num_targets = centers.size(0); const int num_heatmaps = heatmap.size(0); @@ -101,15 +101,15 @@ void draw_heatmap_batched_launcher(at::Tensor& heatmap, const at::Tensor& center const int batch_size = heatmap.size(0); const int num_targets = radii.size(1); - AT_ASSERTM( + TORCH_CHECK( batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0), "batch_size (dim 0) need to be the same for all inputs"); - AT_ASSERTM(num_targets == centers.size(1), - "maximum number of targets (dim 1) need to be the same centers and radii"); - AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]"); - AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2, - "centers must be of shape [batch_size, num_targets, 2]"); - AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); + TORCH_CHECK(num_targets == centers.size(1), + "maximum number of targets (dim 1) need to be the same centers and radii"); + TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]"); + TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2, + "centers must be of shape [batch_size, num_targets, 2]"); + TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); const int height = heatmap.size(1); const int width = heatmap.size(2); @@ -138,23 +138,23 @@ void draw_heatmap_batched_classwise_launcher(at::Tensor& heatmap, const at::Tens const int batch_size = heatmap.size(0); const int num_targets = radii.size(1); - AT_ASSERTM( + TORCH_CHECK( batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0), "batch_size (dim 0) need to be the same for all inputs"); - AT_ASSERTM(num_targets == centers.size(1), - "maximum number of targets (dim 1) need to be the same centers and radii"); - AT_ASSERTM(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]"); - AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2, - "centers must be of shape [batch_size, num_targets, 2]"); - AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); + TORCH_CHECK(num_targets == centers.size(1), + "maximum number of targets (dim 1) need to be the same centers and radii"); + TORCH_CHECK(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]"); + TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2, + "centers must be of shape [batch_size, num_targets, 2]"); + TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]"); const int height = heatmap.size(2); const int width = heatmap.size(3); const int max_num_classes = heatmap.size(1); // Validate labels tensor shape and range before launching the kernel - AT_ASSERTM(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]"); - AT_ASSERTM(labels.size(0) == batch_size && labels.size(1) == num_targets, - "labels shape must be [batch_size, radii.size(1)]"); + TORCH_CHECK(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]"); + TORCH_CHECK(labels.size(0) == batch_size && labels.size(1) == num_targets, + "labels shape must be [batch_size, radii.size(1)]"); AT_DISPATCH_FLOATING_TYPES( heatmap.scalar_type(), "draw_heatmap_cuda_batched", ([&] { draw_heatmap_batched_cuda( diff --git a/packages/example_package/docs/_on_doc_generation.py b/packages/example_package/docs/_on_doc_generation.py new file mode 100644 index 0000000..28bb331 --- /dev/null +++ b/packages/example_package/docs/_on_doc_generation.py @@ -0,0 +1,59 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path +from typing import Any + +import matplotlib + +matplotlib.use("Agg") +from matplotlib import pyplot as plt + +_DATA_FILE = Path("evaluation_results") / "simple_plot.csv" +_OUTPUT_FILE = "simple_plot.png" + + +def _read_plot_data(input_file: Path) -> tuple[list[float], list[float]]: + if not input_file.exists(): + raise FileNotFoundError(f"Required example plot input data is missing: {input_file}") + + with input_file.open("r", encoding="utf-8", newline="") as csv_file: + reader = csv.DictReader(csv_file) + if reader.fieldnames != ["x", "y"]: + raise ValueError(f"Expected CSV columns 'x,y' in {input_file}") + x_values: list[float] = [] + y_values: list[float] = [] + for row in reader: + x_values.append(float(row["x"])) + y_values.append(float(row["y"])) + + if not x_values: + raise ValueError(f"Expected at least one data row in {input_file}") + return x_values, y_values + + +def generate_docs_assets(context: Any) -> None: + input_file = context.package_root / _DATA_FILE + output_file = context.generated_dir / _OUTPUT_FILE + x_values, y_values = _read_plot_data(input_file) + + figure, axis = plt.subplots(figsize=(5.0, 3.2), constrained_layout=True) + axis.plot(x_values, y_values, marker="o") + axis.set_title("Generated Example Plot") + axis.set_xlabel("x") + axis.set_ylabel("y") + axis.grid(True) + figure.savefig(output_file) + plt.close(figure) diff --git a/packages/example_package/docs/intro.rst b/packages/example_package/docs/intro.rst index 104d8ec..237fba1 100644 --- a/packages/example_package/docs/intro.rst +++ b/packages/example_package/docs/intro.rst @@ -56,6 +56,21 @@ Examples For examples, see :doc:`examples`. The example makes use of ``note-literalinclude`` to include the example code in the documentation and highlight notes in the code (comment blocks starting with ``# @NOTE``). +Generated Documentation Assets +------------------------------ + +This package also demonstrates package-local documentation asset generation. The docs build reads committed +data from ``evaluation_results/simple_plot.csv`` and writes the generated plot to +``docs/_generated/simple_plot.png``. The source documentation remains static and references the generated +image using a normal relative path. + +.. figure:: _generated/simple_plot.png + :alt: Simple generated plot from committed CSV data + :align: center + :width: 70% + + Example plot generated from committed CSV data during documentation generation. + .. toctree:: :maxdepth: 2 :caption: Examples diff --git a/packages/example_package/evaluation_results/simple_plot.csv b/packages/example_package/evaluation_results/simple_plot.csv new file mode 100644 index 0000000..9675110 --- /dev/null +++ b/packages/example_package/evaluation_results/simple_plot.csv @@ -0,0 +1,6 @@ +x,y +0,0 +1,1 +2,4 +3,9 +4,16 diff --git a/packages/example_package/pyproject.toml b/packages/example_package/pyproject.toml index ef14420..cbe6014 100644 --- a/packages/example_package/pyproject.toml +++ b/packages/example_package/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ [project.optional-dependencies] optional = [ + "matplotlib", "pytest", ] diff --git a/packages/lane_helpers/accvlab/lane_helpers/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/__init__.py new file mode 100644 index 0000000..e5b5150 --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from importlib.metadata import PackageNotFoundError, version + +from . import polyline + +try: + __version__ = version("accvlab.lane_helpers") +except PackageNotFoundError: + __version__ = "0.0.0" + + +__all__ = [ + "__version__", + "polyline", +] diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py new file mode 100644 index 0000000..e1ec3a1 --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .functions import ( + interpolate, + interpolate_var_size_batch, + lengths, + lengths_var_size_batch, +) + +__all__ = [ + "interpolate", + "interpolate_var_size_batch", + "lengths", + "lengths_var_size_batch", +] diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py new file mode 100644 index 0000000..417575b --- /dev/null +++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py @@ -0,0 +1,113 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +from .. import _polyline_sampling + +if TYPE_CHECKING: + from accvlab.batching_helpers import RaggedBatch + + +def interpolate(points: torch.Tensor, distances: torch.Tensor, *, relative: bool = False) -> torch.Tensor: + """Interpolate batched polylines at requested distances. + + Args: + points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``. + distances: Tensor with shape ``(batch, num_distances)`` on the same device as ``points``. + Distances below zero are clamped to the first point of the polyline. Distances beyond the + total polyline length are clamped to the last point. When ``relative=True``, this corresponds + to clamping values below ``0`` and above ``1``. + relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length. + If ``False``, interpret them as absolute distances from the start of each polyline. + + Returns: + Tensor with shape ``(batch, num_distances, num_dims)`` on the same device as ``points``. + """ + result = _polyline_sampling.polyline_interpolation(points, distances, relative=relative) + return result + + +def lengths(points: torch.Tensor) -> torch.Tensor: + """Compute the total length of each polyline in a fixed-size batch. + + Args: + points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``. + + Returns: + Tensor with shape ``(batch,)`` on the same device as ``points``. + """ + result = _polyline_sampling._polyline_lengths(points) + return result + + +def interpolate_var_size_batch( + points: RaggedBatch, distances: RaggedBatch, *, relative: bool = False +) -> RaggedBatch: + """Interpolate variable-length batched polylines at requested distances. + + Args: + points: RaggedBatch-like object with tensor data on CPU or CUDA and shape + ``(batch, max_num_points, num_dims)``. + distances: RaggedBatch-like object with shape ``(batch, max_num_distances)`` and tensor data + on the same device as ``points``. Distances below zero are clamped to the first point of the + polyline. Distances beyond the total polyline length are clamped to the last point. When + ``relative=True``, this corresponds to clamping values below ``0`` and above ``1``. + relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length. + If ``False``, interpret them as absolute distances from the start of each polyline. + + Returns: + RaggedBatch-like object with shape ``(batch, max_num_distances, num_dims)`` and tensor data + on the same device as ``points``. + """ + assert points.num_batch_dims == 1, "points must have exactly one batch dimension" + assert distances.num_batch_dims == 1, "distances must have exactly one batch dimension" + assert ( + points.non_uniform_dim == 1 + ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)" + assert ( + distances.non_uniform_dim == 1 + ), "distances.non_uniform_dim must be 1 for shape (batch, max_num_distances)" + + result = _polyline_sampling._polyline_interpolation_var_size_batch( + points.tensor, + distances.tensor, + points.sample_sizes, + distances.sample_sizes, + relative=relative, + ) + result_batch = distances.create_with_sample_sizes_like_self(result) + return result_batch + + +def lengths_var_size_batch(points: RaggedBatch) -> torch.Tensor: + """Compute the total length of each polyline in a variable-size batch. + + Args: + points: RaggedBatch-like object with tensor data on CPU or CUDA and shape + ``(batch, max_num_points, num_dims)``. + + Returns: + Tensor with shape ``(batch,)`` on the same device as ``points``. + """ + assert points.num_batch_dims == 1, "points must have exactly one batch dimension" + assert ( + points.non_uniform_dim == 1 + ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)" + result = _polyline_sampling._polyline_lengths_var_size_batch(points.tensor, points.sample_sizes) + return result diff --git a/packages/lane_helpers/docs/_on_doc_generation.py b/packages/lane_helpers/docs/_on_doc_generation.py new file mode 100644 index 0000000..1d6c43d --- /dev/null +++ b/packages/lane_helpers/docs/_on_doc_generation.py @@ -0,0 +1,91 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +import sys +from typing import Any + +_RESULTS_SUBDIR = Path("evaluation_results") / "polyline_runtime_evaluation" +_GENERATED_IMAGE_SUBDIR = Path("polyline_runtime_evaluation") +_DOC_BATCH_SIZES = [1, 64] +_DOC_REQUIRED_MARKDOWN_METRICS = ( + "runtime_shapely", + "runtime_cpu", + "runtime_cuda", + "speedup_cpu_vs_shapely", + "speedup_cuda_vs_shapely", + "speedup_cuda_vs_cpu", +) +_DOC_REQUIRED_IMAGE_NAMES = tuple( + f"batch_{batch_size}_{plot_kind}_comparison.png" + for batch_size in _DOC_BATCH_SIZES + for plot_kind in ("runtime", "speedup") +) + + +def _required_markdown_paths(input_dir: Path) -> list[Path]: + return [ + input_dir / f"batch_{batch_size}_{metric_name}.md" + for batch_size in _DOC_BATCH_SIZES + for metric_name in _DOC_REQUIRED_MARKDOWN_METRICS + ] + + +def _validate_required_markdown_inputs(input_dir: Path) -> None: + if not input_dir.exists(): + raise FileNotFoundError( + "Required committed Markdown input directory is missing for lane_helpers docs asset generation: " + f"{input_dir}." + ) + + missing_inputs = [path for path in _required_markdown_paths(input_dir) if not path.exists()] + if missing_inputs: + missing_list = "\n".join(f" - {path}" for path in missing_inputs) + raise FileNotFoundError( + "Missing required committed Markdown input file(s) for lane_helpers docs asset generation:\n" + f"{missing_list}" + ) + + +def _validate_required_images(output_dir: Path) -> None: + missing_outputs = [ + output_dir / image_name + for image_name in _DOC_REQUIRED_IMAGE_NAMES + if not (output_dir / image_name).exists() + ] + if missing_outputs: + missing_list = "\n".join(f" - {path}" for path in missing_outputs) + raise FileNotFoundError( + "Polyline runtime docs asset generation did not produce all images referenced by introduction.rst:\n" + f"{missing_list}" + ) + + +def generate_docs_assets(context: Any) -> None: + input_dir = context.package_root / _RESULTS_SUBDIR + output_dir = context.generated_dir / _GENERATED_IMAGE_SUBDIR + + _validate_required_markdown_inputs(input_dir) + + evaluation_dir = context.package_root / "evaluation" + sys.path.insert(0, str(evaluation_dir)) + import plot_shapely_evaluation + + plot_shapely_evaluation.plot_from_markdown_directory( + input_dir=input_dir, + output_dir=output_dir, + batch_sizes=_DOC_BATCH_SIZES, + annotate_plots=True, + ) + _validate_required_images(output_dir) diff --git a/packages/lane_helpers/docs/api.rst b/packages/lane_helpers/docs/api.rst new file mode 100644 index 0000000..072f978 --- /dev/null +++ b/packages/lane_helpers/docs/api.rst @@ -0,0 +1,9 @@ +API Reference +============= + +.. automodule:: accvlab.lane_helpers + +polyline +-------- + +.. automodule:: accvlab.lane_helpers.polyline diff --git a/packages/lane_helpers/docs/example.rst b/packages/lane_helpers/docs/example.rst new file mode 100644 index 0000000..4f74766 --- /dev/null +++ b/packages/lane_helpers/docs/example.rst @@ -0,0 +1,16 @@ +Example +======= + +Polyline Interpolation +---------------------- + +The example below samples a rectangle-shaped polyline at a handful of distances. + +.. important:: + + You can run the example using the script ``packages/lane_helpers/examples/basic_usage.py``. + +.. note-literalinclude:: ../examples/basic_usage.py + :language: python + :caption: packages/lane_helpers/examples/basic_usage.py + :linenos: diff --git a/packages/lane_helpers/docs/images/polyline_sampling_illustration.png b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png new file mode 100644 index 0000000..1caaf78 Binary files /dev/null and b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png differ diff --git a/packages/lane_helpers/docs/index.rst b/packages/lane_helpers/docs/index.rst new file mode 100644 index 0000000..40095e5 --- /dev/null +++ b/packages/lane_helpers/docs/index.rst @@ -0,0 +1,11 @@ +Lane Helpers +============ + +This is the documentation for the ``accvlab.lane_helpers`` package. + +.. toctree:: + :maxdepth: 1 + + introduction + api + example diff --git a/packages/lane_helpers/docs/introduction.rst b/packages/lane_helpers/docs/introduction.rst new file mode 100644 index 0000000..9bf3eac --- /dev/null +++ b/packages/lane_helpers/docs/introduction.rst @@ -0,0 +1,113 @@ +Introduction +============ + +Polyline Sampling +----------------- + +Functionality +^^^^^^^^^^^^^ + +The ``lane_helpers`` package provides utilities for lane-processing workloads. + +The main functionality is batched polyline interpolation. A polyline is a sequence of points in the +space :math:`\mathbb{R}^D`, written as :math:`\mathbf{p}_i`, where each pair of consecutive points defines +one line segment. + +Given sampling distances :math:`d_j` measured from the first point :math:`\mathbf{p}_0` along the +polyline, the sampling function :func:`~accvlab.lane_helpers.polyline.interpolate` returns the +corresponding sampled points :math:`\mathbf{q}_j`. + +.. figure:: images/polyline_sampling_illustration.png + :alt: Illustration of polyline sampling + :align: center + :scale: 45% + + Two-segment polyline sampled at two distances. The input points are shown as green circles, and the + sampled points are shown as red circles. + +Sampling distances do not need to be sorted. Distances can be provided either as absolute distances along +the polyline or as fractions of each polyline's total length. + +Point coordinates are not limited to 2D. The coordinate dimension is the last tensor dimension, and 2D, +3D, and higher-dimensional coordinates are supported. + +For batches with variable numbers of points or distances, use +:func:`~accvlab.lane_helpers.polyline.interpolate_var_size_batch` with +:class:`~accvlab.batching_helpers.RaggedBatch` inputs. + +Functionality to compute the total length of each polyline is also provided (through +:func:`~accvlab.lane_helpers.polyline.lengths` and :func:`~accvlab.lane_helpers.polyline.lengths_var_size_batch`). + +Runtime Evaluation +^^^^^^^^^^^^^^^^^^ + +The runtime evaluation compares batched interpolation for both CPU and CUDA against a Shapely LineString +reference over a grid of point counts, numbers of sampled distances, and batch sizes. Runtime plots report +milliseconds per interpolation call, while speedup plots report the x-fold improvement over the Shapely +reference. + +.. seealso:: + + The evaluation script is available at ``packages/lane_helpers/evaluation/shapely_evaluation.py``. It can be + used to run the benchmark sweep for different problem sizes on your target system. + +Performance depends on the batch size for both CPU and CUDA execution. CUDA parallelism scales with the number +of polylines in the batch, so very small batch sizes may not fully utilize the GPU. + +For practical problem sizes, it is recommended to choose the implementation based primarily on where the +tensors already live: CPU inputs should generally stay on CPU, and CUDA inputs should generally stay on CUDA. +Moving tensors only to use a different implementation can dominate the interpolation cost. + +The plots below focus on batch sizes 1 and 64 as examples. The evaluation script runs for more batch sizes by +default, and other batch sizes can be easily added. + +.. note:: + + The following measurements are intended as directional guidance. Exact runtimes depend on the used system, + with performance primarily influenced by the CPU and GPU. + + The plots shown here were generated on a system with an ``NVIDIA RTX 5000 Ada Generation`` GPU and an + ``AMD Ryzen 9 7950X`` 16-Core Processor. + +.. note:: + + In the following runtime plots, markers highlight the smallest measured problem size, the largest measured + problem size, and the 100-point/100-distance cell. + + In the speedup plots, markers highlight the smallest measured problem size and the largest speedup. If speedup is not + above 1x everywhere, they also mark representative cells near the first matching point-count and distance-count + configuration where speedup exceeds 1x. + +Batch size 1 shows behavior for the smallest batch configuration in the benchmark: + +.. figure:: _generated/polyline_runtime_evaluation/batch_1_runtime_comparison.png + :alt: Runtime comparison heatmaps for batch size 1 + :align: center + :width: 100% + + Runtime comparison for batch size 1. Rows vary the number of polyline points, and columns vary the number + of sampled distances. + +.. figure:: _generated/polyline_runtime_evaluation/batch_1_speedup_comparison.png + :alt: Speedup comparison heatmaps for batch size 1 + :align: center + :width: 100% + + Speedup comparison for batch size 1. + +For larger batch sizes, CUDA can expose more parallel work and its speedup over the other methods typically +becomes more pronounced. Batch size 64 shows this behavior: + +.. figure:: _generated/polyline_runtime_evaluation/batch_64_runtime_comparison.png + :alt: Runtime comparison heatmaps for batch size 64 + :align: center + :width: 100% + + Runtime comparison for batch size 64. + +.. figure:: _generated/polyline_runtime_evaluation/batch_64_speedup_comparison.png + :alt: Speedup comparison heatmaps for batch size 64 + :align: center + :width: 100% + + Speedup comparison for batch size 64. diff --git a/packages/lane_helpers/docu_referenced_dirs.txt b/packages/lane_helpers/docu_referenced_dirs.txt new file mode 100644 index 0000000..1e107f5 --- /dev/null +++ b/packages/lane_helpers/docu_referenced_dirs.txt @@ -0,0 +1 @@ +examples diff --git a/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py new file mode 100644 index 0000000..2cf0d9f --- /dev/null +++ b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py @@ -0,0 +1,157 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import numpy as np + +_LARGE_SPEEDUP_THRESHOLD = 1_000.0 + + +# Helper function for formatting speedup values in result tables. +def _format_speedup_value(value: float) -> str: + if abs(value) >= _LARGE_SPEEDUP_THRESHOLD: + return f"{value:.2e}" + return f"{value:.2f}" + + +# Helper function for formatting one measured metric as a Markdown table. +def _format_table( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + scientific: bool, +) -> str: + rows = [ + "| # Points (down) / # Distances (right) | " + " | ".join(str(item) for item in nums_distances) + " |" + ] + rows.append("| :----- |" + " :-----: |" * len(nums_distances)) + for points_idx, num_points_current in enumerate(nums_points): + values = [] + for distances_idx in range(len(nums_distances)): + value = data[points_idx, distances_idx] + if scientific: + values.append(np.format_float_scientific(value, precision=3)) + else: + values.append(_format_speedup_value(value)) + rows.append(f"| {num_points_current} | " + " | ".join(values) + " |") + table = "\n".join(rows) + return table + + +# Helper function for writing one Markdown table to disk. +def _write_markdown( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + filename: Path, + scientific: bool, +) -> None: + table = _format_table(data, nums_points, nums_distances, scientific=scientific) + filename.write_text(table + "\n", encoding="utf-8") + + +# Helper function for writing the Markdown table output for one metric. +def _write_metric_outputs( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + filename_stem: Path, + scientific: bool, +) -> None: + _write_markdown( + data, + nums_points, + nums_distances, + filename=filename_stem.with_suffix(".md"), + scientific=scientific, + ) + + +# Entry point: write all Markdown tables for one evaluated batch size. +def write_batch_results( + output_dir: Path, + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + shapely_runtime_ms: np.ndarray | None, + cpu_runtime_ms: np.ndarray, + cuda_runtime_ms: np.ndarray, + skip_shapely: bool, + assert_results: bool, + max_abs_diff_cpu: np.ndarray | None, + max_abs_diff_cuda: np.ndarray | None, + max_abs_diff_cuda_vs_cpu: np.ndarray | None, +) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms + prefix = f"batch_{batch_size}" + + def write_metric( + metric_name: str, + data: np.ndarray, + *, + scientific: bool, + ) -> None: + _write_metric_outputs( + data, + nums_points, + nums_distances, + filename_stem=output_dir / f"{prefix}_{metric_name}", + scientific=scientific, + ) + + if not skip_shapely: + cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms + cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms + write_metric( + "runtime_shapely", + shapely_runtime_ms, + scientific=True, + ) + # CPU and CUDA outputs are always available; Shapely-related metrics are optional. + write_metric("runtime_cuda", cuda_runtime_ms, scientific=True) + write_metric( + "runtime_cpu", + cpu_runtime_ms, + scientific=True, + ) + if not skip_shapely: + write_metric( + "speedup_cuda_vs_shapely", + cuda_speedup_over_shapely, + scientific=False, + ) + write_metric( + "speedup_cpu_vs_shapely", + cpu_speedup_over_shapely, + scientific=False, + ) + write_metric( + "speedup_cuda_vs_cpu", + cuda_speedup_over_cpu, + scientific=False, + ) + if assert_results: + write_metric( + "max_abs_diff_cuda_vs_cpu", + max_abs_diff_cuda_vs_cpu, + scientific=True, + ) + if assert_results and not skip_shapely: + write_metric("max_abs_diff", max_abs_diff_cuda, scientific=True) + write_metric("max_abs_diff_cpu", max_abs_diff_cpu, scientific=True) diff --git a/packages/lane_helpers/evaluation/plot_shapely_evaluation.py b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py new file mode 100644 index 0000000..34acf06 --- /dev/null +++ b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py @@ -0,0 +1,595 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from dataclasses import dataclass +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.colors as colors +from matplotlib.axes import Axes +from matplotlib import pyplot as plt +import numpy as np + +DEFAULT_ANNOTATE_PLOTS = True +_LARGE_SPEEDUP_THRESHOLD = 1_000.0 +_PLOT_FIGSIZE = (6.4, 5.2) +_COMPARISON_SUBPLOT_WIDTH = 6.6 +_PLOT_SUPTITLE_FONT_SIZE = 22 +_PLOT_TITLE_FONT_SIZE = 18 +_PLOT_AXIS_LABEL_FONT_SIZE = 16 +_PLOT_TICK_LABEL_FONT_SIZE = 14 +_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE = 14 +_PLOT_ANNOTATION_FONT_SIZE = 16 +_PLOT_ANNOTATION_MARKER_SIZE = 52 +_PLOT_ANNOTATION_X_OFFSET = 0.25 +_PLOT_COLORBAR_FRACTION = 0.046 +_PLOT_COLORBAR_PAD = 0.02 + + +@dataclass(frozen=True) +class _MetricPlotConfig: + title: str + annotation: str | None = None + + +_METRIC_PLOT_CONFIGS = { + "runtime_shapely": _MetricPlotConfig("Shapely", annotation="runtime"), + "runtime_cuda": _MetricPlotConfig("CUDA", annotation="runtime"), + "runtime_cpu": _MetricPlotConfig("CPU", annotation="runtime"), + "speedup_cuda_vs_shapely": _MetricPlotConfig("CUDA vs. Shapely", annotation="speedup"), + "speedup_cpu_vs_shapely": _MetricPlotConfig("CPU vs. Shapely", annotation="speedup"), + "speedup_cuda_vs_cpu": _MetricPlotConfig("CUDA vs. CPU", annotation="speedup"), + "max_abs_diff_cuda_vs_cpu": _MetricPlotConfig("CUDA max abs. difference to CPU"), + "max_abs_diff": _MetricPlotConfig("CUDA max abs. difference to Shapely"), + "max_abs_diff_cpu": _MetricPlotConfig("CPU max abs. difference to Shapely"), +} +_SHAPELY_DEPENDENT_METRICS = frozenset( + { + "runtime_shapely", + "speedup_cuda_vs_shapely", + "speedup_cpu_vs_shapely", + "max_abs_diff", + "max_abs_diff_cpu", + } +) +_RUNTIME_METRICS_WITH_SHAPELY = ("runtime_shapely", "runtime_cpu", "runtime_cuda") +_RUNTIME_METRICS_WITHOUT_SHAPELY = ("runtime_cpu", "runtime_cuda") +_SPEEDUP_METRICS_WITH_SHAPELY = ( + "speedup_cpu_vs_shapely", + "speedup_cuda_vs_shapely", + "speedup_cuda_vs_cpu", +) +_SPEEDUP_METRICS_WITHOUT_SHAPELY = ("speedup_cuda_vs_cpu",) + + +# Helper function for formatting speedup values in tables and annotations. +def _format_speedup_value(value: float) -> str: + if abs(value) >= _LARGE_SPEEDUP_THRESHOLD: + return f"{value:.2e}" + return f"{value:.2f}" + + +# Helper function for splitting one Markdown table row into stripped cells. +def _split_markdown_table_row(row: str) -> list[str]: + row = row.strip() + if not row.startswith("|") or not row.endswith("|"): + raise ValueError(f"Expected Markdown table row, got: {row}") + cells = [cell.strip() for cell in row.strip("|").split("|")] + return cells + + +# Helper function for loading one metric table written by `_write_markdown`. +def _read_metric_table(filename: Path) -> tuple[list[int], list[int], np.ndarray]: + table_rows = [ + line.strip() + for line in filename.read_text(encoding="utf-8").splitlines() + if line.strip().startswith("|") + ] + if len(table_rows) < 3: + raise ValueError(f"Expected a Markdown header, separator, and at least one data row in {filename}") + + header_cells = _split_markdown_table_row(table_rows[0]) + if not header_cells or not header_cells[0].startswith("# Points"): + raise ValueError(f"Expected first Markdown header cell to describe point counts in {filename}") + nums_distances = [int(cell) for cell in header_cells[1:]] + nums_points: list[int] = [] + values: list[list[float]] = [] + + for row in table_rows[2:]: + row_cells = _split_markdown_table_row(row) + if len(row_cells) != len(nums_distances) + 1: + raise ValueError(f"Expected {len(nums_distances) + 1} cells in {filename}, got {len(row_cells)}") + nums_points.append(int(row_cells[0])) + values.append([float(cell) for cell in row_cells[1:]]) + + data = np.asarray(values, dtype=np.float64) + return nums_points, nums_distances, data + + +# Helper function for choosing which speedup heatmap cells should show numeric labels. +def _selected_speedup_annotation_cells( + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> list[tuple[int, int]]: + def find_value_index(values: list[int], value: int) -> int | None: + try: + index = values.index(value) + except ValueError: + return None + return index + + def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None: + if cell not in cells: + cells.append(cell) + + def find_first_faster_distance_idx(points_idx: int) -> int | None: + for distances_idx in range(len(nums_distances)): + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0: + return distances_idx + return None + + def find_first_faster_points_idx(distances_idx: int) -> int | None: + for points_idx in range(len(nums_points)): + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0: + return points_idx + return None + + cells: list[tuple[int, int]] = [] + + points_idx = find_value_index(nums_points, 2) + distances_idx = find_value_index(nums_distances, 1) + if points_idx is not None and distances_idx is not None: + add_unique_cell(cells, (points_idx, distances_idx)) + + finite_mask = np.isfinite(data) + if np.any(finite_mask): + finite_data = np.where(finite_mask, data, -np.inf) + points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape) + add_unique_cell(cells, (int(points_idx), int(distances_idx))) + + finite_values = data[finite_mask] + if finite_values.size > 0 and not np.all(finite_values > 1.0): + distances_idx_by_value = {value: idx for idx, value in enumerate(nums_distances)} + for points_idx, num_points_current in enumerate(nums_points): + distances_idx = distances_idx_by_value.get(num_points_current) + if distances_idx is None: + continue + if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] > 1.0: + add_unique_cell(cells, (points_idx, distances_idx)) + if points_idx > 0: + faster_distances_idx = find_first_faster_distance_idx(points_idx - 1) + if faster_distances_idx is not None: + add_unique_cell(cells, (points_idx - 1, faster_distances_idx)) + if distances_idx > 0: + faster_points_idx = find_first_faster_points_idx(distances_idx - 1) + if faster_points_idx is not None: + add_unique_cell(cells, (faster_points_idx, distances_idx - 1)) + break + + return cells + + +# Helper function for choosing which runtime heatmap cells should get marker labels. +def _selected_runtime_annotation_cells( + nums_points: list[int], + nums_distances: list[int], +) -> list[tuple[int, int]]: + def find_value_index(values: list[int], value: int) -> int | None: + try: + index = values.index(value) + except ValueError: + return None + return index + + def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None: + if cell not in cells: + cells.append(cell) + + cells: list[tuple[int, int]] = [] + if nums_points and nums_distances: + add_unique_cell(cells, (0, 0)) + add_unique_cell(cells, (len(nums_points) - 1, len(nums_distances) - 1)) + + points_idx = find_value_index(nums_points, 100) + distances_idx = find_value_index(nums_distances, 100) + if points_idx is not None and distances_idx is not None: + add_unique_cell(cells, (points_idx, distances_idx)) + + return cells + + +# Helper function for placing numeric labels on selected speedup heatmap cells. +def _speedup_annotation_text_position( + points_idx: int, + distances_idx: int, + nums_points: list[int], + nums_distances: list[int], + selected_cells: list[tuple[int, int]], + data: np.ndarray, + max_speedup_cell: tuple[int, int] | None, +) -> tuple[float, str]: + if max_speedup_cell == (points_idx, distances_idx) and distances_idx > 0: + return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right" + + is_left_of_value_diagonal = nums_distances[distances_idx] < nums_points[points_idx] + has_adjacent_above_one_annotation = any( + (other_points_idx, other_distances_idx) != (points_idx, distances_idx) + and abs(other_points_idx - points_idx) + abs(other_distances_idx - distances_idx) == 1 + and np.isfinite(data[other_points_idx, other_distances_idx]) + and data[other_points_idx, other_distances_idx] >= 1.0 + for other_points_idx, other_distances_idx in selected_cells + ) + should_place_left = distances_idx == len(nums_distances) - 1 or ( + distances_idx > 0 and is_left_of_value_diagonal and has_adjacent_above_one_annotation + ) + if should_place_left: + return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right" + return distances_idx + _PLOT_ANNOTATION_X_OFFSET, "left" + + +# Helper function for drawing optional numeric labels on selected speedup heatmap cells. +def _annotate_speedup_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> None: + selected_cells = _selected_speedup_annotation_cells(data, nums_points, nums_distances) + finite_mask = np.isfinite(data) + max_speedup_cell = None + if np.any(finite_mask): + finite_data = np.where(finite_mask, data, -np.inf) + points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape) + max_speedup_cell = (int(points_idx), int(distances_idx)) + + for points_idx, distances_idx in selected_cells: + value = data[points_idx, distances_idx] + if not np.isfinite(value): + continue + + ax.scatter( + [distances_idx], + [points_idx], + marker="o", + s=_PLOT_ANNOTATION_MARKER_SIZE, + c="black", + edgecolors="white", + linewidths=0.8, + zorder=3, + ) + + text_x, horizontal_alignment = _speedup_annotation_text_position( + points_idx, + distances_idx, + nums_points, + nums_distances, + selected_cells, + data, + max_speedup_cell, + ) + ax.text( + text_x, + points_idx, + _format_speedup_value(value), + ha=horizontal_alignment, + va="center", + fontsize=_PLOT_ANNOTATION_FONT_SIZE, + color="black", + bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75}, + zorder=4, + ) + + +# Helper function for drawing fixed reference markers on runtime heatmap cells. +def _annotate_runtime_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], +) -> None: + for points_idx, distances_idx in _selected_runtime_annotation_cells(nums_points, nums_distances): + value = data[points_idx, distances_idx] + if not np.isfinite(value): + continue + + ax.scatter( + [distances_idx], + [points_idx], + marker="o", + s=_PLOT_ANNOTATION_MARKER_SIZE, + c="black", + edgecolors="white", + linewidths=0.8, + zorder=3, + ) + + if distances_idx == len(nums_distances) - 1: + text_x = distances_idx - 0.15 + horizontal_alignment = "right" + else: + text_x = distances_idx + 0.15 + horizontal_alignment = "left" + ax.text( + text_x, + points_idx, + f"{value:.1e}", + ha=horizontal_alignment, + va="center", + fontsize=_PLOT_ANNOTATION_FONT_SIZE, + color="black", + bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75}, + zorder=4, + ) + + +# Helper function for drawing one heatmap into an existing subplot. +def _draw_heatmap( + ax: Axes, + data: np.ndarray, + nums_points: list[int], + nums_distances: list[int], + *, + title: str, + log_scale: bool, + annotate_speedup: bool = False, + annotate_runtime: bool = False, +) -> None: + norm = None + if log_scale: + positive_values = data[data > 0] + if positive_values.size > 0: + norm = colors.LogNorm(vmin=positive_values.min(), vmax=positive_values.max()) + + image = ax.imshow(data, norm=norm) + ax.set_yticks(list(range(len(nums_points))), labels=nums_points, fontsize=_PLOT_TICK_LABEL_FONT_SIZE) + ax.set_ylabel("Number of points", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE) + ax.set_xticks( + list(range(len(nums_distances))), + labels=nums_distances, + rotation=45, + fontsize=_PLOT_TICK_LABEL_FONT_SIZE, + ) + ax.set_xlabel("Number of distances", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE) + ax.set_title(title, fontsize=_PLOT_TITLE_FONT_SIZE, pad=12) + colorbar = ax.figure.colorbar( + image, + ax=ax, + fraction=_PLOT_COLORBAR_FRACTION, + pad=_PLOT_COLORBAR_PAD, + ) + colorbar.ax.tick_params(labelsize=_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE) + colorbar.ax.yaxis.offsetText.set_fontsize(_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE) + if annotate_speedup: + _annotate_speedup_heatmap(ax, data, nums_points, nums_distances) + if annotate_runtime: + _annotate_runtime_heatmap(ax, data, nums_points, nums_distances) + + +# Helper function for writing a multi-subplot comparison plot for one metric group. +def _plot_metric_comparison( + metric_names: tuple[str, ...], + metric_data: dict[str, np.ndarray], + nums_points: list[int], + nums_distances: list[int], + *, + batch_size: int, + figure_title: str, + filename: Path, + annotate_plots: bool, +) -> None: + available_metric_names = tuple(metric_name for metric_name in metric_names if metric_name in metric_data) + if not available_metric_names: + return + + subplot_width = _PLOT_FIGSIZE[0] if len(available_metric_names) == 1 else _COMPARISON_SUBPLOT_WIDTH + fig, axes = plt.subplots( + 1, + len(available_metric_names), + figsize=(subplot_width * len(available_metric_names), _PLOT_FIGSIZE[1]), + constrained_layout=True, + ) + fig.suptitle(f"{figure_title} (Batch Size {batch_size})", fontsize=_PLOT_SUPTITLE_FONT_SIZE) + axes = np.atleast_1d(axes).tolist() + + for ax, metric_name in zip(axes, available_metric_names): + metric_config = _METRIC_PLOT_CONFIGS[metric_name] + _draw_heatmap( + ax, + metric_data[metric_name], + nums_points, + nums_distances, + title=metric_config.title, + log_scale=True, + annotate_speedup=metric_config.annotation == "speedup" and annotate_plots, + annotate_runtime=metric_config.annotation == "runtime" and annotate_plots, + ) + fig.savefig(filename) + plt.close(fig) + + +# Helper function for writing comparison plots whose subplot counts depend on Shapely availability. +def _write_comparison_outputs( + output_dir: Path, + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + metric_data: dict[str, np.ndarray], + *, + has_shapely_results: bool, + annotate_plots: bool, +) -> list[Path]: + runtime_metric_names = ( + _RUNTIME_METRICS_WITH_SHAPELY if has_shapely_results else _RUNTIME_METRICS_WITHOUT_SHAPELY + ) + speedup_metric_names = ( + _SPEEDUP_METRICS_WITH_SHAPELY if has_shapely_results else _SPEEDUP_METRICS_WITHOUT_SHAPELY + ) + prefix = f"batch_{batch_size}" + comparison_files = [ + output_dir / f"{prefix}_runtime_comparison.png", + output_dir / f"{prefix}_speedup_comparison.png", + ] + _plot_metric_comparison( + runtime_metric_names, + metric_data, + nums_points, + nums_distances, + batch_size=batch_size, + figure_title="Runtime [ms]", + filename=comparison_files[0], + annotate_plots=annotate_plots, + ) + _plot_metric_comparison( + speedup_metric_names, + metric_data, + nums_points, + nums_distances, + batch_size=batch_size, + figure_title="Speedup [x-fold]", + filename=comparison_files[1], + annotate_plots=annotate_plots, + ) + return comparison_files + + +# Helper function for parsing comma-separated integer lists. +def _parse_int_list(value: str) -> list[int]: + parsed_values = [int(item) for item in value.split(",") if item] + return parsed_values + + +def plot_batch_results_from_markdown( + input_dir: Path, + output_dir: Path, + batch_size: int, + annotate_plots: bool, +) -> list[Path]: + prefix = f"batch_{batch_size}_" + markdown_files = sorted(input_dir.glob(f"{prefix}*.md")) + if not markdown_files: + raise FileNotFoundError(f"No Markdown result tables found for batch={batch_size} in {input_dir}") + available_metric_names = {markdown_file.stem[len(prefix) :] for markdown_file in markdown_files} + has_shapely_results = "runtime_shapely" in available_metric_names + + metric_data: dict[str, np.ndarray] = {} + comparison_nums_points: list[int] | None = None + comparison_nums_distances: list[int] | None = None + for markdown_file in markdown_files: + metric_name = markdown_file.stem[len(prefix) :] + if metric_name not in _METRIC_PLOT_CONFIGS: + continue + if metric_name in _SHAPELY_DEPENDENT_METRICS and not has_shapely_results: + continue + + nums_points, nums_distances, data = _read_metric_table(markdown_file) + metric_data[metric_name] = data + comparison_nums_points = nums_points + comparison_nums_distances = nums_distances + + if comparison_nums_points is not None and comparison_nums_distances is not None: + output_dir.mkdir(parents=True, exist_ok=True) + plotted_files = _write_comparison_outputs( + output_dir, + batch_size, + comparison_nums_points, + comparison_nums_distances, + metric_data, + has_shapely_results=has_shapely_results, + annotate_plots=annotate_plots, + ) + else: + plotted_files = [] + + if not plotted_files: + raise FileNotFoundError( + f"No known Markdown result tables found for batch={batch_size} in {input_dir}" + ) + return plotted_files + + +def plot_from_markdown_directory( + *, + input_dir: Path, + output_dir: Path, + batch_sizes: list[int], + annotate_plots: bool = DEFAULT_ANNOTATE_PLOTS, +) -> list[Path]: + if not input_dir.exists(): + raise FileNotFoundError(f"Markdown input directory does not exist: {input_dir}") + + plotted_files: list[Path] = [] + for batch_size in batch_sizes: + batch_plotted_files = plot_batch_results_from_markdown( + input_dir=input_dir, + output_dir=output_dir, + batch_size=batch_size, + annotate_plots=annotate_plots, + ) + plotted_files.extend(batch_plotted_files) + return plotted_files + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate polyline runtime plot images from Markdown result tables.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--input-dir", + type=Path, + required=True, + help="Directory containing Markdown result tables.", + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Directory where plot images should be written.", + ) + parser.add_argument( + "--batch-sizes", + default="1,4,16,64", + help="Comma-separated batch sizes to plot.", + ) + no_annotate_plots_action = parser.add_argument( + "--no-annotate-plots", + dest="annotate_plots", + action="store_false", + help="Disable annotations in generated heatmaps.", + ) + parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS) + no_annotate_plots_action.default = argparse.SUPPRESS + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + batch_sizes = _parse_int_list(args.batch_sizes) + plotted_files = plot_from_markdown_directory( + input_dir=args.input_dir, + output_dir=args.output_dir, + batch_sizes=batch_sizes, + annotate_plots=args.annotate_plots, + ) + for plotted_file in plotted_files: + print(f"Generated plot: {plotted_file}") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/evaluation/shapely_evaluation.py b/packages/lane_helpers/evaluation/shapely_evaluation.py new file mode 100644 index 0000000..1048191 --- /dev/null +++ b/packages/lane_helpers/evaluation/shapely_evaluation.py @@ -0,0 +1,681 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from collections.abc import Callable, Iterator +import gc +import sys +import time +from pathlib import Path +from types import ModuleType + +import numpy as np +from shapely import get_coordinates, line_interpolate_point, linestrings +import torch + +# Import helpers for outputting results and plots +SCRIPT_DIR = Path(__file__).resolve().parent +if str(SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPT_DIR)) +import plot_shapely_evaluation +import _shapely_evaluation_outputs as shapely_evaluation_outputs + +# ==================== Default configuration for the evaluation ==================== + +# These constants are convenient local configuration knobs. However, these configurations can also +# be done with CLI arguments. +# When changing these constants, check the CLI arguments further below in the script, +# because some flags only override the default in one direction. + +# Sweep values for the heatmap axes and the batch-size examples. +DEFAULT_NUMS_POINTS = [2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000] +DEFAULT_NUMS_DISTANCES = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000] +DEFAULT_BATCH_SIZES = [1, 4, 16, 64] +# Keep the measured work roughly constant across batch sizes. +DEFAULT_NUM_POLYLINES_PER_MEASUREMENT = 64 * 10 +# Warm up a representative mid-sized configuration before timing the sweep. +DEFAULT_NUM_WARMUP_RUNS = 3 +DEFAULT_WARMUP_NUM_POINTS = 100 +DEFAULT_WARMUP_NUM_DISTANCES = 100 +# Shapely can be skipped for faster CPU/CUDA-only benchmark sweeps. +DEFAULT_SKIP_SHAPELY = False +# Result checks are optional because they add Shapely reference work to each configuration. +DEFAULT_ASSERT_RESULTS = False +DEFAULT_ASSERT_ATOL = 1e-3 +DEFAULT_ASSERT_RTOL = 0.0 +# Plot annotations call out representative cells in generated heatmaps. +DEFAULT_ANNOTATE_PLOTS = True +# ================== End: Default configuration for the evaluation ================= + + +# ================== Constants for the evaluation ================== +DEVICE = "cuda" +DTYPE_NP = np.float32 +DTYPE_TORCH = torch.float32 +_POLYLINE_MODULE: ModuleType | None = None +# ================ End: Constants for the evaluation =============== + + +# Helper function for lazily importing the compiled polyline module outside plotting-only mode. +def _get_polyline_module() -> ModuleType: + global _POLYLINE_MODULE + if _POLYLINE_MODULE is None: + from accvlab.lane_helpers import polyline as polyline_module + + _POLYLINE_MODULE = polyline_module + return _POLYLINE_MODULE + + +# Helper function for config parsing +def _parse_int_list(value: str) -> list[int]: + parsed_values = [int(item) for item in value.split(",") if item] + return parsed_values + + +# Helper function for computing the batched Shapely reference. +def _compute_batched_shapely_reference(points: np.ndarray, distances: np.ndarray) -> np.ndarray: + line_strings = linestrings(points) + interpolated_points = line_interpolate_point(line_strings[:, None], distances) + batched_reference = ( + get_coordinates(interpolated_points) + .reshape( + points.shape[0], + distances.shape[1], + points.shape[2], + ) + .astype(DTYPE_NP) + ) + return batched_reference + + +# Helper function for computing per-polyline lengths in NumPy. +def _compute_polyline_lengths_np(points: np.ndarray) -> np.ndarray: + if points.shape[1] <= 1: + lengths = np.zeros((points.shape[0],), dtype=DTYPE_NP) + return lengths + lengths = np.linalg.norm(points[:, 1:] - points[:, :-1], axis=2).sum(axis=1).astype(DTYPE_NP) + return lengths + + +# Helper function for comparing CPU and CUDA outputs against Shapely when requested. +def _assert_matches_shapely( + shapely_result: np.ndarray, + result: np.ndarray, + *, + implementation_name: str, + batch_size: int, + num_points: int, + num_distances: int, + atol: float, + rtol: float, +) -> None: + try: + np.testing.assert_allclose(result, shapely_result, atol=atol, rtol=rtol) + except AssertionError as exc: + max_abs_diff = np.abs(shapely_result - result).max() + raise AssertionError( + f"{implementation_name} result differs from Shapely for " + f"batch={batch_size}, points={num_points}, distances={num_distances}; " + f"max_abs_diff={max_abs_diff}, atol={atol}, rtol={rtol}" + ) from exc + + +# Helper function for constructing one deterministic benchmark input configuration. +def _make_evaluation_case( + batch_size: int, + num_points: int, + num_distances: int, + *, + seed: int, +) -> tuple[np.ndarray, np.ndarray]: + generator = np.random.default_rng(seed=seed) + # Set up the polylines + points = generator.uniform(0.0, 1.0, size=(batch_size, num_points, 2)).astype(DTYPE_NP) + lengths = _compute_polyline_lengths_np(points) + # Set up the distances to sample the polyline at + distances_normalized = generator.uniform(0.0, 1.0, size=(batch_size, num_distances)).astype(DTYPE_NP) + distances = distances_normalized * lengths[:, None] + return points, distances + + +# Helper function for iterating over deterministic benchmark configurations. +def _iter_evaluation_cases( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], +) -> Iterator[tuple[int, int, int, int, int]]: + for points_idx, num_points_current in enumerate(nums_points): + for distances_idx, num_distances_current in enumerate(nums_distances): + seed = batch_size * 1_000_000 + num_points_current * 1_000 + num_distances_current + yield points_idx, distances_idx, num_points_current, num_distances_current, seed + + +# Helper function for placing the same NumPy inputs on CUDA and CPU. +def _make_torch_tensors( + *arrays: np.ndarray, +) -> tuple[torch.Tensor, ...]: + tensors_gpu = [torch.tensor(array, device=DEVICE, dtype=DTYPE_TORCH) for array in arrays] + tensors_cpu = [torch.tensor(array, device="cpu", dtype=DTYPE_TORCH) for array in arrays] + return *tensors_gpu, *tensors_cpu + + +# Helper function for placing NumPy inputs on one target device. +def _make_torch_tensors_on_device( + *arrays: np.ndarray, + device: str, +) -> tuple[torch.Tensor, ...]: + tensors = tuple(torch.tensor(array, device=device, dtype=DTYPE_TORCH) for array in arrays) + return tensors + + +# Helper function for timing repeated calls and synchronizing CUDA work when needed. +def _time_call( + function: Callable[[], object], + *, + num_runs: int, + synchronize_cuda: bool = False, +) -> float: + if synchronize_cuda: + # Ensure previous work is finished before starting the timing. + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(num_runs): + function() + if synchronize_cuda: + # Ensure all work is finished before stopping the timing. + torch.cuda.synchronize() + runtime = (time.perf_counter() - start) / num_runs + return runtime + + +# Helper function for reducing cross-implementation timing interference. +def _cleanup_between_implementation_sweeps() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.synchronize() + torch.cuda.empty_cache() + + +# Helper function for timing the Shapely reference implementation. +def _time_shapely( + points: np.ndarray, + distances: np.ndarray, + *, + num_runs: int, +) -> float: + compute_function = lambda: _compute_batched_shapely_reference(points, distances) + runtime = _time_call( + compute_function, + num_runs=num_runs, + ) + return runtime + + +# Helper function for timing the CUDA implementation. +def _time_cuda( + points: torch.Tensor, + distances: torch.Tensor, + *, + num_runs: int, +) -> float: + polyline_module = _get_polyline_module() + compute_function = lambda: polyline_module.interpolate(points, distances) + runtime = _time_call( + compute_function, + num_runs=num_runs, + synchronize_cuda=True, + ) + return runtime + + +# Helper function for timing the CPU implementation. +def _time_cpu( + points: torch.Tensor, + distances: torch.Tensor, + *, + num_runs: int, +) -> float: + polyline_module = _get_polyline_module() + compute_function = lambda: polyline_module.interpolate(points, distances) + runtime = _time_call(compute_function, num_runs=num_runs) + return runtime + + +# Helper function for warming up all selected implementations once before measured runs. +def _run_warmup( + *, + batch_size: int, + num_points: int, + num_distances: int, + num_warmup_runs: int, + skip_shapely: bool, +) -> None: + if num_warmup_runs <= 0: + return + + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points, + num_distances, + seed=0, + ) + points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np) + polyline_module = _get_polyline_module() + + for _ in range(num_warmup_runs): + if not skip_shapely: + _compute_batched_shapely_reference(points_np, distances_np) + polyline_module.interpolate(points_cpu, distances_cpu) + polyline_module.interpolate(points_gpu, distances_gpu) + + torch.cuda.synchronize() + + +# Helper to (optionally) validate the results against the Shapely reference. +def _run_validation_sweep( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + *, + assert_atol: float, + assert_rtol: float, + max_abs_diff_cpu: np.ndarray, + max_abs_diff_cuda: np.ndarray, + max_abs_diff_cuda_vs_cpu: np.ndarray, +) -> None: + print(f"Running validation sweep for batch={batch_size}") + polyline_module = _get_polyline_module() + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running validation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + shapely_result = _compute_batched_shapely_reference(points_np, distances_np) + points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np) + cpu_result = polyline_module.interpolate(points_cpu, distances_cpu).numpy() + cuda_result = polyline_module.interpolate(points_gpu, distances_gpu).cpu().numpy() + + max_abs_diff_cpu[points_idx, distances_idx] = np.abs(shapely_result - cpu_result).max() + max_abs_diff_cuda[points_idx, distances_idx] = np.abs(shapely_result - cuda_result).max() + max_abs_diff_cuda_vs_cpu[points_idx, distances_idx] = np.abs(cpu_result - cuda_result).max() + + _assert_matches_shapely( + shapely_result, + cpu_result, + implementation_name="CPU", + batch_size=batch_size, + num_points=num_points_current, + num_distances=num_distances_current, + atol=assert_atol, + rtol=assert_rtol, + ) + _assert_matches_shapely( + shapely_result, + cuda_result, + implementation_name="CUDA", + batch_size=batch_size, + num_points=num_points_current, + num_distances=num_distances_current, + atol=assert_atol, + rtol=assert_rtol, + ) + + +# Helper function for evaluating every point-count and distance-count pair for one batch size. +def _evaluate_batch_size( + batch_size: int, + nums_points: list[int], + nums_distances: list[int], + *, + num_runs: int, + assert_results: bool, + assert_atol: float, + assert_rtol: float, + skip_shapely: bool, +) -> tuple[ + np.ndarray | None, np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None, np.ndarray | None +]: + result_shape = (len(nums_points), len(nums_distances)) + + shapely_runtime_ms = None if skip_shapely else np.zeros(result_shape, dtype=np.float64) + cuda_runtime_ms = np.zeros(result_shape, dtype=np.float64) + cpu_runtime_ms = np.zeros(result_shape, dtype=np.float64) + + max_abs_diff_cuda = np.zeros_like(cpu_runtime_ms) if assert_results else None + max_abs_diff_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None + max_abs_diff_cuda_vs_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None + + if not skip_shapely: + print(f"Running Shapely sweep for batch={batch_size}, runs={num_runs}") + for ( + points_idx, + distances_idx, + num_points_current, + num_distances_current, + seed, + ) in _iter_evaluation_cases(batch_size, nums_points, nums_distances): + print( + "Running Shapely evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + + shapely_runtime = _time_shapely( + points_np, + distances_np, + num_runs=num_runs, + ) + shapely_runtime_ms[points_idx, distances_idx] = shapely_runtime * 1000 + _cleanup_between_implementation_sweeps() + + print(f"Running CPU sweep for batch={batch_size}, runs={num_runs}") + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running CPU evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + points_cpu, distances_cpu = _make_torch_tensors_on_device( + points_np, + distances_np, + device="cpu", + ) + + cpu_runtime_ms[points_idx, distances_idx] = ( + _time_cpu( + points_cpu, + distances_cpu, + num_runs=num_runs, + ) + * 1000 + ) + _cleanup_between_implementation_sweeps() + + print(f"Running CUDA sweep for batch={batch_size}, runs={num_runs}") + for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases( + batch_size, nums_points, nums_distances + ): + print( + "Running CUDA evaluation " + f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, " + f"runs={num_runs}" + ) + points_np, distances_np = _make_evaluation_case( + batch_size, + num_points_current, + num_distances_current, + seed=seed, + ) + points_gpu, distances_gpu = _make_torch_tensors_on_device( + points_np, + distances_np, + device=DEVICE, + ) + + cuda_runtime_ms[points_idx, distances_idx] = ( + _time_cuda( + points_gpu, + distances_gpu, + num_runs=num_runs, + ) + * 1000 + ) + _cleanup_between_implementation_sweeps() + + if assert_results: + _run_validation_sweep( + batch_size, + nums_points, + nums_distances, + assert_atol=assert_atol, + assert_rtol=assert_rtol, + max_abs_diff_cpu=max_abs_diff_cpu, + max_abs_diff_cuda=max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu=max_abs_diff_cuda_vs_cpu, + ) + _cleanup_between_implementation_sweeps() + + return ( + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) + + +# Helper function for parsing command-line arguments. +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Evaluate batched CPU/CUDA polyline interpolation against a Shapely LineString reference " + "over point-count, distance-count, and batch-size sweeps." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--num-points", + dest="nums_points", + default=",".join(str(item) for item in DEFAULT_NUMS_POINTS), + help="Comma-separated point counts for the polyline-length sweep.", + ) + parser.add_argument( + "--num-distances", + dest="nums_distances", + default=",".join(str(item) for item in DEFAULT_NUMS_DISTANCES), + help="Comma-separated sample-distance counts for the interpolation sweep.", + ) + parser.add_argument( + "--batch-sizes", + default=",".join(str(item) for item in DEFAULT_BATCH_SIZES), + help="Comma-separated batch sizes to evaluate.", + ) + parser.add_argument( + "--num-polylines-per-measurement", + type=int, + default=DEFAULT_NUM_POLYLINES_PER_MEASUREMENT, + help="Target number of polylines measured per configuration; divided by batch size to get runs.", + ) + parser.add_argument( + "--num-warmup-runs", + type=int, + default=DEFAULT_NUM_WARMUP_RUNS, + help="Number of untimed warmup runs before the measured sweep.", + ) + parser.add_argument( + "--warmup-num-points", + type=int, + default=DEFAULT_WARMUP_NUM_POINTS, + help="Point count used for warmup inputs.", + ) + parser.add_argument( + "--warmup-num-distances", + type=int, + default=DEFAULT_WARMUP_NUM_DISTANCES, + help="Sample-distance count used for warmup inputs.", + ) + parser.add_argument( + "--skip-shapely", + action="store_true", + default=DEFAULT_SKIP_SHAPELY, + help="Skip Shapely reference timing and Shapely-based speedup plots.", + ) + parser.add_argument( + "--assert-results", + action="store_true", + default=DEFAULT_ASSERT_RESULTS, + help="Compare CPU and CUDA outputs against Shapely using the configured tolerances.", + ) + parser.add_argument( + "--assert-atol", + type=float, + default=DEFAULT_ASSERT_ATOL, + help="Absolute tolerance used when asserting results against Shapely.", + ) + parser.add_argument( + "--assert-rtol", + type=float, + default=DEFAULT_ASSERT_RTOL, + help="Relative tolerance used when asserting results against Shapely.", + ) + no_annotate_plots_action = parser.add_argument( + "--no-annotate-plots", + dest="annotate_plots", + action="store_false", + help="Disable annotations in generated heatmaps.", + ) + parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS) + no_annotate_plots_action.default = argparse.SUPPRESS + parser.add_argument( + "--output-dir", + type=Path, + default=Path("polyline_shapely_eval_results"), + help="Directory for Markdown result tables and generated plot images.", + ) + args = parser.parse_args() + return args + + +# Main entry point for the full benchmark sweep. +def main() -> None: + args = _parse_args() + nums_points = _parse_int_list(args.nums_points) + nums_distances = _parse_int_list(args.nums_distances) + batch_sizes = _parse_int_list(args.batch_sizes) + # Make relative output paths independent of the caller's working directory. + if not args.output_dir.is_absolute(): + args.output_dir = SCRIPT_DIR / args.output_dir + + if not torch.cuda.is_available(): + raise RuntimeError("This evaluation requires a CUDA-capable PyTorch installation.") + + args.output_dir.mkdir(parents=True, exist_ok=True) + + # Result assertions require Shapely, so disabling Shapely also disables assertions. + assert_results = args.assert_results and not args.skip_shapely + + print("Performing runtime evaluation...") + print(f"Numbers of points: {nums_points}") + print(f"Numbers of distances: {nums_distances}") + print(f"Batch sizes: {batch_sizes}") + print(f"Measured polylines per configuration: {args.num_polylines_per_measurement}") + print( + "Warmup configuration: " + f"batch={max(batch_sizes)}, points={args.warmup_num_points}, " + f"distances={args.warmup_num_distances}, runs={args.num_warmup_runs}" + ) + print(f"Use Shapely reference: {not args.skip_shapely}") + print(f"Assert results against Shapely: {assert_results}") + print(f"Annotate plots: {args.annotate_plots}") + print(f"Output directory: {args.output_dir}") + + _run_warmup( + batch_size=max(batch_sizes), + num_points=args.warmup_num_points, + num_distances=args.warmup_num_distances, + num_warmup_runs=args.num_warmup_runs, + skip_shapely=args.skip_shapely, + ) + + for batch_size in batch_sizes: + # Keep (roughly) the same number of measured polylines per configuration across batch sizes. + num_runs = max(1, args.num_polylines_per_measurement // batch_size) + print(f"Using {num_runs} measured runs for batch={batch_size}") + + # Run evaluation & get results for one batch size (number of polylines in single call). + ( + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) = _evaluate_batch_size( + batch_size, + nums_points, + nums_distances, + num_runs=num_runs, + assert_results=assert_results, + assert_atol=args.assert_atol, + assert_rtol=args.assert_rtol, + skip_shapely=args.skip_shapely, + ) + + # Write results to disk. + shapely_evaluation_outputs.write_batch_results( + args.output_dir, + batch_size, + nums_points, + nums_distances, + shapely_runtime_ms, + cpu_runtime_ms, + cuda_runtime_ms, + args.skip_shapely, + assert_results, + max_abs_diff_cpu, + max_abs_diff_cuda, + max_abs_diff_cuda_vs_cpu, + ) + + # Print info. + cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms + if not args.skip_shapely: + cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms + cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms + print(f"Average Shapely runtime [ms], batch={batch_size}:\n{shapely_runtime_ms}") + print(f"Average CPU runtime [ms], batch={batch_size}:\n{cpu_runtime_ms}") + print(f"Average CUDA runtime [ms], batch={batch_size}:\n{cuda_runtime_ms}") + if not args.skip_shapely: + print(f"CPU speedup over Shapely, batch={batch_size}:\n{cpu_speedup_over_shapely}") + print(f"CUDA speedup over Shapely, batch={batch_size}:\n{cuda_speedup_over_shapely}") + print(f"CUDA speedup over CPU, batch={batch_size}:\n{cuda_speedup_over_cpu}") + if assert_results: + print(f"CUDA max absolute difference to CPU, batch={batch_size}:\n{max_abs_diff_cuda_vs_cpu}") + print(f"CPU max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cpu}") + print(f"CUDA max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cuda}") + + plotted_files = plot_shapely_evaluation.plot_from_markdown_directory( + input_dir=args.output_dir, + output_dir=args.output_dir, + batch_sizes=batch_sizes, + annotate_plots=args.annotate_plots, + ) + print(f"Generated {len(plotted_files)} plot image(s) from Markdown results.") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md new file mode 100644 index 0000000..1ac6974 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 1.03e-03 | 9.861e-04 | 1.020e-03 | 1.141e-03 | 1.175e-03 | 1.157e-03 | 1.294e-03 | 1.536e-03 | 2.404e-03 | 3.742e-03 | 6.497e-03 | 1.486e-02 | +| 5 | 1.006e-03 | 1.011e-03 | 1.022e-03 | 1.044e-03 | 1.109e-03 | 1.292e-03 | 1.406e-03 | 1.77e-03 | 2.851e-03 | 4.664e-03 | 8.391e-03 | 2.134e-02 | +| 10 | 1.017e-03 | 1.018e-03 | 1.031e-03 | 1.056e-03 | 1.118e-03 | 1.32e-03 | 1.477e-03 | 1.912e-03 | 3.197e-03 | 5.305e-03 | 9.844e-03 | 2.811e-02 | +| 20 | 1.021e-03 | 1.029e-03 | 1.050e-03 | 1.07e-03 | 1.209e-03 | 1.318e-03 | 1.579e-03 | 2.032e-03 | 3.506e-03 | 6.075e-03 | 1.094e-02 | 2.844e-02 | +| 50 | 1.105e-03 | 1.113e-03 | 1.134e-03 | 1.157e-03 | 1.246e-03 | 1.509e-03 | 1.846e-03 | 2.308e-03 | 4.140e-03 | 7.052e-03 | 1.320e-02 | 4.408e-02 | +| 100 | 1.216e-03 | 1.238e-03 | 1.248e-03 | 1.277e-03 | 1.469e-03 | 1.582e-03 | 1.909e-03 | 2.581e-03 | 4.646e-03 | 8.162e-03 | 1.514e-02 | 5.076e-02 | +| 200 | 1.5e-03 | 1.494e-03 | 1.536e-03 | 1.604e-03 | 1.652e-03 | 1.869e-03 | 2.262e-03 | 3.043e-03 | 5.500e-03 | 9.331e-03 | 1.722e-02 | 7.581e-02 | +| 500 | 2.238e-03 | 2.232e-03 | 2.302e-03 | 2.347e-03 | 2.408e-03 | 2.654e-03 | 3.131e-03 | 4.002e-03 | 6.978e-03 | 1.108e-02 | 2.022e-02 | 9.638e-02 | +| 1000 | 3.430e-03 | 3.437e-03 | 3.476e-03 | 3.52e-03 | 3.594e-03 | 3.868e-03 | 4.464e-03 | 5.305e-03 | 8.287e-03 | 1.318e-02 | 2.306e-02 | 1.374e-01 | +| 2000 | 5.753e-03 | 5.774e-03 | 5.815e-03 | 5.88e-03 | 5.95e-03 | 6.256e-03 | 6.811e-03 | 7.942e-03 | 1.099e-02 | 1.65e-02 | 2.710e-02 | 1.768e-01 | +| 5000 | 1.279e-02 | 1.289e-02 | 1.285e-02 | 1.336e-02 | 1.37e-02 | 1.370e-02 | 1.45e-02 | 1.527e-02 | 1.876e-02 | 2.451e-02 | 3.842e-02 | 2.290e-01 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md new file mode 100644 index 0000000..ee2c4c8 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.535e-03 | 3.428e-03 | 3.44e-03 | 3.456e-03 | 3.432e-03 | 3.443e-03 | 3.43e-03 | 3.521e-03 | 3.464e-03 | 3.47e-03 | 3.486e-03 | 4.911e-03 | +| 5 | 3.394e-03 | 3.39e-03 | 3.455e-03 | 3.415e-03 | 3.385e-03 | 3.388e-03 | 3.478e-03 | 3.472e-03 | 3.471e-03 | 3.533e-03 | 3.648e-03 | 5.321e-03 | +| 10 | 3.437e-03 | 3.397e-03 | 3.396e-03 | 3.392e-03 | 3.454e-03 | 3.384e-03 | 3.467e-03 | 3.482e-03 | 3.480e-03 | 3.476e-03 | 3.841e-03 | 5.798e-03 | +| 20 | 3.419e-03 | 3.408e-03 | 3.403e-03 | 3.381e-03 | 3.393e-03 | 3.442e-03 | 3.482e-03 | 3.464e-03 | 3.467e-03 | 3.468e-03 | 3.970e-03 | 6.137e-03 | +| 50 | 3.470e-03 | 3.4e-03 | 3.388e-03 | 3.394e-03 | 3.405e-03 | 3.382e-03 | 3.48e-03 | 3.528e-03 | 3.462e-03 | 3.471e-03 | 4.098e-03 | 6.448e-03 | +| 100 | 3.400e-03 | 3.453e-03 | 3.415e-03 | 3.401e-03 | 3.412e-03 | 3.411e-03 | 3.491e-03 | 3.417e-03 | 3.481e-03 | 3.535e-03 | 4.291e-03 | 6.711e-03 | +| 200 | 3.486e-03 | 3.396e-03 | 3.396e-03 | 3.454e-03 | 3.396e-03 | 3.403e-03 | 3.478e-03 | 3.460e-03 | 3.405e-03 | 3.461e-03 | 4.406e-03 | 7.067e-03 | +| 500 | 3.479e-03 | 3.389e-03 | 3.394e-03 | 3.409e-03 | 3.476e-03 | 3.385e-03 | 3.471e-03 | 3.474e-03 | 3.471e-03 | 3.594e-03 | 4.706e-03 | 7.429e-03 | +| 1000 | 3.478e-03 | 3.409e-03 | 3.407e-03 | 3.382e-03 | 3.383e-03 | 3.435e-03 | 3.453e-03 | 3.456e-03 | 3.463e-03 | 3.689e-03 | 5.009e-03 | 7.908e-03 | +| 2000 | 3.765e-03 | 3.787e-03 | 3.771e-03 | 3.777e-03 | 3.785e-03 | 3.795e-03 | 3.822e-03 | 3.857e-03 | 4.169e-03 | 4.674e-03 | 5.979e-03 | 9.683e-03 | +| 5000 | 6.e-03 | 5.96e-03 | 6.107e-03 | 6.115e-03 | 6.131e-03 | 6.205e-03 | 6.26e-03 | 6.297e-03 | 6.666e-03 | 7.521e-03 | 9.576e-03 | 1.49e-02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md new file mode 100644 index 0000000..218e33c --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 5.920e-03 | 5.929e-03 | 6.514e-03 | 7.182e-03 | 9.292e-03 | 1.258e-02 | 1.976e-02 | 3.39e-02 | 7.530e-02 | 1.522e-01 | 3.996e-01 | 1.684e+00 | +| 5 | 5.928e-03 | 5.962e-03 | 6.537e-03 | 7.384e-03 | 1.018e-02 | 1.435e-02 | 2.246e-02 | 3.783e-02 | 9.250e-02 | 2.432e-01 | 4.686e-01 | 1.798e+00 | +| 10 | 6.044e-03 | 6.014e-03 | 6.824e-03 | 7.726e-03 | 9.796e-03 | 1.534e-02 | 2.687e-02 | 4.409e-02 | 1.118e-01 | 2.867e-01 | 5.987e-01 | 2.145e+00 | +| 20 | 6.032e-03 | 6.105e-03 | 6.865e-03 | 8.794e-03 | 1.120e-02 | 2.134e-02 | 3.391e-02 | 6.222e-02 | 1.556e-01 | 3.645e-01 | 6.786e-01 | 2.637e+00 | +| 50 | 6.499e-03 | 6.279e-03 | 8.662e-03 | 9.916e-03 | 1.697e-02 | 3.003e-02 | 5.847e-02 | 1.228e-01 | 2.529e-01 | 6.603e-01 | 1.177e+00 | 3.703e+00 | +| 100 | 6.479e-03 | 7.37e-03 | 1.051e-02 | 1.371e-02 | 2.748e-02 | 4.721e-02 | 1.027e-01 | 1.929e-01 | 5.007e-01 | 9.981e-01 | 2.129e+00 | 6.030e+00 | +| 200 | 7.691e-03 | 9.454e-03 | 1.444e-02 | 2.263e-02 | 3.906e-02 | 9.756e-02 | 1.886e-01 | 3.467e-01 | 8.815e-01 | 1.792e+00 | 3.648e+00 | 1.008e+01 | +| 500 | 8.883e-03 | 1.283e-02 | 3.616e-02 | 3.52e-02 | 8.427e-02 | 2.162e-01 | 4.081e-01 | 8.556e-01 | 2.177e+00 | 4.328e+00 | 8.686e+00 | 2.235e+01 | +| 1000 | 8.225e-03 | 2.181e-02 | 5.647e-02 | 8.705e-02 | 1.809e-01 | 4.044e-01 | 7.995e-01 | 1.644e+00 | 4.172e+00 | 8.583e+00 | 1.69e+01 | 4.323e+01 | +| 2000 | 3.628e-02 | 4.154e-02 | 6.644e-02 | 1.663e-01 | 4.074e-01 | 8.366e-01 | 1.748e+00 | 3.414e+00 | 8.769e+00 | 1.775e+01 | 3.397e+01 | 8.585e+01 | +| 5000 | 5.857e-02 | 1.474e-01 | 2.871e-01 | 5.538e-01 | 8.380e-01 | 2.379e+00 | 4.255e+00 | 8.153e+00 | 2.054e+01 | 4.167e+01 | 8.298e+01 | 2.090e+02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md new file mode 100644 index 0000000..4d3f267 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 5.75 | 6.01 | 6.38 | 6.30 | 7.91 | 10.87 | 15.27 | 22.07 | 31.33 | 40.68 | 61.51 | 113.29 | +| 5 | 5.89 | 5.90 | 6.40 | 7.07 | 9.18 | 11.11 | 15.98 | 21.38 | 32.44 | 52.15 | 55.84 | 84.26 | +| 10 | 5.94 | 5.91 | 6.62 | 7.32 | 8.76 | 11.63 | 18.19 | 23.06 | 34.97 | 54.04 | 60.82 | 76.29 | +| 20 | 5.91 | 5.93 | 6.54 | 8.22 | 9.26 | 16.20 | 21.48 | 30.62 | 44.39 | 60.01 | 62.01 | 92.75 | +| 50 | 5.88 | 5.64 | 7.64 | 8.57 | 13.61 | 19.90 | 31.67 | 53.21 | 61.08 | 93.63 | 89.16 | 84.01 | +| 100 | 5.33 | 5.96 | 8.42 | 10.73 | 18.70 | 29.85 | 53.82 | 74.74 | 107.76 | 122.28 | 140.68 | 118.80 | +| 200 | 5.13 | 6.33 | 9.40 | 14.11 | 23.64 | 52.19 | 83.38 | 113.93 | 160.26 | 192.08 | 211.85 | 132.97 | +| 500 | 3.97 | 5.75 | 15.71 | 15.00 | 34.99 | 81.47 | 130.37 | 213.79 | 311.96 | 390.72 | 429.62 | 231.94 | +| 1000 | 2.40 | 6.35 | 16.25 | 24.73 | 50.34 | 104.54 | 179.10 | 309.90 | 503.46 | 651.02 | 732.57 | 314.65 | +| 2000 | 6.31 | 7.19 | 11.43 | 28.28 | 68.47 | 133.72 | 256.58 | 429.83 | 797.87 | 1.08e+03 | 1.25e+03 | 485.50 | +| 5000 | 4.58 | 11.44 | 22.34 | 41.47 | 61.17 | 173.59 | 293.48 | 533.92 | 1.09e+03 | 1.70e+03 | 2.16e+03 | 912.55 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md new file mode 100644 index 0000000..487290e --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 0.29 | 0.29 | 0.30 | 0.33 | 0.34 | 0.34 | 0.38 | 0.44 | 0.69 | 1.08 | 1.86 | 3.03 | +| 5 | 0.30 | 0.30 | 0.30 | 0.31 | 0.33 | 0.38 | 0.40 | 0.51 | 0.82 | 1.32 | 2.30 | 4.01 | +| 10 | 0.30 | 0.30 | 0.30 | 0.31 | 0.32 | 0.39 | 0.43 | 0.55 | 0.92 | 1.53 | 2.56 | 4.85 | +| 20 | 0.30 | 0.30 | 0.31 | 0.32 | 0.36 | 0.38 | 0.45 | 0.59 | 1.01 | 1.75 | 2.76 | 4.63 | +| 50 | 0.32 | 0.33 | 0.33 | 0.34 | 0.37 | 0.45 | 0.53 | 0.65 | 1.20 | 2.03 | 3.22 | 6.84 | +| 100 | 0.36 | 0.36 | 0.37 | 0.38 | 0.43 | 0.46 | 0.55 | 0.76 | 1.33 | 2.31 | 3.53 | 7.56 | +| 200 | 0.43 | 0.44 | 0.45 | 0.46 | 0.49 | 0.55 | 0.65 | 0.88 | 1.62 | 2.70 | 3.91 | 10.73 | +| 500 | 0.64 | 0.66 | 0.68 | 0.69 | 0.69 | 0.78 | 0.90 | 1.15 | 2.01 | 3.08 | 4.30 | 12.97 | +| 1000 | 0.99 | 1.01 | 1.02 | 1.04 | 1.06 | 1.13 | 1.29 | 1.53 | 2.39 | 3.57 | 4.60 | 17.37 | +| 2000 | 1.53 | 1.52 | 1.54 | 1.56 | 1.57 | 1.65 | 1.78 | 2.06 | 2.64 | 3.53 | 4.53 | 18.26 | +| 5000 | 2.13 | 2.16 | 2.10 | 2.18 | 2.23 | 2.21 | 2.32 | 2.43 | 2.81 | 3.26 | 4.01 | 15.38 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md new file mode 100644 index 0000000..1125800 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 1.67 | 1.73 | 1.89 | 2.08 | 2.71 | 3.65 | 5.76 | 9.63 | 21.74 | 43.87 | 114.64 | 342.86 | +| 5 | 1.75 | 1.76 | 1.89 | 2.16 | 3.01 | 4.23 | 6.46 | 10.89 | 26.65 | 68.85 | 128.46 | 337.91 | +| 10 | 1.76 | 1.77 | 2.01 | 2.28 | 2.84 | 4.53 | 7.75 | 12.66 | 32.12 | 82.47 | 155.86 | 369.85 | +| 20 | 1.76 | 1.79 | 2.02 | 2.60 | 3.30 | 6.20 | 9.74 | 17.96 | 44.88 | 105.11 | 170.94 | 429.75 | +| 50 | 1.87 | 1.85 | 2.56 | 2.92 | 4.98 | 8.88 | 16.80 | 34.81 | 73.04 | 190.20 | 287.27 | 574.20 | +| 100 | 1.91 | 2.13 | 3.08 | 4.03 | 8.05 | 13.84 | 29.42 | 56.46 | 143.84 | 282.36 | 496.26 | 898.57 | +| 200 | 2.21 | 2.78 | 4.25 | 6.55 | 11.50 | 28.67 | 54.23 | 100.20 | 258.88 | 517.84 | 827.99 | 1.43e+03 | +| 500 | 2.55 | 3.79 | 10.65 | 10.32 | 24.24 | 63.87 | 117.57 | 246.30 | 627.28 | 1.20e+03 | 1.85e+03 | 3.01e+03 | +| 1000 | 2.36 | 6.40 | 16.58 | 25.74 | 53.47 | 117.71 | 231.55 | 475.60 | 1.20e+03 | 2.33e+03 | 3.37e+03 | 5.47e+03 | +| 2000 | 9.63 | 10.97 | 17.62 | 44.03 | 107.63 | 220.43 | 457.23 | 885.05 | 2.10e+03 | 3.80e+03 | 5.68e+03 | 8.87e+03 | +| 5000 | 9.76 | 24.73 | 47.01 | 90.57 | 136.68 | 383.37 | 679.79 | 1.29e+03 | 3.08e+03 | 5.54e+03 | 8.67e+03 | 1.40e+04 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md new file mode 100644 index 0000000..584833d --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.839e-03 | 2.263e-03 | 2.751e-03 | 3.603e-03 | 5.367e-03 | 1.043e-02 | 1.963e-02 | 3.62e-02 | 8.759e-02 | 1.75e-01 | 3.531e-01 | 8.816e-01 | +| 5 | 2.581e-03 | 2.852e-03 | 3.709e-03 | 5.404e-03 | 8.689e-03 | 2.09e-02 | 4.92e-02 | 1.152e-01 | 3.25e-01 | 6.775e-01 | 1.329e+00 | 3.191e+00 | +| 10 | 3.203e-03 | 3.694e-03 | 4.880e-03 | 6.879e-03 | 1.078e-02 | 2.716e-02 | 6.540e-02 | 1.654e-01 | 4.613e-01 | 9.447e-01 | 1.831e+00 | 4.547e+00 | +| 20 | 4.909e-03 | 5.348e-03 | 6.714e-03 | 8.853e-03 | 1.4e-02 | 3.155e-02 | 8.694e-02 | 2.126e-01 | 5.852e-01 | 1.196e+00 | 2.377e+00 | 5.819e+00 | +| 50 | 9.96e-03 | 1.06e-02 | 1.193e-02 | 1.534e-02 | 2.064e-02 | 4.532e-02 | 1.234e-01 | 2.898e-01 | 7.611e-01 | 1.517e+00 | 3.015e+00 | 7.419e+00 | +| 100 | 1.739e-02 | 1.813e-02 | 2.019e-02 | 2.349e-02 | 2.952e-02 | 5.775e-02 | 1.613e-01 | 3.547e-01 | 8.880e-01 | 1.768e+00 | 3.481e+00 | 8.625e+00 | +| 200 | 3.283e-02 | 3.274e-02 | 3.517e-02 | 3.895e-02 | 4.640e-02 | 8.618e-02 | 2.034e-01 | 4.200e-01 | 1.036e+00 | 2.037e+00 | 4.028e+00 | 9.911e+00 | +| 500 | 7.66e-02 | 7.725e-02 | 8.045e-02 | 8.389e-02 | 9.346e-02 | 1.439e-01 | 2.970e-01 | 5.451e-01 | 1.259e+00 | 2.425e+00 | 4.751e+00 | 1.168e+01 | +| 1000 | 1.509e-01 | 1.580e-01 | 1.543e-01 | 1.593e-01 | 1.691e-01 | 2.349e-01 | 4.054e-01 | 6.792e-01 | 1.476e+00 | 2.797e+00 | 5.417e+00 | 1.32e+01 | +| 2000 | 3.070e-01 | 3.025e-01 | 3.032e-01 | 3.147e-01 | 3.206e-01 | 4.106e-01 | 5.676e-01 | 9.091e-01 | 1.797e+00 | 3.308e+00 | 6.246e+00 | 1.515e+01 | +| 5000 | 7.461e-01 | 7.505e-01 | 7.574e-01 | 7.580e-01 | 7.719e-01 | 8.920e-01 | 1.084e+00 | 1.424e+00 | 2.472e+00 | 4.160e+00 | 7.566e+00 | 1.774e+01 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md new file mode 100644 index 0000000..66190a3 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 8.093e-03 | 4.831e-03 | 4.721e-03 | 4.601e-03 | 4.671e-03 | 4.493e-03 | 4.649e-03 | 4.688e-03 | 4.626e-03 | 4.676e-03 | 5.169e-03 | 8.322e-03 | +| 5 | 4.795e-03 | 4.643e-03 | 4.625e-03 | 4.689e-03 | 4.613e-03 | 4.672e-03 | 4.744e-03 | 4.554e-03 | 4.539e-03 | 5.416e-03 | 5.288e-03 | 8.217e-03 | +| 10 | 4.59e-03 | 4.752e-03 | 5.439e-03 | 4.58e-03 | 4.614e-03 | 4.622e-03 | 4.530e-03 | 4.682e-03 | 4.752e-03 | 4.708e-03 | 5.339e-03 | 8.593e-03 | +| 20 | 4.6e-03 | 4.605e-03 | 4.989e-03 | 4.693e-03 | 4.636e-03 | 5.333e-03 | 4.715e-03 | 4.553e-03 | 4.573e-03 | 4.735e-03 | 5.519e-03 | 9.150e-03 | +| 50 | 4.689e-03 | 4.66e-03 | 4.779e-03 | 4.624e-03 | 4.725e-03 | 4.518e-03 | 4.727e-03 | 4.716e-03 | 5.022e-03 | 4.643e-03 | 5.667e-03 | 9.064e-03 | +| 100 | 4.615e-03 | 4.651e-03 | 4.751e-03 | 4.623e-03 | 4.545e-03 | 4.623e-03 | 4.731e-03 | 4.651e-03 | 5.399e-03 | 4.741e-03 | 1.487e-02 | 9.328e-03 | +| 200 | 4.689e-03 | 5.153e-03 | 4.695e-03 | 4.742e-03 | 4.776e-03 | 4.716e-03 | 5.547e-03 | 4.72e-03 | 4.628e-03 | 4.870e-03 | 5.972e-03 | 9.408e-03 | +| 500 | 4.613e-03 | 4.76e-03 | 4.683e-03 | 4.732e-03 | 5.307e-03 | 4.693e-03 | 4.659e-03 | 4.737e-03 | 5.204e-03 | 4.969e-03 | 6.236e-03 | 9.575e-03 | +| 1000 | 4.701e-03 | 4.749e-03 | 4.716e-03 | 4.71e-03 | 4.676e-03 | 4.783e-03 | 4.781e-03 | 4.735e-03 | 4.820e-03 | 5.343e-03 | 6.652e-03 | 1.028e-02 | +| 2000 | 5.166e-03 | 5.164e-03 | 5.182e-03 | 5.161e-03 | 5.152e-03 | 5.738e-03 | 5.541e-03 | 5.356e-03 | 5.642e-03 | 6.490e-03 | 7.966e-03 | 1.232e-02 | +| 5000 | 7.627e-03 | 7.593e-03 | 7.564e-03 | 7.658e-03 | 7.698e-03 | 7.712e-03 | 7.797e-03 | 7.8e-03 | 8.731e-03 | 1.037e-02 | 1.327e-02 | 2.230e-02 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md new file mode 100644 index 0000000..afd2356 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 2.728e-02 | 3.092e-02 | 5.674e-02 | 1.079e-01 | 1.983e-01 | 4.919e-01 | 9.971e-01 | 7.669e+00 | 1.382e+01 | 3.652e+01 | 7.587e+01 | 2.001e+02 | +| 5 | 2.908e-02 | 3.597e-02 | 6.759e-02 | 1.316e-01 | 2.390e-01 | 5.987e-01 | 5.551e+00 | 5.53e+00 | 1.544e+01 | 3.595e+01 | 8.467e+01 | 2.072e+02 | +| 10 | 3.137e-02 | 4.208e-02 | 8.376e-02 | 1.58e-01 | 3.011e-01 | 4.584e+00 | 1.489e+00 | 5.831e+00 | 1.888e+01 | 3.929e+01 | 8.577e+01 | 2.247e+02 | +| 20 | 3.699e-02 | 5.445e-02 | 1.104e-01 | 2.123e-01 | 4.003e-01 | 1.003e+00 | 5.894e+00 | 6.842e+00 | 2.179e+01 | 4.142e+01 | 1.000e+02 | 2.533e+02 | +| 50 | 5.549e-02 | 8.991e-02 | 1.946e-01 | 3.639e-01 | 7.197e-01 | 1.785e+00 | 3.579e+00 | 7.337e+00 | 3.250e+01 | 5.705e+01 | 1.321e+02 | 3.331e+02 | +| 100 | 8.595e-02 | 1.505e-01 | 3.145e-01 | 6.342e-01 | 1.227e+00 | 3.114e+00 | 6.111e+00 | 1.251e+01 | 4.497e+01 | 8.732e+01 | 1.823e+02 | 4.563e+02 | +| 200 | 1.464e-01 | 2.415e-01 | 6.012e-01 | 1.157e+00 | 2.252e+00 | 5.739e+00 | 1.556e+01 | 2.64e+01 | 6.973e+01 | 1.423e+02 | 2.886e+02 | 7.270e+02 | +| 500 | 3.363e-01 | 5.55e-01 | 1.525e+00 | 2.669e+00 | 5.563e+00 | 1.352e+01 | 3.084e+01 | 5.781e+01 | 1.467e+02 | 2.977e+02 | 6.112e+02 | 1.519e+03 | +| 1000 | 6.443e-01 | 1.201e+00 | 2.763e+00 | 5.361e+00 | 1.078e+01 | 2.608e+01 | 5.396e+01 | 1.076e+02 | 2.877e+02 | 5.609e+02 | 1.143e+03 | 2.861e+03 | +| 2000 | 1.189e+00 | 1.968e+00 | 5.459e+00 | 1.057e+01 | 2.136e+01 | 5.446e+01 | 1.096e+02 | 2.185e+02 | 5.670e+02 | 1.114e+03 | 2.244e+03 | 5.570e+03 | +| 5000 | 5.404e+00 | 5.521e+00 | 1.321e+01 | 2.739e+01 | 5.495e+01 | 1.321e+02 | 2.654e+02 | 5.364e+02 | 1.375e+03 | 2.726e+03 | 5.435e+03 | 1.369e+04 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md new file mode 100644 index 0000000..929b485 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 7.11 | 13.66 | 20.62 | 29.95 | 36.94 | 47.14 | 50.78 | 211.88 | 157.80 | 208.71 | 214.88 | 226.96 | +| 5 | 11.27 | 12.61 | 18.22 | 24.35 | 27.51 | 28.65 | 112.84 | 48.00 | 47.51 | 53.07 | 63.71 | 64.93 | +| 10 | 9.79 | 11.39 | 17.16 | 22.96 | 27.92 | 168.75 | 22.77 | 35.24 | 40.93 | 41.59 | 46.86 | 49.41 | +| 20 | 7.53 | 10.18 | 16.45 | 23.99 | 28.60 | 31.78 | 67.79 | 32.19 | 37.23 | 34.62 | 42.08 | 43.52 | +| 50 | 5.57 | 8.48 | 16.31 | 23.72 | 34.86 | 39.38 | 29.00 | 25.31 | 42.71 | 37.61 | 43.83 | 44.90 | +| 100 | 4.94 | 8.30 | 15.58 | 27.00 | 41.57 | 53.93 | 37.89 | 35.28 | 50.64 | 49.39 | 52.37 | 52.91 | +| 200 | 4.46 | 7.38 | 17.09 | 29.70 | 48.53 | 66.59 | 76.49 | 62.85 | 67.33 | 69.88 | 71.66 | 73.36 | +| 500 | 4.39 | 7.18 | 18.96 | 31.82 | 59.52 | 93.93 | 103.85 | 106.04 | 116.48 | 122.75 | 128.65 | 130.10 | +| 1000 | 4.27 | 7.60 | 17.91 | 33.65 | 63.73 | 111.04 | 133.13 | 158.49 | 194.95 | 200.54 | 210.95 | 216.73 | +| 2000 | 3.87 | 6.51 | 18.00 | 33.57 | 66.62 | 132.64 | 193.13 | 240.29 | 315.51 | 336.89 | 359.30 | 367.59 | +| 5000 | 7.24 | 7.36 | 17.44 | 36.14 | 71.19 | 148.05 | 244.92 | 376.69 | 556.45 | 655.34 | 718.35 | 771.81 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md new file mode 100644 index 0000000..39bcbd0 --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 0.47 | 0.47 | 0.58 | 0.78 | 1.15 | 2.32 | 4.22 | 7.72 | 18.93 | 37.42 | 68.31 | 105.94 | +| 5 | 0.54 | 0.61 | 0.80 | 1.15 | 1.88 | 4.47 | 10.37 | 25.30 | 71.59 | 125.09 | 251.34 | 388.32 | +| 10 | 0.70 | 0.78 | 0.90 | 1.50 | 2.34 | 5.88 | 14.44 | 35.34 | 97.07 | 200.67 | 342.88 | 529.19 | +| 20 | 1.07 | 1.16 | 1.35 | 1.89 | 3.02 | 5.92 | 18.44 | 46.69 | 127.98 | 252.67 | 430.68 | 636.00 | +| 50 | 2.12 | 2.27 | 2.50 | 3.32 | 4.37 | 10.03 | 26.11 | 61.46 | 151.55 | 326.74 | 532.01 | 818.56 | +| 100 | 3.77 | 3.90 | 4.25 | 5.08 | 6.49 | 12.49 | 34.09 | 76.28 | 164.48 | 372.91 | 234.16 | 924.55 | +| 200 | 7.00 | 6.35 | 7.49 | 8.21 | 9.72 | 18.28 | 36.67 | 89.00 | 223.79 | 418.30 | 674.41 | 1.05e+03 | +| 500 | 16.61 | 16.23 | 17.18 | 17.73 | 17.61 | 30.67 | 63.76 | 115.08 | 242.03 | 488.09 | 761.85 | 1.22e+03 | +| 1000 | 32.10 | 33.27 | 32.72 | 33.83 | 36.17 | 49.11 | 84.79 | 143.44 | 306.20 | 523.52 | 814.37 | 1.28e+03 | +| 2000 | 59.44 | 58.58 | 58.52 | 60.99 | 62.24 | 71.56 | 102.44 | 169.74 | 318.56 | 509.73 | 784.05 | 1.23e+03 | +| 5000 | 97.82 | 98.84 | 100.13 | 98.98 | 100.27 | 115.66 | 139.00 | 182.58 | 283.08 | 401.24 | 570.04 | 795.20 | diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md new file mode 100644 index 0000000..12e13eb --- /dev/null +++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md @@ -0,0 +1,13 @@ +| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 | +| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| 2 | 3.37 | 6.40 | 12.02 | 23.45 | 42.45 | 109.46 | 214.50 | 1.64e+03 | 2.99e+03 | 7.81e+03 | 1.47e+04 | 2.40e+04 | +| 5 | 6.06 | 7.75 | 14.61 | 28.06 | 51.83 | 128.15 | 1.17e+03 | 1.21e+03 | 3.40e+03 | 6.64e+03 | 1.60e+04 | 2.52e+04 | +| 10 | 6.83 | 8.85 | 15.40 | 34.49 | 65.26 | 991.79 | 328.65 | 1.25e+03 | 3.97e+03 | 8.35e+03 | 1.61e+04 | 2.61e+04 | +| 20 | 8.04 | 11.83 | 22.13 | 45.25 | 86.35 | 188.01 | 1.25e+03 | 1.50e+03 | 4.77e+03 | 8.75e+03 | 1.81e+04 | 2.77e+04 | +| 50 | 11.84 | 19.30 | 40.72 | 78.71 | 152.33 | 394.99 | 757.11 | 1.56e+03 | 6.47e+03 | 1.23e+04 | 2.33e+04 | 3.68e+04 | +| 100 | 18.63 | 32.36 | 66.20 | 137.20 | 269.98 | 673.70 | 1.29e+03 | 2.69e+03 | 8.33e+03 | 1.84e+04 | 1.23e+04 | 4.89e+04 | +| 200 | 31.22 | 46.87 | 128.05 | 243.96 | 471.56 | 1.22e+03 | 2.80e+03 | 5.59e+03 | 1.51e+04 | 2.92e+04 | 4.83e+04 | 7.73e+04 | +| 500 | 72.90 | 116.59 | 325.71 | 564.06 | 1.05e+03 | 2.88e+03 | 6.62e+03 | 1.22e+04 | 2.82e+04 | 5.99e+04 | 9.80e+04 | 1.59e+05 | +| 1000 | 137.05 | 252.94 | 585.84 | 1.14e+03 | 2.31e+03 | 5.45e+03 | 1.13e+04 | 2.27e+04 | 5.97e+04 | 1.05e+05 | 1.72e+05 | 2.78e+05 | +| 2000 | 230.13 | 381.18 | 1.05e+03 | 2.05e+03 | 4.15e+03 | 9.49e+03 | 1.98e+04 | 4.08e+04 | 1.01e+05 | 1.72e+05 | 2.82e+05 | 4.52e+05 | +| 5000 | 708.50 | 727.06 | 1.75e+03 | 3.58e+03 | 7.14e+03 | 1.71e+04 | 3.40e+04 | 6.88e+04 | 1.58e+05 | 2.63e+05 | 4.09e+05 | 6.14e+05 | diff --git a/packages/lane_helpers/examples/basic_usage.py b/packages/lane_helpers/examples/basic_usage.py new file mode 100644 index 0000000..a099fd8 --- /dev/null +++ b/packages/lane_helpers/examples/basic_usage.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from accvlab.lane_helpers import polyline + + +def main() -> None: + if not torch.cuda.is_available(): + raise RuntimeError("This example requires a CUDA-capable PyTorch installation.") + + # @NOTE Use one rectangle polyline with shape (batch=1, num_points=5, num_dims=2). + points = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device="cuda", + dtype=torch.float32, + ) + + # @NOTE Use a tensor of distances to sample the polyline at (batch=1, num_distances=5). + distances = torch.tensor([[0.0, 0.5, 1.0, 3.0, 6.0]], device="cuda", dtype=torch.float32) + + # @NOTE Interpolate the polyline at the given distances. + sampled_points = polyline.interpolate(points, distances) + # @NOTE Compute the length of the polyline. + line_lengths = polyline.lengths(points) + + # @NOTE Print the results. + print(f"Interpolated points:\n{sampled_points}") + print(f"Line length(s): {line_lengths}") + + +if __name__ == "__main__": + main() diff --git a/packages/lane_helpers/ext_impl/CMakeLists.txt b/packages/lane_helpers/ext_impl/CMakeLists.txt new file mode 100644 index 0000000..bfab1b2 --- /dev/null +++ b/packages/lane_helpers/ext_impl/CMakeLists.txt @@ -0,0 +1,55 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.18) +project(accvlab_lane_helpers_ext LANGUAGES CXX CUDA) + +if(NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +execute_process( + COMMAND "python3" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'share', 'cmake'))" + OUTPUT_VARIABLE TORCH_CMAKE_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PATH}") + +find_package(CUDA REQUIRED) +find_package(Torch REQUIRED) +find_package(Python COMPONENTS Interpreter Development REQUIRED) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -m pybind11 --cmakedir + OUTPUT_VARIABLE pybind11_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +find_package(pybind11 REQUIRED) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + set(CMAKE_CUDA_ARCHITECTURES native) + else() + set(CMAKE_CUDA_ARCHITECTURES "75;80;86") + endif() +endif() + +separate_arguments(TORCH_CXX_FLAGS_LIST NATIVE_COMMAND "${TORCH_CXX_FLAGS}") + +find_library(TORCH_PYTHON_LIBRARY torch_python PATHS ${TORCH_INSTALL_PREFIX}/lib) + +add_subdirectory(polyline) diff --git a/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt new file mode 100644 index 0000000..207431b --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pybind11_add_module(accvlab_lane_helpers_polyline_sampling MODULE + src/polyline.cu + src/polyline_cpu.cpp + src/polyline.cpp +) + +set_target_properties(accvlab_lane_helpers_polyline_sampling PROPERTIES + CXX_STANDARD 17 + CUDA_STANDARD 17 + OUTPUT_NAME "_polyline_sampling" + PREFIX "" +) + +target_compile_definitions(accvlab_lane_helpers_polyline_sampling PRIVATE + TORCH_EXTENSION_NAME=_polyline_sampling + TORCH_API_INCLUDE_EXTENSION_H +) + +target_compile_options(accvlab_lane_helpers_polyline_sampling PRIVATE ${TORCH_CXX_FLAGS_LIST}) + +target_link_libraries(accvlab_lane_helpers_polyline_sampling PRIVATE + ${TORCH_LIBRARIES} + ${CUDA_LIBRARIES} + ${TORCH_PYTHON_LIBRARY} +) + +target_include_directories(accvlab_lane_helpers_polyline_sampling PRIVATE + ${TORCH_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} + include/ +) + +install(TARGETS accvlab_lane_helpers_polyline_sampling + LIBRARY DESTINATION . + RUNTIME DESTINATION . +) diff --git a/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh new file mode 100644 index 0000000..441220a --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH +#define POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH + +#include + +#define CUDA_CHECK(error_code_or_call) C10_CUDA_CHECK(error_code_or_call) +#define CUDA_CHECK_LAST() C10_CUDA_CHECK(cudaGetLastError()) + +#endif \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh new file mode 100644 index 0000000..01ce298 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Host-visible interface for the polyline interpolation CUDA +// implementation. This header is intentionally free of CUDA device intrinsics +// so it can be included from both C++ and CUDA translation units. + +#ifndef LANE_HELPERS_POLYLINE_CUH +#define LANE_HELPERS_POLYLINE_CUH + +#include +#include + +#include +#include +#include + +namespace polyline { + +template +struct PolylineLaunchConfig { + dim3 block_dim; + dim3 grid_dim; + int num_points_full_blocks; + size_t shared_mem_size; + size_t distance_buffer_ext_size_elems; + bool use_shared_distances; + size_t max_shared_full; +}; + +template +PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, int device); + +template +void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances, + dtype* result_points, int num_samples, bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream); + +template +void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples, + cudaStream_t stream); + +template +void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances, + int num_distances, dtype* result_points, int num_samples, + sample_size_dtype* sample_sizes_points, + sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream); + +template +void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths, + int num_samples, sample_size_dtype* sample_sizes_points, + cudaStream_t stream); + +template +void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances, + int num_distances, dtype* result_points, int num_samples, + bool relative_distances); + +template +void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples); + +template +void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + const dtype* distances, int num_distances, + dtype* result_points, int num_samples, + const sample_size_dtype* sample_sizes_points, + const sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances); + +template +void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + dtype* lengths, int num_samples, + const sample_size_dtype* sample_sizes_points); + +// Explicit instantiations are provided in polyline.cu and polyline_cpu.cpp. +#define DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE) \ + extern template PolylineLaunchConfig make_polyline_launch_config( \ + int num_points, int num_samples, int device); + +#define DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE) \ + extern template void polyline_interpolation( \ + DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE) \ + extern template void polyline_lengths(DTYPE * points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples, cudaStream_t stream); + +#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_interpolation_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_lengths_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream); + +#define DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) + +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(float) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(double) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::Half) +DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::BFloat16) + +#undef DECLARE_POLYLINE_CUDA_DTYPE_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_EXTERN +#undef DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN + +#define DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE) \ + extern template void polyline_interpolation_cpu( \ + const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances); + +#define DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE) \ + extern template void polyline_lengths_cpu(const DTYPE* points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples); + +#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_interpolation_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances); + +#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE) \ + extern template void polyline_lengths_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_points); + +#define DECLARE_POLYLINE_CPU_DTYPE_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE) \ + DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int) \ + DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) + +DECLARE_POLYLINE_CPU_DTYPE_EXTERN(float) +DECLARE_POLYLINE_CPU_DTYPE_EXTERN(double) + +#undef DECLARE_POLYLINE_CPU_DTYPE_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN +#undef DECLARE_POLYLINE_LENGTHS_CPU_EXTERN +#undef DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh new file mode 100644 index 0000000..625dacf --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_COMMON_CUH +#define LANE_HELPERS_POLYLINE_COMMON_CUH + +#include +#include + +#include "polyline_dtype_compat.cuh" + +#ifdef __CUDACC__ +// Keep scalar helpers callable from both CUDA kernels and CPU translation units. +#define POLYLINE_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else +#define POLYLINE_HOST_DEVICE_INLINE inline +#endif + +namespace polyline { + +template +POLYLINE_HOST_DEVICE_INLINE dtype polyline_nan() { + const dtype nan_value = static_cast(NAN); + return nan_value; +} + +template +POLYLINE_HOST_DEVICE_INLINE void fill_point_with_nan_common(point_dtype* res_point, int num_dims) { + const point_dtype nan_value = polyline_nan(); + for (int d = 0; d < num_dims; ++d) { + res_point[d] = nan_value; + } +} + +/** + * @brief Compute the Euclidean length of one polyline segment. + * + * @details + * `segment_idx` refers to the segment between points `segment_idx` and + * `segment_idx + 1`. The point coordinates are laid out consecutively as + * `(num_points, num_dims)`. + * + * The point dtype and accumulation dtype are intentionally separate so the CPU + * path can accumulate in a wider type while the CUDA path preserves its + * existing dtype behavior. + */ +template +POLYLINE_HOST_DEVICE_INLINE accum_dtype compute_segment_length_common(const point_dtype* points_sample, + int segment_idx, int num_dims) { + const point_dtype* first_point = points_sample + segment_idx * num_dims; + const point_dtype* second_point = points_sample + (segment_idx + 1) * num_dims; + accum_dtype accum_sqr = static_cast(0.0); + for (int d = 0; d < num_dims; ++d) { + const accum_dtype diff = + static_cast(first_point[d]) - static_cast(second_point[d]); + accum_sqr += diff * diff; + } + const accum_dtype segment_length = polyline_sqrt(accum_sqr); + return segment_length; +} + +/** + * @brief Find the last index whose value is lower than or equal to `value`. + * + * @details + * The input sequence is expected to be monotonically non-decreasing cumulative + * distances. The return value can be: + * - `-1` when `value` lies before the first point. + * - `sequence_length - 1` when `value` lies at or beyond the last point. + * - Any valid lower segment endpoint otherwise. + * + * This is used to locate the segment containing the requested interpolation + * distance. + */ +template +POLYLINE_HOST_DEVICE_INLINE int get_index_of_last_lower_or_equal_to_common(const accum_dtype* sequence, + accum_dtype value, + int sequence_length) { + int min_idx = 0; + int max_idx = sequence_length - 1; + + if (polyline_value_gt(sequence[0], value)) { + return -1; + } + if (polyline_value_lt(sequence[sequence_length - 1], value)) { + return sequence_length - 1; + } + + while (max_idx - min_idx > 1) { + const int curr_idx = (max_idx + min_idx) >> 1; + const accum_dtype curr_val = sequence[curr_idx]; + if (polyline_value_lt(curr_val, value)) { + min_idx = curr_idx; + } else if (polyline_value_gt(curr_val, value)) { + max_idx = curr_idx; + } else { + min_idx = curr_idx; + max_idx = curr_idx; + } + } + return min_idx; +} + +/** + * @brief Sample one point on a polyline at a requested absolute distance. + * + * @details + * `accum_distances` stores the distance from the start of the polyline to each + * point. Distances outside the polyline are clamped to the first or last point. + * Degenerate zero-length segments return the lower endpoint. + */ +template +POLYLINE_HOST_DEVICE_INLINE void sample_at_distance_common(const point_dtype* points, + const accum_dtype* accum_distances, + accum_dtype distance_to_sample_at, int num_points, + int num_dims, point_dtype* res_point) { + const int index_min = get_index_of_last_lower_or_equal_to_common( + accum_distances, distance_to_sample_at, num_points); + if (index_min >= 0 && index_min < num_points - 1) { + const int index_max = index_min + 1; + const point_dtype* min_point = points + index_min * num_dims; + const point_dtype* max_point = points + index_max * num_dims; + const accum_dtype dist_min = accum_distances[index_min]; + const accum_dtype dist_max = accum_distances[index_max]; + const accum_dtype dist = dist_max - dist_min; + if (polyline_value_ge(dist, static_cast(std::numeric_limits::epsilon()))) { + const accum_dtype weight_max = (distance_to_sample_at - dist_min) / dist; + const accum_dtype weight_min = (dist_max - distance_to_sample_at) / dist; + for (int d = 0; d < num_dims; ++d) { + const accum_dtype interpolated = static_cast(min_point[d]) * weight_min + + static_cast(max_point[d]) * weight_max; + res_point[d] = static_cast(interpolated); + } + } else { + for (int d = 0; d < num_dims; ++d) { + res_point[d] = min_point[d]; + } + } + } else if (index_min == -1) { + for (int d = 0; d < num_dims; ++d) { + // Note that we are accessing the first point, so that points[d] corresponds to the element we + // want to access, and no offset is needed. + res_point[d] = points[d]; + } + } else if (index_min == num_points - 1) { + for (int d = 0; d < num_dims; ++d) { + res_point[d] = points[(num_points - 1) * num_dims + d]; + } + } +} + +} // namespace polyline + +#undef POLYLINE_HOST_DEVICE_INLINE + +#endif // LANE_HELPERS_POLYLINE_COMMON_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh new file mode 100644 index 0000000..eae81c4 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH +#define LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH + +#include + +// CUDA provides native __half/__nv_bfloat16 comparison intrinsics and shuffle +// overloads, while c10 low-precision wrappers add extra conversion paths, +// leading to compilation errors. The CUDA-only specializations below route c10 values +// through the native CUDA operations where available; only scalar math such as sqrt +// intentionally computes via float. Keeping these variants CUDA-only keeps CPU builds +// free of these types. +#ifdef __CUDACC__ +#include +#include +#include +#include +#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else +#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE inline +#endif + +namespace polyline { + +#ifdef __CUDACC__ +template +__device__ __forceinline__ dtype shfl_xor_sync_compat(unsigned mask, dtype val, int laneMask) { + return __shfl_xor_sync(mask, val, laneMask); +} + +template <> +__device__ __forceinline__ c10::Half shfl_xor_sync_compat(unsigned mask, c10::Half val, int laneMask) { + return c10::Half(__shfl_xor_sync(mask, static_cast<__half>(val), laneMask)); +} + +template <> +__device__ __forceinline__ c10::BFloat16 shfl_xor_sync_compat(unsigned mask, c10::BFloat16 val, + int laneMask) { + return c10::BFloat16(__shfl_xor_sync(mask, static_cast<__nv_bfloat16>(val), laneMask)); +} +#endif + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(dtype lhs, dtype rhs) { + return lhs < rhs; +} + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(dtype lhs, dtype rhs) { + return lhs > rhs; +} + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(dtype lhs, dtype rhs) { + return lhs >= rhs; +} + +#ifdef __CUDACC__ +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(c10::Half lhs, c10::Half rhs) { + return __hlt(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(c10::Half lhs, c10::Half rhs) { + return __hgt(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(c10::Half lhs, c10::Half rhs) { + return __hge(static_cast<__half>(lhs), static_cast<__half>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hlt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hgt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} + +template <> +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(c10::BFloat16 lhs, + c10::BFloat16 rhs) { + return __hge(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs)); +} +#endif + +template +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE dtype polyline_sqrt(dtype value) { + return sqrt(value); +} + +#ifdef __CUDACC__ +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::Half polyline_sqrt(c10::Half value) { + return static_cast(sqrtf(static_cast(value))); +} + +POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::BFloat16 polyline_sqrt(c10::BFloat16 value) { + return static_cast(sqrtf(static_cast(value))); +} +#endif + +} // namespace polyline + +#undef POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE + +#endif // LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh new file mode 100644 index 0000000..2354b54 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_KERNELS_CUH +#define LANE_HELPERS_POLYLINE_KERNELS_CUH + +#include "polyline_common.cuh" +#include "polyline_dtype_compat.cuh" + +namespace polyline { + +template +__device__ __forceinline__ void prefix_sum_warp(int index, dtype value, int num_values_in_scan, + dtype& scan_value, dtype& sum_all) { + sum_all = value; + scan_value = static_cast(0.0); + for (int i = 1; i < num_values_in_scan; i <<= 1) { + dtype sum_other = shfl_xor_sync_compat(0xffffffffu, sum_all, i); + scan_value += ((static_cast(index) & static_cast(i)) > 0) * sum_other; + sum_all += sum_other; + } +} + +/** + * @brief Perform a prefix sum on a block of values. + * + * @details + * The buffer `warp_scan_buffer` is used to store the sums of the individual warps, which is then used + * to compute the offsets to add to each warp. For that, a prefix sum is performed on the buffer in a second + * step (in-place). The size of the buffer is (in elements): `blockDim.y * num_warps_per_sample`. + * + * @tparam dtype The type of the values to prefix sum + * + * @param value The value to prefix sum + * @param num_warps_per_sample The number of warps per sample + * @param warp_scan_buffer The buffer to store the partial sums of the iterations so far for each sample + * + * @return The prefix sum for the current thread + */ +template +__device__ __forceinline__ dtype prefix_sum_block(dtype value, int num_warps_per_sample, + dtype* warp_scan_buffer) { + // ix corresponds to the thread index inside a single sample + const int ix = threadIdx.x; + const int iwx = threadIdx.x / 32; // index of the warp inside the sample + const int iw = threadIdx.y * num_warps_per_sample + iwx; // index of the warp in the block + // Get thread id (consecutive IDs correspond to consecutive values in the array) + //const int tid_shared = threadIdx.y * bxsize + ix; + + // Using warp shuffles iteratively, in two stages + + // First stage: perform warp scans + dtype warp_scan1; + dtype warp_sum1; + prefix_sum_warp(ix, value, 32, warp_scan1, warp_sum1); + // If this is the first thread in the warp, it is responsible for storing the partial sum for the warp + if (ix % 32 == 0) warp_scan_buffer[iw] = warp_sum1; + __syncthreads(); + + // Warp scan for for the partial sums to obtain the offsets for each warp + // The first warp (i.e. ix < 32) participates in step 2 of the warp scan. + // Note that the whole first warp always participates in step 2, even if num_warps_per_sample < 32. + // This is to avoid a deadlock without using a complex mask generation method for `__shfl_xor_sync()`. + if (ix < 32) { + const int wid_shared = threadIdx.y * num_warps_per_sample + ix; + const bool is_inside = ix < num_warps_per_sample; + const dtype value = is_inside ? warp_scan_buffer[wid_shared] : static_cast(0.0); + dtype warp_scan2; + dtype warp_sum2; + prefix_sum_warp(ix, value, num_warps_per_sample, warp_scan2, warp_sum2); + if (ix < num_warps_per_sample) { + warp_scan_buffer[wid_shared] = warp_scan2; + } + } + __syncthreads(); + + // Apply offsets to the partial sums to obtain the final values + warp_scan1 += warp_scan_buffer[iw]; + + return warp_scan1; +} + +/** + * @brief Perform a prefix sum on a block of values. + * + * @details + * The buffer is split into 2 parts: + * - The first part (`blockDim.y` elements) stores the running sums of all + * elements processed so far for each sample in y (`sum_buffer`). + * - The second part (`warp_scan_buffer`) is scratch space for the intra‑block + * scan performed by `prefix_sum_block`. The size is: `blockDim.y * num_warps_per_sample` + * (see `prefix_sum_block` for more details) + * Hence, the total buffer size in elements is (in elements): + * `blockDim.y + blockDim.y * num_warps_per_sample`. + * or + * `blockDim.y * (1 + num_warps_per_sample)` + * + * The results are stored in the sequence array, overriding the input values. + * Note that the results are the accumulated values including the current value, i.e. the operation can be expressed as: + * `sequences[i] = sum(sequences[0:(i+1)])`, where the slicing is defined as in Python. + * + * + * + * @tparam dtype The type of the values to prefix sum + * + * @param sequences The sequences to prefix sum for the current thread block. Note that this means that + * the first sequence is the one corresponding to threadIdx.y == 0 of the current block, not necessarily the + * first sequence in the global array. + * @param buffer Combined temporary storage used by the prefix-sum. + * @param numel_x The number of elements in the x dimension + * @param numel_x_full_blocks The number of elements in the x dimension extended to a multiple of blockDim.x + * @param numel_y The number of sequences in the y dimension + * @param offset The initial offset to add to the prefix sum of each sequence + */ +template +__device__ __forceinline__ void prefix_sum_looped(dtype* sequence, dtype* buffer_block, int numel_x, + int numel_x_full_blocks, int numel_y, dtype offset) { + const int ix = threadIdx.x; + const int iy = threadIdx.y; + + // Buffer for keeping the sums of the iterations so far for each sample + dtype* sum_buffer = buffer_block; + // Buffer as needed for the prefix sum implementation + dtype* warp_scan_buffer = buffer_block + blockDim.y; + + int num_warps_per_sample = (blockDim.x + 31) / 32; + + // Initialize the buffer containing the partial sums of the iterations so far for each sample + if (ix == 0) { + sum_buffer[iy] = static_cast(0.0); + } + // Compute the sum one `bxsize` at a time for each sample + for (int i = ix; i < numel_x_full_blocks; i += blockDim.x) { + // Make sure that sum_buffer is written to (either initially or in the previous iteration) + __syncthreads(); + const dtype value = i < numel_x ? sequence[i] : static_cast(0.0); + const dtype value_out = + prefix_sum_block(value, num_warps_per_sample, warp_scan_buffer) + sum_buffer[iy] + offset; + const dtype value_out_incl_current = value_out + value; + // Make sure that + // - sequences are not written to before they are read from for the current iteration + // - sum_buffer is not written to before it is read from for the current iteration + __syncthreads(); + if (i < numel_x) { + sequence[i] = value_out_incl_current; + } + // Update the sum buffer for the next iteration to the current value of the last processed sample. + // Note that the last thread may be out of bounds and not correspond to the last element. However, + // in this case, the value us not needed (and also still is the correct value as the values are + // extended with zeros, so that the cumulative sum (computed as including the current value) is + // the same as for the last element) + if (ix == blockDim.x - 1) { + sum_buffer[iy] = value_out_incl_current; + } + // Offset is only applied in the first iteration. Afterwards, the offset is already included in the + // partial sum as stored in `sum_buffer` and the offset must not be applied again. + offset = static_cast(0.0); + } + __syncthreads(); +} + +template +__device__ __forceinline__ dtype warp_reduce_sum(dtype value, int num_vals_per_partial) { + const int ix = threadIdx.x; + for (int i = 1; i < num_vals_per_partial; i <<= 1) { + const dtype val_other = shfl_xor_sync_compat(0xffffffffu, value, i); + value += val_other; + } + return value; +} + +template +__device__ __forceinline__ dtype sample_reduce_sum(dtype value, int num_warps_per_sample, + dtype* warp_temp_and_result_buffer) { + const int ix = threadIdx.x; // index of thread in the block + const int iwx = threadIdx.x / 32; //index of warp in the sample + const int iw = threadIdx.y * num_warps_per_sample + iwx; // index of the warp in the block + + const dtype warp_sum = warp_reduce_sum(value, 32); + // The first thread in the warp writes the result for the warp + if (ix % 32 == 0) { + warp_temp_and_result_buffer[iw] = warp_sum; + } + // Make sure all warps have written their results + __syncthreads(); + + dtype sample_sum = static_cast(0.0); + // The first warp reduces the results of the first stage + // Note that from now on, ix corresponds to the index of the warp (from stage 1) in the block (previously iw) + if (ix < 32) { + const int iw_base = threadIdx.y * num_warps_per_sample; + const dtype warp_sum_phase_1 = + ix < num_warps_per_sample ? warp_temp_and_result_buffer[iw_base + ix] : static_cast(0.0); + // The partial sum will have constant segments, each segment corresponding to one sample (and containing as many values as + // there are warps per sample). + sample_sum = warp_reduce_sum(warp_sum_phase_1, num_warps_per_sample); + } + __syncthreads(); + return sample_sum; +} + +template +__device__ __forceinline__ void sample_distances(const dtype* points_sample, + const dtype* accum_distances_sample, + const dtype* distances_to_sample_sample, + int num_distances_to_sample, int num_points, int num_dims, + dtype* res_points_sample, bool relative_distances) { + const int x = threadIdx.x; + if (num_points == 0) { + const int result_stride = blockDim.x * num_dims; + + dtype* res_points_current = + x < num_distances_to_sample ? res_points_sample + x * num_dims : res_points_sample; + for (int i = x; i < num_distances_to_sample; i += blockDim.x, res_points_current += result_stride) { + fill_point_with_nan_common(res_points_current, num_dims); + } + return; + } + + dtype total_length_if_needed = static_cast(0.0); + if (relative_distances) { + total_length_if_needed = accum_distances_sample[num_points - 1]; + } + for (int i = x; i < num_distances_to_sample; i += blockDim.x) { + dtype* res_points_current = res_points_sample + i * num_dims; + const dtype distance_to_sample = relative_distances + ? distances_to_sample_sample[i] * total_length_if_needed + : distances_to_sample_sample[i]; + sample_at_distance_common(points_sample, accum_distances_sample, distance_to_sample, + num_points, num_dims, res_points_current); + } +} + +template +__device__ __forceinline__ void compute_distances(dtype* points_sample, int num_points, int num_dims, + dtype* distances_sample) { + const int x = threadIdx.x; + if (num_points == 0) { + return; + } + + if (x == 0) { + // Distance from the start to the first point is zero. + distances_sample[0] = static_cast(0.0); + } + // Store segment lengths starting at index 1 so that an *inclusive* prefix + // sum over `distances_sample` yields distances to points: + // distances_sample[j] = distance from start to point j. + for (int i = x; i < num_points - 1; i += blockDim.x) { + distances_sample[i + 1] = compute_segment_length_common(points_sample, i, num_dims); + } +} + +/** + * @brief Shared implementation for both fixed-size and variable-size batch kernels. + * + * @details + * This routine implements the common logic used by: + * - `polyline_sampling_fully_shared_kernel` (fixed-size batches), and + * - `polyline_sampling_fully_shared_var_batch_kernel` (variable-size batches). + * + * The shared memory is split into two parts: + * - The first part stores the distances and accumulated distances + * (conversion in-place) for all points and has size (in elements): + * `blockDim.y * max_num_points`. + * - The second part stores the temporary buffer used by + * `prefix_sum_looped` and has size (in elements): + * `blockDim.y * (num_warps_per_sample + 1)`. + * (see the documentation of `prefix_sum_looped` for details). + * The total shared memory size is therefore (in elements): + * `(blockDim.y * max_num_points + blockDim.y * (num_warps_per_sample + 1))`. + * + * + * @tparam dtype The type of the points + * + * @param points The points to sample + * @param distances_to_sample The distances to sample at + * @param res_points The resulting sampled points + * @param max_num_points The maximum number of points per polyline in the batch + * @param max_num_points_full_blocks The maximum number of points extended to a multiple of blockDim.x + * @param num_dims The number of dimensions of the points + * @param max_num_distances_to_sample The maximum number of distances to sample at per polyline + * @param num_samples The number of samples (batch size) + * @param sample_sizes_points (optional) Per-sample number of points (variable-size batches) + * @param sample_sizes_distances_to_sample (optional) Per-sample number of distances (variable-size batches) + * @param relative_distances Interpret distances to sample as fractions of each polyline's total length + * @param distance_buffer_ext Optional external buffer for distances when shared memory is insufficient + */ +template +__device__ __forceinline__ void polyline_sampling_fully_shared_common( + dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points, + int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples, + sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, dtype* distance_buffer_ext) { + extern __shared__ uint8_t shared_mem[]; + dtype* distances; + dtype* buffer; + if (use_shared_distances) { + // Shared-memory layout per block: + // distances: [blockDim.y][max_num_points] + // buffer : [blockDim.y * (1 + num_warps_per_sample)] + distances = reinterpret_cast(shared_mem); + buffer = reinterpret_cast(shared_mem + blockDim.y * max_num_points * sizeof(dtype)); + } else { + // External distances buffer is laid out per block as + // [blockIdx.y][blockDim.y][max_num_points] + // so each block gets its own contiguous slice. The scratch `buffer` + // always starts at the beginning of this block's shared memory. + distances = distance_buffer_ext + blockIdx.y * blockDim.y * max_num_points; + buffer = reinterpret_cast(shared_mem); + } + + const int y = threadIdx.y; + const int y_global = blockIdx.y * blockDim.y + y; + const bool is_active_sample = (y_global < num_samples); + + // 1) Compute per-point distances only for valid samples. Inactive rows in + // the final block still participate in sync-heavy code paths with zero work. + int curr_num_points = 0; + int curr_num_distances_to_sample = 0; + if (is_active_sample) { + if (use_variable_size_batch) { + curr_num_points = sample_sizes_points[y_global]; + curr_num_distances_to_sample = sample_sizes_distances_to_sample[y_global]; + } else { + curr_num_points = max_num_points; + curr_num_distances_to_sample = max_num_distances_to_sample; + } + + // Global index for points in device memory; distances remain indexed by the + // local y within the block because they live in shared memory. + dtype* points_sample = points + y_global * max_num_points * num_dims; + dtype* distances_sample = distances + y * max_num_points; + if (curr_num_points > 0) { + compute_distances(points_sample, curr_num_points, num_dims, distances_sample); + } + } + + // 2) Prefix-sum over distances for all rows in this block-local buffer. + // This operates purely on (shared or external) distances, so it is + // safe even for rows that don't correspond to a real sample; their + // results are never used. + // The `distances` are per-block, so we use the local index `y` to access the distances for the current block. + dtype* distance = distances + y * max_num_points; + prefix_sum_looped(distance, // sequences + buffer, // buffer (sum_buffer + warp_scan_buffer) + curr_num_points, // numel_x + max_num_points_full_blocks, // numel_x_full_blocks (extended to full blocks) + blockDim.y, // numel_y (number of samples per block) + static_cast(0.0) // offset + ); + + // 3) Sample only for valid samples, using their (possibly shared or + // external) accumulated distances. + if (is_active_sample) { + // Get the points for the current sample (use of global offset) + const dtype* points_sample = points + y_global * max_num_points * num_dims; + // Get the distances for the current sample (use of block-local offset, as distances are stored in + // shared memory (or in an external buffer with `points` referring to points for this block)) + const dtype* distances_sample = distances + y * max_num_points; + // Get the distances to sample at for the current sample (use of global offset) + const dtype* distances_to_sample_sample = + distances_to_sample + y_global * max_num_distances_to_sample; + sample_distances(points_sample, distances_sample, distances_to_sample_sample, + curr_num_distances_to_sample, curr_num_points, num_dims, + res_points + y_global * max_num_distances_to_sample * num_dims, + relative_distances); + } +} + +/** + * @brief Sample the points at the distances (fixed-size batches). + * + * See `polyline_sampling_fully_shared_common` for implementation details. + */ +template +__global__ void polyline_sampling_fully_shared_kernel(dtype* points, dtype* distances_to_sample, + dtype* res_points, int num_points, + int num_points_full_blocks, int num_dims, + int num_distances_to_sample, int num_samples, + bool relative_distances, dtype* distance_buffer_ext) { + polyline_sampling_fully_shared_common( + points, distances_to_sample, res_points, + num_points, // max_num_points + num_points_full_blocks, // max_num_points_full_blocks + num_dims, + num_distances_to_sample, // max_num_distances_to_sample + num_samples, + /*sample_sizes_points=*/nullptr, + /*sample_sizes_distances_to_sample=*/nullptr, relative_distances, distance_buffer_ext); +} + +// Variable-size batch version of the kernel. +template +__global__ void polyline_sampling_fully_shared_var_batch_kernel( + dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points, + int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples, + sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, dtype* distance_buffer_ext) { + polyline_sampling_fully_shared_common( + points, distances_to_sample, res_points, max_num_points, max_num_points_full_blocks, num_dims, + max_num_distances_to_sample, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, distance_buffer_ext); +} + +template +__device__ __forceinline__ void polyline_lengths_common(dtype* points, dtype* lengths, int max_num_points, + int num_dims, int num_samples, + sample_size_dtype* sample_sizes_points, + dtype* reduction_buffer) { + const int x = threadIdx.x; + const int y_global = blockIdx.y * blockDim.y + threadIdx.y; + const bool is_active_sample = (y_global < num_samples); + + int curr_num_points = 0; + dtype local_length = static_cast(0.0); + if (is_active_sample) { + curr_num_points = use_variable_size_batch ? sample_sizes_points[y_global] : max_num_points; + const dtype* points_sample = points + y_global * max_num_points * num_dims; + for (int i = x; i < curr_num_points - 1; i += blockDim.x) { + local_length += compute_segment_length_common(points_sample, i, num_dims); + } + } + + const int num_warps_per_sample = (blockDim.x + 31) / 32; + const dtype length = sample_reduce_sum(local_length, num_warps_per_sample, reduction_buffer); + if (is_active_sample && x == 0) { + lengths[y_global] = curr_num_points == 0 ? polyline_nan() : length; + } +} + +template +__global__ void polyline_lengths_kernel(dtype* points, dtype* lengths, int num_points, int num_dims, + int num_samples) { + extern __shared__ uint8_t shared_mem[]; + dtype* reduction_buffer = reinterpret_cast(shared_mem); + polyline_lengths_common(points, lengths, num_points, num_dims, num_samples, + /*sample_sizes_points=*/nullptr, reduction_buffer); +} + +template +__global__ void polyline_lengths_var_batch_kernel(dtype* points, dtype* lengths, int max_num_points, + int num_dims, int num_samples, + sample_size_dtype* sample_sizes_points) { + extern __shared__ uint8_t shared_mem[]; + dtype* reduction_buffer = reinterpret_cast(shared_mem); + polyline_lengths_common( + points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points, reduction_buffer); +} + +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_KERNELS_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh new file mode 100644 index 0000000..2668917 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH +#define LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH + +#include +#include +#include + +#include + +#include "helper_macros.cuh" +#include "polyline_kernels.cuh" + +namespace polyline { + +static constexpr int MAX_CACHED_CUDA_DEVICES = 64; + +static void check_non_negative_cuda_device(int device) { + if (device < 0) { + throw std::runtime_error("CUDA device index must be non-negative."); + } +} + +static size_t query_polyline_max_shared_full_for_device(int device) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + size_t max_shared_full = static_cast(prop.sharedMemPerBlock); + if (prop.sharedMemPerBlockOptin != 0) { + max_shared_full = static_cast(prop.sharedMemPerBlockOptin); + } + return max_shared_full; +} + +static size_t polyline_max_shared_full_for_device(int device) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + static size_t max_shared_full_by_device[MAX_CACHED_CUDA_DEVICES] = {}; + + check_non_negative_cuda_device(device); + // Fallback if there are more devices than the maximum number of cached devices we use. + if (device >= MAX_CACHED_CUDA_DEVICES) { + const size_t max_shared_full = query_polyline_max_shared_full_for_device(device); + return max_shared_full; + } + + std::call_once(configured_devices[device], [device]() { + max_shared_full_by_device[device] = query_polyline_max_shared_full_for_device(device); + }); + const size_t max_shared_full = max_shared_full_by_device[device]; + return max_shared_full; +} + +template +static void configure_polyline_sampling_kernel(size_t max_shared_full) { + CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + static_cast(max_shared_full))); + CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); +} + +template +static void configure_polyline_sampling_kernel_once(int device, size_t max_shared_full) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + + check_non_negative_cuda_device(device); + if (device >= MAX_CACHED_CUDA_DEVICES) { + configure_polyline_sampling_kernel(max_shared_full); + return; + } + + std::call_once(configured_devices[device], [max_shared_full]() { + configure_polyline_sampling_kernel(max_shared_full); + }); +} + +template +static void configure_polyline_sampling_var_batch_kernel(size_t max_shared_full) { + CUDA_CHECK(cudaFuncSetAttribute( + polyline_sampling_fully_shared_var_batch_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast(max_shared_full))); + CUDA_CHECK(cudaFuncSetAttribute( + polyline_sampling_fully_shared_var_batch_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); +} + +template +static void configure_polyline_sampling_var_batch_kernel_once(int device, size_t max_shared_full) { + static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES]; + + check_non_negative_cuda_device(device); + if (device >= MAX_CACHED_CUDA_DEVICES) { + configure_polyline_sampling_var_batch_kernel( + max_shared_full); + return; + } + + std::call_once(configured_devices[device], [max_shared_full]() { + configure_polyline_sampling_var_batch_kernel( + max_shared_full); + }); +} + +} // namespace polyline + +#endif // LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp new file mode 100644 index 0000000..a0d82a3 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "polyline.cuh" +#include "helper_macros.cuh" + +//#define PROFILE_AND_SYNC + +#ifdef PROFILE_AND_SYNC +#include +#endif + +namespace polyline { + +#define CHECK_DEVICE(x) check_device(x, #x) +#define CHECK_CONTIGUOUS(x) check_contiguous(x, #x) +#define CHECK_TYPE(x) check_type(x, #x) +#define CHECK_INPUT(x) \ + CHECK_DEVICE(x); \ + CHECK_CONTIGUOUS(x); \ + CHECK_TYPE(x); +inline void check_device(const at::Tensor& tensor, const char* description) { + TORCH_CHECK(tensor.is_cpu() || tensor.is_cuda(), description, " must be on CPU or CUDA"); +} + +inline void check_contiguous(const at::Tensor& tensor, const char* description) { + TORCH_CHECK(tensor.is_contiguous(), description, " must be contiguous"); +} + +inline void check_type(const at::Tensor& tensor, const char* description) { + if (tensor.is_cuda()) { + TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64 || + tensor.scalar_type() == torch::kFloat16 || tensor.scalar_type() == torch::kBFloat16, + description, " must have dtype float16, float32, float64, or bfloat16 on CUDA"); + } else { + TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64, + description, " must have dtype float32 or float64 on CPU"); + } +} + +inline void check_same_device(const at::Tensor& lhs, const at::Tensor& rhs, const char* message) { + TORCH_CHECK(lhs.device() == rhs.device(), message); +} + +inline void check_sample_size_type(const at::Tensor& sample_sizes, const char* description) { + TORCH_CHECK(sample_sizes.scalar_type() == at::kInt || sample_sizes.scalar_type() == at::kLong, + description, " must have dtype int32 or int64"); +} + +inline void check_sample_sizes(const at::Tensor& sample_sizes, int max_size, const char* description) { + if (sample_sizes.numel() == 0) { + return; + } + TORCH_CHECK( + !torch::any(sample_sizes < 0).item() && !torch::any(sample_sizes > max_size).item(), + description, " values must be in [0, ", max_size, "]"); +} + +at::Tensor make_external_distance_buffer(size_t size_elems, const at::TensorOptions& options) { + // Keep external CUDA scratch memory owned by PyTorch's stream-aware allocator. + // A raw cudaFree here can race with the asynchronous custom kernel that uses this buffer. + + // Return an empty tensor if no external distance buffer is needed. + if (size_elems == 0) { + return at::Tensor(); + } + + // Check that the size is not too large to allocate as a tensor. + TORCH_CHECK(size_elems <= static_cast(std::numeric_limits::max()), + "external polyline distance buffer is too large to allocate as a tensor"); + + // Allocate the buffer and return it. + at::Tensor buffer = at::empty({static_cast(size_elems)}, options); + return buffer; +} + +at::Tensor polyline_interpolation(at::Tensor points, at::Tensor distances, bool relative_distances) { + CHECK_DEVICE(points); + CHECK_DEVICE(distances); + CHECK_TYPE(points); + CHECK_TYPE(distances); + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)"); + TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, num_distances)"); + TORCH_CHECK(points.size(0) == distances.size(0), + "points and distances must contain the same number of polylines"); + TORCH_CHECK(points.scalar_type() == distances.scalar_type(), + "points and distances must have the same dtype"); + check_same_device(points, distances, "points and distances must be on the same device"); + + const int num_samples = points.size(0); + const int num_points = points.size(1); + const int num_distances = distances.size(1); + const int num_dims = points.size(2); + // Result has shape (batch, num_distances, point_dim) and otherwise + // matches `distances` (device, dtype). + auto res = at::empty({num_samples, num_distances, num_dims}, distances.options()); + if (num_distances == 0) { + return res; + } + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor distances_contiguous = distances.contiguous(); + + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation", [&] { + const int device = points.get_device(); + c10::cuda::CUDAGuard device_guard(static_cast(device)); + const auto stream = at::cuda::getCurrentCUDAStream(static_cast(device)); + at::cuda::CUDAStreamGuard stream_guard(stream); + const auto cfg = make_polyline_launch_config(num_points, num_samples, device); + // Allocate under the same stream used for the kernel launch so the caching allocator + // does not recycle this temporary scratch buffer before queued kernel work consumes it. + const at::Tensor distance_buffer_ext = + make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options()); + scalar_t* distance_buffer_ext_ptr = + distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr() : nullptr; + polyline_interpolation(points_contiguous.data_ptr(), num_points, num_dims, + distances_contiguous.data_ptr(), num_distances, + res.data_ptr(), num_samples, relative_distances, + device, cfg, distance_buffer_ext_ptr, stream.stream()); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_interpolation_cpu", [&] { + polyline_interpolation_cpu(points_contiguous.data_ptr(), num_points, num_dims, + distances_contiguous.data_ptr(), num_distances, + res.data_ptr(), num_samples, relative_distances); + }); + } + + return res; +} + +at::Tensor polyline_lengths(at::Tensor points) { + CHECK_DEVICE(points); + CHECK_TYPE(points); + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)"); + + const int num_samples = points.size(0); + const int num_points = points.size(1); + const int num_dims = points.size(2); + auto res = at::empty({num_samples}, points.options()); + const at::Tensor points_contiguous = points.contiguous(); + + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths", [&] { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + polyline_lengths(points_contiguous.data_ptr(), // points + num_points, // num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + stream // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_cpu", [&] { + polyline_lengths_cpu(points_contiguous.data_ptr(), // points + num_points, // num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples // num_samples + ); + }); + } + + return res; +} + +at::Tensor polyline_interpolation_var_size_batch(at::Tensor points, at::Tensor distances, + at::Tensor sample_sizes_points, + at::Tensor sample_sizes_distances_to_sample, + bool relative_distances) { + CHECK_DEVICE(points); + CHECK_DEVICE(distances); + check_device(sample_sizes_points, "points.sample_sizes"); + check_device(sample_sizes_distances_to_sample, "distances.sample_sizes"); + CHECK_TYPE(points); + CHECK_TYPE(distances); + check_sample_size_type(sample_sizes_points, "points.sample_sizes"); + check_sample_size_type(sample_sizes_distances_to_sample, "distances.sample_sizes"); + + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)"); + TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, max_num_distances)"); + TORCH_CHECK(points.size(0) == distances.size(0), + "points and distances must contain the same number of polylines"); + TORCH_CHECK(points.scalar_type() == distances.scalar_type(), + "points and distances must have the same dtype"); + check_same_device(points, distances, "points and distances must be on the same device"); + TORCH_CHECK(sample_sizes_points.scalar_type() == sample_sizes_distances_to_sample.scalar_type(), + "points.sample_sizes and distances.sample_sizes must have the same dtype " + "(both int32 or both int64)"); + check_same_device(sample_sizes_points, points, + "points.sample_sizes must be on the same device as points"); + check_same_device(sample_sizes_distances_to_sample, distances, + "distances.sample_sizes must be on the same device as distances"); + TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor"); + TORCH_CHECK(sample_sizes_distances_to_sample.ndimension() == 1, + "distances.sample_sizes must be a 1D tensor"); + + const int num_samples = points.size(0); + const int max_num_points = points.size(1); + const int max_num_distances = distances.size(1); + const int num_dims = points.size(2); + // Result has shape (batch, num_distances, point_dim) and otherwise + // matches `distances` (device, dtype). + auto res = at::empty({num_samples, max_num_distances, num_dims}, distances.options()); + + TORCH_CHECK(sample_sizes_points.size(0) == num_samples, + "points.sample_sizes must contain one count per polyline in points"); + TORCH_CHECK(sample_sizes_distances_to_sample.size(0) == num_samples, + "distances.sample_sizes must contain one count per polyline in distances"); + check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes"); + check_sample_sizes(sample_sizes_distances_to_sample, max_num_distances, "distances.sample_sizes"); + if (max_num_distances == 0) { + return res; + } + + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor distances_contiguous = distances.contiguous(); + const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous(); + const at::Tensor sample_sizes_distances_to_sample_contiguous = + sample_sizes_distances_to_sample.contiguous(); + + auto launch = [&](auto sample_size_type_tag) { + using sample_size_t = decltype(sample_size_type_tag); + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation_var_size_batch", [&] { + const int device = points.get_device(); + c10::cuda::CUDAGuard device_guard(static_cast(device)); + const auto stream = at::cuda::getCurrentCUDAStream(static_cast(device)); + at::cuda::CUDAStreamGuard stream_guard(stream); + const auto cfg = + make_polyline_launch_config(max_num_points, num_samples, device); + // Allocate under the same stream used for the kernel launch so the caching allocator + // does not recycle this temporary scratch buffer before queued kernel work consumes it. + const at::Tensor distance_buffer_ext = + make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options()); + scalar_t* distance_buffer_ext_ptr = + distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr() : nullptr; + polyline_interpolation_var_size_batch( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + distances_contiguous.data_ptr(), // distances + max_num_distances, // num_distances + res.data_ptr(), // result_points + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + sample_sizes_distances_to_sample_contiguous + .data_ptr(), // sample sizes distances + relative_distances, // relative_distances + device, // device + cfg, // launch config + distance_buffer_ext_ptr, // distance_buffer_ext + stream.stream() // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES( + points.scalar_type(), "polyline_interpolation_var_size_batch_cpu", [&] { + polyline_interpolation_var_size_batch_cpu( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + distances_contiguous.data_ptr(), // distances + max_num_distances, // num_distances + res.data_ptr(), // result_points + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + sample_sizes_distances_to_sample_contiguous + .data_ptr(), // sample sizes distances + relative_distances // relative_distances + ); + }); + } + }; + if (sample_sizes_points.scalar_type() == at::kInt) { + launch(int32_t{}); + } else { + launch(int64_t{}); + } + + return res; +} + +at::Tensor polyline_lengths_var_size_batch(at::Tensor points, at::Tensor sample_sizes_points) { + CHECK_DEVICE(points); + check_device(sample_sizes_points, "points.sample_sizes"); + CHECK_TYPE(points); + check_sample_size_type(sample_sizes_points, "points.sample_sizes"); + + TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)"); + TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor"); + check_same_device(sample_sizes_points, points, + "points.sample_sizes must be on the same device as points"); + + const int num_samples = points.size(0); + const int max_num_points = points.size(1); + const int num_dims = points.size(2); + auto res = at::empty({num_samples}, points.options()); + + TORCH_CHECK(sample_sizes_points.size(0) == num_samples, + "points.sample_sizes must contain one count per polyline in points"); + check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes"); + + const at::Tensor points_contiguous = points.contiguous(); + const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous(); + + auto launch = [&](auto sample_size_type_tag) { + using sample_size_t = decltype(sample_size_type_tag); + if (points.is_cuda()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths_var_size_batch", [&] { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + polyline_lengths_var_size_batch( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr(), // sample_sizes_points + stream // stream + ); + CUDA_CHECK_LAST(); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_var_size_batch_cpu", [&] { + polyline_lengths_var_size_batch_cpu( + points_contiguous.data_ptr(), // points + max_num_points, // max_num_points + num_dims, // num_dims + res.data_ptr(), // lengths + num_samples, // num_samples + sample_sizes_points_contiguous.data_ptr() // sample_sizes_points + ); + }); + } + }; + if (sample_sizes_points.scalar_type() == at::kInt) { + launch(int32_t{}); + } else { + launch(int64_t{}); + } + + return res; +} + +} // namespace polyline + +namespace py = pybind11; +using namespace polyline; + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "Lane helpers polyline interpolation bindings"; + m.def("polyline_interpolation", (at::Tensor(*)(at::Tensor, at::Tensor, bool)) & polyline_interpolation, + py::arg("points"), py::arg("distances"), py::arg("relative") = false, + "Interpolate points along polylines at given distances."); + m.def("_polyline_lengths", (at::Tensor(*)(at::Tensor)) & polyline_lengths, py::arg("points"), + "Internal tensor-only entry point for fixed-size polyline length computation."); + m.def("_polyline_interpolation_var_size_batch", + (at::Tensor(*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, bool)) & + polyline_interpolation_var_size_batch, + py::arg("points"), py::arg("distances"), py::arg("sample_sizes_points"), + py::arg("sample_sizes_distances_to_sample"), py::arg("relative") = false, + "Internal tensor-only entry point for variable-length polyline interpolation."); + m.def("_polyline_lengths_var_size_batch", + (at::Tensor(*)(at::Tensor, at::Tensor)) & polyline_lengths_var_size_batch, py::arg("points"), + py::arg("sample_sizes_points"), + "Internal tensor-only entry point for variable-length polyline length computation."); +} \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cu b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu new file mode 100644 index 0000000..7d62499 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "helper_macros.cuh" + +#include "polyline.cuh" +#include "polyline_kernels.cuh" +#include "polyline_shared_memory_config.cuh" + +namespace polyline { + +// Return the largest power of two that is <= n. +// For n <= 1, this returns 0 for n == 0 and 1 for n == 1. +static inline int last_power_of_2(int n) { + if (n <= 0) { + return 0; + } + unsigned int v = static_cast(n); + // Propagate highest set bit to all lower bits. + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + // Now (v + 1) >> 1 is the highest power of two <= original n. + const int power_of_two = static_cast((v + 1u) >> 1); + return power_of_two; +} + +template +struct PolylineLengthLaunchConfig { + dim3 block_dim; + dim3 grid_dim; + size_t shared_mem_size; +}; + +static int polyline_launch_threads_x(int num_samples_per_block) { + const int max_num_threads = 1024; + + const int max_threads_x_for_y = max_num_threads / num_samples_per_block; + // Round down to a multiple of 32, but keep at least one warp. + int threads_x = (max_threads_x_for_y / 32) * 32; + if (threads_x < 32) { + threads_x = 32; + } + return threads_x; +} + +template +static PolylineLaunchConfig make_polyline_launch_config_for_y(int num_points, int num_samples, + int num_samples_per_block) { + const int threads_x = polyline_launch_threads_x(num_samples_per_block); + + const dim3 block_dim(threads_x, num_samples_per_block, 1); + const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1); + const int num_points_full_blocks = ((num_points + block_dim.x - 1) / block_dim.x) * block_dim.x; + const int num_warps_per_sample = (block_dim.x + 31) / 32; + const size_t scratch_buffer_size_elems = block_dim.y * (num_warps_per_sample + 1); + const size_t distances_buffer_size_elems_shared = static_cast(block_dim.y) * num_points; + + PolylineLaunchConfig cfg; + cfg.block_dim = block_dim; + cfg.grid_dim = grid_dim; + cfg.num_points_full_blocks = num_points_full_blocks; + cfg.shared_mem_size = (distances_buffer_size_elems_shared + scratch_buffer_size_elems) * sizeof(dtype); + cfg.distance_buffer_ext_size_elems = 0; + cfg.use_shared_distances = true; + cfg.max_shared_full = 0; + return cfg; +} + +template +static size_t polyline_external_distance_buffer_size_elems(const PolylineLaunchConfig& cfg, + int num_points) { + const size_t buffer_size_elems = static_cast(cfg.grid_dim.y) * cfg.block_dim.y * num_points; + return buffer_size_elems; +} + +template +static size_t polyline_scratch_shared_mem_size(const PolylineLaunchConfig& cfg) { + const int num_warps_per_sample = (cfg.block_dim.x + 31) / 32; + const size_t shared_mem_size = + static_cast(cfg.block_dim.y) * (num_warps_per_sample + 1) * sizeof(dtype); + return shared_mem_size; +} + +template +PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, int device) { + // Keep blockDim.y at 1 so blockDim.x can use the full thread block for each sample. + PolylineLaunchConfig cfg = make_polyline_launch_config_for_y(num_points, num_samples, 1); + + // Determine whether we can stay in the shared‑memory path using the opt‑in + // limit (`sharedMemPerBlockOptin`) instead of falling back to the external + // buffer. + const size_t max_shared_full = polyline_max_shared_full_for_device(device); + + if (cfg.shared_mem_size <= max_shared_full) { + cfg.max_shared_full = max_shared_full; + return cfg; + } + + cfg.shared_mem_size = polyline_scratch_shared_mem_size(cfg); + cfg.use_shared_distances = false; + cfg.max_shared_full = max_shared_full; + cfg.distance_buffer_ext_size_elems = polyline_external_distance_buffer_size_elems(cfg, num_points); + return cfg; +} + +template +static PolylineLengthLaunchConfig make_polyline_length_launch_config(int num_samples) { + const int max_num_threads = 1024; + const int max_y_by_threads = max_num_threads / 32; // assuming at least one warp in x + const int max_y_candidate = min(num_samples, max_y_by_threads); + int num_samples_per_block = last_power_of_2(max_y_candidate); + if (num_samples_per_block < 1) { + num_samples_per_block = 1; + } + + const int max_threads_x_for_y = max_num_threads / num_samples_per_block; + int threads_x = (max_threads_x_for_y / 32) * 32; + if (threads_x < 32) { + threads_x = 32; + } + + const dim3 block_dim(threads_x, num_samples_per_block, 1); + const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1); + const int num_warps_per_sample = (block_dim.x + 31) / 32; + + PolylineLengthLaunchConfig cfg; + cfg.block_dim = block_dim; + cfg.grid_dim = grid_dim; + cfg.shared_mem_size = static_cast(block_dim.y) * num_warps_per_sample * sizeof(dtype); + return cfg; +} + +template +void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances, + dtype* result_points, int num_samples, bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream) { + if (cfg.use_shared_distances) { + configure_polyline_sampling_kernel_once(device, cfg.max_shared_full); + polyline_sampling_fully_shared_kernel + <<>>( + points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, relative_distances, nullptr); + } else { + polyline_sampling_fully_shared_kernel + <<>>( + points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, relative_distances, distance_buffer_ext); + } + CUDA_CHECK_LAST(); +} + +template +void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples, + cudaStream_t stream) { + auto cfg = make_polyline_length_launch_config(num_samples); + polyline_lengths_kernel<<>>( + points, lengths, num_points, num_dims, num_samples); + CUDA_CHECK_LAST(); +} + +template +void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances, + int num_distances, dtype* result_points, int num_samples, + sample_size_dtype* sample_sizes_points, + sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances, int device, + const PolylineLaunchConfig& cfg, dtype* distance_buffer_ext, + cudaStream_t stream) { + if (cfg.use_shared_distances) { + configure_polyline_sampling_var_batch_kernel_once( + device, cfg.max_shared_full); + polyline_sampling_fully_shared_var_batch_kernel + <<>>( + points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, nullptr); + } else { + polyline_sampling_fully_shared_var_batch_kernel + <<>>( + points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims, + num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample, + relative_distances, distance_buffer_ext); + } + CUDA_CHECK_LAST(); +} + +template +void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths, + int num_samples, sample_size_dtype* sample_sizes_points, + cudaStream_t stream) { + auto cfg = make_polyline_length_launch_config(num_samples); + polyline_lengths_var_batch_kernel + <<>>( + points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points); + CUDA_CHECK_LAST(); +} + +#define INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE) \ + template void polyline_interpolation( \ + DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE) \ + template PolylineLaunchConfig make_polyline_launch_config(int num_points, int num_samples, \ + int device); + +#define INSTANTIATE_POLYLINE_LENGTHS(DTYPE) \ + template void polyline_lengths(DTYPE * points, int num_points, int num_dims, DTYPE* lengths, \ + int num_samples, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_interpolation_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \ + const PolylineLaunchConfig& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_lengths_var_size_batch( \ + DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream); + +#define INSTANTIATE_POLYLINE_CUDA_DTYPE(DTYPE) \ + INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE) \ + INSTANTIATE_POLYLINE_LENGTHS(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int64_t) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int64_t) + +INSTANTIATE_POLYLINE_CUDA_DTYPE(float) +INSTANTIATE_POLYLINE_CUDA_DTYPE(double) +INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::Half) +INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::BFloat16) + +#undef INSTANTIATE_POLYLINE_CUDA_DTYPE +#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH +#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH +#undef INSTANTIATE_POLYLINE_LENGTHS +#undef INSTANTIATE_POLYLINE_LAUNCH_CONFIG +#undef INSTANTIATE_POLYLINE_INTERPOLATION +} // namespace polyline \ No newline at end of file diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp new file mode 100644 index 0000000..f0d7cd0 --- /dev/null +++ b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "polyline_common.cuh" + +namespace polyline { + +template +using cpu_acc_t = at::acc_type; + +template +static void compute_accumulated_distances_cpu(const dtype* points_sample, int num_points, int num_dims, + cpu_acc_t* accum_distances) { + using acc_t = cpu_acc_t; + accum_distances[0] = static_cast(0.0); + for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) { + accum_distances[point_idx + 1] = + accum_distances[point_idx] + + compute_segment_length_common(points_sample, point_idx, num_dims); + } +} + +template +static void sample_polyline_cpu(const dtype* points_sample, const dtype* distances_sample, int num_points, + int num_dims, int num_distances, dtype* result_sample, + bool relative_distances, std::vector>& accum_distances) { + using acc_t = cpu_acc_t; + if (num_distances == 0) { + return; + } + if (num_points == 0) { + dtype* result_sample_i = result_sample; + for (int distance_idx = 0; distance_idx < num_distances; + ++distance_idx, result_sample_i += num_dims) { + fill_point_with_nan_common(result_sample_i, num_dims); + } + return; + } + compute_accumulated_distances_cpu(points_sample, num_points, num_dims, accum_distances.data()); + const acc_t total_length = accum_distances[num_points - 1]; + for (int distance_idx = 0; distance_idx < num_distances; ++distance_idx) { + const acc_t distance_to_sample = + relative_distances ? static_cast(distances_sample[distance_idx]) * total_length + : static_cast(distances_sample[distance_idx]); + sample_at_distance_common(points_sample, accum_distances.data(), distance_to_sample, + num_points, num_dims, + result_sample + distance_idx * num_dims); + } +} + +template +void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances, + int num_distances, dtype* result_points, int num_samples, + bool relative_distances) { + using acc_t = cpu_acc_t; + const size_t stride_points = static_cast(num_points) * static_cast(num_dims); + const size_t stride_distances = static_cast(num_distances); + const size_t stride_result = static_cast(num_distances) * static_cast(num_dims); + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + std::vector accum_distances(num_points); + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const dtype* points_sample = points + sample_idx * stride_points; + const dtype* distances_sample = distances + sample_idx * stride_distances; + dtype* result_sample = result_points + sample_idx * stride_result; + sample_polyline_cpu(points_sample, distances_sample, num_points, num_dims, num_distances, + result_sample, relative_distances, accum_distances); + } + }); +} + +template +void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, + int num_samples) { + using acc_t = cpu_acc_t; + const size_t stride_points = static_cast(num_points) * static_cast(num_dims); + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const dtype* points_sample = points + sample_idx * stride_points; + acc_t length = static_cast(0.0); + if (num_points == 0) { + length = polyline_nan(); + } else { + for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) { + length += compute_segment_length_common(points_sample, point_idx, num_dims); + } + } + lengths[sample_idx] = static_cast(length); + } + }); +} + +template +void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + const dtype* distances, int num_distances, + dtype* result_points, int num_samples, + const sample_size_dtype* sample_sizes_points, + const sample_size_dtype* sample_sizes_distances_to_sample, + bool relative_distances) { + using acc_t = cpu_acc_t; + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + std::vector accum_distances(max_num_points); + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const int curr_num_points = static_cast(sample_sizes_points[sample_idx]); + const int curr_num_distances = static_cast(sample_sizes_distances_to_sample[sample_idx]); + const dtype* points_sample = points + sample_idx * max_num_points * num_dims; + const dtype* distances_sample = distances + sample_idx * num_distances; + dtype* result_sample = result_points + sample_idx * num_distances * num_dims; + sample_polyline_cpu(points_sample, distances_sample, curr_num_points, num_dims, + curr_num_distances, result_sample, relative_distances, + accum_distances); + } + }); +} + +template +void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims, + dtype* lengths, int num_samples, + const sample_size_dtype* sample_sizes_points) { + using acc_t = cpu_acc_t; + at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) { + for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) { + const int curr_num_points = static_cast(sample_sizes_points[sample_idx]); + const dtype* points_sample = points + sample_idx * max_num_points * num_dims; + acc_t length = static_cast(0.0); + if (curr_num_points == 0) { + length = polyline_nan(); + } else { + for (int point_idx = 0; point_idx < curr_num_points - 1; ++point_idx) { + length += compute_segment_length_common(points_sample, point_idx, num_dims); + } + } + lengths[sample_idx] = static_cast(length); + } + }); +} + +#define INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE) \ + template void polyline_interpolation_cpu( \ + const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, bool relative_distances); + +#define INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE) \ + template void polyline_lengths_cpu(const DTYPE* points, int num_points, int num_dims, \ + DTYPE* lengths, int num_samples); + +#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_interpolation_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \ + DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances); + +#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE) \ + template void polyline_lengths_var_size_batch_cpu( \ + const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \ + const SAMPLE_SIZE_DTYPE* sample_sizes_points); + +#define INSTANTIATE_POLYLINE_CPU_DTYPE(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE) \ + INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int) \ + INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int) \ + INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) + +INSTANTIATE_POLYLINE_CPU_DTYPE(float) +INSTANTIATE_POLYLINE_CPU_DTYPE(double) + +#undef INSTANTIATE_POLYLINE_CPU_DTYPE +#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU +#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU +#undef INSTANTIATE_POLYLINE_LENGTHS_CPU +#undef INSTANTIATE_POLYLINE_INTERPOLATION_CPU + +} // namespace polyline diff --git a/packages/lane_helpers/pyproject.toml b/packages/lane_helpers/pyproject.toml new file mode 100644 index 0000000..41b2940 --- /dev/null +++ b/packages/lane_helpers/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = [ + "setuptools>=64", + "wheel", + "scikit-build>=0.17.0", + "pybind11>=2.10.0", + "setuptools-scm>=8", + "accvlab-build-config @ file:../../build_config", +] +build-backend = "setuptools.build_meta" + +[project] +name = "accvlab.lane_helpers" +dynamic = ["version"] +description = "Lane helper utilities for ACCV-Lab." +requires-python = ">=3.8" +dependencies = [ + "torch>=2.0.0", + "numpy>=1.22.2", +] + +[project.optional-dependencies] +optional = [ + "matplotlib", + "pytest", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["accvlab.lane_helpers*"] + +[tool.setuptools_scm] +version_scheme = "no-guess-dev" +fallback_version = "0.0.0" +root = "../.." diff --git a/packages/lane_helpers/setup.py b/packages/lane_helpers/setup.py new file mode 100644 index 0000000..557ed99 --- /dev/null +++ b/packages/lane_helpers/setup.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from skbuild import setup +from setuptools import find_namespace_packages + +_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR = """ +######################################################################################### +# Missing build dependency: accvlab-build-config. # +# # +# ACCV-Lab package builds normally use --no-build-isolation, so the shared build helper # +# must already be installed in the active environment. Install it first with: # +# # +# pip install /build_config # +# # +# and retry. # +# # +# Alternatively, use /scripts/package_manager.sh to install packages in # +# the documented order. # +######################################################################################### +""" + +try: + from accvlab_build_config import build_cmake_args +except ModuleNotFoundError as exc: + if exc.name != "accvlab_build_config": + raise + raise RuntimeError(_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR) from exc + +_cmake_args = build_cmake_args() + + +setup( + name="accvlab.lane_helpers", + description="Lane helper utilities for ACCV-Lab.", + packages=find_namespace_packages(include=["accvlab.lane_helpers*"]), + include_package_data=True, + zip_safe=False, + cmake_source_dir="ext_impl", + cmake_install_dir="accvlab/lane_helpers", + cmake_args=_cmake_args, +) diff --git a/packages/lane_helpers/tests/polyline_test_utils.py b/packages/lane_helpers/tests/polyline_test_utils.py new file mode 100644 index 0000000..8cad98f --- /dev/null +++ b/packages/lane_helpers/tests/polyline_test_utils.py @@ -0,0 +1,226 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from accvlab.batching_helpers import RaggedBatch + +DEVICES = ["cpu", "cuda"] + + +def sample_polyline_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor: + # For no distances, the result is an empty tensor. + if distances.shape[0] == 0: + sampled_points = points.new_empty((0, points.shape[1])) + return sampled_points + # For no points, the result is NaN for every requested point coordinate. + if points.shape[0] == 0: + sampled_points = points.new_full((distances.shape[0], points.shape[1]), torch.nan) + return sampled_points + + segment_lengths = torch.linalg.vector_norm(points[1:] - points[:-1], dim=1) + accum = torch.cat([segment_lengths.new_zeros(1), torch.cumsum(segment_lengths, dim=0)]) + total_length = accum[-1] + + out = [] + for distance in distances: + d = torch.clamp(distance, min=0.0, max=total_length) + lower_idx = int(torch.nonzero(accum <= d, as_tuple=False)[-1]) + if lower_idx >= points.shape[0] - 1: + out.append(points[-1]) + continue + + upper_idx = lower_idx + 1 + lower_dist = accum[lower_idx] + upper_dist = accum[upper_idx] + segment_dist = upper_dist - lower_dist + if segment_dist <= torch.finfo(points.dtype).eps: + out.append(points[lower_idx]) + continue + + weight_upper = (d - lower_dist) / segment_dist + weight_lower = (upper_dist - d) / segment_dist + out.append(points[lower_idx] * weight_lower + points[upper_idx] * weight_upper) + + sampled_points = torch.stack(out) + return sampled_points + + +def sample_batch_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor: + sampled_points = torch.stack( + [ + sample_polyline_cpu(points_sample, distances_sample) + for points_sample, distances_sample in zip(points, distances) + ] + ) + return sampled_points + + +def polyline_lengths_cpu(points: torch.Tensor) -> torch.Tensor: + # For no points, the length is undefined. + if points.shape[1] == 0: + lengths = points.new_full((points.shape[0],), torch.nan) + return lengths + # For a single point, the length is 0. + if points.shape[1] == 1: + lengths = points.new_zeros((points.shape[0],)) + return lengths + + lengths = torch.linalg.vector_norm(points[:, 1:] - points[:, :-1], dim=2).sum(dim=1) + return lengths + + +def polyline_lengths_var_size_cpu(points: torch.Tensor, sample_sizes: torch.Tensor) -> torch.Tensor: + lengths = [] + for sample_idx in range(points.shape[0]): + num_points = int(sample_sizes[sample_idx].item()) + lengths.append(polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0]) + lengths = torch.stack(lengths) + return lengths + + +def assert_ragged_matches_cpu( + result: RaggedBatch, + points: torch.Tensor, + distances: torch.Tensor, + points_sample_sizes: torch.Tensor, + distances_sample_sizes: torch.Tensor, + *, + atol: float = 1e-5, +) -> None: + assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu()) + + for sample_idx in range(points.shape[0]): + + num_points = int(points_sample_sizes[sample_idx].item()) + num_distances = int(distances_sample_sizes[sample_idx].item()) + expected = sample_polyline_cpu( + points[sample_idx, :num_points].cpu(), + distances[sample_idx, :num_distances].cpu(), + ) + + actual = result.tensor[sample_idx, :num_distances].cpu() + + assert torch.allclose(actual, expected, atol=atol, rtol=0.0, equal_nan=True) + + +def make_random_ragged_polyline_case( + *, + seed: int, + batch_size: int = 7, + max_num_points: int = 12, + max_num_distances: int = 17, + num_dims: int = 3, +) -> tuple[RaggedBatch, RaggedBatch]: + generator = torch.Generator().manual_seed(seed) + points_sample_sizes = torch.randint(1, max_num_points + 1, (batch_size,), generator=generator) + distances_sample_sizes = torch.randint(0, max_num_distances + 1, (batch_size,), generator=generator) + + max_points_in_batch = int(points_sample_sizes.max().item()) + max_distances_in_batch = int(distances_sample_sizes.max().item()) + + points = torch.full((batch_size, max_points_in_batch, num_dims), 9999.0, dtype=torch.float32) + distances = torch.full((batch_size, max_distances_in_batch), -9999.0, dtype=torch.float32) + + for sample_idx in range(batch_size): + num_points = int(points_sample_sizes[sample_idx].item()) + num_distances = int(distances_sample_sizes[sample_idx].item()) + points[sample_idx, :num_points] = torch.rand((num_points, num_dims), generator=generator) + total_length = polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0] + distances[sample_idx, :num_distances] = ( + torch.rand((num_distances,), generator=generator) * total_length + ) + + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + + return points_batch, distances_batch + + +def make_padded_ragged_polyline_case( + device: str, +) -> tuple[RaggedBatch, RaggedBatch]: + # Poitns data + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0], [0.0, 0.0]], + [[3.5, -1.25], [4.5, -1.25], [4.5, 0.75], [9999.0, 9999.0], [9999.0, 9999.0]], + [[-2.0, 3.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]], + [[10.0, 0.0], [12.0, 0.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + points_sample_sizes = torch.tensor([5, 3, 1, 2], device=device) + # Distances data + distances = torch.tensor( + [ + [0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0], + [3.0, 2.0, 1.0, 0.0, -1.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + [9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + [-5.0, 1.0, 5.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0], + ], + device=device, + dtype=torch.float32, + ) + distances_sample_sizes = torch.tensor([11, 5, 0, 3], device=device) + + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + + return points_batch, distances_batch + + +def distances_for_mode( + points: torch.Tensor, absolute_distances: torch.Tensor, *, relative: bool +) -> torch.Tensor: + + if not relative: + return absolute_distances + + lengths = polyline_lengths_cpu(points.cpu()).to( + device=absolute_distances.device, dtype=absolute_distances.dtype + ) + + # For zero length, use 1.0 to avoid division by zero. + safe_lengths = torch.where(lengths > 0, lengths, torch.ones_like(lengths)) + relative_distances = absolute_distances / safe_lengths[:, None] + + return relative_distances + + +def ragged_distances_for_mode( + points: RaggedBatch, + absolute_distances: RaggedBatch, + *, + relative: bool, +) -> RaggedBatch: + if not relative: + return absolute_distances + + relative_distances = absolute_distances.tensor.clone() + lengths = polyline_lengths_var_size_cpu(points.tensor.cpu(), points.sample_sizes.cpu()).to( + device=absolute_distances.tensor.device, dtype=absolute_distances.tensor.dtype + ) + for sample_idx in range(points.tensor.shape[0]): + num_distances = int(absolute_distances.sample_sizes[sample_idx].item()) + if num_distances == 0: + continue + length = lengths[sample_idx] + if length > 0: + relative_distances[sample_idx, :num_distances] /= length + else: + relative_distances[sample_idx, :num_distances] = 0.0 + relative_distances_batch = absolute_distances.create_with_sample_sizes_like_self(relative_distances) + return relative_distances_batch diff --git a/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py new file mode 100644 index 0000000..ffa4946 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py @@ -0,0 +1,209 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.lane_helpers import polyline + +from polyline_test_utils import DEVICES, distances_for_mode, sample_batch_cpu + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_rectangle_polyline_interpolation(relative: bool, device: str): + points = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0]], + device=device, + ) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_batched_polyline_interpolation(relative: bool, device: str): + base_points = torch.tensor( + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ], + dtype=torch.float32, + ) + offsets = torch.tensor([[0.0, 0.0], [3.5, -1.25]], dtype=torch.float32) + points = (base_points.unsqueeze(0) + offsets.unsqueeze(1)).to(device) + distances = torch.tensor( + [ + [0.0, 0.5, 1.0, 3.0, 6.0], + [6.0, 5.0, 3.0, 1.0, 0.0], + ], + device=device, + dtype=torch.float32, + ) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points.contiguous(), distances_input.contiguous(), relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + distances = torch.tensor( + [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]], + device=device, + dtype=torch.float32, + ).transpose(0, 1) + assert not points.is_contiguous() + assert not distances.is_contiguous() + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_out_of_range_distances_clamp_to_endpoints(relative: bool, device: str): + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0], [1.0, 2.0]]], device=device, dtype=torch.float32) + distances = torch.tensor([[-4.0, -1.0, 0.0, 3.0, 4.0]], device=device, dtype=torch.float32) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_single_point_polyline(relative: bool, device: str): + points = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32) + distances = torch.tensor([[-1.0, 0.0, 1.0]], device=device, dtype=torch.float32) + + distances_input = distances_for_mode(points, distances, relative=relative) + + expected = sample_batch_cpu(points.cpu(), distances.cpu()) + result = polyline.interpolate(points, distances_input, relative=relative) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_zero_point_polyline_returns_nan(relative: bool, device: str): + points = torch.empty((2, 0, 3), device=device, dtype=torch.float32) + distances = torch.tensor([[0.0, 1.0], [-1.0, 2.0]], device=device, dtype=torch.float32) + distances_input = distances_for_mode(points, distances, relative=relative) + + result = polyline.interpolate(points, distances_input, relative=relative) + + assert result.shape == (2, 2, 3) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_zero_point_polyline_with_zero_distances_returns_empty(relative: bool, device: str): + points = torch.empty((2, 0, 3), device=device, dtype=torch.float32) + distances = torch.empty((2, 0), device=device, dtype=torch.float32) + distances_input = distances_for_mode(points, distances, relative=relative) + + result = polyline.interpolate(points, distances_input, relative=relative) + + assert result.shape == (2, 0, 3) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_random_polyline_matches_cpu_reference(relative: bool, device: str): + num_iters = 100 + generator = torch.Generator().manual_seed(0) + for _ in range(num_iters): + num_points = int(torch.randint(15, 61, (), generator=generator).item()) + num_distances = int(torch.randint(15, 61, (), generator=generator).item()) + points_cpu = torch.rand((3, num_points, 2), generator=generator, dtype=torch.float32) + distances_cpu = torch.rand((3, num_distances), generator=generator, dtype=torch.float32) + + segment_lengths = torch.linalg.vector_norm(points_cpu[:, 1:] - points_cpu[:, :-1], dim=2) + total_lengths = torch.sum(segment_lengths, dim=1) + distances_cpu = distances_cpu * total_lengths[:, None] + + distances_input_cpu = distances_for_mode(points_cpu, distances_cpu, relative=relative) + + expected = sample_batch_cpu(points_cpu, distances_cpu) + result = polyline.interpolate( + points_cpu.to(device), distances_input_cpu.to(device), relative=relative + ) + + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_large_polyline_interpolation_external_distance_buffer(relative: bool): + # Create a large polyline to ensure that the external distance buffer is used. + num_points = 200_000 + x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32) + points = torch.stack((x, torch.zeros_like(x)), dim=1).unsqueeze(0) + distances = torch.tensor([[0.0, 0.25, 0.5, 1.0, 2.0]], device="cuda", dtype=torch.float32) + expected = torch.tensor( + [[[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]]], + device="cuda", + dtype=torch.float32, + ) + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + result = polyline.interpolate(points, distances, relative=relative) + stream.synchronize() + + assert torch.allclose(result, expected, atol=1e-4, rtol=0.0) diff --git a/packages/lane_helpers/tests/test_polyline_lengths.py b/packages/lane_helpers/tests/test_polyline_lengths.py new file mode 100644 index 0000000..4b31de2 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_lengths.py @@ -0,0 +1,179 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import ( + DEVICES, + make_padded_ragged_polyline_case, + make_random_ragged_polyline_case, + polyline_lengths_cpu, + polyline_lengths_var_size_cpu, +) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_rectangle_and_single_point(device: str): + rectangle = torch.tensor( + [ + [ + [0.0, 0.0], + [1.0, 0.0], + [1.0, 2.0], + [0.0, 2.0], + [0.0, 0.0], + ] + ], + device=device, + dtype=torch.float32, + ) + single_point = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32) + + assert torch.allclose(polyline.lengths(rectangle).cpu(), torch.tensor([6.0]), atol=1e-5, rtol=0.0) + assert torch.allclose(polyline.lengths(single_point).cpu(), torch.tensor([0.0]), atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_zero_point_batch_returns_nan(device: str): + points = torch.empty((3, 0, 2), device=device, dtype=torch.float32) + + result = polyline.lengths(points) + + assert result.shape == (3,) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_random_nd_matches_cpu_reference(device: str): + generator = torch.Generator().manual_seed(1) + num_iters = 100 + for _ in range(num_iters): + points_cpu = torch.rand((5, 37, 4), generator=generator, dtype=torch.float32) + + expected = polyline_lengths_cpu(points_cpu) + result = polyline.lengths(points_cpu.to(device)) + + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_accepts_non_contiguous_points(device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + assert not points.is_contiguous() + + result = polyline.lengths(points) + expected = polyline_lengths_cpu(points.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_random_matches_cpu_reference(device: str): + num_iters = 100 + for i in range(num_iters): + points_batch_cpu, _ = make_random_ragged_polyline_case(seed=i * 100) + points_batch = points_batch_cpu.to(device) + + result = polyline.lengths_var_size_batch(points_batch) + expected = polyline_lengths_var_size_cpu(points_batch_cpu.tensor, points_batch_cpu.sample_sizes) + + assert result.shape == (points_batch.tensor.shape[0],) + assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_matches_cpu_reference_and_ignores_padding(device: str): + points_batch, _ = make_padded_ragged_polyline_case(device) + + result = polyline.lengths_var_size_batch(points_batch) + expected = polyline_lengths_var_size_cpu(points_batch.tensor.cpu(), points_batch.sample_sizes.cpu()) + + assert result.shape == (4,) + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_zero_point_row_returns_nan(device: str): + points = torch.tensor( + [ + [[9999.0, 9999.0], [9999.0, 9999.0]], + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32) + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0, equal_nan=True) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_all_zero_point_rows_return_nan(device: str): + points = torch.empty((3, 0, 2), device=device, dtype=torch.float32) + sample_sizes = torch.zeros(3, device=device, dtype=torch.int32) + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + + assert result.shape == (3,) + assert torch.isnan(result).all() + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_accepts_int32_sample_sizes_and_non_contiguous_points(device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32) + assert not points.is_contiguous() + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu()) + + assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0) + + +def test_polyline_lengths_var_size_batch_handles_inactive_cuda_rows(): + num_samples = 33 + points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32) + points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32) + points[:, 0, 1] = 0.0 + points[:, 1, 0] = points[:, 0, 0] + 1.0 + points[:, 1, 1] = 0.0 + sample_sizes = torch.full((num_samples,), 2, device="cuda") + + result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes)) + + assert torch.allclose(result.cpu(), torch.ones(num_samples), atol=1e-5, rtol=0.0) diff --git a/packages/lane_helpers/tests/test_polyline_validation.py b/packages/lane_helpers/tests/test_polyline_validation.py new file mode 100644 index 0000000..fe7db26 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_validation.py @@ -0,0 +1,166 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import DEVICES + + +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_rejects_invalid_ragged_layout(device: str): + points = torch.randn((2, 3, 4), device=device) + distances = torch.randn((2, 4), device=device) + points_batch = RaggedBatch( + points.transpose(1, 2).contiguous(), + sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32), + non_uniform_dim=2, + ) + distances_batch = RaggedBatch( + distances, sample_sizes=torch.tensor([4, 2], device=device, dtype=torch.int32) + ) + + # Polyline points must use dimension 1 as the non-uniform point dimension. + with pytest.raises(AssertionError, match="points.non_uniform_dim"): + polyline.interpolate_var_size_batch(points_batch, distances_batch) + + +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_validates_inputs(device: str): + points = torch.randn((1, 3, 2), device=device) + distances = torch.randn((1, 4), device=device) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([3], device=device)) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([4], device=device)) + + # Points sample sizes must not be negative. + bad_points_sizes = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.interpolate_var_size_batch(bad_points_sizes, distances_batch) + + # Distance sample sizes must not exceed the padded distance dimension. + bad_distances_sizes = RaggedBatch(distances, sample_sizes=torch.tensor([5], device=device)) + with pytest.raises(RuntimeError, match="distances.sample_sizes"): + polyline.interpolate_var_size_batch(points_batch, bad_distances_sizes) + + # Points and distances must have the same dtype. + distances_double = distances_batch.double() + with pytest.raises(RuntimeError, match="same dtype"): + polyline.interpolate_var_size_batch(points_batch, distances_double) + + # Points and distances must have the same sample size dtype. + mismatched_sample_size_dtype = RaggedBatch( + distances, + sample_sizes=torch.tensor([4], device=device, dtype=torch.int32), + ) + with pytest.raises(RuntimeError, match="same dtype"): + polyline.interpolate_var_size_batch(points_batch, mismatched_sample_size_dtype) + + +def test_polyline_functions_reject_mixed_cpu_cuda_inputs(): + points_cpu = torch.randn((1, 3, 2), device="cpu") + distances_cpu = torch.randn((1, 4), device="cpu") + points_cuda = points_cpu.cuda() + distances_cuda = distances_cpu.cuda() + + # Fixed-size points and distances must live on the same device. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate(points_cpu, distances_cuda) + + # Ragged points and distances must live on the same device. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate_var_size_batch( + RaggedBatch(points_cpu, sample_sizes=torch.tensor([3], device="cpu")), + RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")), + ) + + # Ragged sample sizes must live on the same device as their data tensor. + with pytest.raises(RuntimeError, match="same device"): + polyline.interpolate_var_size_batch( + RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")), + RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")), + ) + + # Lengths use only points, but points.sample_sizes must still match the points device. + with pytest.raises(RuntimeError, match="same device"): + polyline.lengths_var_size_batch( + RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")) + ) + + +def test_cpu_polyline_functions_reject_low_precision_dtypes(): + for dtype in (torch.float16, torch.bfloat16): + # CPU kernels intentionally support only float32 and float64. + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], dtype=dtype) + distances = torch.tensor([[0.0, 1.0]], dtype=dtype) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2])) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2])) + + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.interpolate(points, distances) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.lengths(points) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.interpolate_var_size_batch(points_batch, distances_batch) + with pytest.raises(RuntimeError, match="float32 or float64 on CPU"): + polyline.lengths_var_size_batch(points_batch) + + +def test_cuda_polyline_functions_accept_low_precision_dtypes(): + for dtype in (torch.float16, torch.bfloat16): + points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype) + distances = torch.tensor([[0.0, 1.0]], device="cuda", dtype=dtype) + points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2], device="cuda")) + distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2], device="cuda")) + + expected_points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype) + expected_lengths = torch.tensor([1.0], device="cuda", dtype=dtype) + + assert torch.equal(polyline.interpolate(points, distances), expected_points) + assert torch.equal(polyline.lengths(points), expected_lengths) + assert torch.equal( + polyline.interpolate_var_size_batch(points_batch, distances_batch).tensor, expected_points + ) + assert torch.equal(polyline.lengths_var_size_batch(points_batch), expected_lengths) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_rejects_invalid_ragged_layout(device: str): + points = torch.randn((2, 3, 4), device=device) + points_batch = RaggedBatch( + points.transpose(1, 2).contiguous(), + sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32), + non_uniform_dim=2, + ) + + # Polyline points must use dimension 1 as the non-uniform point dimension. + with pytest.raises(AssertionError, match="points.non_uniform_dim"): + polyline.lengths_var_size_batch(points_batch) + + +@pytest.mark.parametrize("device", DEVICES) +def test_polyline_lengths_var_size_batch_validates_sample_sizes(device: str): + points = torch.randn((1, 3, 2), device=device) + + # Length sample sizes must not be negative. + bad_small = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.lengths_var_size_batch(bad_small) + + # Length sample sizes must not exceed the padded point dimension. + bad_large = RaggedBatch(points, sample_sizes=torch.tensor([4], device=device)) + with pytest.raises(RuntimeError, match="points.sample_sizes"): + polyline.lengths_var_size_batch(bad_large) diff --git a/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py new file mode 100644 index 0000000..7234ef0 --- /dev/null +++ b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py @@ -0,0 +1,314 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from accvlab.batching_helpers import RaggedBatch +from accvlab.lane_helpers import polyline + +from polyline_test_utils import ( + DEVICES, + assert_ragged_matches_cpu, + make_padded_ragged_polyline_case, + make_random_ragged_polyline_case, + ragged_distances_for_mode, +) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_matches_cpu_reference(relative: bool, device: str): + points_batch, distances_batch = make_padded_ragged_polyline_case(device) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch(points_batch, distances_input_batch, relative=relative) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == (4, 11, 2) + assert result.non_uniform_dim == 1 + assert_ragged_matches_cpu( + result, + points_batch.tensor, + distances_batch.tensor, + points_batch.sample_sizes, + distances_batch.sample_sizes, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_random_matches_cpu_reference(relative: bool, device: str): + num_iters = 100 + for i in range(num_iters): + points_batch_cpu, distances_batch_cpu = make_random_ragged_polyline_case(seed=i) + points_batch = points_batch_cpu.to(device) + distances_batch = distances_batch_cpu.to(device) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == ( + points_batch.tensor.shape[0], + distances_batch.tensor.shape[1], + points_batch.tensor.shape[2], + ) + assert_ragged_matches_cpu( + result, + points_batch.tensor, + distances_batch.tensor, + points_batch.sample_sizes, + distances_batch.sample_sizes, + atol=1e-4, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_matches_fixed_size_when_uniform(relative: bool, device: str): + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0]], + [[2.0, 2.0], [3.0, 2.0], [3.0, 4.0], [2.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 0.5, 2.0, 4.0], [4.0, 2.0, 0.5, 0.0]], + device=device, + dtype=torch.float32, + ) + sample_sizes = torch.tensor([points.shape[1], points.shape[1]], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor( + [distances.shape[1], distances.shape[1]], device=device, dtype=torch.int32 + ) + points_batch = RaggedBatch(points, sample_sizes=sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + expected = polyline.interpolate( + points.contiguous(), distances_input_batch.tensor.contiguous(), relative=relative + ) + + assert torch.allclose(result.tensor, expected, atol=1e-5, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str): + points_storage = torch.tensor( + [ + [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]], + [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]], + ], + device=device, + dtype=torch.float32, + ) + points = points_storage.transpose(1, 2) + distances = torch.tensor( + [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]], + device=device, + dtype=torch.float32, + ).transpose(0, 1) + assert not points.is_contiguous() + assert not distances.is_contiguous() + + points_sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([4, 2], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu( + result, + points, + distances, + points_sample_sizes, + distances_sample_sizes, + ) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_zero_point_row_returns_nan(relative: bool, device: str): + points = torch.tensor( + [ + [[9999.0, 9999.0], [9999.0, 9999.0]], + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.tensor( + [[0.0, 1.0], [0.0, 0.5], [-1.0, 2.0]], + device=device, + dtype=torch.float32, + ) + points_sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([2, 2, 2], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu( + result, + points, + distances, + points_sample_sizes, + distances_sample_sizes, + ) + assert torch.isnan(result.tensor[0, :2]).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_zero_max_distances_returns_empty(relative: bool, device: str): + points = torch.tensor( + [ + [[0.0, 0.0], [1.0, 0.0]], + [[2.0, 3.0], [9999.0, 9999.0]], + ], + device=device, + dtype=torch.float32, + ) + distances = torch.empty((2, 0), device=device, dtype=torch.float32) + points_sample_sizes = torch.tensor([2, 1], device=device, dtype=torch.int32) + distances_sample_sizes = torch.tensor([0, 0], device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert isinstance(result, RaggedBatch) + assert result.tensor.shape == (2, 0, 2) + assert torch.equal(result.sample_sizes.cpu(), torch.zeros(2, dtype=torch.int32)) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +@pytest.mark.parametrize("device", DEVICES) +def test_variable_size_polyline_interpolation_all_zero_point_rows_return_nan(relative: bool, device: str): + points = torch.empty((2, 0, 2), device=device, dtype=torch.float32) + distances = torch.tensor([[0.0, 1.0, 2.0], [-1.0, 0.5, 3.0]], device=device, dtype=torch.float32) + points_sample_sizes = torch.zeros(2, device=device, dtype=torch.int32) + distances_sample_sizes = torch.full((2,), 3, device=device, dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert result.tensor.shape == (2, 3, 2) + assert torch.isnan(result.tensor).all() + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_variable_size_large_polyline_interpolation_external_distance_buffer(relative: bool): + # Create a large polyline to ensure that the external distance buffer is used. + num_points = 200_000 + x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32) + first_polyline = torch.stack((x, torch.zeros_like(x)), dim=1) + second_polyline = torch.stack((x, torch.ones_like(x)), dim=1) + points = torch.stack((first_polyline, second_polyline), dim=0) + distances = torch.tensor( + # Note that 9999.0 is a filler is not not part of the distances used for interpolation (due to `distances_sample_sizes`) + [[0.0, 0.25, 0.5, 1.0, 2.0], [1.0, 0.5, 0.0, -1.0, 9999.0]], + device="cuda", + dtype=torch.float32, + ) + points_sample_sizes = torch.full((2,), num_points, device="cuda", dtype=torch.int32) + distances_sample_sizes = torch.tensor([5, 4], device="cuda", dtype=torch.int32) + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + expected = torch.tensor( + [ + [[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]], + # Note that 9999.0 is a filler and is not checked for equality in the test. + [[1.0, 1.0], [0.5, 1.0], [0.0, 1.0], [0.0, 1.0], [9999.0, 9999.0]], + ], + device="cuda", + dtype=torch.float32, + ) + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + result = polyline.interpolate_var_size_batch(points_batch, distances_batch, relative=relative) + stream.synchronize() + + assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu()) + assert torch.allclose(result.tensor[0, :5], expected[0, :5], atol=1e-4, rtol=0.0) + assert torch.allclose(result.tensor[1, :4], expected[1, :4], atol=1e-4, rtol=0.0) + + +@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"]) +def test_variable_size_polyline_interpolation_handles_inactive_cuda_rows(relative: bool): + num_samples = 33 + points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32) + points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32) + points[:, 0, 1] = 0.0 + points[:, 1, 0] = points[:, 0, 0] + 1.0 + points[:, 1, 1] = 0.0 + distances = ( + torch.tensor([[0.0, 0.25, 1.0]], device="cuda", dtype=torch.float32).expand(num_samples, -1).clone() + ) + points_sample_sizes = torch.full((num_samples,), 2, device="cuda") + distances_sample_sizes = torch.full((num_samples,), 3, device="cuda") + points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes) + distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes) + distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative) + + result = polyline.interpolate_var_size_batch( + points_batch, + distances_input_batch, + relative=relative, + ) + + assert_ragged_matches_cpu(result, points, distances, points_sample_sizes, distances_sample_sizes) + + +if __name__ == "__main__": + pytest.main([__file__])