diff --git a/.gitignore b/.gitignore
index 31a607c..7a579ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,3 @@ __pycache__/
 *.whl
 
 *.log
-
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 6ae40b1..82c44b8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -113,8 +113,8 @@ RUN pip install pandas==1.5.3 \
     numba==0.59 \
     pyquaternion==0.9.9
 
-RUN pip install nuscenes-devkit && \
-    pip install shapely tqdm pillow networkx fire
+RUN pip install nuscenes-devkit==1.2.0 && \
+    pip install shapely==2.0.7 tqdm==4.67.3 pillow==12.2.0 networkx==3.4.2 fire==0.7.1
 
 RUN pip install pytest pytest-timeout
 RUN pip install pynvml
diff --git a/docs/Makefile b/docs/Makefile
index 51667be..987e408 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -23,8 +23,9 @@ help:
 
 # Generate namespace package documentation before building
 generate:
-	python3 mirror_referenced_dirs.py
 	python3 generate_new_namespace_package_docs.py
+	python3 generate_package_docs_assets.py
+	python3 mirror_referenced_dirs.py
 	python3 update_docs_index.py
 
 # Sync the root README into the docs tree before building
@@ -41,7 +42,7 @@ clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	rm -rf $(BUILDDIR)/
 	rm -rf api/generated/
-	rm -rf ../packages/*/docs/generated/
+	rm -rf ../packages/*/docs/_generated/
 
 # Auto-build documentation (watches for changes)
 livehtml: sync-readme generate
diff --git a/docs/generate_package_docs_assets.py b/docs/generate_package_docs_assets.py
new file mode 100644
index 0000000..e205e28
--- /dev/null
+++ b/docs/generate_package_docs_assets.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from dataclasses import dataclass
+import importlib.util
+from pathlib import Path
+import sys
+from types import ModuleType
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class PackageDocsContext:
+    project_root: Path
+    namespace_package: str
+    package_name: str
+    package_root: Path
+    docs_root: Path
+    generated_dir: Path
+
+
+HookFunction = Callable[[PackageDocsContext], None]
+_GENERATED_ASSET_GITIGNORE = "*\n"
+
+
+def _load_hook_module(hook_path: Path, package_name: str) -> ModuleType:
+    # Temporary module name for the imported hook.
+    module_name = f"_accvlab_docs_assets_{package_name}"
+
+    # Import
+    spec = importlib.util.spec_from_file_location(module_name, hook_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not create import spec for docs asset hook: {hook_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    return module
+
+
+def _get_hook_function(module: ModuleType, hook_path: Path) -> HookFunction:
+    hook_function = getattr(module, "generate_docs_assets", None)
+    if not callable(hook_function):
+        raise AttributeError(
+            f"Docs asset hook must define a callable generate_docs_assets(context): {hook_path}"
+        )
+    return hook_function
+
+
+def _prepare_generated_dir(context: PackageDocsContext) -> None:
+    """Create the package's generated docs asset directory and keep it untracked."""
+    context.generated_dir.mkdir(parents=True, exist_ok=True)
+    (context.generated_dir / ".gitignore").write_text(_GENERATED_ASSET_GITIGNORE, encoding="utf-8")
+
+
+def _build_context(project_root: Path, namespace_package: str) -> PackageDocsContext:
+    package_name = namespace_package.split(".")[-1]
+    package_root = project_root / "packages" / package_name
+    docs_root = package_root / "docs"
+    generated_dir = docs_root / "_generated"
+    ctx = PackageDocsContext(
+        project_root=project_root,
+        namespace_package=namespace_package,
+        package_name=package_name,
+        package_root=package_root,
+        docs_root=docs_root,
+        generated_dir=generated_dir,
+    )
+    return ctx
+
+
+def _generate_assets_for_package(
+    *,
+    project_root: Path,
+    namespace_package: str,
+    verbose: bool,
+) -> bool:
+    context = _build_context(project_root, namespace_package)
+    hook_path = context.docs_root / "_on_doc_generation.py"
+    if not hook_path.exists():
+        if verbose:
+            print(f"No docs asset hook for {context.package_name}")
+        return False
+
+    if verbose:
+        print(f"Running docs asset hook for {context.package_name}: {hook_path}")
+    module = _load_hook_module(hook_path, context.package_name)
+    _prepare_generated_dir(context)
+    hook_function = _get_hook_function(module, hook_path)
+    hook_function(context)
+    print(f"Generated docs assets for {context.package_name}")
+    return True
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run optional package-local documentation asset generation hooks.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output.",
+    )
+    parser.add_argument(
+        "--package",
+        dest="package_names",
+        action="append",
+        help="Package name to process, such as lane_helpers. Can be passed more than once.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _parse_args()
+    docs_dir = Path(__file__).resolve().parent
+    project_root = docs_dir.parent
+    sys.path.insert(0, str(project_root))
+
+    try:
+        from namespace_packages_config import NAMESPACE_PACKAGES
+    except ImportError as exc:
+        print(
+            f"Error: Could not import NAMESPACE_PACKAGES from namespace_packages_config.py: {exc}",
+            file=sys.stderr,
+        )
+        return 1
+
+    package_filter = set(args.package_names or [])
+    namespace_packages = [
+        namespace_package
+        for namespace_package in NAMESPACE_PACKAGES
+        if not package_filter or namespace_package.split(".")[-1] in package_filter
+    ]
+    if package_filter and len(namespace_packages) != len(package_filter):
+        found_package_names = {namespace_package.split(".")[-1] for namespace_package in namespace_packages}
+        missing_package_names = sorted(package_filter - found_package_names)
+        print(f"Error: Unknown namespace package(s): {', '.join(missing_package_names)}", file=sys.stderr)
+        return 1
+
+    hook_count = 0
+    for namespace_package in namespace_packages:
+        package_name = namespace_package.split(".")[-1]
+        try:
+            hook_ran = _generate_assets_for_package(
+                project_root=project_root,
+                namespace_package=namespace_package,
+                verbose=args.verbose,
+            )
+        except Exception as exc:
+            print(f"Error: docs asset generation failed for {package_name}: {exc}", file=sys.stderr)
+            return 1
+        if hook_ran:
+            hook_count += 1
+
+    if args.verbose:
+        print(f"Ran {hook_count} package docs asset hook(s).")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docs/guides/DEVELOPMENT_GUIDE.md b/docs/guides/DEVELOPMENT_GUIDE.md
index ffb1f5a..08077d4 100644
--- a/docs/guides/DEVELOPMENT_GUIDE.md
+++ b/docs/guides/DEVELOPMENT_GUIDE.md
@@ -48,7 +48,8 @@ There are two example projects which showcase how a namespace package is structu
 - `packages/example_package`: Showcases a package containing PyTorch extensions built using
   `CppExtension` and `CUDAExtension` provided by PyTorch as well as an external implementation (see
   [External Implementations](#external-implementations) section for more details on external implementations)
-  as described below.
+  as described below. It also includes a package-local documentation asset hook that generates a simple plot
+  from committed CSV data under `evaluation_results/` during the docs build.
 - `packages/example_skbuild_package`: Showcases a package using `scikit-build` for C++/CUDA implementation 
   (see the [Alternative: SKBuild-Based Packages](#alternative-skbuild-based-packages) section for more 
   details on this approach).
@@ -70,6 +71,8 @@ To add a new namespace package (e.g., `example_package`), you need to create:
 | **Setup** | `packages/example_package/setup.py` | Package build configuration |
 | **Project Config** | `packages/example_package/pyproject.toml` | Modern Python project configuration and authoritative dependency definition |
 | **Documentation include list (optional)** | `packages/example_package/docu_referenced_dirs.txt` | List additional directories referenced by the docs (besides `docs/`). See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details.|
+| **Documentation asset hook (optional)** | `packages/example_package/docs/_on_doc_generation.py` | Generate package-owned docs assets such as plots from committed evaluation data. See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md#package-local-generated-assets). |
+| **Evaluation results (optional)** | `packages/example_package/evaluation_results/` | Package-owned committed inputs for generating docs assets, such as data to plot. |
 
 > **ℹ️ Note**: Apart from the above, further folders/files can be included (and made use of manually or added to the 
 > documentation) if needed. A typical use case is to include e.g. an `examples` directory which is:
@@ -84,26 +87,29 @@ The following diagram shows the relevant project structure containing the folder
 
 ```
 accvlab/
-├── packages/                        # Namespace packages directory
+├── packages/                         # Namespace packages directory
 │   ├── optim_test_tools/...
 │   ├── batching_helpers/...
-│   └── example_package/             # ← New namespace package
-│       ├── accvlab/                 # ← Namespace root
-│       │   └── example_package/     # ← Implementation for "example_package" package
+│   └── example_package/              # ← New namespace package
+│       ├── accvlab/                  # ← Namespace root
+│       │   └── example_package/      # ← Implementation for "example_package" package
 │       │       ├── __init__.py
-│       │       ├── csrc/            # ← C++/CUDA sources
-│       │       └── include/         # ← Headers
-│       ├── ext_impl/                # ← Optional: external implementation
+│       │       ├── csrc/             # ← C++/CUDA sources
+│       │       └── include/          # ← Headers
+│       ├── ext_impl/                 # ← Optional: external implementation
 │       │   ├── build_and_copy.sh
 │       │   └── ...
-│       ├── tests/                   # ← Tests for "example_package" package
-│       ├── docs/                    # ← Documentation for "example_package" package
-│       ├── setup.py                 # ← Package build configuration
-│       ├── pyproject.toml           # ← Project configuration (including dependencies)
-│       └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`)
-├── build_config/                    # Shared build utilities
-├── docs/                            # Main documentation
-└── namespace_packages_config.py     # ← Namespace package needs to be listed here
+│       ├── tests/                    # ← Tests for "example_package" package
+│       ├── evaluation_results/       # ← Optional committed inputs for generated docs assets
+│       ├── docs/                     # ← Documentation for "example_package" package
+│       │   ├── _on_doc_generation.py # ← Optional docs asset hook
+│       │   └── ...
+│       ├── setup.py                  # ← Package build configuration
+│       ├── pyproject.toml            # ← Project configuration (including dependencies)
+│       └── docu_referenced_dirs.txt  # ← Optional: list additional directories referenced by the docs (besides `docs/`)
+├── build_config/                     # Shared build utilities
+├── docs/                             # Main documentation
+└── namespace_packages_config.py      # ← Namespace package needs to be listed here
 ```
 
 Note that inside the package, there is the directory structure `accvlab/example_package`. This is where the 
@@ -238,6 +244,11 @@ root = "../.."
 
 Use this pattern for your own namespace package, adapting the dependency names as needed.
 
+Use `[project.optional-dependencies].optional` for dependencies needed by tests, examples, or package-local
+documentation asset hooks, but not by the core package at runtime. For example, if a docs hook generates plots
+from committed data, put the plotting library in the package's optional dependencies rather than in the base
+`[project].dependencies`.
+
 > **ℹ️ Note**: The `accvlab-build-config @ file:../../build_config` build dependency is intentionally a
 > local path reference. From a package under `packages/<package_name>/`, it resolves to the repository's `build_config/` package 
 > so isolated pip builds use the local helper package. See
@@ -317,6 +328,18 @@ Most of the contained packages extend this basic structure considerably to provi
 documentation. Please see the [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details on 
 the documentation system and how to set it up.
 
+If your package needs generated docs assets, add `packages/<package_name>/docs/_on_doc_generation.py`. The
+documentation build creates `packages/<package_name>/docs/_generated/`, keeps it untracked, and passes that
+directory to the hook. Keep user-facing `.rst`/`.md` files static and reference generated assets with relative
+paths such as `_generated/<asset_name>.png`. The hook should generate those assets from committed inputs and
+fail clearly if required inputs are missing. Store committed plot or evaluation inputs outside the package
+`docs/` folder, for example under `packages/<package_name>/evaluation_results/`, so Sphinx does not discover
+data tables as standalone documentation pages.
+
+> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement
+> workflows. They should only regenerate documentation assets, such as plots, from data that is already
+> available in the repository.
+
 #### 8. Test Your Package
 
 ```bash
@@ -352,6 +375,10 @@ When adding a new namespace package, ensure you have:
 - [ ] **Documentation**: Generated with docs scripts and customized intro
 - [ ] **Documentation include list (optional)**: `docu_referenced_dirs.txt` created and populated if extra 
   folders (e.g. `examples/`) are referenced and are needed to build the documentation
+- [ ] **Documentation asset hook (optional)**: `_on_doc_generation.py` added if the package needs generated
+  documentation assets
+- [ ] **Evaluation results (optional)**: `packages/<package_name>/evaluation_results/` contains committed
+  inputs for generated docs assets if needed
 - [ ] **Examples (optional)**: `packages/<package_name>/examples/` created and referenced from docs if used
 - [ ] **Dependencies**: Declared runtime and optional dependencies in `pyproject.toml`
 - [ ] **External implementation**: (Optional) `packages/<package_name>/ext_impl/` for external builds
diff --git a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
index 521725f..667ce0e 100644
--- a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
+++ b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
@@ -9,6 +9,7 @@ The documentation system provides:
 
 - **Explicit namespace package configuration** through `namespace_packages_config.py`
 - **Dynamic documentation generation** for each configured namespace package
+- **Optional package-local asset generation** for generated documentation assets such as plots
 - **Comprehensive API reference** with auto-generated content (extracted from docstrings)
 - **Referenced directories mirroring** to access files from the individual namespace packages in the 
   documentation by
@@ -43,6 +44,10 @@ The documentation generation makes use of multiple scripts:
   - **Template-based**: Uses consistent templates for all namespace packages (but generated files may be 
     modified as needed)
   - **Safe regeneration**: Only creates missing files if no `index.rst` is present for the namespace package
+- **`generate_package_docs_assets.py`**: Runs optional package-local documentation asset hooks
+  - **Package-owned**: Each package can decide whether it needs generated assets and how to create them
+  - **Format-agnostic**: The hook can read any package-owned input files and write any output files in the output folder; 
+    The core docs system does not prescribe a data format
 - **`update_docs_index.py`**: Updates main index file by including references to newly added namespace 
   packages
 - **`mirror_referenced_dirs.py`**: Mirrors (symlinks by default) the `docs` directory and other needed 
@@ -64,27 +69,28 @@ The documentation generation makes use of multiple scripts:
 #### Main Documentation Directory (`docs/`)
 ```
 docs/
-├── conf.py                        # Sphinx configuration using namespace_packages_config
-├── index.rst                      # Main documentation index
+├── conf.py                         # Sphinx configuration using namespace_packages_config
+├── index.rst                       # Main documentation index
 ├── generate_new_namespace_package_docs.py   # Creates structure for new namespace packages
-├── update_docs_index.py           # Updates navigation and indices
-├── mirror_referenced_dirs.py      # Mirrors referenced directories (symlinks by default)
-├── sync_root_readme_for_docs.py   # Syncs project root README into docs/project_overview
-├── Makefile                       # Build commands
-├── requirements.txt               # Documentation dependencies
-├── project_overview/              # Synced copy of the project root README used as docs overview
-├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies)
-│   ├── example_package/           # Example namespace package docs (representative)
-│   │   ├── docs/                  # Documentation files
-│   │   │   ├── index.rst          # Namespace package overview
-│   │   │   ├── intro.rst          # Introduction (manual content)
-│   │   │   └── api.rst            # API reference (auto-generated)
-│   │   └── examples/              # Additional mirrored directory (referenced in docs)
-│   └── [other_packages]/          # Other configured namespace packages
-├── common/                        # Shared documentation resources
+├── generate_package_docs_assets.py # Runs optional package-local docs asset hooks
+├── update_docs_index.py            # Updates navigation and indices
+├── mirror_referenced_dirs.py       # Mirrors referenced directories (symlinks by default)
+├── sync_root_readme_for_docs.py    # Syncs project root README into docs/project_overview
+├── Makefile                        # Build commands
+├── requirements.txt                # Documentation dependencies
+├── project_overview/               # Synced copy of the project root README used as docs overview
+├── contained_package_docs_mirror/  # Mirrored package documentation via symlinks (or copies)
+│   ├── example_package/            # Example namespace package docs (representative)
+│   │   ├── docs/                   # Documentation files
+│   │   │   ├── index.rst           # Namespace package overview
+│   │   │   ├── intro.rst           # Introduction (manual content)
+│   │   │   └── api.rst             # API reference (auto-generated)
+│   │   └── examples/               # Additional mirrored directory (referenced in docs)
+│   └── [other_packages]/           # Other configured namespace packages
+├── common/                         # Shared documentation resources
 ├── _static/css/
-│           └── custom.css         # Custom styling
-└── _build/                        # Built documentation output
+│           └── custom.css          # Custom styling
+└── _build/                         # Built documentation output
 ```
 
 **Notes**:
@@ -103,7 +109,10 @@ packages/
     ├── docs/                      # Source documentation files
     │   ├── index.rst              # Namespace package overview
     │   ├── intro.rst              # Introduction (manual content)
-    │   └── api.rst                # API reference (auto-generated)
+    │   ├── api.rst                # API reference (auto-generated)
+    │   ├── _on_doc_generation.py  # Optional package-local docs asset hook
+    │   └── _generated/            # Generated assets created at docs build time
+    ├── evaluation_results/        # Optional committed inputs for generated docs assets
     ├── docu_referenced_dirs.txt   # List of additional directories to copy
     ├── examples/                  # Example code (mirrored and referenced by docs)
     └── [other_dirs]/              # Other package directories
@@ -112,6 +121,9 @@ packages/
 
 **Notes**:
 - The `packages/example_package/` structure shows the source documentation that gets mirrored during build
+- The `example_package` includes a small generated plot example: committed CSV data under
+  `packages/example_package/evaluation_results/` is converted into an image under
+  `packages/example_package/docs/_generated/` during the docs build
 - **⚠️ Important**: Content should be edited in the source locations (`packages/<package_name>/docs/`), not in 
   the mirrored locations
 - In case of the `example_package`, the `examples/` directory is mirrored to maintain documentation references 
@@ -172,6 +184,65 @@ etc.) can still be found after the documentation is mirrored to the build locati
 - Only list additional directories that are referenced by your documentation. Note that the API documentation
   does not rely on this mirroring, but is extracted from the installed packages.
 
+### Package-Local Generated Assets
+
+Packages can generate documentation assets during the docs build by adding an optional hook:
+
+```text
+packages/<package_name>/docs/_on_doc_generation.py
+```
+
+If present, `generate_package_docs_assets.py` imports the hook and calls:
+
+```python
+def generate_docs_assets(context):
+    ...
+```
+
+The hook receives a context with package and documentation paths, including:
+
+- `context.project_root`
+- `context.package_root`
+- `context.docs_root`
+- `context.generated_dir`
+
+The docs asset generator creates `context.generated_dir` before calling the hook. This directory is always:
+
+```text
+packages/<package_name>/docs/_generated/
+```
+
+It also writes a local `.gitignore` file there so generated assets remain untracked. The hook should write
+generated images or other generated files directly into `context.generated_dir`, or into subdirectories below
+it if a package needs additional structure.
+
+Source documentation files remain static. For example, an `.rst` file can reference a generated image with a
+normal relative path:
+
+```rst
+.. figure:: _generated/runtime_plot.png
+   :alt: Runtime plot
+```
+
+Packages own the input data and generation logic. For example, a package can commit benchmark result tables
+under `packages/<package_name>/evaluation_results/` and generate plots from those tables during the docs
+build. If a generated asset is required by the static docs, the hook should fail with a clear error when the
+required input data is missing or malformed.
+
+> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement
+> workflows. They should only regenerate documentation assets, such as plots, from data that is already
+> available in the repository. It is recommended to store results in simple formats such as .csv or .md, 
+> and use those as the source of truth for the plots.
+>
+> Keep committed plot or evaluation inputs outside the package `docs/` folder, for example under
+> `packages/<package_name>/evaluation_results/`. This prevents Sphinx from discovering e.g. `.md` data tables as
+> standalone documentation pages while keeping the inputs package-local.
+
+Package-specific dependencies needed only by the hook should be declared in that package's optional
+dependencies in `pyproject.toml`. The default local installation path (`./scripts/install_local.sh`) installs
+optional package dependencies. If you build docs after installing packages without optional dependencies,
+package-local asset hooks may fail when their optional plotting or parsing dependencies are missing.
+
 ### Building Documentation Locally
 
 **Quick build using the script** (can be run from any directory, example shows running from the project 
@@ -208,6 +279,9 @@ make livehtml
   in sequence
 - The `html` target ensures all scripts run before building
 - The `livehtml` target also runs the scripts for development builds
+- Package-local docs asset hooks run before package docs are mirrored, so generated assets under
+  `packages/<package_name>/docs/_generated/` are available from both the package docs source tree and the
+  mirrored docs tree.
 - When running spelling via the script, the generation scripts are executed first to ensure mirrored package 
   docs are up to date. Spelling findings are written to `docs/_build/spelling/output.txt`.
 
@@ -220,6 +294,9 @@ make livehtml
 >  - It does **not** reinstall or rebuild packages for you. This means that if you change the docstrings in 
 >    the source tree of a package, you need to reinstall the package (for example via 
 >    `./scripts/install_local.sh`) and then restart `make livehtml` to see updated docstrings.
+>  - It does **not** rerun package-local docs asset hooks for you after startup. This means that if you change
+>    committed plot data or hook code, you need to restart `make livehtml` (or run `make generate`) to
+>    regenerate plots and other generated docs assets.
 
 ### Spell-checking
 
@@ -456,6 +533,10 @@ the per-package runtime dependencies defined in each package's `pyproject.toml`)
 - Theme packages
 - Other documentation-specific dependencies
 
+Package-specific docs asset dependencies belong to the corresponding package's optional dependencies. This keeps the 
+global documentation requirements focused on the Sphinx build itself while allowing package-owned hooks to declare their 
+own plotting or data-processing dependencies.
+
 ### File Descriptions
 
 #### Core Configuration Files
@@ -478,6 +559,12 @@ automatically as part of the docs build; you normally do not need to run them ma
 - **`packages/<package_name>/docs/index.rst`**: Namespace package overview (source)
 - **`packages/<package_name>/docs/intro.rst`**: Manual introduction content (source)
 - **`packages/<package_name>/docs/api.rst`**: Auto-generated API reference (source)
+- **`packages/<package_name>/docs/_on_doc_generation.py`**: Optional hook for package-local generated docs
+  assets
+- **`packages/<package_name>/docs/_generated/`**: Generated documentation assets created during docs
+  generation and ignored by Git
+- **`packages/<package_name>/evaluation_results/`**: Optional package-owned committed inputs for generated
+  docs assets, such as benchmark tables used for plots
 - **`packages/<package_name>/docu_referenced_dirs.txt`**: List of directories containing files used in the 
   documentation in addition to `docs` (to mirror into the documentation source directory).
 - **`docs/contained_package_docs_mirror/<package_name>/docs/`**: Mirrored documentation (symlink to the 
diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md
index df0fe2e..b9fe12f 100644
--- a/docs/guides/INSTALLATION_GUIDE.md
+++ b/docs/guides/INSTALLATION_GUIDE.md
@@ -53,10 +53,11 @@ dependencies (needed for some tests and examples), pass the `--optional` flag ex
 ./scripts/package_manager.sh install -e --optional
 ```
 
-> **⚠️ Important**: Installing with optional dependencies is required if you plan to run the contained
-> tests, as they rely on optional dependencies such as `pytest` (and possibly other dependencies). It may be 
-> also required for the contained examples, as they may use additional packages which are otherwise 
-> not used in the core library.
+> **⚠️ Important**: Installing with optional dependencies is required for workflows that rely on packages
+> outside the core library, including contained tests, contained examples, and documentation generation.
+> Documentation generation may run package-local asset hooks, for example to regenerate plots from committed
+> data, and those hooks can require plotting or data-processing packages. Tests commonly require tools such as
+> `pytest` and may require further packages.
 
 The package manager script:
 - Automatically installs the required `accvlab_build_config` helper package (see the `build_config` directory
diff --git a/docs/index.rst b/docs/index.rst
index 273cc02..6492146 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -71,6 +71,7 @@ Please see the documentation of each namespace package for usage instructions (a
    contained_package_docs_mirror/batching_helpers/docs/index
    contained_package_docs_mirror/multi_tensor_copier/docs/index
    contained_package_docs_mirror/dali_pipeline_framework/docs/index
+   contained_package_docs_mirror/lane_helpers/docs/index
    contained_package_docs_mirror/draw_heatmap/docs/index
    contained_package_docs_mirror/optim_test_tools/docs/index
 
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index bee6c51..bda1b02 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -212,3 +212,7 @@ ABI
 aggregator
 multimodal
 cubin
+Polyline
+polyline
+Polylines
+polylines
diff --git a/namespace_packages_config.py b/namespace_packages_config.py
index a94ab18..aaf9d3d 100644
--- a/namespace_packages_config.py
+++ b/namespace_packages_config.py
@@ -27,6 +27,7 @@
     'accvlab.batching_helpers',
     'accvlab.multi_tensor_copier',
     'accvlab.dali_pipeline_framework',
+    'accvlab.lane_helpers',
     'accvlab.draw_heatmap',
     'accvlab.optim_test_tools',
     # Add new namespace packages in the same way as above
diff --git a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
index 9e66681..4df9745 100644
--- a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
+++ b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
@@ -64,48 +64,48 @@
 #define DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \
     AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INDEX_TYPES(__VA_ARGS__))
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CPU(x) AT_ASSERTM(x.is_cpu(), #x " must be a CPU tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_SAME_CUDA_DEVICE(tensors_list...)                                                        \
-    {                                                                                                  \
-        const std::vector<torch::Tensor> tensors = {tensors_list};                                     \
-        CHECK_CUDA(tensors[0]);                                                                        \
-        const auto& device = tensors[0].device();                                                      \
-        for (size_t i = 1; i < tensors.size(); ++i) {                                                  \
-            AT_ASSERTM(tensors[i].device() == device, "All input tensors must be on the same device"); \
-        }                                                                                              \
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) TORCH_CHECK(x.is_cpu(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_SAME_CUDA_DEVICE(tensors_list...)                                                         \
+    {                                                                                                   \
+        const std::vector<torch::Tensor> tensors = {tensors_list};                                      \
+        CHECK_CUDA(tensors[0]);                                                                         \
+        const auto& device = tensors[0].device();                                                       \
+        for (size_t i = 1; i < tensors.size(); ++i) {                                                   \
+            TORCH_CHECK(tensors[i].device() == device, "All input tensors must be on the same device"); \
+        }                                                                                               \
     }
-#define CHECK_SAME_DTYPE(error_msg, tensors_list...)                                     \
-    {                                                                                    \
-        const std::vector<torch::Tensor> tensors = {tensors_list};                       \
-        for (size_t i = 1; i < tensors.size(); ++i) {                                    \
-            AT_ASSERTM(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \
-        }                                                                                \
+#define CHECK_SAME_DTYPE(error_msg, tensors_list...)                                      \
+    {                                                                                     \
+        const std::vector<torch::Tensor> tensors = {tensors_list};                        \
+        for (size_t i = 1; i < tensors.size(); ++i) {                                     \
+            TORCH_CHECK(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \
+        }                                                                                 \
     }
 
-#define CHECK_SIZE_MATCH(tensor1, tensor2)                                                    \
-    {                                                                                         \
-        /* If the tensors are empty, the actual sizes are not relevant */                     \
-        if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                            \
-            AT_ASSERTM((tensor1).dim() == (tensor2).dim(),                                    \
-                       #tensor1 " and " #tensor2 " must have the same number of dimensions"); \
-            for (size_t i = 0; i < (tensor1).dim(); ++i) {                                    \
-                AT_ASSERTM((tensor1).size(i) == (tensor2).size(i),                            \
-                           #tensor1 " and " #tensor2 " must have the same size");             \
-            }                                                                                 \
-        }                                                                                     \
+#define CHECK_SIZE_MATCH(tensor1, tensor2)                                                     \
+    {                                                                                          \
+        /* If the tensors are empty, the actual sizes are not relevant */                      \
+        if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                             \
+            TORCH_CHECK((tensor1).dim() == (tensor2).dim(),                                    \
+                        #tensor1 " and " #tensor2 " must have the same number of dimensions"); \
+            for (size_t i = 0; i < (tensor1).dim(); ++i) {                                     \
+                TORCH_CHECK((tensor1).size(i) == (tensor2).size(i),                            \
+                            #tensor1 " and " #tensor2 " must have the same size");             \
+            }                                                                                  \
+        }                                                                                      \
     }
 
 #define CHECK_SIZE_MATCH_FIRST_DIMS(tensor1, tensor2, num_dims_to_check)                                     \
     {                                                                                                        \
         /* If the tensors are empty, the actual sizes are not relevant */                                    \
         if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                                           \
-            AT_ASSERTM((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check),     \
-                       #tensor1 " and " #tensor2 " must have at least " +                                    \
-                           std::to_string(num_dims_to_check) + " dimensions");                               \
+            TORCH_CHECK((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check),    \
+                        #tensor1 " and " #tensor2 " must have at least " +                                   \
+                            std::to_string(num_dims_to_check) + " dimensions");                              \
             for (size_t i = 0; i < (num_dims_to_check); ++i) {                                               \
-                AT_ASSERTM(                                                                                  \
+                TORCH_CHECK(                                                                                 \
                     (tensor1).size(i) == (tensor2).size(i),                                                  \
                     #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \
             }                                                                                                \
@@ -116,35 +116,35 @@
     {                                                                                                        \
         /* If the tensors are empty, the actual sizes are not relevant */                                    \
         if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                                           \
-            AT_ASSERTM((tensor1).dim() == (tensor2).dim(),                                                   \
-                       #tensor1 " and " #tensor2 " must have the same number of dimensions");                \
+            TORCH_CHECK((tensor1).dim() == (tensor2).dim(),                                                  \
+                        #tensor1 " and " #tensor2 " must have the same number of dimensions");               \
             for (size_t i = 0; i < (tensor1).dim(); ++i) {                                                   \
                 if (i == (dim_to_allow_mismatch)) {                                                          \
                     continue;                                                                                \
                 }                                                                                            \
-                AT_ASSERTM(                                                                                  \
+                TORCH_CHECK(                                                                                 \
                     (tensor1).size(i) == (tensor2).size(i),                                                  \
                     #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \
             }                                                                                                \
         }                                                                                                    \
     }
 
-#define CHECK_NUM_DIMS(tensor, num_dims)                                                  \
-    {                                                                                     \
-        /* If the tensor is empty, the number of dimensions is not relevant */            \
-        if (!((tensor).numel() == 0)) {                                                   \
-            AT_ASSERTM((tensor).dim() == (num_dims),                                      \
-                       #tensor " must have " + std::to_string(num_dims) + " dimensions"); \
-        }                                                                                 \
+#define CHECK_NUM_DIMS(tensor, num_dims)                                                   \
+    {                                                                                      \
+        /* If the tensor is empty, the number of dimensions is not relevant */             \
+        if (!((tensor).numel() == 0)) {                                                    \
+            TORCH_CHECK((tensor).dim() == (num_dims),                                      \
+                        #tensor " must have " + std::to_string(num_dims) + " dimensions"); \
+        }                                                                                  \
     }
 
-#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims)                                                  \
-    {                                                                                              \
-        /* If the tensor is empty, the number of dimensions is not relevant */                     \
-        if (!((tensor).numel() == 0)) {                                                            \
-            AT_ASSERTM((tensor).dim() >= (num_dims),                                               \
-                       #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \
-        }                                                                                          \
+#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims)                                                   \
+    {                                                                                               \
+        /* If the tensor is empty, the number of dimensions is not relevant */                      \
+        if (!((tensor).numel() == 0)) {                                                             \
+            TORCH_CHECK((tensor).dim() >= (num_dims),                                               \
+                        #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \
+        }                                                                                           \
     }
 
 static inline int64_t get_number_data_elements_per_index(const torch::Tensor& input_data,
diff --git a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
index ed5e9c4..85d5d38 100644
--- a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
+++ b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
@@ -20,8 +20,8 @@
 #include <cuda_runtime.h>
 #include <torch/extension.h>
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
     CHECK_CUDA(x);     \
     CHECK_CONTIGUOUS(x);
@@ -69,11 +69,11 @@ void draw_heatmap_launcher(at::Tensor& heatmap, const at::Tensor& centers, const
     CHECK_INPUT(radii);
     CHECK_INPUT(heatmap_idxes);
 
-    AT_ASSERTM(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0");
-    AT_ASSERTM(centers.size(0) == heatmap_idxes.size(0),
-               "centers and heatmap_idxes must have the same size at dim0");
-    AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]");
-    AT_ASSERTM(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]");
+    TORCH_CHECK(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0");
+    TORCH_CHECK(centers.size(0) == heatmap_idxes.size(0),
+                "centers and heatmap_idxes must have the same size at dim0");
+    TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]");
+    TORCH_CHECK(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]");
 
     const int num_targets = centers.size(0);
     const int num_heatmaps = heatmap.size(0);
@@ -101,15 +101,15 @@ void draw_heatmap_batched_launcher(at::Tensor& heatmap, const at::Tensor& center
 
     const int batch_size = heatmap.size(0);
     const int num_targets = radii.size(1);
-    AT_ASSERTM(
+    TORCH_CHECK(
         batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0),
         "batch_size (dim 0) need to be the same for all inputs");
-    AT_ASSERTM(num_targets == centers.size(1),
-               "maximum number of targets (dim 1) need to be the same centers and radii");
-    AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]");
-    AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2,
-               "centers must be of shape [batch_size, num_targets, 2]");
-    AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
+    TORCH_CHECK(num_targets == centers.size(1),
+                "maximum number of targets (dim 1) need to be the same centers and radii");
+    TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]");
+    TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2,
+                "centers must be of shape [batch_size, num_targets, 2]");
+    TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
 
     const int height = heatmap.size(1);
     const int width = heatmap.size(2);
@@ -138,23 +138,23 @@ void draw_heatmap_batched_classwise_launcher(at::Tensor& heatmap, const at::Tens
 
     const int batch_size = heatmap.size(0);
     const int num_targets = radii.size(1);
-    AT_ASSERTM(
+    TORCH_CHECK(
         batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0),
         "batch_size (dim 0) need to be the same for all inputs");
-    AT_ASSERTM(num_targets == centers.size(1),
-               "maximum number of targets (dim 1) need to be the same centers and radii");
-    AT_ASSERTM(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]");
-    AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2,
-               "centers must be of shape [batch_size, num_targets, 2]");
-    AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
+    TORCH_CHECK(num_targets == centers.size(1),
+                "maximum number of targets (dim 1) need to be the same centers and radii");
+    TORCH_CHECK(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]");
+    TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2,
+                "centers must be of shape [batch_size, num_targets, 2]");
+    TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
 
     const int height = heatmap.size(2);
     const int width = heatmap.size(3);
     const int max_num_classes = heatmap.size(1);
     // Validate labels tensor shape and range before launching the kernel
-    AT_ASSERTM(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]");
-    AT_ASSERTM(labels.size(0) == batch_size && labels.size(1) == num_targets,
-               "labels shape must be [batch_size, radii.size(1)]");
+    TORCH_CHECK(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]");
+    TORCH_CHECK(labels.size(0) == batch_size && labels.size(1) == num_targets,
+                "labels shape must be [batch_size, radii.size(1)]");
     AT_DISPATCH_FLOATING_TYPES(
         heatmap.scalar_type(), "draw_heatmap_cuda_batched", ([&] {
             draw_heatmap_batched_cuda(
diff --git a/packages/example_package/docs/_on_doc_generation.py b/packages/example_package/docs/_on_doc_generation.py
new file mode 100644
index 0000000..28bb331
--- /dev/null
+++ b/packages/example_package/docs/_on_doc_generation.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+from pathlib import Path
+from typing import Any
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+_DATA_FILE = Path("evaluation_results") / "simple_plot.csv"
+_OUTPUT_FILE = "simple_plot.png"
+
+
+def _read_plot_data(input_file: Path) -> tuple[list[float], list[float]]:
+    if not input_file.exists():
+        raise FileNotFoundError(f"Required example plot input data is missing: {input_file}")
+
+    with input_file.open("r", encoding="utf-8", newline="") as csv_file:
+        reader = csv.DictReader(csv_file)
+        if reader.fieldnames != ["x", "y"]:
+            raise ValueError(f"Expected CSV columns 'x,y' in {input_file}")
+        x_values: list[float] = []
+        y_values: list[float] = []
+        for row in reader:
+            x_values.append(float(row["x"]))
+            y_values.append(float(row["y"]))
+
+    if not x_values:
+        raise ValueError(f"Expected at least one data row in {input_file}")
+    return x_values, y_values
+
+
+def generate_docs_assets(context: Any) -> None:
+    input_file = context.package_root / _DATA_FILE
+    output_file = context.generated_dir / _OUTPUT_FILE
+    x_values, y_values = _read_plot_data(input_file)
+
+    figure, axis = plt.subplots(figsize=(5.0, 3.2), constrained_layout=True)
+    axis.plot(x_values, y_values, marker="o")
+    axis.set_title("Generated Example Plot")
+    axis.set_xlabel("x")
+    axis.set_ylabel("y")
+    axis.grid(True)
+    figure.savefig(output_file)
+    plt.close(figure)
diff --git a/packages/example_package/docs/intro.rst b/packages/example_package/docs/intro.rst
index 104d8ec..237fba1 100644
--- a/packages/example_package/docs/intro.rst
+++ b/packages/example_package/docs/intro.rst
@@ -56,6 +56,21 @@ Examples
 For examples, see :doc:`examples`. The example makes use of ``note-literalinclude`` to include the 
 example code in the documentation and highlight notes in the code (comment blocks starting with ``# @NOTE``).
 
+Generated Documentation Assets
+------------------------------
+
+This package also demonstrates package-local documentation asset generation. The docs build reads committed
+data from ``evaluation_results/simple_plot.csv`` and writes the generated plot to
+``docs/_generated/simple_plot.png``. The source documentation remains static and references the generated
+image using a normal relative path.
+
+.. figure:: _generated/simple_plot.png
+   :alt: Simple generated plot from committed CSV data
+   :align: center
+   :width: 70%
+
+   Example plot generated from committed CSV data during documentation generation.
+
 .. toctree::
    :maxdepth: 2
    :caption: Examples
diff --git a/packages/example_package/evaluation_results/simple_plot.csv b/packages/example_package/evaluation_results/simple_plot.csv
new file mode 100644
index 0000000..9675110
--- /dev/null
+++ b/packages/example_package/evaluation_results/simple_plot.csv
@@ -0,0 +1,6 @@
+x,y
+0,0
+1,1
+2,4
+3,9
+4,16
diff --git a/packages/example_package/pyproject.toml b/packages/example_package/pyproject.toml
index ef14420..cbe6014 100644
--- a/packages/example_package/pyproject.toml
+++ b/packages/example_package/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
 
 [project.optional-dependencies]
 optional = [
+    "matplotlib",
     "pytest",
 ]
 
diff --git a/packages/lane_helpers/accvlab/lane_helpers/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/__init__.py
new file mode 100644
index 0000000..e5b5150
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import PackageNotFoundError, version
+
+from . import polyline
+
+try:
+    __version__ = version("accvlab.lane_helpers")
+except PackageNotFoundError:
+    __version__ = "0.0.0"
+
+
+__all__ = [
+    "__version__",
+    "polyline",
+]
diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py
new file mode 100644
index 0000000..e1ec3a1
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .functions import (
+    interpolate,
+    interpolate_var_size_batch,
+    lengths,
+    lengths_var_size_batch,
+)
+
+__all__ = [
+    "interpolate",
+    "interpolate_var_size_batch",
+    "lengths",
+    "lengths_var_size_batch",
+]
diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py
new file mode 100644
index 0000000..417575b
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from .. import _polyline_sampling
+
+if TYPE_CHECKING:
+    from accvlab.batching_helpers import RaggedBatch
+
+
+def interpolate(points: torch.Tensor, distances: torch.Tensor, *, relative: bool = False) -> torch.Tensor:
+    """Interpolate batched polylines at requested distances.
+
+    Args:
+        points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``.
+        distances: Tensor with shape ``(batch, num_distances)`` on the same device as ``points``.
+            Distances below zero are clamped to the first point of the polyline. Distances beyond the
+            total polyline length are clamped to the last point. When ``relative=True``, this corresponds
+            to clamping values below ``0`` and above ``1``.
+        relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length.
+            If ``False``, interpret them as absolute distances from the start of each polyline.
+
+    Returns:
+        Tensor with shape ``(batch, num_distances, num_dims)`` on the same device as ``points``.
+    """
+    result = _polyline_sampling.polyline_interpolation(points, distances, relative=relative)
+    return result
+
+
+def lengths(points: torch.Tensor) -> torch.Tensor:
+    """Compute the total length of each polyline in a fixed-size batch.
+
+    Args:
+        points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``.
+
+    Returns:
+        Tensor with shape ``(batch,)`` on the same device as ``points``.
+    """
+    result = _polyline_sampling._polyline_lengths(points)
+    return result
+
+
+def interpolate_var_size_batch(
+    points: RaggedBatch, distances: RaggedBatch, *, relative: bool = False
+) -> RaggedBatch:
+    """Interpolate variable-length batched polylines at requested distances.
+
+    Args:
+        points: RaggedBatch-like object with tensor data on CPU or CUDA and shape
+            ``(batch, max_num_points, num_dims)``.
+        distances: RaggedBatch-like object with shape ``(batch, max_num_distances)`` and tensor data
+            on the same device as ``points``. Distances below zero are clamped to the first point of the
+            polyline. Distances beyond the total polyline length are clamped to the last point. When
+            ``relative=True``, this corresponds to clamping values below ``0`` and above ``1``.
+        relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length.
+            If ``False``, interpret them as absolute distances from the start of each polyline.
+
+    Returns:
+        RaggedBatch-like object with shape ``(batch, max_num_distances, num_dims)`` and tensor data
+        on the same device as ``points``.
+    """
+    assert points.num_batch_dims == 1, "points must have exactly one batch dimension"
+    assert distances.num_batch_dims == 1, "distances must have exactly one batch dimension"
+    assert (
+        points.non_uniform_dim == 1
+    ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)"
+    assert (
+        distances.non_uniform_dim == 1
+    ), "distances.non_uniform_dim must be 1 for shape (batch, max_num_distances)"
+
+    result = _polyline_sampling._polyline_interpolation_var_size_batch(
+        points.tensor,
+        distances.tensor,
+        points.sample_sizes,
+        distances.sample_sizes,
+        relative=relative,
+    )
+    result_batch = distances.create_with_sample_sizes_like_self(result)
+    return result_batch
+
+
+def lengths_var_size_batch(points: RaggedBatch) -> torch.Tensor:
+    """Compute the total length of each polyline in a variable-size batch.
+
+    Args:
+        points: RaggedBatch-like object with tensor data on CPU or CUDA and shape
+            ``(batch, max_num_points, num_dims)``.
+
+    Returns:
+        Tensor with shape ``(batch,)`` on the same device as ``points``.
+    """
+    assert points.num_batch_dims == 1, "points must have exactly one batch dimension"
+    assert (
+        points.non_uniform_dim == 1
+    ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)"
+    result = _polyline_sampling._polyline_lengths_var_size_batch(points.tensor, points.sample_sizes)
+    return result
diff --git a/packages/lane_helpers/docs/_on_doc_generation.py b/packages/lane_helpers/docs/_on_doc_generation.py
new file mode 100644
index 0000000..1d6c43d
--- /dev/null
+++ b/packages/lane_helpers/docs/_on_doc_generation.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+import sys
+from typing import Any
+
+_RESULTS_SUBDIR = Path("evaluation_results") / "polyline_runtime_evaluation"
+_GENERATED_IMAGE_SUBDIR = Path("polyline_runtime_evaluation")
+_DOC_BATCH_SIZES = [1, 64]
+_DOC_REQUIRED_MARKDOWN_METRICS = (
+    "runtime_shapely",
+    "runtime_cpu",
+    "runtime_cuda",
+    "speedup_cpu_vs_shapely",
+    "speedup_cuda_vs_shapely",
+    "speedup_cuda_vs_cpu",
+)
+_DOC_REQUIRED_IMAGE_NAMES = tuple(
+    f"batch_{batch_size}_{plot_kind}_comparison.png"
+    for batch_size in _DOC_BATCH_SIZES
+    for plot_kind in ("runtime", "speedup")
+)
+
+
+def _required_markdown_paths(input_dir: Path) -> list[Path]:
+    return [
+        input_dir / f"batch_{batch_size}_{metric_name}.md"
+        for batch_size in _DOC_BATCH_SIZES
+        for metric_name in _DOC_REQUIRED_MARKDOWN_METRICS
+    ]
+
+
+def _validate_required_markdown_inputs(input_dir: Path) -> None:
+    if not input_dir.exists():
+        raise FileNotFoundError(
+            "Required committed Markdown input directory is missing for lane_helpers docs asset generation: "
+            f"{input_dir}."
+        )
+
+    missing_inputs = [path for path in _required_markdown_paths(input_dir) if not path.exists()]
+    if missing_inputs:
+        missing_list = "\n".join(f"  - {path}" for path in missing_inputs)
+        raise FileNotFoundError(
+            "Missing required committed Markdown input file(s) for lane_helpers docs asset generation:\n"
+            f"{missing_list}"
+        )
+
+
+def _validate_required_images(output_dir: Path) -> None:
+    missing_outputs = [
+        output_dir / image_name
+        for image_name in _DOC_REQUIRED_IMAGE_NAMES
+        if not (output_dir / image_name).exists()
+    ]
+    if missing_outputs:
+        missing_list = "\n".join(f"  - {path}" for path in missing_outputs)
+        raise FileNotFoundError(
+            "Polyline runtime docs asset generation did not produce all images referenced by introduction.rst:\n"
+            f"{missing_list}"
+        )
+
+
+def generate_docs_assets(context: Any) -> None:
+    input_dir = context.package_root / _RESULTS_SUBDIR
+    output_dir = context.generated_dir / _GENERATED_IMAGE_SUBDIR
+
+    _validate_required_markdown_inputs(input_dir)
+
+    evaluation_dir = context.package_root / "evaluation"
+    sys.path.insert(0, str(evaluation_dir))
+    import plot_shapely_evaluation
+
+    plot_shapely_evaluation.plot_from_markdown_directory(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        batch_sizes=_DOC_BATCH_SIZES,
+        annotate_plots=True,
+    )
+    _validate_required_images(output_dir)
diff --git a/packages/lane_helpers/docs/api.rst b/packages/lane_helpers/docs/api.rst
new file mode 100644
index 0000000..072f978
--- /dev/null
+++ b/packages/lane_helpers/docs/api.rst
@@ -0,0 +1,9 @@
+API Reference
+=============
+
+.. automodule:: accvlab.lane_helpers
+
+polyline
+--------
+
+.. automodule:: accvlab.lane_helpers.polyline
diff --git a/packages/lane_helpers/docs/example.rst b/packages/lane_helpers/docs/example.rst
new file mode 100644
index 0000000..4f74766
--- /dev/null
+++ b/packages/lane_helpers/docs/example.rst
@@ -0,0 +1,16 @@
+Example
+=======
+
+Polyline Interpolation
+----------------------
+
+The example below samples a rectangle-shaped polyline at a handful of distances.
+
+.. important::
+
+   You can run the example using the script ``packages/lane_helpers/examples/basic_usage.py``.
+
+.. note-literalinclude:: ../examples/basic_usage.py
+   :language: python
+   :caption: packages/lane_helpers/examples/basic_usage.py
+   :linenos:
diff --git a/packages/lane_helpers/docs/images/polyline_sampling_illustration.png b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png
new file mode 100644
index 0000000..1caaf78
Binary files /dev/null and b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png differ
diff --git a/packages/lane_helpers/docs/index.rst b/packages/lane_helpers/docs/index.rst
new file mode 100644
index 0000000..40095e5
--- /dev/null
+++ b/packages/lane_helpers/docs/index.rst
@@ -0,0 +1,11 @@
+Lane Helpers
+============
+
+This is the documentation for the ``accvlab.lane_helpers`` package.
+
+.. toctree::
+   :maxdepth: 1
+
+   introduction
+   api
+   example
diff --git a/packages/lane_helpers/docs/introduction.rst b/packages/lane_helpers/docs/introduction.rst
new file mode 100644
index 0000000..9bf3eac
--- /dev/null
+++ b/packages/lane_helpers/docs/introduction.rst
@@ -0,0 +1,113 @@
+Introduction
+============
+
+Polyline Sampling
+-----------------
+
+Functionality
+^^^^^^^^^^^^^
+
+The ``lane_helpers`` package provides utilities for lane-processing workloads.
+
+The main functionality is batched polyline interpolation. A polyline is a sequence of points in the
+space :math:`\mathbb{R}^D`, written as :math:`\mathbf{p}_i`, where each pair of consecutive points defines
+one line segment.
+
+Given sampling distances :math:`d_j` measured from the first point :math:`\mathbf{p}_0` along the
+polyline, the sampling function :func:`~accvlab.lane_helpers.polyline.interpolate` returns the
+corresponding sampled points :math:`\mathbf{q}_j`.
+
+.. figure:: images/polyline_sampling_illustration.png
+   :alt: Illustration of polyline sampling
+   :align: center
+   :scale: 45%
+
+   Two-segment polyline sampled at two distances. The input points are shown as green circles, and the
+   sampled points are shown as red circles.
+
+Sampling distances do not need to be sorted. Distances can be provided either as absolute distances along
+the polyline or as fractions of each polyline's total length.
+
+Point coordinates are not limited to 2D. The coordinate dimension is the last tensor dimension, and 2D,
+3D, and higher-dimensional coordinates are supported.
+
+For batches with variable numbers of points or distances, use
+:func:`~accvlab.lane_helpers.polyline.interpolate_var_size_batch` with
+:class:`~accvlab.batching_helpers.RaggedBatch` inputs.
+
+Functionality to compute the total length of each polyline is also provided (through
+:func:`~accvlab.lane_helpers.polyline.lengths` and :func:`~accvlab.lane_helpers.polyline.lengths_var_size_batch`).
+
+Runtime Evaluation
+^^^^^^^^^^^^^^^^^^
+
+The runtime evaluation compares batched interpolation for both CPU and CUDA against a Shapely LineString
+reference over a grid of point counts, numbers of sampled distances, and batch sizes. Runtime plots report
+milliseconds per interpolation call, while speedup plots report the x-fold improvement over the Shapely
+reference.
+
+.. seealso::
+
+   The evaluation script is available at ``packages/lane_helpers/evaluation/shapely_evaluation.py``. It can be
+   used to run the benchmark sweep for different problem sizes on your target system.
+
+Performance depends on the batch size for both CPU and CUDA execution. CUDA parallelism scales with the number
+of polylines in the batch, so very small batch sizes may not fully utilize the GPU.
+
+For practical problem sizes, it is recommended to choose the implementation based primarily on where the
+tensors already live: CPU inputs should generally stay on CPU, and CUDA inputs should generally stay on CUDA.
+Moving tensors only to use a different implementation can dominate the interpolation cost.
+
+The plots below focus on batch sizes 1 and 64 as examples. The evaluation script runs for more batch sizes by
+default, and other batch sizes can be easily added.
+
+.. note::
+
+   The following measurements are intended as directional guidance. Exact runtimes depend on the used system, 
+   with performance primarily influenced by the CPU and GPU.
+
+   The plots shown here were generated on a system with an ``NVIDIA RTX 5000 Ada Generation`` GPU and an 
+   ``AMD Ryzen 9 7950X`` 16-Core Processor.
+
+.. note::
+
+   In the following runtime plots, markers highlight the smallest measured problem size, the largest measured 
+   problem size, and the 100-point/100-distance cell.
+
+   In the speedup plots, markers highlight the smallest measured problem size and the largest speedup. If speedup is not
+   above 1x everywhere, they also mark representative cells near the first matching point-count and distance-count
+   configuration where speedup exceeds 1x.
+
+Batch size 1 shows behavior for the smallest batch configuration in the benchmark:
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_1_runtime_comparison.png
+   :alt: Runtime comparison heatmaps for batch size 1
+   :align: center
+   :width: 100%
+
+   Runtime comparison for batch size 1. Rows vary the number of polyline points, and columns vary the number
+   of sampled distances.
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_1_speedup_comparison.png
+   :alt: Speedup comparison heatmaps for batch size 1
+   :align: center
+   :width: 100%
+
+   Speedup comparison for batch size 1.
+
+For larger batch sizes, CUDA can expose more parallel work and its speedup over the other methods typically
+becomes more pronounced. Batch size 64 shows this behavior:
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_64_runtime_comparison.png
+   :alt: Runtime comparison heatmaps for batch size 64
+   :align: center
+   :width: 100%
+
+   Runtime comparison for batch size 64.
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_64_speedup_comparison.png
+   :alt: Speedup comparison heatmaps for batch size 64
+   :align: center
+   :width: 100%
+
+   Speedup comparison for batch size 64.
diff --git a/packages/lane_helpers/docu_referenced_dirs.txt b/packages/lane_helpers/docu_referenced_dirs.txt
new file mode 100644
index 0000000..1e107f5
--- /dev/null
+++ b/packages/lane_helpers/docu_referenced_dirs.txt
@@ -0,0 +1 @@
+examples
diff --git a/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py
new file mode 100644
index 0000000..2cf0d9f
--- /dev/null
+++ b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import numpy as np
+
+_LARGE_SPEEDUP_THRESHOLD = 1_000.0
+
+
+# Helper function for formatting speedup values in result tables.
+def _format_speedup_value(value: float) -> str:
+    if abs(value) >= _LARGE_SPEEDUP_THRESHOLD:
+        return f"{value:.2e}"
+    return f"{value:.2f}"
+
+
+# Helper function for formatting one measured metric as a Markdown table.
+def _format_table(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    scientific: bool,
+) -> str:
+    rows = [
+        "| # Points (down) / # Distances (right) | " + " | ".join(str(item) for item in nums_distances) + " |"
+    ]
+    rows.append("| :----- |" + " :-----: |" * len(nums_distances))
+    for points_idx, num_points_current in enumerate(nums_points):
+        values = []
+        for distances_idx in range(len(nums_distances)):
+            value = data[points_idx, distances_idx]
+            if scientific:
+                values.append(np.format_float_scientific(value, precision=3))
+            else:
+                values.append(_format_speedup_value(value))
+        rows.append(f"| {num_points_current} | " + " | ".join(values) + " |")
+    table = "\n".join(rows)
+    return table
+
+
+# Helper function for writing one Markdown table to disk.
+def _write_markdown(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    filename: Path,
+    scientific: bool,
+) -> None:
+    table = _format_table(data, nums_points, nums_distances, scientific=scientific)
+    filename.write_text(table + "\n", encoding="utf-8")
+
+
+# Helper function for writing the Markdown table output for one metric.
+def _write_metric_outputs(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    filename_stem: Path,
+    scientific: bool,
+) -> None:
+    _write_markdown(
+        data,
+        nums_points,
+        nums_distances,
+        filename=filename_stem.with_suffix(".md"),
+        scientific=scientific,
+    )
+
+
+# Entry point: write all Markdown tables for one evaluated batch size.
+def write_batch_results(
+    output_dir: Path,
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    shapely_runtime_ms: np.ndarray | None,
+    cpu_runtime_ms: np.ndarray,
+    cuda_runtime_ms: np.ndarray,
+    skip_shapely: bool,
+    assert_results: bool,
+    max_abs_diff_cpu: np.ndarray | None,
+    max_abs_diff_cuda: np.ndarray | None,
+    max_abs_diff_cuda_vs_cpu: np.ndarray | None,
+) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms
+    prefix = f"batch_{batch_size}"
+
+    def write_metric(
+        metric_name: str,
+        data: np.ndarray,
+        *,
+        scientific: bool,
+    ) -> None:
+        _write_metric_outputs(
+            data,
+            nums_points,
+            nums_distances,
+            filename_stem=output_dir / f"{prefix}_{metric_name}",
+            scientific=scientific,
+        )
+
+    if not skip_shapely:
+        cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms
+        cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms
+        write_metric(
+            "runtime_shapely",
+            shapely_runtime_ms,
+            scientific=True,
+        )
+    # CPU and CUDA outputs are always available; Shapely-related metrics are optional.
+    write_metric("runtime_cuda", cuda_runtime_ms, scientific=True)
+    write_metric(
+        "runtime_cpu",
+        cpu_runtime_ms,
+        scientific=True,
+    )
+    if not skip_shapely:
+        write_metric(
+            "speedup_cuda_vs_shapely",
+            cuda_speedup_over_shapely,
+            scientific=False,
+        )
+        write_metric(
+            "speedup_cpu_vs_shapely",
+            cpu_speedup_over_shapely,
+            scientific=False,
+        )
+    write_metric(
+        "speedup_cuda_vs_cpu",
+        cuda_speedup_over_cpu,
+        scientific=False,
+    )
+    if assert_results:
+        write_metric(
+            "max_abs_diff_cuda_vs_cpu",
+            max_abs_diff_cuda_vs_cpu,
+            scientific=True,
+        )
+    if assert_results and not skip_shapely:
+        write_metric("max_abs_diff", max_abs_diff_cuda, scientific=True)
+        write_metric("max_abs_diff_cpu", max_abs_diff_cpu, scientific=True)
diff --git a/packages/lane_helpers/evaluation/plot_shapely_evaluation.py b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py
new file mode 100644
index 0000000..34acf06
--- /dev/null
+++ b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py
@@ -0,0 +1,595 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.colors as colors
+from matplotlib.axes import Axes
+from matplotlib import pyplot as plt
+import numpy as np
+
+DEFAULT_ANNOTATE_PLOTS = True
+_LARGE_SPEEDUP_THRESHOLD = 1_000.0
+_PLOT_FIGSIZE = (6.4, 5.2)
+_COMPARISON_SUBPLOT_WIDTH = 6.6
+_PLOT_SUPTITLE_FONT_SIZE = 22
+_PLOT_TITLE_FONT_SIZE = 18
+_PLOT_AXIS_LABEL_FONT_SIZE = 16
+_PLOT_TICK_LABEL_FONT_SIZE = 14
+_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE = 14
+_PLOT_ANNOTATION_FONT_SIZE = 16
+_PLOT_ANNOTATION_MARKER_SIZE = 52
+_PLOT_ANNOTATION_X_OFFSET = 0.25
+_PLOT_COLORBAR_FRACTION = 0.046
+_PLOT_COLORBAR_PAD = 0.02
+
+
+@dataclass(frozen=True)
+class _MetricPlotConfig:
+    title: str
+    annotation: str | None = None
+
+
+_METRIC_PLOT_CONFIGS = {
+    "runtime_shapely": _MetricPlotConfig("Shapely", annotation="runtime"),
+    "runtime_cuda": _MetricPlotConfig("CUDA", annotation="runtime"),
+    "runtime_cpu": _MetricPlotConfig("CPU", annotation="runtime"),
+    "speedup_cuda_vs_shapely": _MetricPlotConfig("CUDA vs. Shapely", annotation="speedup"),
+    "speedup_cpu_vs_shapely": _MetricPlotConfig("CPU vs. Shapely", annotation="speedup"),
+    "speedup_cuda_vs_cpu": _MetricPlotConfig("CUDA vs. CPU", annotation="speedup"),
+    "max_abs_diff_cuda_vs_cpu": _MetricPlotConfig("CUDA max abs. difference to CPU"),
+    "max_abs_diff": _MetricPlotConfig("CUDA max abs. difference to Shapely"),
+    "max_abs_diff_cpu": _MetricPlotConfig("CPU max abs. difference to Shapely"),
+}
+_SHAPELY_DEPENDENT_METRICS = frozenset(
+    {
+        "runtime_shapely",
+        "speedup_cuda_vs_shapely",
+        "speedup_cpu_vs_shapely",
+        "max_abs_diff",
+        "max_abs_diff_cpu",
+    }
+)
+_RUNTIME_METRICS_WITH_SHAPELY = ("runtime_shapely", "runtime_cpu", "runtime_cuda")
+_RUNTIME_METRICS_WITHOUT_SHAPELY = ("runtime_cpu", "runtime_cuda")
+_SPEEDUP_METRICS_WITH_SHAPELY = (
+    "speedup_cpu_vs_shapely",
+    "speedup_cuda_vs_shapely",
+    "speedup_cuda_vs_cpu",
+)
+_SPEEDUP_METRICS_WITHOUT_SHAPELY = ("speedup_cuda_vs_cpu",)
+
+
+# Helper function for formatting speedup values in tables and annotations.
+def _format_speedup_value(value: float) -> str:
+    if abs(value) >= _LARGE_SPEEDUP_THRESHOLD:
+        return f"{value:.2e}"
+    return f"{value:.2f}"
+
+
+# Helper function for splitting one Markdown table row into stripped cells.
+def _split_markdown_table_row(row: str) -> list[str]:
+    row = row.strip()
+    if not row.startswith("|") or not row.endswith("|"):
+        raise ValueError(f"Expected Markdown table row, got: {row}")
+    cells = [cell.strip() for cell in row.strip("|").split("|")]
+    return cells
+
+
+# Helper function for loading one metric table written by `_write_markdown`.
+def _read_metric_table(filename: Path) -> tuple[list[int], list[int], np.ndarray]:
+    table_rows = [
+        line.strip()
+        for line in filename.read_text(encoding="utf-8").splitlines()
+        if line.strip().startswith("|")
+    ]
+    if len(table_rows) < 3:
+        raise ValueError(f"Expected a Markdown header, separator, and at least one data row in {filename}")
+
+    header_cells = _split_markdown_table_row(table_rows[0])
+    if not header_cells or not header_cells[0].startswith("# Points"):
+        raise ValueError(f"Expected first Markdown header cell to describe point counts in {filename}")
+    nums_distances = [int(cell) for cell in header_cells[1:]]
+    nums_points: list[int] = []
+    values: list[list[float]] = []
+
+    for row in table_rows[2:]:
+        row_cells = _split_markdown_table_row(row)
+        if len(row_cells) != len(nums_distances) + 1:
+            raise ValueError(f"Expected {len(nums_distances) + 1} cells in {filename}, got {len(row_cells)}")
+        nums_points.append(int(row_cells[0]))
+        values.append([float(cell) for cell in row_cells[1:]])
+
+    data = np.asarray(values, dtype=np.float64)
+    return nums_points, nums_distances, data
+
+
+# Helper function for choosing which speedup heatmap cells should show numeric labels.
+def _selected_speedup_annotation_cells(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> list[tuple[int, int]]:
+    def find_value_index(values: list[int], value: int) -> int | None:
+        try:
+            index = values.index(value)
+        except ValueError:
+            return None
+        return index
+
+    def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None:
+        if cell not in cells:
+            cells.append(cell)
+
+    def find_first_faster_distance_idx(points_idx: int) -> int | None:
+        for distances_idx in range(len(nums_distances)):
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0:
+                return distances_idx
+        return None
+
+    def find_first_faster_points_idx(distances_idx: int) -> int | None:
+        for points_idx in range(len(nums_points)):
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0:
+                return points_idx
+        return None
+
+    cells: list[tuple[int, int]] = []
+
+    points_idx = find_value_index(nums_points, 2)
+    distances_idx = find_value_index(nums_distances, 1)
+    if points_idx is not None and distances_idx is not None:
+        add_unique_cell(cells, (points_idx, distances_idx))
+
+    finite_mask = np.isfinite(data)
+    if np.any(finite_mask):
+        finite_data = np.where(finite_mask, data, -np.inf)
+        points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape)
+        add_unique_cell(cells, (int(points_idx), int(distances_idx)))
+
+    finite_values = data[finite_mask]
+    if finite_values.size > 0 and not np.all(finite_values > 1.0):
+        distances_idx_by_value = {value: idx for idx, value in enumerate(nums_distances)}
+        for points_idx, num_points_current in enumerate(nums_points):
+            distances_idx = distances_idx_by_value.get(num_points_current)
+            if distances_idx is None:
+                continue
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] > 1.0:
+                add_unique_cell(cells, (points_idx, distances_idx))
+                if points_idx > 0:
+                    faster_distances_idx = find_first_faster_distance_idx(points_idx - 1)
+                    if faster_distances_idx is not None:
+                        add_unique_cell(cells, (points_idx - 1, faster_distances_idx))
+                if distances_idx > 0:
+                    faster_points_idx = find_first_faster_points_idx(distances_idx - 1)
+                    if faster_points_idx is not None:
+                        add_unique_cell(cells, (faster_points_idx, distances_idx - 1))
+                break
+
+    return cells
+
+
+# Helper function for choosing which runtime heatmap cells should get marker labels.
+def _selected_runtime_annotation_cells(
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> list[tuple[int, int]]:
+    def find_value_index(values: list[int], value: int) -> int | None:
+        try:
+            index = values.index(value)
+        except ValueError:
+            return None
+        return index
+
+    def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None:
+        if cell not in cells:
+            cells.append(cell)
+
+    cells: list[tuple[int, int]] = []
+    if nums_points and nums_distances:
+        add_unique_cell(cells, (0, 0))
+        add_unique_cell(cells, (len(nums_points) - 1, len(nums_distances) - 1))
+
+    points_idx = find_value_index(nums_points, 100)
+    distances_idx = find_value_index(nums_distances, 100)
+    if points_idx is not None and distances_idx is not None:
+        add_unique_cell(cells, (points_idx, distances_idx))
+
+    return cells
+
+
+# Helper function for placing numeric labels on selected speedup heatmap cells.
+def _speedup_annotation_text_position(
+    points_idx: int,
+    distances_idx: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    selected_cells: list[tuple[int, int]],
+    data: np.ndarray,
+    max_speedup_cell: tuple[int, int] | None,
+) -> tuple[float, str]:
+    if max_speedup_cell == (points_idx, distances_idx) and distances_idx > 0:
+        return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right"
+
+    is_left_of_value_diagonal = nums_distances[distances_idx] < nums_points[points_idx]
+    has_adjacent_above_one_annotation = any(
+        (other_points_idx, other_distances_idx) != (points_idx, distances_idx)
+        and abs(other_points_idx - points_idx) + abs(other_distances_idx - distances_idx) == 1
+        and np.isfinite(data[other_points_idx, other_distances_idx])
+        and data[other_points_idx, other_distances_idx] >= 1.0
+        for other_points_idx, other_distances_idx in selected_cells
+    )
+    should_place_left = distances_idx == len(nums_distances) - 1 or (
+        distances_idx > 0 and is_left_of_value_diagonal and has_adjacent_above_one_annotation
+    )
+    if should_place_left:
+        return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right"
+    return distances_idx + _PLOT_ANNOTATION_X_OFFSET, "left"
+
+
+# Helper function for drawing optional numeric labels on selected speedup heatmap cells.
+def _annotate_speedup_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> None:
+    selected_cells = _selected_speedup_annotation_cells(data, nums_points, nums_distances)
+    finite_mask = np.isfinite(data)
+    max_speedup_cell = None
+    if np.any(finite_mask):
+        finite_data = np.where(finite_mask, data, -np.inf)
+        points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape)
+        max_speedup_cell = (int(points_idx), int(distances_idx))
+
+    for points_idx, distances_idx in selected_cells:
+        value = data[points_idx, distances_idx]
+        if not np.isfinite(value):
+            continue
+
+        ax.scatter(
+            [distances_idx],
+            [points_idx],
+            marker="o",
+            s=_PLOT_ANNOTATION_MARKER_SIZE,
+            c="black",
+            edgecolors="white",
+            linewidths=0.8,
+            zorder=3,
+        )
+
+        text_x, horizontal_alignment = _speedup_annotation_text_position(
+            points_idx,
+            distances_idx,
+            nums_points,
+            nums_distances,
+            selected_cells,
+            data,
+            max_speedup_cell,
+        )
+        ax.text(
+            text_x,
+            points_idx,
+            _format_speedup_value(value),
+            ha=horizontal_alignment,
+            va="center",
+            fontsize=_PLOT_ANNOTATION_FONT_SIZE,
+            color="black",
+            bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75},
+            zorder=4,
+        )
+
+
+# Helper function for drawing fixed reference markers on runtime heatmap cells.
+def _annotate_runtime_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> None:
+    for points_idx, distances_idx in _selected_runtime_annotation_cells(nums_points, nums_distances):
+        value = data[points_idx, distances_idx]
+        if not np.isfinite(value):
+            continue
+
+        ax.scatter(
+            [distances_idx],
+            [points_idx],
+            marker="o",
+            s=_PLOT_ANNOTATION_MARKER_SIZE,
+            c="black",
+            edgecolors="white",
+            linewidths=0.8,
+            zorder=3,
+        )
+
+        if distances_idx == len(nums_distances) - 1:
+            text_x = distances_idx - 0.15
+            horizontal_alignment = "right"
+        else:
+            text_x = distances_idx + 0.15
+            horizontal_alignment = "left"
+        ax.text(
+            text_x,
+            points_idx,
+            f"{value:.1e}",
+            ha=horizontal_alignment,
+            va="center",
+            fontsize=_PLOT_ANNOTATION_FONT_SIZE,
+            color="black",
+            bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75},
+            zorder=4,
+        )
+
+
+# Helper function for drawing one heatmap into an existing subplot.
+def _draw_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    title: str,
+    log_scale: bool,
+    annotate_speedup: bool = False,
+    annotate_runtime: bool = False,
+) -> None:
+    norm = None
+    if log_scale:
+        positive_values = data[data > 0]
+        if positive_values.size > 0:
+            norm = colors.LogNorm(vmin=positive_values.min(), vmax=positive_values.max())
+
+    image = ax.imshow(data, norm=norm)
+    ax.set_yticks(list(range(len(nums_points))), labels=nums_points, fontsize=_PLOT_TICK_LABEL_FONT_SIZE)
+    ax.set_ylabel("Number of points", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE)
+    ax.set_xticks(
+        list(range(len(nums_distances))),
+        labels=nums_distances,
+        rotation=45,
+        fontsize=_PLOT_TICK_LABEL_FONT_SIZE,
+    )
+    ax.set_xlabel("Number of distances", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE)
+    ax.set_title(title, fontsize=_PLOT_TITLE_FONT_SIZE, pad=12)
+    colorbar = ax.figure.colorbar(
+        image,
+        ax=ax,
+        fraction=_PLOT_COLORBAR_FRACTION,
+        pad=_PLOT_COLORBAR_PAD,
+    )
+    colorbar.ax.tick_params(labelsize=_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE)
+    colorbar.ax.yaxis.offsetText.set_fontsize(_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE)
+    if annotate_speedup:
+        _annotate_speedup_heatmap(ax, data, nums_points, nums_distances)
+    if annotate_runtime:
+        _annotate_runtime_heatmap(ax, data, nums_points, nums_distances)
+
+
+# Helper function for writing a multi-subplot comparison plot for one metric group.
+def _plot_metric_comparison(
+    metric_names: tuple[str, ...],
+    metric_data: dict[str, np.ndarray],
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    batch_size: int,
+    figure_title: str,
+    filename: Path,
+    annotate_plots: bool,
+) -> None:
+    available_metric_names = tuple(metric_name for metric_name in metric_names if metric_name in metric_data)
+    if not available_metric_names:
+        return
+
+    subplot_width = _PLOT_FIGSIZE[0] if len(available_metric_names) == 1 else _COMPARISON_SUBPLOT_WIDTH
+    fig, axes = plt.subplots(
+        1,
+        len(available_metric_names),
+        figsize=(subplot_width * len(available_metric_names), _PLOT_FIGSIZE[1]),
+        constrained_layout=True,
+    )
+    fig.suptitle(f"{figure_title} (Batch Size {batch_size})", fontsize=_PLOT_SUPTITLE_FONT_SIZE)
+    axes = np.atleast_1d(axes).tolist()
+
+    for ax, metric_name in zip(axes, available_metric_names):
+        metric_config = _METRIC_PLOT_CONFIGS[metric_name]
+        _draw_heatmap(
+            ax,
+            metric_data[metric_name],
+            nums_points,
+            nums_distances,
+            title=metric_config.title,
+            log_scale=True,
+            annotate_speedup=metric_config.annotation == "speedup" and annotate_plots,
+            annotate_runtime=metric_config.annotation == "runtime" and annotate_plots,
+        )
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+# Helper function for writing comparison plots whose subplot counts depend on Shapely availability.
+def _write_comparison_outputs(
+    output_dir: Path,
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    metric_data: dict[str, np.ndarray],
+    *,
+    has_shapely_results: bool,
+    annotate_plots: bool,
+) -> list[Path]:
+    runtime_metric_names = (
+        _RUNTIME_METRICS_WITH_SHAPELY if has_shapely_results else _RUNTIME_METRICS_WITHOUT_SHAPELY
+    )
+    speedup_metric_names = (
+        _SPEEDUP_METRICS_WITH_SHAPELY if has_shapely_results else _SPEEDUP_METRICS_WITHOUT_SHAPELY
+    )
+    prefix = f"batch_{batch_size}"
+    comparison_files = [
+        output_dir / f"{prefix}_runtime_comparison.png",
+        output_dir / f"{prefix}_speedup_comparison.png",
+    ]
+    _plot_metric_comparison(
+        runtime_metric_names,
+        metric_data,
+        nums_points,
+        nums_distances,
+        batch_size=batch_size,
+        figure_title="Runtime [ms]",
+        filename=comparison_files[0],
+        annotate_plots=annotate_plots,
+    )
+    _plot_metric_comparison(
+        speedup_metric_names,
+        metric_data,
+        nums_points,
+        nums_distances,
+        batch_size=batch_size,
+        figure_title="Speedup [x-fold]",
+        filename=comparison_files[1],
+        annotate_plots=annotate_plots,
+    )
+    return comparison_files
+
+
+# Helper function for parsing comma-separated integer lists.
+def _parse_int_list(value: str) -> list[int]:
+    parsed_values = [int(item) for item in value.split(",") if item]
+    return parsed_values
+
+
+def plot_batch_results_from_markdown(
+    input_dir: Path,
+    output_dir: Path,
+    batch_size: int,
+    annotate_plots: bool,
+) -> list[Path]:
+    prefix = f"batch_{batch_size}_"
+    markdown_files = sorted(input_dir.glob(f"{prefix}*.md"))
+    if not markdown_files:
+        raise FileNotFoundError(f"No Markdown result tables found for batch={batch_size} in {input_dir}")
+    available_metric_names = {markdown_file.stem[len(prefix) :] for markdown_file in markdown_files}
+    has_shapely_results = "runtime_shapely" in available_metric_names
+
+    metric_data: dict[str, np.ndarray] = {}
+    comparison_nums_points: list[int] | None = None
+    comparison_nums_distances: list[int] | None = None
+    for markdown_file in markdown_files:
+        metric_name = markdown_file.stem[len(prefix) :]
+        if metric_name not in _METRIC_PLOT_CONFIGS:
+            continue
+        if metric_name in _SHAPELY_DEPENDENT_METRICS and not has_shapely_results:
+            continue
+
+        nums_points, nums_distances, data = _read_metric_table(markdown_file)
+        metric_data[metric_name] = data
+        comparison_nums_points = nums_points
+        comparison_nums_distances = nums_distances
+
+    if comparison_nums_points is not None and comparison_nums_distances is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        plotted_files = _write_comparison_outputs(
+            output_dir,
+            batch_size,
+            comparison_nums_points,
+            comparison_nums_distances,
+            metric_data,
+            has_shapely_results=has_shapely_results,
+            annotate_plots=annotate_plots,
+        )
+    else:
+        plotted_files = []
+
+    if not plotted_files:
+        raise FileNotFoundError(
+            f"No known Markdown result tables found for batch={batch_size} in {input_dir}"
+        )
+    return plotted_files
+
+
+def plot_from_markdown_directory(
+    *,
+    input_dir: Path,
+    output_dir: Path,
+    batch_sizes: list[int],
+    annotate_plots: bool = DEFAULT_ANNOTATE_PLOTS,
+) -> list[Path]:
+    if not input_dir.exists():
+        raise FileNotFoundError(f"Markdown input directory does not exist: {input_dir}")
+
+    plotted_files: list[Path] = []
+    for batch_size in batch_sizes:
+        batch_plotted_files = plot_batch_results_from_markdown(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            batch_size=batch_size,
+            annotate_plots=annotate_plots,
+        )
+        plotted_files.extend(batch_plotted_files)
+    return plotted_files
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate polyline runtime plot images from Markdown result tables.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        required=True,
+        help="Directory containing Markdown result tables.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        required=True,
+        help="Directory where plot images should be written.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        default="1,4,16,64",
+        help="Comma-separated batch sizes to plot.",
+    )
+    no_annotate_plots_action = parser.add_argument(
+        "--no-annotate-plots",
+        dest="annotate_plots",
+        action="store_false",
+        help="Disable annotations in generated heatmaps.",
+    )
+    parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS)
+    no_annotate_plots_action.default = argparse.SUPPRESS
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = _parse_args()
+    batch_sizes = _parse_int_list(args.batch_sizes)
+    plotted_files = plot_from_markdown_directory(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        batch_sizes=batch_sizes,
+        annotate_plots=args.annotate_plots,
+    )
+    for plotted_file in plotted_files:
+        print(f"Generated plot: {plotted_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/evaluation/shapely_evaluation.py b/packages/lane_helpers/evaluation/shapely_evaluation.py
new file mode 100644
index 0000000..1048191
--- /dev/null
+++ b/packages/lane_helpers/evaluation/shapely_evaluation.py
@@ -0,0 +1,681 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from collections.abc import Callable, Iterator
+import gc
+import sys
+import time
+from pathlib import Path
+from types import ModuleType
+
+import numpy as np
+from shapely import get_coordinates, line_interpolate_point, linestrings
+import torch
+
+# Import helpers for outputting results and plots
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+import plot_shapely_evaluation
+import _shapely_evaluation_outputs as shapely_evaluation_outputs
+
+# ==================== Default configuration for the evaluation ====================
+
+# These constants are convenient local configuration knobs. However, these configurations can also
+# be done with CLI arguments.
+# When changing these constants, check the CLI arguments further below in the script,
+# because some flags only override the default in one direction.
+
+# Sweep values for the heatmap axes and the batch-size examples.
+DEFAULT_NUMS_POINTS = [2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
+DEFAULT_NUMS_DISTANCES = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
+DEFAULT_BATCH_SIZES = [1, 4, 16, 64]
+# Keep the measured work roughly constant across batch sizes.
+DEFAULT_NUM_POLYLINES_PER_MEASUREMENT = 64 * 10
+# Warm up a representative mid-sized configuration before timing the sweep.
+DEFAULT_NUM_WARMUP_RUNS = 3
+DEFAULT_WARMUP_NUM_POINTS = 100
+DEFAULT_WARMUP_NUM_DISTANCES = 100
+# Shapely can be skipped for faster CPU/CUDA-only benchmark sweeps.
+DEFAULT_SKIP_SHAPELY = False
+# Result checks are optional because they add Shapely reference work to each configuration.
+DEFAULT_ASSERT_RESULTS = False
+DEFAULT_ASSERT_ATOL = 1e-3
+DEFAULT_ASSERT_RTOL = 0.0
+# Plot annotations call out representative cells in generated heatmaps.
+DEFAULT_ANNOTATE_PLOTS = True
+# ================== End: Default configuration for the evaluation =================
+
+
+# ================== Constants for the evaluation ==================
+DEVICE = "cuda"
+DTYPE_NP = np.float32
+DTYPE_TORCH = torch.float32
+_POLYLINE_MODULE: ModuleType | None = None
+# ================ End: Constants for the evaluation ===============
+
+
+# Helper function for lazily importing the compiled polyline module outside plotting-only mode.
+def _get_polyline_module() -> ModuleType:
+    global _POLYLINE_MODULE
+    if _POLYLINE_MODULE is None:
+        from accvlab.lane_helpers import polyline as polyline_module
+
+        _POLYLINE_MODULE = polyline_module
+    return _POLYLINE_MODULE
+
+
+# Helper function for config parsing
+def _parse_int_list(value: str) -> list[int]:
+    parsed_values = [int(item) for item in value.split(",") if item]
+    return parsed_values
+
+
+# Helper function for computing the batched Shapely reference.
+def _compute_batched_shapely_reference(points: np.ndarray, distances: np.ndarray) -> np.ndarray:
+    line_strings = linestrings(points)
+    interpolated_points = line_interpolate_point(line_strings[:, None], distances)
+    batched_reference = (
+        get_coordinates(interpolated_points)
+        .reshape(
+            points.shape[0],
+            distances.shape[1],
+            points.shape[2],
+        )
+        .astype(DTYPE_NP)
+    )
+    return batched_reference
+
+
+# Helper function for computing per-polyline lengths in NumPy.
+def _compute_polyline_lengths_np(points: np.ndarray) -> np.ndarray:
+    if points.shape[1] <= 1:
+        lengths = np.zeros((points.shape[0],), dtype=DTYPE_NP)
+        return lengths
+    lengths = np.linalg.norm(points[:, 1:] - points[:, :-1], axis=2).sum(axis=1).astype(DTYPE_NP)
+    return lengths
+
+
+# Helper function for comparing CPU and CUDA outputs against Shapely when requested.
+def _assert_matches_shapely(
+    shapely_result: np.ndarray,
+    result: np.ndarray,
+    *,
+    implementation_name: str,
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    atol: float,
+    rtol: float,
+) -> None:
+    try:
+        np.testing.assert_allclose(result, shapely_result, atol=atol, rtol=rtol)
+    except AssertionError as exc:
+        max_abs_diff = np.abs(shapely_result - result).max()
+        raise AssertionError(
+            f"{implementation_name} result differs from Shapely for "
+            f"batch={batch_size}, points={num_points}, distances={num_distances}; "
+            f"max_abs_diff={max_abs_diff}, atol={atol}, rtol={rtol}"
+        ) from exc
+
+
+# Helper function for constructing one deterministic benchmark input configuration.
+def _make_evaluation_case(
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    *,
+    seed: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    generator = np.random.default_rng(seed=seed)
+    # Set up the polylines
+    points = generator.uniform(0.0, 1.0, size=(batch_size, num_points, 2)).astype(DTYPE_NP)
+    lengths = _compute_polyline_lengths_np(points)
+    # Set up the distances to sample the polyline at
+    distances_normalized = generator.uniform(0.0, 1.0, size=(batch_size, num_distances)).astype(DTYPE_NP)
+    distances = distances_normalized * lengths[:, None]
+    return points, distances
+
+
+# Helper function for iterating over deterministic benchmark configurations.
+def _iter_evaluation_cases(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> Iterator[tuple[int, int, int, int, int]]:
+    for points_idx, num_points_current in enumerate(nums_points):
+        for distances_idx, num_distances_current in enumerate(nums_distances):
+            seed = batch_size * 1_000_000 + num_points_current * 1_000 + num_distances_current
+            yield points_idx, distances_idx, num_points_current, num_distances_current, seed
+
+
+# Helper function for placing the same NumPy inputs on CUDA and CPU.
+def _make_torch_tensors(
+    *arrays: np.ndarray,
+) -> tuple[torch.Tensor, ...]:
+    tensors_gpu = [torch.tensor(array, device=DEVICE, dtype=DTYPE_TORCH) for array in arrays]
+    tensors_cpu = [torch.tensor(array, device="cpu", dtype=DTYPE_TORCH) for array in arrays]
+    return *tensors_gpu, *tensors_cpu
+
+
+# Helper function for placing NumPy inputs on one target device.
+def _make_torch_tensors_on_device(
+    *arrays: np.ndarray,
+    device: str,
+) -> tuple[torch.Tensor, ...]:
+    tensors = tuple(torch.tensor(array, device=device, dtype=DTYPE_TORCH) for array in arrays)
+    return tensors
+
+
+# Helper function for timing repeated calls and synchronizing CUDA work when needed.
+def _time_call(
+    function: Callable[[], object],
+    *,
+    num_runs: int,
+    synchronize_cuda: bool = False,
+) -> float:
+    if synchronize_cuda:
+        # Ensure previous work is finished before starting the timing.
+        torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(num_runs):
+        function()
+    if synchronize_cuda:
+        # Ensure all work is finished before stopping the timing.
+        torch.cuda.synchronize()
+    runtime = (time.perf_counter() - start) / num_runs
+    return runtime
+
+
+# Helper function for reducing cross-implementation timing interference.
+def _cleanup_between_implementation_sweeps() -> None:
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+
+
+# Helper function for timing the Shapely reference implementation.
+def _time_shapely(
+    points: np.ndarray,
+    distances: np.ndarray,
+    *,
+    num_runs: int,
+) -> float:
+    compute_function = lambda: _compute_batched_shapely_reference(points, distances)
+    runtime = _time_call(
+        compute_function,
+        num_runs=num_runs,
+    )
+    return runtime
+
+
+# Helper function for timing the CUDA implementation.
+def _time_cuda(
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    *,
+    num_runs: int,
+) -> float:
+    polyline_module = _get_polyline_module()
+    compute_function = lambda: polyline_module.interpolate(points, distances)
+    runtime = _time_call(
+        compute_function,
+        num_runs=num_runs,
+        synchronize_cuda=True,
+    )
+    return runtime
+
+
+# Helper function for timing the CPU implementation.
+def _time_cpu(
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    *,
+    num_runs: int,
+) -> float:
+    polyline_module = _get_polyline_module()
+    compute_function = lambda: polyline_module.interpolate(points, distances)
+    runtime = _time_call(compute_function, num_runs=num_runs)
+    return runtime
+
+
+# Helper function for warming up all selected implementations once before measured runs.
+def _run_warmup(
+    *,
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    num_warmup_runs: int,
+    skip_shapely: bool,
+) -> None:
+    if num_warmup_runs <= 0:
+        return
+
+    points_np, distances_np = _make_evaluation_case(
+        batch_size,
+        num_points,
+        num_distances,
+        seed=0,
+    )
+    points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np)
+    polyline_module = _get_polyline_module()
+
+    for _ in range(num_warmup_runs):
+        if not skip_shapely:
+            _compute_batched_shapely_reference(points_np, distances_np)
+        polyline_module.interpolate(points_cpu, distances_cpu)
+        polyline_module.interpolate(points_gpu, distances_gpu)
+
+    torch.cuda.synchronize()
+
+
+# Helper to (optionally) validate the results against the Shapely reference.
+def _run_validation_sweep(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    assert_atol: float,
+    assert_rtol: float,
+    max_abs_diff_cpu: np.ndarray,
+    max_abs_diff_cuda: np.ndarray,
+    max_abs_diff_cuda_vs_cpu: np.ndarray,
+) -> None:
+    print(f"Running validation sweep for batch={batch_size}")
+    polyline_module = _get_polyline_module()
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running validation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        shapely_result = _compute_batched_shapely_reference(points_np, distances_np)
+        points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np)
+        cpu_result = polyline_module.interpolate(points_cpu, distances_cpu).numpy()
+        cuda_result = polyline_module.interpolate(points_gpu, distances_gpu).cpu().numpy()
+
+        max_abs_diff_cpu[points_idx, distances_idx] = np.abs(shapely_result - cpu_result).max()
+        max_abs_diff_cuda[points_idx, distances_idx] = np.abs(shapely_result - cuda_result).max()
+        max_abs_diff_cuda_vs_cpu[points_idx, distances_idx] = np.abs(cpu_result - cuda_result).max()
+
+        _assert_matches_shapely(
+            shapely_result,
+            cpu_result,
+            implementation_name="CPU",
+            batch_size=batch_size,
+            num_points=num_points_current,
+            num_distances=num_distances_current,
+            atol=assert_atol,
+            rtol=assert_rtol,
+        )
+        _assert_matches_shapely(
+            shapely_result,
+            cuda_result,
+            implementation_name="CUDA",
+            batch_size=batch_size,
+            num_points=num_points_current,
+            num_distances=num_distances_current,
+            atol=assert_atol,
+            rtol=assert_rtol,
+        )
+
+
+# Helper function for evaluating every point-count and distance-count pair for one batch size.
+def _evaluate_batch_size(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    num_runs: int,
+    assert_results: bool,
+    assert_atol: float,
+    assert_rtol: float,
+    skip_shapely: bool,
+) -> tuple[
+    np.ndarray | None, np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None, np.ndarray | None
+]:
+    result_shape = (len(nums_points), len(nums_distances))
+
+    shapely_runtime_ms = None if skip_shapely else np.zeros(result_shape, dtype=np.float64)
+    cuda_runtime_ms = np.zeros(result_shape, dtype=np.float64)
+    cpu_runtime_ms = np.zeros(result_shape, dtype=np.float64)
+
+    max_abs_diff_cuda = np.zeros_like(cpu_runtime_ms) if assert_results else None
+    max_abs_diff_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None
+    max_abs_diff_cuda_vs_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None
+
+    if not skip_shapely:
+        print(f"Running Shapely sweep for batch={batch_size}, runs={num_runs}")
+        for (
+            points_idx,
+            distances_idx,
+            num_points_current,
+            num_distances_current,
+            seed,
+        ) in _iter_evaluation_cases(batch_size, nums_points, nums_distances):
+            print(
+                "Running Shapely evaluation "
+                f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+                f"runs={num_runs}"
+            )
+            points_np, distances_np = _make_evaluation_case(
+                batch_size,
+                num_points_current,
+                num_distances_current,
+                seed=seed,
+            )
+
+            shapely_runtime = _time_shapely(
+                points_np,
+                distances_np,
+                num_runs=num_runs,
+            )
+            shapely_runtime_ms[points_idx, distances_idx] = shapely_runtime * 1000
+        _cleanup_between_implementation_sweeps()
+
+    print(f"Running CPU sweep for batch={batch_size}, runs={num_runs}")
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running CPU evaluation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+            f"runs={num_runs}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        points_cpu, distances_cpu = _make_torch_tensors_on_device(
+            points_np,
+            distances_np,
+            device="cpu",
+        )
+
+        cpu_runtime_ms[points_idx, distances_idx] = (
+            _time_cpu(
+                points_cpu,
+                distances_cpu,
+                num_runs=num_runs,
+            )
+            * 1000
+        )
+    _cleanup_between_implementation_sweeps()
+
+    print(f"Running CUDA sweep for batch={batch_size}, runs={num_runs}")
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running CUDA evaluation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+            f"runs={num_runs}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        points_gpu, distances_gpu = _make_torch_tensors_on_device(
+            points_np,
+            distances_np,
+            device=DEVICE,
+        )
+
+        cuda_runtime_ms[points_idx, distances_idx] = (
+            _time_cuda(
+                points_gpu,
+                distances_gpu,
+                num_runs=num_runs,
+            )
+            * 1000
+        )
+    _cleanup_between_implementation_sweeps()
+
+    if assert_results:
+        _run_validation_sweep(
+            batch_size,
+            nums_points,
+            nums_distances,
+            assert_atol=assert_atol,
+            assert_rtol=assert_rtol,
+            max_abs_diff_cpu=max_abs_diff_cpu,
+            max_abs_diff_cuda=max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu=max_abs_diff_cuda_vs_cpu,
+        )
+        _cleanup_between_implementation_sweeps()
+
+    return (
+        shapely_runtime_ms,
+        cpu_runtime_ms,
+        cuda_runtime_ms,
+        max_abs_diff_cpu,
+        max_abs_diff_cuda,
+        max_abs_diff_cuda_vs_cpu,
+    )
+
+
+# Helper function for parsing command-line arguments.
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Evaluate batched CPU/CUDA polyline interpolation against a Shapely LineString reference "
+            "over point-count, distance-count, and batch-size sweeps."
+        ),
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--num-points",
+        dest="nums_points",
+        default=",".join(str(item) for item in DEFAULT_NUMS_POINTS),
+        help="Comma-separated point counts for the polyline-length sweep.",
+    )
+    parser.add_argument(
+        "--num-distances",
+        dest="nums_distances",
+        default=",".join(str(item) for item in DEFAULT_NUMS_DISTANCES),
+        help="Comma-separated sample-distance counts for the interpolation sweep.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        default=",".join(str(item) for item in DEFAULT_BATCH_SIZES),
+        help="Comma-separated batch sizes to evaluate.",
+    )
+    parser.add_argument(
+        "--num-polylines-per-measurement",
+        type=int,
+        default=DEFAULT_NUM_POLYLINES_PER_MEASUREMENT,
+        help="Target number of polylines measured per configuration; divided by batch size to get runs.",
+    )
+    parser.add_argument(
+        "--num-warmup-runs",
+        type=int,
+        default=DEFAULT_NUM_WARMUP_RUNS,
+        help="Number of untimed warmup runs before the measured sweep.",
+    )
+    parser.add_argument(
+        "--warmup-num-points",
+        type=int,
+        default=DEFAULT_WARMUP_NUM_POINTS,
+        help="Point count used for warmup inputs.",
+    )
+    parser.add_argument(
+        "--warmup-num-distances",
+        type=int,
+        default=DEFAULT_WARMUP_NUM_DISTANCES,
+        help="Sample-distance count used for warmup inputs.",
+    )
+    parser.add_argument(
+        "--skip-shapely",
+        action="store_true",
+        default=DEFAULT_SKIP_SHAPELY,
+        help="Skip Shapely reference timing and Shapely-based speedup plots.",
+    )
+    parser.add_argument(
+        "--assert-results",
+        action="store_true",
+        default=DEFAULT_ASSERT_RESULTS,
+        help="Compare CPU and CUDA outputs against Shapely using the configured tolerances.",
+    )
+    parser.add_argument(
+        "--assert-atol",
+        type=float,
+        default=DEFAULT_ASSERT_ATOL,
+        help="Absolute tolerance used when asserting results against Shapely.",
+    )
+    parser.add_argument(
+        "--assert-rtol",
+        type=float,
+        default=DEFAULT_ASSERT_RTOL,
+        help="Relative tolerance used when asserting results against Shapely.",
+    )
+    no_annotate_plots_action = parser.add_argument(
+        "--no-annotate-plots",
+        dest="annotate_plots",
+        action="store_false",
+        help="Disable annotations in generated heatmaps.",
+    )
+    parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS)
+    no_annotate_plots_action.default = argparse.SUPPRESS
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("polyline_shapely_eval_results"),
+        help="Directory for Markdown result tables and generated plot images.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+# Main entry point for the full benchmark sweep.
+def main() -> None:
+    args = _parse_args()
+    nums_points = _parse_int_list(args.nums_points)
+    nums_distances = _parse_int_list(args.nums_distances)
+    batch_sizes = _parse_int_list(args.batch_sizes)
+    # Make relative output paths independent of the caller's working directory.
+    if not args.output_dir.is_absolute():
+        args.output_dir = SCRIPT_DIR / args.output_dir
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("This evaluation requires a CUDA-capable PyTorch installation.")
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Result assertions require Shapely, so disabling Shapely also disables assertions.
+    assert_results = args.assert_results and not args.skip_shapely
+
+    print("Performing runtime evaluation...")
+    print(f"Numbers of points: {nums_points}")
+    print(f"Numbers of distances: {nums_distances}")
+    print(f"Batch sizes: {batch_sizes}")
+    print(f"Measured polylines per configuration: {args.num_polylines_per_measurement}")
+    print(
+        "Warmup configuration: "
+        f"batch={max(batch_sizes)}, points={args.warmup_num_points}, "
+        f"distances={args.warmup_num_distances}, runs={args.num_warmup_runs}"
+    )
+    print(f"Use Shapely reference: {not args.skip_shapely}")
+    print(f"Assert results against Shapely: {assert_results}")
+    print(f"Annotate plots: {args.annotate_plots}")
+    print(f"Output directory: {args.output_dir}")
+
+    _run_warmup(
+        batch_size=max(batch_sizes),
+        num_points=args.warmup_num_points,
+        num_distances=args.warmup_num_distances,
+        num_warmup_runs=args.num_warmup_runs,
+        skip_shapely=args.skip_shapely,
+    )
+
+    for batch_size in batch_sizes:
+        # Keep (roughly) the same number of measured polylines per configuration across batch sizes.
+        num_runs = max(1, args.num_polylines_per_measurement // batch_size)
+        print(f"Using {num_runs} measured runs for batch={batch_size}")
+
+        # Run evaluation & get results for one batch size (number of polylines in single call).
+        (
+            shapely_runtime_ms,
+            cpu_runtime_ms,
+            cuda_runtime_ms,
+            max_abs_diff_cpu,
+            max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu,
+        ) = _evaluate_batch_size(
+            batch_size,
+            nums_points,
+            nums_distances,
+            num_runs=num_runs,
+            assert_results=assert_results,
+            assert_atol=args.assert_atol,
+            assert_rtol=args.assert_rtol,
+            skip_shapely=args.skip_shapely,
+        )
+
+        # Write results to disk.
+        shapely_evaluation_outputs.write_batch_results(
+            args.output_dir,
+            batch_size,
+            nums_points,
+            nums_distances,
+            shapely_runtime_ms,
+            cpu_runtime_ms,
+            cuda_runtime_ms,
+            args.skip_shapely,
+            assert_results,
+            max_abs_diff_cpu,
+            max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu,
+        )
+
+        # Print info.
+        cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms
+        if not args.skip_shapely:
+            cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms
+            cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms
+            print(f"Average Shapely runtime [ms], batch={batch_size}:\n{shapely_runtime_ms}")
+        print(f"Average CPU runtime [ms], batch={batch_size}:\n{cpu_runtime_ms}")
+        print(f"Average CUDA runtime [ms], batch={batch_size}:\n{cuda_runtime_ms}")
+        if not args.skip_shapely:
+            print(f"CPU speedup over Shapely, batch={batch_size}:\n{cpu_speedup_over_shapely}")
+            print(f"CUDA speedup over Shapely, batch={batch_size}:\n{cuda_speedup_over_shapely}")
+        print(f"CUDA speedup over CPU, batch={batch_size}:\n{cuda_speedup_over_cpu}")
+        if assert_results:
+            print(f"CUDA max absolute difference to CPU, batch={batch_size}:\n{max_abs_diff_cuda_vs_cpu}")
+            print(f"CPU max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cpu}")
+            print(f"CUDA max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cuda}")
+
+    plotted_files = plot_shapely_evaluation.plot_from_markdown_directory(
+        input_dir=args.output_dir,
+        output_dir=args.output_dir,
+        batch_sizes=batch_sizes,
+        annotate_plots=args.annotate_plots,
+    )
+    print(f"Generated {len(plotted_files)} plot image(s) from Markdown results.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md
new file mode 100644
index 0000000..1ac6974
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 1.03e-03 | 9.861e-04 | 1.020e-03 | 1.141e-03 | 1.175e-03 | 1.157e-03 | 1.294e-03 | 1.536e-03 | 2.404e-03 | 3.742e-03 | 6.497e-03 | 1.486e-02 |
+| 5 | 1.006e-03 | 1.011e-03 | 1.022e-03 | 1.044e-03 | 1.109e-03 | 1.292e-03 | 1.406e-03 | 1.77e-03 | 2.851e-03 | 4.664e-03 | 8.391e-03 | 2.134e-02 |
+| 10 | 1.017e-03 | 1.018e-03 | 1.031e-03 | 1.056e-03 | 1.118e-03 | 1.32e-03 | 1.477e-03 | 1.912e-03 | 3.197e-03 | 5.305e-03 | 9.844e-03 | 2.811e-02 |
+| 20 | 1.021e-03 | 1.029e-03 | 1.050e-03 | 1.07e-03 | 1.209e-03 | 1.318e-03 | 1.579e-03 | 2.032e-03 | 3.506e-03 | 6.075e-03 | 1.094e-02 | 2.844e-02 |
+| 50 | 1.105e-03 | 1.113e-03 | 1.134e-03 | 1.157e-03 | 1.246e-03 | 1.509e-03 | 1.846e-03 | 2.308e-03 | 4.140e-03 | 7.052e-03 | 1.320e-02 | 4.408e-02 |
+| 100 | 1.216e-03 | 1.238e-03 | 1.248e-03 | 1.277e-03 | 1.469e-03 | 1.582e-03 | 1.909e-03 | 2.581e-03 | 4.646e-03 | 8.162e-03 | 1.514e-02 | 5.076e-02 |
+| 200 | 1.5e-03 | 1.494e-03 | 1.536e-03 | 1.604e-03 | 1.652e-03 | 1.869e-03 | 2.262e-03 | 3.043e-03 | 5.500e-03 | 9.331e-03 | 1.722e-02 | 7.581e-02 |
+| 500 | 2.238e-03 | 2.232e-03 | 2.302e-03 | 2.347e-03 | 2.408e-03 | 2.654e-03 | 3.131e-03 | 4.002e-03 | 6.978e-03 | 1.108e-02 | 2.022e-02 | 9.638e-02 |
+| 1000 | 3.430e-03 | 3.437e-03 | 3.476e-03 | 3.52e-03 | 3.594e-03 | 3.868e-03 | 4.464e-03 | 5.305e-03 | 8.287e-03 | 1.318e-02 | 2.306e-02 | 1.374e-01 |
+| 2000 | 5.753e-03 | 5.774e-03 | 5.815e-03 | 5.88e-03 | 5.95e-03 | 6.256e-03 | 6.811e-03 | 7.942e-03 | 1.099e-02 | 1.65e-02 | 2.710e-02 | 1.768e-01 |
+| 5000 | 1.279e-02 | 1.289e-02 | 1.285e-02 | 1.336e-02 | 1.37e-02 | 1.370e-02 | 1.45e-02 | 1.527e-02 | 1.876e-02 | 2.451e-02 | 3.842e-02 | 2.290e-01 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md
new file mode 100644
index 0000000..ee2c4c8
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.535e-03 | 3.428e-03 | 3.44e-03 | 3.456e-03 | 3.432e-03 | 3.443e-03 | 3.43e-03 | 3.521e-03 | 3.464e-03 | 3.47e-03 | 3.486e-03 | 4.911e-03 |
+| 5 | 3.394e-03 | 3.39e-03 | 3.455e-03 | 3.415e-03 | 3.385e-03 | 3.388e-03 | 3.478e-03 | 3.472e-03 | 3.471e-03 | 3.533e-03 | 3.648e-03 | 5.321e-03 |
+| 10 | 3.437e-03 | 3.397e-03 | 3.396e-03 | 3.392e-03 | 3.454e-03 | 3.384e-03 | 3.467e-03 | 3.482e-03 | 3.480e-03 | 3.476e-03 | 3.841e-03 | 5.798e-03 |
+| 20 | 3.419e-03 | 3.408e-03 | 3.403e-03 | 3.381e-03 | 3.393e-03 | 3.442e-03 | 3.482e-03 | 3.464e-03 | 3.467e-03 | 3.468e-03 | 3.970e-03 | 6.137e-03 |
+| 50 | 3.470e-03 | 3.4e-03 | 3.388e-03 | 3.394e-03 | 3.405e-03 | 3.382e-03 | 3.48e-03 | 3.528e-03 | 3.462e-03 | 3.471e-03 | 4.098e-03 | 6.448e-03 |
+| 100 | 3.400e-03 | 3.453e-03 | 3.415e-03 | 3.401e-03 | 3.412e-03 | 3.411e-03 | 3.491e-03 | 3.417e-03 | 3.481e-03 | 3.535e-03 | 4.291e-03 | 6.711e-03 |
+| 200 | 3.486e-03 | 3.396e-03 | 3.396e-03 | 3.454e-03 | 3.396e-03 | 3.403e-03 | 3.478e-03 | 3.460e-03 | 3.405e-03 | 3.461e-03 | 4.406e-03 | 7.067e-03 |
+| 500 | 3.479e-03 | 3.389e-03 | 3.394e-03 | 3.409e-03 | 3.476e-03 | 3.385e-03 | 3.471e-03 | 3.474e-03 | 3.471e-03 | 3.594e-03 | 4.706e-03 | 7.429e-03 |
+| 1000 | 3.478e-03 | 3.409e-03 | 3.407e-03 | 3.382e-03 | 3.383e-03 | 3.435e-03 | 3.453e-03 | 3.456e-03 | 3.463e-03 | 3.689e-03 | 5.009e-03 | 7.908e-03 |
+| 2000 | 3.765e-03 | 3.787e-03 | 3.771e-03 | 3.777e-03 | 3.785e-03 | 3.795e-03 | 3.822e-03 | 3.857e-03 | 4.169e-03 | 4.674e-03 | 5.979e-03 | 9.683e-03 |
+| 5000 | 6.e-03 | 5.96e-03 | 6.107e-03 | 6.115e-03 | 6.131e-03 | 6.205e-03 | 6.26e-03 | 6.297e-03 | 6.666e-03 | 7.521e-03 | 9.576e-03 | 1.49e-02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md
new file mode 100644
index 0000000..218e33c
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 5.920e-03 | 5.929e-03 | 6.514e-03 | 7.182e-03 | 9.292e-03 | 1.258e-02 | 1.976e-02 | 3.39e-02 | 7.530e-02 | 1.522e-01 | 3.996e-01 | 1.684e+00 |
+| 5 | 5.928e-03 | 5.962e-03 | 6.537e-03 | 7.384e-03 | 1.018e-02 | 1.435e-02 | 2.246e-02 | 3.783e-02 | 9.250e-02 | 2.432e-01 | 4.686e-01 | 1.798e+00 |
+| 10 | 6.044e-03 | 6.014e-03 | 6.824e-03 | 7.726e-03 | 9.796e-03 | 1.534e-02 | 2.687e-02 | 4.409e-02 | 1.118e-01 | 2.867e-01 | 5.987e-01 | 2.145e+00 |
+| 20 | 6.032e-03 | 6.105e-03 | 6.865e-03 | 8.794e-03 | 1.120e-02 | 2.134e-02 | 3.391e-02 | 6.222e-02 | 1.556e-01 | 3.645e-01 | 6.786e-01 | 2.637e+00 |
+| 50 | 6.499e-03 | 6.279e-03 | 8.662e-03 | 9.916e-03 | 1.697e-02 | 3.003e-02 | 5.847e-02 | 1.228e-01 | 2.529e-01 | 6.603e-01 | 1.177e+00 | 3.703e+00 |
+| 100 | 6.479e-03 | 7.37e-03 | 1.051e-02 | 1.371e-02 | 2.748e-02 | 4.721e-02 | 1.027e-01 | 1.929e-01 | 5.007e-01 | 9.981e-01 | 2.129e+00 | 6.030e+00 |
+| 200 | 7.691e-03 | 9.454e-03 | 1.444e-02 | 2.263e-02 | 3.906e-02 | 9.756e-02 | 1.886e-01 | 3.467e-01 | 8.815e-01 | 1.792e+00 | 3.648e+00 | 1.008e+01 |
+| 500 | 8.883e-03 | 1.283e-02 | 3.616e-02 | 3.52e-02 | 8.427e-02 | 2.162e-01 | 4.081e-01 | 8.556e-01 | 2.177e+00 | 4.328e+00 | 8.686e+00 | 2.235e+01 |
+| 1000 | 8.225e-03 | 2.181e-02 | 5.647e-02 | 8.705e-02 | 1.809e-01 | 4.044e-01 | 7.995e-01 | 1.644e+00 | 4.172e+00 | 8.583e+00 | 1.69e+01 | 4.323e+01 |
+| 2000 | 3.628e-02 | 4.154e-02 | 6.644e-02 | 1.663e-01 | 4.074e-01 | 8.366e-01 | 1.748e+00 | 3.414e+00 | 8.769e+00 | 1.775e+01 | 3.397e+01 | 8.585e+01 |
+| 5000 | 5.857e-02 | 1.474e-01 | 2.871e-01 | 5.538e-01 | 8.380e-01 | 2.379e+00 | 4.255e+00 | 8.153e+00 | 2.054e+01 | 4.167e+01 | 8.298e+01 | 2.090e+02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md
new file mode 100644
index 0000000..4d3f267
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 5.75 | 6.01 | 6.38 | 6.30 | 7.91 | 10.87 | 15.27 | 22.07 | 31.33 | 40.68 | 61.51 | 113.29 |
+| 5 | 5.89 | 5.90 | 6.40 | 7.07 | 9.18 | 11.11 | 15.98 | 21.38 | 32.44 | 52.15 | 55.84 | 84.26 |
+| 10 | 5.94 | 5.91 | 6.62 | 7.32 | 8.76 | 11.63 | 18.19 | 23.06 | 34.97 | 54.04 | 60.82 | 76.29 |
+| 20 | 5.91 | 5.93 | 6.54 | 8.22 | 9.26 | 16.20 | 21.48 | 30.62 | 44.39 | 60.01 | 62.01 | 92.75 |
+| 50 | 5.88 | 5.64 | 7.64 | 8.57 | 13.61 | 19.90 | 31.67 | 53.21 | 61.08 | 93.63 | 89.16 | 84.01 |
+| 100 | 5.33 | 5.96 | 8.42 | 10.73 | 18.70 | 29.85 | 53.82 | 74.74 | 107.76 | 122.28 | 140.68 | 118.80 |
+| 200 | 5.13 | 6.33 | 9.40 | 14.11 | 23.64 | 52.19 | 83.38 | 113.93 | 160.26 | 192.08 | 211.85 | 132.97 |
+| 500 | 3.97 | 5.75 | 15.71 | 15.00 | 34.99 | 81.47 | 130.37 | 213.79 | 311.96 | 390.72 | 429.62 | 231.94 |
+| 1000 | 2.40 | 6.35 | 16.25 | 24.73 | 50.34 | 104.54 | 179.10 | 309.90 | 503.46 | 651.02 | 732.57 | 314.65 |
+| 2000 | 6.31 | 7.19 | 11.43 | 28.28 | 68.47 | 133.72 | 256.58 | 429.83 | 797.87 | 1.08e+03 | 1.25e+03 | 485.50 |
+| 5000 | 4.58 | 11.44 | 22.34 | 41.47 | 61.17 | 173.59 | 293.48 | 533.92 | 1.09e+03 | 1.70e+03 | 2.16e+03 | 912.55 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md
new file mode 100644
index 0000000..487290e
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 0.29 | 0.29 | 0.30 | 0.33 | 0.34 | 0.34 | 0.38 | 0.44 | 0.69 | 1.08 | 1.86 | 3.03 |
+| 5 | 0.30 | 0.30 | 0.30 | 0.31 | 0.33 | 0.38 | 0.40 | 0.51 | 0.82 | 1.32 | 2.30 | 4.01 |
+| 10 | 0.30 | 0.30 | 0.30 | 0.31 | 0.32 | 0.39 | 0.43 | 0.55 | 0.92 | 1.53 | 2.56 | 4.85 |
+| 20 | 0.30 | 0.30 | 0.31 | 0.32 | 0.36 | 0.38 | 0.45 | 0.59 | 1.01 | 1.75 | 2.76 | 4.63 |
+| 50 | 0.32 | 0.33 | 0.33 | 0.34 | 0.37 | 0.45 | 0.53 | 0.65 | 1.20 | 2.03 | 3.22 | 6.84 |
+| 100 | 0.36 | 0.36 | 0.37 | 0.38 | 0.43 | 0.46 | 0.55 | 0.76 | 1.33 | 2.31 | 3.53 | 7.56 |
+| 200 | 0.43 | 0.44 | 0.45 | 0.46 | 0.49 | 0.55 | 0.65 | 0.88 | 1.62 | 2.70 | 3.91 | 10.73 |
+| 500 | 0.64 | 0.66 | 0.68 | 0.69 | 0.69 | 0.78 | 0.90 | 1.15 | 2.01 | 3.08 | 4.30 | 12.97 |
+| 1000 | 0.99 | 1.01 | 1.02 | 1.04 | 1.06 | 1.13 | 1.29 | 1.53 | 2.39 | 3.57 | 4.60 | 17.37 |
+| 2000 | 1.53 | 1.52 | 1.54 | 1.56 | 1.57 | 1.65 | 1.78 | 2.06 | 2.64 | 3.53 | 4.53 | 18.26 |
+| 5000 | 2.13 | 2.16 | 2.10 | 2.18 | 2.23 | 2.21 | 2.32 | 2.43 | 2.81 | 3.26 | 4.01 | 15.38 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md
new file mode 100644
index 0000000..1125800
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 1.67 | 1.73 | 1.89 | 2.08 | 2.71 | 3.65 | 5.76 | 9.63 | 21.74 | 43.87 | 114.64 | 342.86 |
+| 5 | 1.75 | 1.76 | 1.89 | 2.16 | 3.01 | 4.23 | 6.46 | 10.89 | 26.65 | 68.85 | 128.46 | 337.91 |
+| 10 | 1.76 | 1.77 | 2.01 | 2.28 | 2.84 | 4.53 | 7.75 | 12.66 | 32.12 | 82.47 | 155.86 | 369.85 |
+| 20 | 1.76 | 1.79 | 2.02 | 2.60 | 3.30 | 6.20 | 9.74 | 17.96 | 44.88 | 105.11 | 170.94 | 429.75 |
+| 50 | 1.87 | 1.85 | 2.56 | 2.92 | 4.98 | 8.88 | 16.80 | 34.81 | 73.04 | 190.20 | 287.27 | 574.20 |
+| 100 | 1.91 | 2.13 | 3.08 | 4.03 | 8.05 | 13.84 | 29.42 | 56.46 | 143.84 | 282.36 | 496.26 | 898.57 |
+| 200 | 2.21 | 2.78 | 4.25 | 6.55 | 11.50 | 28.67 | 54.23 | 100.20 | 258.88 | 517.84 | 827.99 | 1.43e+03 |
+| 500 | 2.55 | 3.79 | 10.65 | 10.32 | 24.24 | 63.87 | 117.57 | 246.30 | 627.28 | 1.20e+03 | 1.85e+03 | 3.01e+03 |
+| 1000 | 2.36 | 6.40 | 16.58 | 25.74 | 53.47 | 117.71 | 231.55 | 475.60 | 1.20e+03 | 2.33e+03 | 3.37e+03 | 5.47e+03 |
+| 2000 | 9.63 | 10.97 | 17.62 | 44.03 | 107.63 | 220.43 | 457.23 | 885.05 | 2.10e+03 | 3.80e+03 | 5.68e+03 | 8.87e+03 |
+| 5000 | 9.76 | 24.73 | 47.01 | 90.57 | 136.68 | 383.37 | 679.79 | 1.29e+03 | 3.08e+03 | 5.54e+03 | 8.67e+03 | 1.40e+04 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md
new file mode 100644
index 0000000..584833d
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.839e-03 | 2.263e-03 | 2.751e-03 | 3.603e-03 | 5.367e-03 | 1.043e-02 | 1.963e-02 | 3.62e-02 | 8.759e-02 | 1.75e-01 | 3.531e-01 | 8.816e-01 |
+| 5 | 2.581e-03 | 2.852e-03 | 3.709e-03 | 5.404e-03 | 8.689e-03 | 2.09e-02 | 4.92e-02 | 1.152e-01 | 3.25e-01 | 6.775e-01 | 1.329e+00 | 3.191e+00 |
+| 10 | 3.203e-03 | 3.694e-03 | 4.880e-03 | 6.879e-03 | 1.078e-02 | 2.716e-02 | 6.540e-02 | 1.654e-01 | 4.613e-01 | 9.447e-01 | 1.831e+00 | 4.547e+00 |
+| 20 | 4.909e-03 | 5.348e-03 | 6.714e-03 | 8.853e-03 | 1.4e-02 | 3.155e-02 | 8.694e-02 | 2.126e-01 | 5.852e-01 | 1.196e+00 | 2.377e+00 | 5.819e+00 |
+| 50 | 9.96e-03 | 1.06e-02 | 1.193e-02 | 1.534e-02 | 2.064e-02 | 4.532e-02 | 1.234e-01 | 2.898e-01 | 7.611e-01 | 1.517e+00 | 3.015e+00 | 7.419e+00 |
+| 100 | 1.739e-02 | 1.813e-02 | 2.019e-02 | 2.349e-02 | 2.952e-02 | 5.775e-02 | 1.613e-01 | 3.547e-01 | 8.880e-01 | 1.768e+00 | 3.481e+00 | 8.625e+00 |
+| 200 | 3.283e-02 | 3.274e-02 | 3.517e-02 | 3.895e-02 | 4.640e-02 | 8.618e-02 | 2.034e-01 | 4.200e-01 | 1.036e+00 | 2.037e+00 | 4.028e+00 | 9.911e+00 |
+| 500 | 7.66e-02 | 7.725e-02 | 8.045e-02 | 8.389e-02 | 9.346e-02 | 1.439e-01 | 2.970e-01 | 5.451e-01 | 1.259e+00 | 2.425e+00 | 4.751e+00 | 1.168e+01 |
+| 1000 | 1.509e-01 | 1.580e-01 | 1.543e-01 | 1.593e-01 | 1.691e-01 | 2.349e-01 | 4.054e-01 | 6.792e-01 | 1.476e+00 | 2.797e+00 | 5.417e+00 | 1.32e+01 |
+| 2000 | 3.070e-01 | 3.025e-01 | 3.032e-01 | 3.147e-01 | 3.206e-01 | 4.106e-01 | 5.676e-01 | 9.091e-01 | 1.797e+00 | 3.308e+00 | 6.246e+00 | 1.515e+01 |
+| 5000 | 7.461e-01 | 7.505e-01 | 7.574e-01 | 7.580e-01 | 7.719e-01 | 8.920e-01 | 1.084e+00 | 1.424e+00 | 2.472e+00 | 4.160e+00 | 7.566e+00 | 1.774e+01 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md
new file mode 100644
index 0000000..66190a3
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 8.093e-03 | 4.831e-03 | 4.721e-03 | 4.601e-03 | 4.671e-03 | 4.493e-03 | 4.649e-03 | 4.688e-03 | 4.626e-03 | 4.676e-03 | 5.169e-03 | 8.322e-03 |
+| 5 | 4.795e-03 | 4.643e-03 | 4.625e-03 | 4.689e-03 | 4.613e-03 | 4.672e-03 | 4.744e-03 | 4.554e-03 | 4.539e-03 | 5.416e-03 | 5.288e-03 | 8.217e-03 |
+| 10 | 4.59e-03 | 4.752e-03 | 5.439e-03 | 4.58e-03 | 4.614e-03 | 4.622e-03 | 4.530e-03 | 4.682e-03 | 4.752e-03 | 4.708e-03 | 5.339e-03 | 8.593e-03 |
+| 20 | 4.6e-03 | 4.605e-03 | 4.989e-03 | 4.693e-03 | 4.636e-03 | 5.333e-03 | 4.715e-03 | 4.553e-03 | 4.573e-03 | 4.735e-03 | 5.519e-03 | 9.150e-03 |
+| 50 | 4.689e-03 | 4.66e-03 | 4.779e-03 | 4.624e-03 | 4.725e-03 | 4.518e-03 | 4.727e-03 | 4.716e-03 | 5.022e-03 | 4.643e-03 | 5.667e-03 | 9.064e-03 |
+| 100 | 4.615e-03 | 4.651e-03 | 4.751e-03 | 4.623e-03 | 4.545e-03 | 4.623e-03 | 4.731e-03 | 4.651e-03 | 5.399e-03 | 4.741e-03 | 1.487e-02 | 9.328e-03 |
+| 200 | 4.689e-03 | 5.153e-03 | 4.695e-03 | 4.742e-03 | 4.776e-03 | 4.716e-03 | 5.547e-03 | 4.72e-03 | 4.628e-03 | 4.870e-03 | 5.972e-03 | 9.408e-03 |
+| 500 | 4.613e-03 | 4.76e-03 | 4.683e-03 | 4.732e-03 | 5.307e-03 | 4.693e-03 | 4.659e-03 | 4.737e-03 | 5.204e-03 | 4.969e-03 | 6.236e-03 | 9.575e-03 |
+| 1000 | 4.701e-03 | 4.749e-03 | 4.716e-03 | 4.71e-03 | 4.676e-03 | 4.783e-03 | 4.781e-03 | 4.735e-03 | 4.820e-03 | 5.343e-03 | 6.652e-03 | 1.028e-02 |
+| 2000 | 5.166e-03 | 5.164e-03 | 5.182e-03 | 5.161e-03 | 5.152e-03 | 5.738e-03 | 5.541e-03 | 5.356e-03 | 5.642e-03 | 6.490e-03 | 7.966e-03 | 1.232e-02 |
+| 5000 | 7.627e-03 | 7.593e-03 | 7.564e-03 | 7.658e-03 | 7.698e-03 | 7.712e-03 | 7.797e-03 | 7.8e-03 | 8.731e-03 | 1.037e-02 | 1.327e-02 | 2.230e-02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md
new file mode 100644
index 0000000..afd2356
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 2.728e-02 | 3.092e-02 | 5.674e-02 | 1.079e-01 | 1.983e-01 | 4.919e-01 | 9.971e-01 | 7.669e+00 | 1.382e+01 | 3.652e+01 | 7.587e+01 | 2.001e+02 |
+| 5 | 2.908e-02 | 3.597e-02 | 6.759e-02 | 1.316e-01 | 2.390e-01 | 5.987e-01 | 5.551e+00 | 5.53e+00 | 1.544e+01 | 3.595e+01 | 8.467e+01 | 2.072e+02 |
+| 10 | 3.137e-02 | 4.208e-02 | 8.376e-02 | 1.58e-01 | 3.011e-01 | 4.584e+00 | 1.489e+00 | 5.831e+00 | 1.888e+01 | 3.929e+01 | 8.577e+01 | 2.247e+02 |
+| 20 | 3.699e-02 | 5.445e-02 | 1.104e-01 | 2.123e-01 | 4.003e-01 | 1.003e+00 | 5.894e+00 | 6.842e+00 | 2.179e+01 | 4.142e+01 | 1.000e+02 | 2.533e+02 |
+| 50 | 5.549e-02 | 8.991e-02 | 1.946e-01 | 3.639e-01 | 7.197e-01 | 1.785e+00 | 3.579e+00 | 7.337e+00 | 3.250e+01 | 5.705e+01 | 1.321e+02 | 3.331e+02 |
+| 100 | 8.595e-02 | 1.505e-01 | 3.145e-01 | 6.342e-01 | 1.227e+00 | 3.114e+00 | 6.111e+00 | 1.251e+01 | 4.497e+01 | 8.732e+01 | 1.823e+02 | 4.563e+02 |
+| 200 | 1.464e-01 | 2.415e-01 | 6.012e-01 | 1.157e+00 | 2.252e+00 | 5.739e+00 | 1.556e+01 | 2.64e+01 | 6.973e+01 | 1.423e+02 | 2.886e+02 | 7.270e+02 |
+| 500 | 3.363e-01 | 5.55e-01 | 1.525e+00 | 2.669e+00 | 5.563e+00 | 1.352e+01 | 3.084e+01 | 5.781e+01 | 1.467e+02 | 2.977e+02 | 6.112e+02 | 1.519e+03 |
+| 1000 | 6.443e-01 | 1.201e+00 | 2.763e+00 | 5.361e+00 | 1.078e+01 | 2.608e+01 | 5.396e+01 | 1.076e+02 | 2.877e+02 | 5.609e+02 | 1.143e+03 | 2.861e+03 |
+| 2000 | 1.189e+00 | 1.968e+00 | 5.459e+00 | 1.057e+01 | 2.136e+01 | 5.446e+01 | 1.096e+02 | 2.185e+02 | 5.670e+02 | 1.114e+03 | 2.244e+03 | 5.570e+03 |
+| 5000 | 5.404e+00 | 5.521e+00 | 1.321e+01 | 2.739e+01 | 5.495e+01 | 1.321e+02 | 2.654e+02 | 5.364e+02 | 1.375e+03 | 2.726e+03 | 5.435e+03 | 1.369e+04 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md
new file mode 100644
index 0000000..929b485
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 7.11 | 13.66 | 20.62 | 29.95 | 36.94 | 47.14 | 50.78 | 211.88 | 157.80 | 208.71 | 214.88 | 226.96 |
+| 5 | 11.27 | 12.61 | 18.22 | 24.35 | 27.51 | 28.65 | 112.84 | 48.00 | 47.51 | 53.07 | 63.71 | 64.93 |
+| 10 | 9.79 | 11.39 | 17.16 | 22.96 | 27.92 | 168.75 | 22.77 | 35.24 | 40.93 | 41.59 | 46.86 | 49.41 |
+| 20 | 7.53 | 10.18 | 16.45 | 23.99 | 28.60 | 31.78 | 67.79 | 32.19 | 37.23 | 34.62 | 42.08 | 43.52 |
+| 50 | 5.57 | 8.48 | 16.31 | 23.72 | 34.86 | 39.38 | 29.00 | 25.31 | 42.71 | 37.61 | 43.83 | 44.90 |
+| 100 | 4.94 | 8.30 | 15.58 | 27.00 | 41.57 | 53.93 | 37.89 | 35.28 | 50.64 | 49.39 | 52.37 | 52.91 |
+| 200 | 4.46 | 7.38 | 17.09 | 29.70 | 48.53 | 66.59 | 76.49 | 62.85 | 67.33 | 69.88 | 71.66 | 73.36 |
+| 500 | 4.39 | 7.18 | 18.96 | 31.82 | 59.52 | 93.93 | 103.85 | 106.04 | 116.48 | 122.75 | 128.65 | 130.10 |
+| 1000 | 4.27 | 7.60 | 17.91 | 33.65 | 63.73 | 111.04 | 133.13 | 158.49 | 194.95 | 200.54 | 210.95 | 216.73 |
+| 2000 | 3.87 | 6.51 | 18.00 | 33.57 | 66.62 | 132.64 | 193.13 | 240.29 | 315.51 | 336.89 | 359.30 | 367.59 |
+| 5000 | 7.24 | 7.36 | 17.44 | 36.14 | 71.19 | 148.05 | 244.92 | 376.69 | 556.45 | 655.34 | 718.35 | 771.81 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md
new file mode 100644
index 0000000..39bcbd0
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 0.47 | 0.47 | 0.58 | 0.78 | 1.15 | 2.32 | 4.22 | 7.72 | 18.93 | 37.42 | 68.31 | 105.94 |
+| 5 | 0.54 | 0.61 | 0.80 | 1.15 | 1.88 | 4.47 | 10.37 | 25.30 | 71.59 | 125.09 | 251.34 | 388.32 |
+| 10 | 0.70 | 0.78 | 0.90 | 1.50 | 2.34 | 5.88 | 14.44 | 35.34 | 97.07 | 200.67 | 342.88 | 529.19 |
+| 20 | 1.07 | 1.16 | 1.35 | 1.89 | 3.02 | 5.92 | 18.44 | 46.69 | 127.98 | 252.67 | 430.68 | 636.00 |
+| 50 | 2.12 | 2.27 | 2.50 | 3.32 | 4.37 | 10.03 | 26.11 | 61.46 | 151.55 | 326.74 | 532.01 | 818.56 |
+| 100 | 3.77 | 3.90 | 4.25 | 5.08 | 6.49 | 12.49 | 34.09 | 76.28 | 164.48 | 372.91 | 234.16 | 924.55 |
+| 200 | 7.00 | 6.35 | 7.49 | 8.21 | 9.72 | 18.28 | 36.67 | 89.00 | 223.79 | 418.30 | 674.41 | 1.05e+03 |
+| 500 | 16.61 | 16.23 | 17.18 | 17.73 | 17.61 | 30.67 | 63.76 | 115.08 | 242.03 | 488.09 | 761.85 | 1.22e+03 |
+| 1000 | 32.10 | 33.27 | 32.72 | 33.83 | 36.17 | 49.11 | 84.79 | 143.44 | 306.20 | 523.52 | 814.37 | 1.28e+03 |
+| 2000 | 59.44 | 58.58 | 58.52 | 60.99 | 62.24 | 71.56 | 102.44 | 169.74 | 318.56 | 509.73 | 784.05 | 1.23e+03 |
+| 5000 | 97.82 | 98.84 | 100.13 | 98.98 | 100.27 | 115.66 | 139.00 | 182.58 | 283.08 | 401.24 | 570.04 | 795.20 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md
new file mode 100644
index 0000000..12e13eb
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.37 | 6.40 | 12.02 | 23.45 | 42.45 | 109.46 | 214.50 | 1.64e+03 | 2.99e+03 | 7.81e+03 | 1.47e+04 | 2.40e+04 |
+| 5 | 6.06 | 7.75 | 14.61 | 28.06 | 51.83 | 128.15 | 1.17e+03 | 1.21e+03 | 3.40e+03 | 6.64e+03 | 1.60e+04 | 2.52e+04 |
+| 10 | 6.83 | 8.85 | 15.40 | 34.49 | 65.26 | 991.79 | 328.65 | 1.25e+03 | 3.97e+03 | 8.35e+03 | 1.61e+04 | 2.61e+04 |
+| 20 | 8.04 | 11.83 | 22.13 | 45.25 | 86.35 | 188.01 | 1.25e+03 | 1.50e+03 | 4.77e+03 | 8.75e+03 | 1.81e+04 | 2.77e+04 |
+| 50 | 11.84 | 19.30 | 40.72 | 78.71 | 152.33 | 394.99 | 757.11 | 1.56e+03 | 6.47e+03 | 1.23e+04 | 2.33e+04 | 3.68e+04 |
+| 100 | 18.63 | 32.36 | 66.20 | 137.20 | 269.98 | 673.70 | 1.29e+03 | 2.69e+03 | 8.33e+03 | 1.84e+04 | 1.23e+04 | 4.89e+04 |
+| 200 | 31.22 | 46.87 | 128.05 | 243.96 | 471.56 | 1.22e+03 | 2.80e+03 | 5.59e+03 | 1.51e+04 | 2.92e+04 | 4.83e+04 | 7.73e+04 |
+| 500 | 72.90 | 116.59 | 325.71 | 564.06 | 1.05e+03 | 2.88e+03 | 6.62e+03 | 1.22e+04 | 2.82e+04 | 5.99e+04 | 9.80e+04 | 1.59e+05 |
+| 1000 | 137.05 | 252.94 | 585.84 | 1.14e+03 | 2.31e+03 | 5.45e+03 | 1.13e+04 | 2.27e+04 | 5.97e+04 | 1.05e+05 | 1.72e+05 | 2.78e+05 |
+| 2000 | 230.13 | 381.18 | 1.05e+03 | 2.05e+03 | 4.15e+03 | 9.49e+03 | 1.98e+04 | 4.08e+04 | 1.01e+05 | 1.72e+05 | 2.82e+05 | 4.52e+05 |
+| 5000 | 708.50 | 727.06 | 1.75e+03 | 3.58e+03 | 7.14e+03 | 1.71e+04 | 3.40e+04 | 6.88e+04 | 1.58e+05 | 2.63e+05 | 4.09e+05 | 6.14e+05 |
diff --git a/packages/lane_helpers/examples/basic_usage.py b/packages/lane_helpers/examples/basic_usage.py
new file mode 100644
index 0000000..a099fd8
--- /dev/null
+++ b/packages/lane_helpers/examples/basic_usage.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from accvlab.lane_helpers import polyline
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise RuntimeError("This example requires a CUDA-capable PyTorch installation.")
+
+    # @NOTE Use one rectangle polyline with shape (batch=1, num_points=5, num_dims=2).
+    points = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    # @NOTE Use a tensor of distances to sample the polyline at (batch=1, num_distances=5).
+    distances = torch.tensor([[0.0, 0.5, 1.0, 3.0, 6.0]], device="cuda", dtype=torch.float32)
+
+    # @NOTE Interpolate the polyline at the given distances.
+    sampled_points = polyline.interpolate(points, distances)
+    # @NOTE Compute the length of the polyline.
+    line_lengths = polyline.lengths(points)
+
+    # @NOTE Print the results.
+    print(f"Interpolated points:\n{sampled_points}")
+    print(f"Line length(s): {line_lengths}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/ext_impl/CMakeLists.txt b/packages/lane_helpers/ext_impl/CMakeLists.txt
new file mode 100644
index 0000000..bfab1b2
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.18)
+project(accvlab_lane_helpers_ext LANGUAGES CXX CUDA)
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+execute_process(
+    COMMAND "python3" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'share', 'cmake'))"
+    OUTPUT_VARIABLE TORCH_CMAKE_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PATH}")
+
+find_package(CUDA REQUIRED)
+find_package(Torch REQUIRED)
+find_package(Python COMPONENTS Interpreter Development REQUIRED)
+
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m pybind11 --cmakedir
+    OUTPUT_VARIABLE pybind11_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+find_package(pybind11 REQUIRED)
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+    set(CMAKE_CUDA_ARCHITECTURES native)
+  else()
+    set(CMAKE_CUDA_ARCHITECTURES "75;80;86")
+  endif()
+endif()
+
+separate_arguments(TORCH_CXX_FLAGS_LIST NATIVE_COMMAND "${TORCH_CXX_FLAGS}")
+
+find_library(TORCH_PYTHON_LIBRARY torch_python PATHS ${TORCH_INSTALL_PREFIX}/lib)
+
+add_subdirectory(polyline)
diff --git a/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt
new file mode 100644
index 0000000..207431b
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pybind11_add_module(accvlab_lane_helpers_polyline_sampling MODULE
+    src/polyline.cu
+    src/polyline_cpu.cpp
+    src/polyline.cpp
+)
+
+set_target_properties(accvlab_lane_helpers_polyline_sampling PROPERTIES
+    CXX_STANDARD 17
+    CUDA_STANDARD 17
+    OUTPUT_NAME "_polyline_sampling"
+    PREFIX ""
+)
+
+target_compile_definitions(accvlab_lane_helpers_polyline_sampling PRIVATE
+    TORCH_EXTENSION_NAME=_polyline_sampling
+    TORCH_API_INCLUDE_EXTENSION_H
+)
+
+target_compile_options(accvlab_lane_helpers_polyline_sampling PRIVATE ${TORCH_CXX_FLAGS_LIST})
+
+target_link_libraries(accvlab_lane_helpers_polyline_sampling PRIVATE
+    ${TORCH_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    ${TORCH_PYTHON_LIBRARY}
+)
+
+target_include_directories(accvlab_lane_helpers_polyline_sampling PRIVATE
+    ${TORCH_INCLUDE_DIRS}
+    ${CUDA_INCLUDE_DIRS}
+    include/
+)
+
+install(TARGETS accvlab_lane_helpers_polyline_sampling
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .
+)
diff --git a/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh
new file mode 100644
index 0000000..441220a
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH
+#define POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH
+
+#include <c10/cuda/CUDAException.h>
+
+#define CUDA_CHECK(error_code_or_call) C10_CUDA_CHECK(error_code_or_call)
+#define CUDA_CHECK_LAST() C10_CUDA_CHECK(cudaGetLastError())
+
+#endif
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh
new file mode 100644
index 0000000..01ce298
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Host-visible interface for the polyline interpolation CUDA
+// implementation. This header is intentionally free of CUDA device intrinsics
+// so it can be included from both C++ and CUDA translation units.
+
+#ifndef LANE_HELPERS_POLYLINE_CUH
+#define LANE_HELPERS_POLYLINE_CUH
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+namespace polyline {
+
+template <typename dtype>
+struct PolylineLaunchConfig {
+    dim3 block_dim;
+    dim3 grid_dim;
+    int num_points_full_blocks;
+    size_t shared_mem_size;
+    size_t distance_buffer_ext_size_elems;
+    bool use_shared_distances;
+    size_t max_shared_full;
+};
+
+template <typename dtype>
+PolylineLaunchConfig<dtype> make_polyline_launch_config(int num_points, int num_samples, int device);
+
+template <typename dtype>
+void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances,
+                            dtype* result_points, int num_samples, bool relative_distances, int device,
+                            const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                            cudaStream_t stream);
+
+template <typename dtype>
+void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples,
+                      cudaStream_t stream);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances,
+                                           int num_distances, dtype* result_points, int num_samples,
+                                           sample_size_dtype* sample_sizes_points,
+                                           sample_size_dtype* sample_sizes_distances_to_sample,
+                                           bool relative_distances, int device,
+                                           const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                                           cudaStream_t stream);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths,
+                                     int num_samples, sample_size_dtype* sample_sizes_points,
+                                     cudaStream_t stream);
+
+template <typename dtype>
+void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances,
+                                int num_distances, dtype* result_points, int num_samples,
+                                bool relative_distances);
+
+template <typename dtype>
+void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                               const dtype* distances, int num_distances,
+                                               dtype* result_points, int num_samples,
+                                               const sample_size_dtype* sample_sizes_points,
+                                               const sample_size_dtype* sample_sizes_distances_to_sample,
+                                               bool relative_distances);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                         dtype* lengths, int num_samples,
+                                         const sample_size_dtype* sample_sizes_points);
+
+// Explicit instantiations are provided in polyline.cu and polyline_cpu.cpp.
+#define DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE)                                \
+    extern template PolylineLaunchConfig<DTYPE> make_polyline_launch_config<DTYPE>( \
+        int num_points, int num_samples, int device);
+
+#define DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE)                                       \
+    extern template void polyline_interpolation<DTYPE>(                                    \
+        DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances, int device,        \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE)                                                 \
+    extern template void polyline_lengths<DTYPE>(DTYPE * points, int num_points, int num_dims, \
+                                                 DTYPE* lengths, int num_samples, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)            \
+    extern template void polyline_interpolation_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(         \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances,    \
+        DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points,            \
+        SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)           \
+    extern template void polyline_lengths_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(        \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(DTYPE)                        \
+    DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE)                               \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int)     \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int)           \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t)
+
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(float)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(double)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::Half)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::BFloat16)
+
+#undef DECLARE_POLYLINE_CUDA_DTYPE_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_EXTERN
+#undef DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN
+
+#define DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE)                                              \
+    extern template void polyline_interpolation_cpu<DTYPE>(                                           \
+        const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances);
+
+#define DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE)                                                      \
+    extern template void polyline_lengths_cpu<DTYPE>(const DTYPE* points, int num_points, int num_dims, \
+                                                     DTYPE* lengths, int num_samples);
+
+#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)                \
+    extern template void polyline_interpolation_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(             \
+        const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points,              \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances);
+
+#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)            \
+    extern template void polyline_lengths_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(         \
+        const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_points);
+
+#define DECLARE_POLYLINE_CPU_DTYPE_EXTERN(DTYPE)                             \
+    DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE)                               \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int)     \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int)           \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t)
+
+DECLARE_POLYLINE_CPU_DTYPE_EXTERN(float)
+DECLARE_POLYLINE_CPU_DTYPE_EXTERN(double)
+
+#undef DECLARE_POLYLINE_CPU_DTYPE_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_CPU_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh
new file mode 100644
index 0000000..625dacf
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_COMMON_CUH
+#define LANE_HELPERS_POLYLINE_COMMON_CUH
+
+#include <cmath>
+#include <limits>
+
+#include "polyline_dtype_compat.cuh"
+
+#ifdef __CUDACC__
+// Keep scalar helpers callable from both CUDA kernels and CPU translation units.
+#define POLYLINE_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define POLYLINE_HOST_DEVICE_INLINE inline
+#endif
+
+namespace polyline {
+
+template <typename dtype>
+POLYLINE_HOST_DEVICE_INLINE dtype polyline_nan() {
+    const dtype nan_value = static_cast<dtype>(NAN);
+    return nan_value;
+}
+
+template <typename point_dtype>
+POLYLINE_HOST_DEVICE_INLINE void fill_point_with_nan_common(point_dtype* res_point, int num_dims) {
+    const point_dtype nan_value = polyline_nan<point_dtype>();
+    for (int d = 0; d < num_dims; ++d) {
+        res_point[d] = nan_value;
+    }
+}
+
+/**
+ * @brief Compute the Euclidean length of one polyline segment.
+ *
+ * @details
+ * `segment_idx` refers to the segment between points `segment_idx` and
+ * `segment_idx + 1`. The point coordinates are laid out consecutively as
+ * `(num_points, num_dims)`.
+ *
+ * The point dtype and accumulation dtype are intentionally separate so the CPU
+ * path can accumulate in a wider type while the CUDA path preserves its
+ * existing dtype behavior.
+ */
+template <typename point_dtype, typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE accum_dtype compute_segment_length_common(const point_dtype* points_sample,
+                                                                      int segment_idx, int num_dims) {
+    const point_dtype* first_point = points_sample + segment_idx * num_dims;
+    const point_dtype* second_point = points_sample + (segment_idx + 1) * num_dims;
+    accum_dtype accum_sqr = static_cast<accum_dtype>(0.0);
+    for (int d = 0; d < num_dims; ++d) {
+        const accum_dtype diff =
+            static_cast<accum_dtype>(first_point[d]) - static_cast<accum_dtype>(second_point[d]);
+        accum_sqr += diff * diff;
+    }
+    const accum_dtype segment_length = polyline_sqrt(accum_sqr);
+    return segment_length;
+}
+
+/**
+ * @brief Find the last index whose value is lower than or equal to `value`.
+ *
+ * @details
+ * The input sequence is expected to be monotonically non-decreasing cumulative
+ * distances. The return value can be:
+ * - `-1` when `value` lies before the first point.
+ * - `sequence_length - 1` when `value` lies at or beyond the last point.
+ * - Any valid lower segment endpoint otherwise.
+ *
+ * This is used to locate the segment containing the requested interpolation
+ * distance.
+ */
+template <typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE int get_index_of_last_lower_or_equal_to_common(const accum_dtype* sequence,
+                                                                           accum_dtype value,
+                                                                           int sequence_length) {
+    int min_idx = 0;
+    int max_idx = sequence_length - 1;
+
+    if (polyline_value_gt(sequence[0], value)) {
+        return -1;
+    }
+    if (polyline_value_lt(sequence[sequence_length - 1], value)) {
+        return sequence_length - 1;
+    }
+
+    while (max_idx - min_idx > 1) {
+        const int curr_idx = (max_idx + min_idx) >> 1;
+        const accum_dtype curr_val = sequence[curr_idx];
+        if (polyline_value_lt(curr_val, value)) {
+            min_idx = curr_idx;
+        } else if (polyline_value_gt(curr_val, value)) {
+            max_idx = curr_idx;
+        } else {
+            min_idx = curr_idx;
+            max_idx = curr_idx;
+        }
+    }
+    return min_idx;
+}
+
+/**
+ * @brief Sample one point on a polyline at a requested absolute distance.
+ *
+ * @details
+ * `accum_distances` stores the distance from the start of the polyline to each
+ * point. Distances outside the polyline are clamped to the first or last point.
+ * Degenerate zero-length segments return the lower endpoint.
+ */
+template <typename point_dtype, typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE void sample_at_distance_common(const point_dtype* points,
+                                                           const accum_dtype* accum_distances,
+                                                           accum_dtype distance_to_sample_at, int num_points,
+                                                           int num_dims, point_dtype* res_point) {
+    const int index_min = get_index_of_last_lower_or_equal_to_common<accum_dtype>(
+        accum_distances, distance_to_sample_at, num_points);
+    if (index_min >= 0 && index_min < num_points - 1) {
+        const int index_max = index_min + 1;
+        const point_dtype* min_point = points + index_min * num_dims;
+        const point_dtype* max_point = points + index_max * num_dims;
+        const accum_dtype dist_min = accum_distances[index_min];
+        const accum_dtype dist_max = accum_distances[index_max];
+        const accum_dtype dist = dist_max - dist_min;
+        if (polyline_value_ge(dist, static_cast<accum_dtype>(std::numeric_limits<accum_dtype>::epsilon()))) {
+            const accum_dtype weight_max = (distance_to_sample_at - dist_min) / dist;
+            const accum_dtype weight_min = (dist_max - distance_to_sample_at) / dist;
+            for (int d = 0; d < num_dims; ++d) {
+                const accum_dtype interpolated = static_cast<accum_dtype>(min_point[d]) * weight_min +
+                                                 static_cast<accum_dtype>(max_point[d]) * weight_max;
+                res_point[d] = static_cast<point_dtype>(interpolated);
+            }
+        } else {
+            for (int d = 0; d < num_dims; ++d) {
+                res_point[d] = min_point[d];
+            }
+        }
+    } else if (index_min == -1) {
+        for (int d = 0; d < num_dims; ++d) {
+            // Note that we are accessing the first point, so that points[d] corresponds to the element we
+            // want to access, and no offset is needed.
+            res_point[d] = points[d];
+        }
+    } else if (index_min == num_points - 1) {
+        for (int d = 0; d < num_dims; ++d) {
+            res_point[d] = points[(num_points - 1) * num_dims + d];
+        }
+    }
+}
+
+}  // namespace polyline
+
+#undef POLYLINE_HOST_DEVICE_INLINE
+
+#endif  // LANE_HELPERS_POLYLINE_COMMON_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh
new file mode 100644
index 0000000..eae81c4
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
+#define LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
+
+#include <cmath>
+
+// CUDA provides native __half/__nv_bfloat16 comparison intrinsics and shuffle
+// overloads, while c10 low-precision wrappers add extra conversion paths,
+// leading to compilation errors. The CUDA-only specializations below route c10 values
+// through the native CUDA operations where available; only scalar math such as sqrt
+// intentionally computes via float. Keeping these variants CUDA-only keeps CPU builds
+// free of these types.
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE inline
+#endif
+
+namespace polyline {
+
+#ifdef __CUDACC__
+template <typename dtype>
+__device__ __forceinline__ dtype shfl_xor_sync_compat(unsigned mask, dtype val, int laneMask) {
+    return __shfl_xor_sync(mask, val, laneMask);
+}
+
+template <>
+__device__ __forceinline__ c10::Half shfl_xor_sync_compat(unsigned mask, c10::Half val, int laneMask) {
+    return c10::Half(__shfl_xor_sync(mask, static_cast<__half>(val), laneMask));
+}
+
+template <>
+__device__ __forceinline__ c10::BFloat16 shfl_xor_sync_compat(unsigned mask, c10::BFloat16 val,
+                                                              int laneMask) {
+    return c10::BFloat16(__shfl_xor_sync(mask, static_cast<__nv_bfloat16>(val), laneMask));
+}
+#endif
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(dtype lhs, dtype rhs) {
+    return lhs < rhs;
+}
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(dtype lhs, dtype rhs) {
+    return lhs > rhs;
+}
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(dtype lhs, dtype rhs) {
+    return lhs >= rhs;
+}
+
+#ifdef __CUDACC__
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hlt(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hgt(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hge(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hlt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hgt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hge(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+#endif
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE dtype polyline_sqrt(dtype value) {
+    return sqrt(value);
+}
+
+#ifdef __CUDACC__
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::Half polyline_sqrt(c10::Half value) {
+    return static_cast<c10::Half>(sqrtf(static_cast<float>(value)));
+}
+
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::BFloat16 polyline_sqrt(c10::BFloat16 value) {
+    return static_cast<c10::BFloat16>(sqrtf(static_cast<float>(value)));
+}
+#endif
+
+}  // namespace polyline
+
+#undef POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE
+
+#endif  // LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh
new file mode 100644
index 0000000..2354b54
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_KERNELS_CUH
+#define LANE_HELPERS_POLYLINE_KERNELS_CUH
+
+#include "polyline_common.cuh"
+#include "polyline_dtype_compat.cuh"
+
+namespace polyline {
+
+template <typename dtype>
+__device__ __forceinline__ void prefix_sum_warp(int index, dtype value, int num_values_in_scan,
+                                                dtype& scan_value, dtype& sum_all) {
+    sum_all = value;
+    scan_value = static_cast<dtype>(0.0);
+    for (int i = 1; i < num_values_in_scan; i <<= 1) {
+        dtype sum_other = shfl_xor_sync_compat<dtype>(0xffffffffu, sum_all, i);
+        scan_value += ((static_cast<uint32_t>(index) & static_cast<uint32_t>(i)) > 0) * sum_other;
+        sum_all += sum_other;
+    }
+}
+
+/**
+ * @brief Perform a prefix sum on a block of values.
+ *
+ * @details
+ * The buffer `warp_scan_buffer` is used to store the sums of the individual warps, which is then used
+ * to compute the offsets to add to each warp. For that, a prefix sum is performed on the buffer in a second
+ * step (in-place). The size of the buffer is (in elements): `blockDim.y * num_warps_per_sample`.
+ * 
+ * @tparam dtype The type of the values to prefix sum
+ *
+ * @param value The value to prefix sum
+ * @param num_warps_per_sample The number of warps per sample
+ * @param warp_scan_buffer The buffer to store the partial sums of the iterations so far for each sample
+ *
+ * @return The prefix sum for the current thread
+ */
+template <typename dtype>
+__device__ __forceinline__ dtype prefix_sum_block(dtype value, int num_warps_per_sample,
+                                                  dtype* warp_scan_buffer) {
+    // ix corresponds to the thread index inside a single sample
+    const int ix = threadIdx.x;
+    const int iwx = threadIdx.x / 32;                         // index of the warp inside the sample
+    const int iw = threadIdx.y * num_warps_per_sample + iwx;  // index of the warp in the block
+    // Get thread id (consecutive IDs correspond to consecutive values in the array)
+    //const int tid_shared = threadIdx.y * bxsize + ix;
+
+    // Using warp shuffles iteratively, in two stages
+
+    // First stage: perform warp scans
+    dtype warp_scan1;
+    dtype warp_sum1;
+    prefix_sum_warp(ix, value, 32, warp_scan1, warp_sum1);
+    // If this is the first thread in the warp, it is responsible for storing the partial sum for the warp
+    if (ix % 32 == 0) warp_scan_buffer[iw] = warp_sum1;
+    __syncthreads();
+
+    // Warp scan for for the partial sums to obtain the offsets for each warp
+    // The first warp (i.e. ix < 32) participates in step 2 of the warp scan.
+    // Note that the whole first warp always participates in step 2, even if num_warps_per_sample < 32.
+    // This is to avoid a deadlock without using a complex mask generation method for `__shfl_xor_sync()`.
+    if (ix < 32) {
+        const int wid_shared = threadIdx.y * num_warps_per_sample + ix;
+        const bool is_inside = ix < num_warps_per_sample;
+        const dtype value = is_inside ? warp_scan_buffer[wid_shared] : static_cast<dtype>(0.0);
+        dtype warp_scan2;
+        dtype warp_sum2;
+        prefix_sum_warp(ix, value, num_warps_per_sample, warp_scan2, warp_sum2);
+        if (ix < num_warps_per_sample) {
+            warp_scan_buffer[wid_shared] = warp_scan2;
+        }
+    }
+    __syncthreads();
+
+    // Apply offsets to the partial sums to obtain the final values
+    warp_scan1 += warp_scan_buffer[iw];
+
+    return warp_scan1;
+}
+
+/**
+ * @brief Perform a prefix sum on a block of values.
+ *
+ * @details
+ * The buffer is split into 2 parts:
+ * - The first part (`blockDim.y` elements) stores the running sums of all
+ *   elements processed so far for each sample in y (`sum_buffer`).
+ * - The second part (`warp_scan_buffer`) is scratch space for the intra‑block
+ *   scan performed by `prefix_sum_block`. The size is: `blockDim.y * num_warps_per_sample`
+ *   (see `prefix_sum_block` for more details)
+ * Hence, the total buffer size in elements is (in elements):
+ * `blockDim.y + blockDim.y * num_warps_per_sample`.
+ * or
+ * `blockDim.y * (1 + num_warps_per_sample)`
+ * 
+ * The results are stored in the sequence array, overriding the input values.
+ * Note that the results are the accumulated values including the current value, i.e. the operation can be expressed as:
+ * `sequences[i] = sum(sequences[0:(i+1)])`, where the slicing is defined as in Python.
+ *
+ * 
+ *
+ * @tparam dtype The type of the values to prefix sum
+ *
+ * @param sequences The sequences to prefix sum for the current thread block. Note that this means that 
+ *   the first sequence is the one corresponding to threadIdx.y == 0 of the current block, not necessarily the 
+ *   first sequence in the global array.
+ * @param buffer Combined temporary storage used by the prefix-sum.
+ * @param numel_x The number of elements in the x dimension
+ * @param numel_x_full_blocks The number of elements in the x dimension extended to a multiple of blockDim.x
+ * @param numel_y The number of sequences in the y dimension
+ * @param offset The initial offset to add to the prefix sum of each sequence
+ */
+template <typename dtype>
+__device__ __forceinline__ void prefix_sum_looped(dtype* sequence, dtype* buffer_block, int numel_x,
+                                                  int numel_x_full_blocks, int numel_y, dtype offset) {
+    const int ix = threadIdx.x;
+    const int iy = threadIdx.y;
+
+    // Buffer for keeping the sums of the iterations so far for each sample
+    dtype* sum_buffer = buffer_block;
+    // Buffer as needed for the prefix sum implementation
+    dtype* warp_scan_buffer = buffer_block + blockDim.y;
+
+    int num_warps_per_sample = (blockDim.x + 31) / 32;
+
+    // Initialize the buffer containing the partial sums of the iterations so far for each sample
+    if (ix == 0) {
+        sum_buffer[iy] = static_cast<dtype>(0.0);
+    }
+    // Compute the sum one `bxsize` at a time for each sample
+    for (int i = ix; i < numel_x_full_blocks; i += blockDim.x) {
+        // Make sure that sum_buffer is written to (either initially or in the previous iteration)
+        __syncthreads();
+        const dtype value = i < numel_x ? sequence[i] : static_cast<dtype>(0.0);
+        const dtype value_out =
+            prefix_sum_block<dtype>(value, num_warps_per_sample, warp_scan_buffer) + sum_buffer[iy] + offset;
+        const dtype value_out_incl_current = value_out + value;
+        // Make sure that
+        //   - sequences are not written to before they are read from for the current iteration
+        //   - sum_buffer is not written to before it is read from for the current iteration
+        __syncthreads();
+        if (i < numel_x) {
+            sequence[i] = value_out_incl_current;
+        }
+        // Update the sum buffer for the next iteration to the current value of the last processed sample.
+        // Note that the last thread may be out of bounds and not correspond to the last element. However,
+        // in this case, the value us not needed (and also still is the correct value as the values are
+        // extended with zeros, so that the cumulative sum (computed as including the current value) is
+        // the same as for the last element)
+        if (ix == blockDim.x - 1) {
+            sum_buffer[iy] = value_out_incl_current;
+        }
+        // Offset is only applied in the first iteration. Afterwards, the offset is already included in the
+        // partial sum as stored in `sum_buffer` and the offset must not be applied again.
+        offset = static_cast<dtype>(0.0);
+    }
+    __syncthreads();
+}
+
+template <typename dtype>
+__device__ __forceinline__ dtype warp_reduce_sum(dtype value, int num_vals_per_partial) {
+    const int ix = threadIdx.x;
+    for (int i = 1; i < num_vals_per_partial; i <<= 1) {
+        const dtype val_other = shfl_xor_sync_compat<dtype>(0xffffffffu, value, i);
+        value += val_other;
+    }
+    return value;
+}
+
+template <typename dtype>
+__device__ __forceinline__ dtype sample_reduce_sum(dtype value, int num_warps_per_sample,
+                                                   dtype* warp_temp_and_result_buffer) {
+    const int ix = threadIdx.x;                               // index of thread in the block
+    const int iwx = threadIdx.x / 32;                         //index of warp in the sample
+    const int iw = threadIdx.y * num_warps_per_sample + iwx;  // index of the warp in the block
+
+    const dtype warp_sum = warp_reduce_sum(value, 32);
+    // The first thread in the warp writes the result for the warp
+    if (ix % 32 == 0) {
+        warp_temp_and_result_buffer[iw] = warp_sum;
+    }
+    // Make sure all warps have written their results
+    __syncthreads();
+
+    dtype sample_sum = static_cast<dtype>(0.0);
+    // The first warp reduces the results of the first stage
+    // Note that from now on, ix corresponds to the index of the warp (from stage 1) in the block (previously iw)
+    if (ix < 32) {
+        const int iw_base = threadIdx.y * num_warps_per_sample;
+        const dtype warp_sum_phase_1 =
+            ix < num_warps_per_sample ? warp_temp_and_result_buffer[iw_base + ix] : static_cast<dtype>(0.0);
+        // The partial sum will have constant segments, each segment corresponding to one sample (and containing as many values as
+        // there are warps per sample).
+        sample_sum = warp_reduce_sum(warp_sum_phase_1, num_warps_per_sample);
+    }
+    __syncthreads();
+    return sample_sum;
+}
+
+template <typename dtype>
+__device__ __forceinline__ void sample_distances(const dtype* points_sample,
+                                                 const dtype* accum_distances_sample,
+                                                 const dtype* distances_to_sample_sample,
+                                                 int num_distances_to_sample, int num_points, int num_dims,
+                                                 dtype* res_points_sample, bool relative_distances) {
+    const int x = threadIdx.x;
+    if (num_points == 0) {
+        const int result_stride = blockDim.x * num_dims;
+
+        dtype* res_points_current =
+            x < num_distances_to_sample ? res_points_sample + x * num_dims : res_points_sample;
+        for (int i = x; i < num_distances_to_sample; i += blockDim.x, res_points_current += result_stride) {
+            fill_point_with_nan_common<dtype>(res_points_current, num_dims);
+        }
+        return;
+    }
+
+    dtype total_length_if_needed = static_cast<dtype>(0.0);
+    if (relative_distances) {
+        total_length_if_needed = accum_distances_sample[num_points - 1];
+    }
+    for (int i = x; i < num_distances_to_sample; i += blockDim.x) {
+        dtype* res_points_current = res_points_sample + i * num_dims;
+        const dtype distance_to_sample = relative_distances
+                                             ? distances_to_sample_sample[i] * total_length_if_needed
+                                             : distances_to_sample_sample[i];
+        sample_at_distance_common<dtype, dtype>(points_sample, accum_distances_sample, distance_to_sample,
+                                                num_points, num_dims, res_points_current);
+    }
+}
+
+template <typename dtype>
+__device__ __forceinline__ void compute_distances(dtype* points_sample, int num_points, int num_dims,
+                                                  dtype* distances_sample) {
+    const int x = threadIdx.x;
+    if (num_points == 0) {
+        return;
+    }
+
+    if (x == 0) {
+        // Distance from the start to the first point is zero.
+        distances_sample[0] = static_cast<dtype>(0.0);
+    }
+    // Store segment lengths starting at index 1 so that an *inclusive* prefix
+    // sum over `distances_sample` yields distances to points:
+    //   distances_sample[j] = distance from start to point j.
+    for (int i = x; i < num_points - 1; i += blockDim.x) {
+        distances_sample[i + 1] = compute_segment_length_common<dtype, dtype>(points_sample, i, num_dims);
+    }
+}
+
+/**
+ * @brief Shared implementation for both fixed-size and variable-size batch kernels.
+ *
+ * @details
+ * This routine implements the common logic used by:
+ *  - `polyline_sampling_fully_shared_kernel` (fixed-size batches), and
+ *  - `polyline_sampling_fully_shared_var_batch_kernel` (variable-size batches).
+ *
+ * The shared memory is split into two parts:
+ * - The first part stores the distances and accumulated distances
+ *   (conversion in-place) for all points and has size (in elements):
+ *   `blockDim.y * max_num_points`.
+ * - The second part stores the temporary buffer used by
+ *   `prefix_sum_looped` and has size (in elements):
+ *   `blockDim.y * (num_warps_per_sample + 1)`.
+ *   (see the documentation of `prefix_sum_looped` for details).
+ * The total shared memory size is therefore (in elements):
+ * `(blockDim.y * max_num_points + blockDim.y * (num_warps_per_sample + 1))`.
+ *
+ *
+ * @tparam dtype The type of the points
+ *
+ * @param points The points to sample
+ * @param distances_to_sample The distances to sample at
+ * @param res_points The resulting sampled points
+ * @param max_num_points The maximum number of points per polyline in the batch
+ * @param max_num_points_full_blocks The maximum number of points extended to a multiple of blockDim.x
+ * @param num_dims The number of dimensions of the points
+ * @param max_num_distances_to_sample The maximum number of distances to sample at per polyline
+ * @param num_samples The number of samples (batch size)
+ * @param sample_sizes_points (optional) Per-sample number of points (variable-size batches)
+ * @param sample_sizes_distances_to_sample (optional) Per-sample number of distances (variable-size batches)
+ * @param relative_distances Interpret distances to sample as fractions of each polyline's total length
+ * @param distance_buffer_ext Optional external buffer for distances when shared memory is insufficient
+ */
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances, bool use_variable_size_batch>
+__device__ __forceinline__ void polyline_sampling_fully_shared_common(
+    dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points,
+    int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples,
+    sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample,
+    bool relative_distances, dtype* distance_buffer_ext) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* distances;
+    dtype* buffer;
+    if (use_shared_distances) {
+        // Shared-memory layout per block:
+        //   distances: [blockDim.y][max_num_points]
+        //   buffer   : [blockDim.y * (1 + num_warps_per_sample)]
+        distances = reinterpret_cast<dtype*>(shared_mem);
+        buffer = reinterpret_cast<dtype*>(shared_mem + blockDim.y * max_num_points * sizeof(dtype));
+    } else {
+        // External distances buffer is laid out per block as
+        //   [blockIdx.y][blockDim.y][max_num_points]
+        // so each block gets its own contiguous slice. The scratch `buffer`
+        // always starts at the beginning of this block's shared memory.
+        distances = distance_buffer_ext + blockIdx.y * blockDim.y * max_num_points;
+        buffer = reinterpret_cast<dtype*>(shared_mem);
+    }
+
+    const int y = threadIdx.y;
+    const int y_global = blockIdx.y * blockDim.y + y;
+    const bool is_active_sample = (y_global < num_samples);
+
+    // 1) Compute per-point distances only for valid samples. Inactive rows in
+    // the final block still participate in sync-heavy code paths with zero work.
+    int curr_num_points = 0;
+    int curr_num_distances_to_sample = 0;
+    if (is_active_sample) {
+        if (use_variable_size_batch) {
+            curr_num_points = sample_sizes_points[y_global];
+            curr_num_distances_to_sample = sample_sizes_distances_to_sample[y_global];
+        } else {
+            curr_num_points = max_num_points;
+            curr_num_distances_to_sample = max_num_distances_to_sample;
+        }
+
+        // Global index for points in device memory; distances remain indexed by the
+        // local y within the block because they live in shared memory.
+        dtype* points_sample = points + y_global * max_num_points * num_dims;
+        dtype* distances_sample = distances + y * max_num_points;
+        if (curr_num_points > 0) {
+            compute_distances<dtype>(points_sample, curr_num_points, num_dims, distances_sample);
+        }
+    }
+
+    // 2) Prefix-sum over distances for all rows in this block-local buffer.
+    //    This operates purely on (shared or external) distances, so it is
+    //    safe even for rows that don't correspond to a real sample; their
+    //    results are never used.
+    // The `distances` are per-block, so we use the local index `y` to access the distances for the current block.
+    dtype* distance = distances + y * max_num_points;
+    prefix_sum_looped<dtype>(distance,                    // sequences
+                             buffer,                      // buffer (sum_buffer + warp_scan_buffer)
+                             curr_num_points,             // numel_x
+                             max_num_points_full_blocks,  // numel_x_full_blocks (extended to full blocks)
+                             blockDim.y,                  // numel_y (number of samples per block)
+                             static_cast<dtype>(0.0)      // offset
+    );
+
+    // 3) Sample only for valid samples, using their (possibly shared or
+    //    external) accumulated distances.
+    if (is_active_sample) {
+        // Get the points for the current sample (use of global offset)
+        const dtype* points_sample = points + y_global * max_num_points * num_dims;
+        // Get the distances for the current sample (use of block-local offset, as distances are stored in
+        // shared memory (or in an external buffer with `points` referring to points for this block))
+        const dtype* distances_sample = distances + y * max_num_points;
+        // Get the distances to sample at for the current sample (use of global offset)
+        const dtype* distances_to_sample_sample =
+            distances_to_sample + y_global * max_num_distances_to_sample;
+        sample_distances<dtype>(points_sample, distances_sample, distances_to_sample_sample,
+                                curr_num_distances_to_sample, curr_num_points, num_dims,
+                                res_points + y_global * max_num_distances_to_sample * num_dims,
+                                relative_distances);
+    }
+}
+
+/**
+ * @brief Sample the points at the distances (fixed-size batches).
+ *
+ * See `polyline_sampling_fully_shared_common` for implementation details.
+ */
+template <typename dtype, bool use_shared_distances>
+__global__ void polyline_sampling_fully_shared_kernel(dtype* points, dtype* distances_to_sample,
+                                                      dtype* res_points, int num_points,
+                                                      int num_points_full_blocks, int num_dims,
+                                                      int num_distances_to_sample, int num_samples,
+                                                      bool relative_distances, dtype* distance_buffer_ext) {
+    polyline_sampling_fully_shared_common<dtype, int, use_shared_distances, false>(
+        points, distances_to_sample, res_points,
+        num_points,              // max_num_points
+        num_points_full_blocks,  // max_num_points_full_blocks
+        num_dims,
+        num_distances_to_sample,  // max_num_distances_to_sample
+        num_samples,
+        /*sample_sizes_points=*/nullptr,
+        /*sample_sizes_distances_to_sample=*/nullptr, relative_distances, distance_buffer_ext);
+}
+
+// Variable-size batch version of the kernel.
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+__global__ void polyline_sampling_fully_shared_var_batch_kernel(
+    dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points,
+    int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples,
+    sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample,
+    bool relative_distances, dtype* distance_buffer_ext) {
+    polyline_sampling_fully_shared_common<dtype, sample_size_dtype, use_shared_distances, true>(
+        points, distances_to_sample, res_points, max_num_points, max_num_points_full_blocks, num_dims,
+        max_num_distances_to_sample, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+        relative_distances, distance_buffer_ext);
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_variable_size_batch>
+__device__ __forceinline__ void polyline_lengths_common(dtype* points, dtype* lengths, int max_num_points,
+                                                        int num_dims, int num_samples,
+                                                        sample_size_dtype* sample_sizes_points,
+                                                        dtype* reduction_buffer) {
+    const int x = threadIdx.x;
+    const int y_global = blockIdx.y * blockDim.y + threadIdx.y;
+    const bool is_active_sample = (y_global < num_samples);
+
+    int curr_num_points = 0;
+    dtype local_length = static_cast<dtype>(0.0);
+    if (is_active_sample) {
+        curr_num_points = use_variable_size_batch ? sample_sizes_points[y_global] : max_num_points;
+        const dtype* points_sample = points + y_global * max_num_points * num_dims;
+        for (int i = x; i < curr_num_points - 1; i += blockDim.x) {
+            local_length += compute_segment_length_common<dtype, dtype>(points_sample, i, num_dims);
+        }
+    }
+
+    const int num_warps_per_sample = (blockDim.x + 31) / 32;
+    const dtype length = sample_reduce_sum<dtype>(local_length, num_warps_per_sample, reduction_buffer);
+    if (is_active_sample && x == 0) {
+        lengths[y_global] = curr_num_points == 0 ? polyline_nan<dtype>() : length;
+    }
+}
+
+template <typename dtype>
+__global__ void polyline_lengths_kernel(dtype* points, dtype* lengths, int num_points, int num_dims,
+                                        int num_samples) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* reduction_buffer = reinterpret_cast<dtype*>(shared_mem);
+    polyline_lengths_common<dtype, int, false>(points, lengths, num_points, num_dims, num_samples,
+                                               /*sample_sizes_points=*/nullptr, reduction_buffer);
+}
+
+template <typename dtype, typename sample_size_dtype>
+__global__ void polyline_lengths_var_batch_kernel(dtype* points, dtype* lengths, int max_num_points,
+                                                  int num_dims, int num_samples,
+                                                  sample_size_dtype* sample_sizes_points) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* reduction_buffer = reinterpret_cast<dtype*>(shared_mem);
+    polyline_lengths_common<dtype, sample_size_dtype, true>(
+        points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points, reduction_buffer);
+}
+
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_KERNELS_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh
new file mode 100644
index 0000000..2668917
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
+#define LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
+
+#include <cstddef>
+#include <mutex>
+#include <stdexcept>
+
+#include <cuda_runtime.h>
+
+#include "helper_macros.cuh"
+#include "polyline_kernels.cuh"
+
+namespace polyline {
+
+static constexpr int MAX_CACHED_CUDA_DEVICES = 64;
+
+static void check_non_negative_cuda_device(int device) {
+    if (device < 0) {
+        throw std::runtime_error("CUDA device index must be non-negative.");
+    }
+}
+
+static size_t query_polyline_max_shared_full_for_device(int device) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    size_t max_shared_full = static_cast<size_t>(prop.sharedMemPerBlock);
+    if (prop.sharedMemPerBlockOptin != 0) {
+        max_shared_full = static_cast<size_t>(prop.sharedMemPerBlockOptin);
+    }
+    return max_shared_full;
+}
+
+static size_t polyline_max_shared_full_for_device(int device) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+    static size_t max_shared_full_by_device[MAX_CACHED_CUDA_DEVICES] = {};
+
+    check_non_negative_cuda_device(device);
+    // Fallback if there are more devices than the maximum number of cached devices we use.
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        const size_t max_shared_full = query_polyline_max_shared_full_for_device(device);
+        return max_shared_full;
+    }
+
+    std::call_once(configured_devices[device], [device]() {
+        max_shared_full_by_device[device] = query_polyline_max_shared_full_for_device(device);
+    });
+    const size_t max_shared_full = max_shared_full_by_device[device];
+    return max_shared_full;
+}
+
+template <typename dtype, bool use_shared_distances>
+static void configure_polyline_sampling_kernel(size_t max_shared_full) {
+    CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel<dtype, use_shared_distances>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    static_cast<int>(max_shared_full)));
+    CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel<dtype, use_shared_distances>,
+                                    cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+}
+
+template <typename dtype, bool use_shared_distances>
+static void configure_polyline_sampling_kernel_once(int device, size_t max_shared_full) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+
+    check_non_negative_cuda_device(device);
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        configure_polyline_sampling_kernel<dtype, use_shared_distances>(max_shared_full);
+        return;
+    }
+
+    std::call_once(configured_devices[device], [max_shared_full]() {
+        configure_polyline_sampling_kernel<dtype, use_shared_distances>(max_shared_full);
+    });
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+static void configure_polyline_sampling_var_batch_kernel(size_t max_shared_full) {
+    CUDA_CHECK(cudaFuncSetAttribute(
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(max_shared_full)));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+static void configure_polyline_sampling_var_batch_kernel_once(int device, size_t max_shared_full) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+
+    check_non_negative_cuda_device(device);
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        configure_polyline_sampling_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>(
+            max_shared_full);
+        return;
+    }
+
+    std::call_once(configured_devices[device], [max_shared_full]() {
+        configure_polyline_sampling_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>(
+            max_shared_full);
+    });
+}
+
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp
new file mode 100644
index 0000000..a0d82a3
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/pybind11.h>
+#include <cstdint>
+#include <limits>
+
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "polyline.cuh"
+#include "helper_macros.cuh"
+
+//#define PROFILE_AND_SYNC
+
+#ifdef PROFILE_AND_SYNC
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+namespace polyline {
+
+#define CHECK_DEVICE(x) check_device(x, #x)
+#define CHECK_CONTIGUOUS(x) check_contiguous(x, #x)
+#define CHECK_TYPE(x) check_type(x, #x)
+#define CHECK_INPUT(x)   \
+    CHECK_DEVICE(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_TYPE(x);
+inline void check_device(const at::Tensor& tensor, const char* description) {
+    TORCH_CHECK(tensor.is_cpu() || tensor.is_cuda(), description, " must be on CPU or CUDA");
+}
+
+inline void check_contiguous(const at::Tensor& tensor, const char* description) {
+    TORCH_CHECK(tensor.is_contiguous(), description, " must be contiguous");
+}
+
+inline void check_type(const at::Tensor& tensor, const char* description) {
+    if (tensor.is_cuda()) {
+        TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64 ||
+                        tensor.scalar_type() == torch::kFloat16 || tensor.scalar_type() == torch::kBFloat16,
+                    description, " must have dtype float16, float32, float64, or bfloat16 on CUDA");
+    } else {
+        TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64,
+                    description, " must have dtype float32 or float64 on CPU");
+    }
+}
+
+inline void check_same_device(const at::Tensor& lhs, const at::Tensor& rhs, const char* message) {
+    TORCH_CHECK(lhs.device() == rhs.device(), message);
+}
+
+inline void check_sample_size_type(const at::Tensor& sample_sizes, const char* description) {
+    TORCH_CHECK(sample_sizes.scalar_type() == at::kInt || sample_sizes.scalar_type() == at::kLong,
+                description, " must have dtype int32 or int64");
+}
+
+inline void check_sample_sizes(const at::Tensor& sample_sizes, int max_size, const char* description) {
+    if (sample_sizes.numel() == 0) {
+        return;
+    }
+    TORCH_CHECK(
+        !torch::any(sample_sizes < 0).item<bool>() && !torch::any(sample_sizes > max_size).item<bool>(),
+        description, " values must be in [0, ", max_size, "]");
+}
+
+at::Tensor make_external_distance_buffer(size_t size_elems, const at::TensorOptions& options) {
+    // Keep external CUDA scratch memory owned by PyTorch's stream-aware allocator.
+    // A raw cudaFree here can race with the asynchronous custom kernel that uses this buffer.
+
+    // Return an empty tensor if no external distance buffer is needed.
+    if (size_elems == 0) {
+        return at::Tensor();
+    }
+
+    // Check that the size is not too large to allocate as a tensor.
+    TORCH_CHECK(size_elems <= static_cast<size_t>(std::numeric_limits<int64_t>::max()),
+                "external polyline distance buffer is too large to allocate as a tensor");
+
+    // Allocate the buffer and return it.
+    at::Tensor buffer = at::empty({static_cast<int64_t>(size_elems)}, options);
+    return buffer;
+}
+
+at::Tensor polyline_interpolation(at::Tensor points, at::Tensor distances, bool relative_distances) {
+    CHECK_DEVICE(points);
+    CHECK_DEVICE(distances);
+    CHECK_TYPE(points);
+    CHECK_TYPE(distances);
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)");
+    TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, num_distances)");
+    TORCH_CHECK(points.size(0) == distances.size(0),
+                "points and distances must contain the same number of polylines");
+    TORCH_CHECK(points.scalar_type() == distances.scalar_type(),
+                "points and distances must have the same dtype");
+    check_same_device(points, distances, "points and distances must be on the same device");
+
+    const int num_samples = points.size(0);
+    const int num_points = points.size(1);
+    const int num_distances = distances.size(1);
+    const int num_dims = points.size(2);
+    // Result has shape (batch, num_distances, point_dim) and otherwise
+    // matches `distances` (device, dtype).
+    auto res = at::empty({num_samples, num_distances, num_dims}, distances.options());
+    if (num_distances == 0) {
+        return res;
+    }
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor distances_contiguous = distances.contiguous();
+
+    if (points.is_cuda()) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation", [&] {
+                const int device = points.get_device();
+                c10::cuda::CUDAGuard device_guard(static_cast<c10::DeviceIndex>(device));
+                const auto stream = at::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
+                at::cuda::CUDAStreamGuard stream_guard(stream);
+                const auto cfg = make_polyline_launch_config<scalar_t>(num_points, num_samples, device);
+                // Allocate under the same stream used for the kernel launch so the caching allocator
+                // does not recycle this temporary scratch buffer before queued kernel work consumes it.
+                const at::Tensor distance_buffer_ext =
+                    make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options());
+                scalar_t* distance_buffer_ext_ptr =
+                    distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr<scalar_t>() : nullptr;
+                polyline_interpolation<scalar_t>(points_contiguous.data_ptr<scalar_t>(), num_points, num_dims,
+                                                 distances_contiguous.data_ptr<scalar_t>(), num_distances,
+                                                 res.data_ptr<scalar_t>(), num_samples, relative_distances,
+                                                 device, cfg, distance_buffer_ext_ptr, stream.stream());
+                CUDA_CHECK_LAST();
+            });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_interpolation_cpu", [&] {
+            polyline_interpolation_cpu<scalar_t>(points_contiguous.data_ptr<scalar_t>(), num_points, num_dims,
+                                                 distances_contiguous.data_ptr<scalar_t>(), num_distances,
+                                                 res.data_ptr<scalar_t>(), num_samples, relative_distances);
+        });
+    }
+
+    return res;
+}
+
+at::Tensor polyline_lengths(at::Tensor points) {
+    CHECK_DEVICE(points);
+    CHECK_TYPE(points);
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)");
+
+    const int num_samples = points.size(0);
+    const int num_points = points.size(1);
+    const int num_dims = points.size(2);
+    auto res = at::empty({num_samples}, points.options());
+    const at::Tensor points_contiguous = points.contiguous();
+
+    if (points.is_cuda()) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths", [&] {
+                cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+                polyline_lengths<scalar_t>(points_contiguous.data_ptr<scalar_t>(),  // points
+                                           num_points,                              // num_points
+                                           num_dims,                                // num_dims
+                                           res.data_ptr<scalar_t>(),                // lengths
+                                           num_samples,                             // num_samples
+                                           stream                                   // stream
+                );
+                CUDA_CHECK_LAST();
+            });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_cpu", [&] {
+            polyline_lengths_cpu<scalar_t>(points_contiguous.data_ptr<scalar_t>(),  // points
+                                           num_points,                              // num_points
+                                           num_dims,                                // num_dims
+                                           res.data_ptr<scalar_t>(),                // lengths
+                                           num_samples                              // num_samples
+            );
+        });
+    }
+
+    return res;
+}
+
+at::Tensor polyline_interpolation_var_size_batch(at::Tensor points, at::Tensor distances,
+                                                 at::Tensor sample_sizes_points,
+                                                 at::Tensor sample_sizes_distances_to_sample,
+                                                 bool relative_distances) {
+    CHECK_DEVICE(points);
+    CHECK_DEVICE(distances);
+    check_device(sample_sizes_points, "points.sample_sizes");
+    check_device(sample_sizes_distances_to_sample, "distances.sample_sizes");
+    CHECK_TYPE(points);
+    CHECK_TYPE(distances);
+    check_sample_size_type(sample_sizes_points, "points.sample_sizes");
+    check_sample_size_type(sample_sizes_distances_to_sample, "distances.sample_sizes");
+
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)");
+    TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, max_num_distances)");
+    TORCH_CHECK(points.size(0) == distances.size(0),
+                "points and distances must contain the same number of polylines");
+    TORCH_CHECK(points.scalar_type() == distances.scalar_type(),
+                "points and distances must have the same dtype");
+    check_same_device(points, distances, "points and distances must be on the same device");
+    TORCH_CHECK(sample_sizes_points.scalar_type() == sample_sizes_distances_to_sample.scalar_type(),
+                "points.sample_sizes and distances.sample_sizes must have the same dtype "
+                "(both int32 or both int64)");
+    check_same_device(sample_sizes_points, points,
+                      "points.sample_sizes must be on the same device as points");
+    check_same_device(sample_sizes_distances_to_sample, distances,
+                      "distances.sample_sizes must be on the same device as distances");
+    TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor");
+    TORCH_CHECK(sample_sizes_distances_to_sample.ndimension() == 1,
+                "distances.sample_sizes must be a 1D tensor");
+
+    const int num_samples = points.size(0);
+    const int max_num_points = points.size(1);
+    const int max_num_distances = distances.size(1);
+    const int num_dims = points.size(2);
+    // Result has shape (batch, num_distances, point_dim) and otherwise
+    // matches `distances` (device, dtype).
+    auto res = at::empty({num_samples, max_num_distances, num_dims}, distances.options());
+
+    TORCH_CHECK(sample_sizes_points.size(0) == num_samples,
+                "points.sample_sizes must contain one count per polyline in points");
+    TORCH_CHECK(sample_sizes_distances_to_sample.size(0) == num_samples,
+                "distances.sample_sizes must contain one count per polyline in distances");
+    check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes");
+    check_sample_sizes(sample_sizes_distances_to_sample, max_num_distances, "distances.sample_sizes");
+    if (max_num_distances == 0) {
+        return res;
+    }
+
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor distances_contiguous = distances.contiguous();
+    const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous();
+    const at::Tensor sample_sizes_distances_to_sample_contiguous =
+        sample_sizes_distances_to_sample.contiguous();
+
+    auto launch = [&](auto sample_size_type_tag) {
+        using sample_size_t = decltype(sample_size_type_tag);
+        if (points.is_cuda()) {
+            AT_DISPATCH_FLOATING_TYPES_AND2(
+                at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation_var_size_batch", [&] {
+                    const int device = points.get_device();
+                    c10::cuda::CUDAGuard device_guard(static_cast<c10::DeviceIndex>(device));
+                    const auto stream = at::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
+                    at::cuda::CUDAStreamGuard stream_guard(stream);
+                    const auto cfg =
+                        make_polyline_launch_config<scalar_t>(max_num_points, num_samples, device);
+                    // Allocate under the same stream used for the kernel launch so the caching allocator
+                    // does not recycle this temporary scratch buffer before queued kernel work consumes it.
+                    const at::Tensor distance_buffer_ext =
+                        make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options());
+                    scalar_t* distance_buffer_ext_ptr =
+                        distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr<scalar_t>() : nullptr;
+                    polyline_interpolation_var_size_batch<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        distances_contiguous.data_ptr<scalar_t>(),                 // distances
+                        max_num_distances,                                         // num_distances
+                        res.data_ptr<scalar_t>(),                                  // result_points
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        sample_sizes_distances_to_sample_contiguous
+                            .data_ptr<sample_size_t>(),  // sample sizes distances
+                        relative_distances,              // relative_distances
+                        device,                          // device
+                        cfg,                             // launch config
+                        distance_buffer_ext_ptr,         // distance_buffer_ext
+                        stream.stream()                  // stream
+                    );
+                    CUDA_CHECK_LAST();
+                });
+        } else {
+            AT_DISPATCH_FLOATING_TYPES(
+                points.scalar_type(), "polyline_interpolation_var_size_batch_cpu", [&] {
+                    polyline_interpolation_var_size_batch_cpu<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        distances_contiguous.data_ptr<scalar_t>(),                 // distances
+                        max_num_distances,                                         // num_distances
+                        res.data_ptr<scalar_t>(),                                  // result_points
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        sample_sizes_distances_to_sample_contiguous
+                            .data_ptr<sample_size_t>(),  // sample sizes distances
+                        relative_distances               // relative_distances
+                    );
+                });
+        }
+    };
+    if (sample_sizes_points.scalar_type() == at::kInt) {
+        launch(int32_t{});
+    } else {
+        launch(int64_t{});
+    }
+
+    return res;
+}
+
+at::Tensor polyline_lengths_var_size_batch(at::Tensor points, at::Tensor sample_sizes_points) {
+    CHECK_DEVICE(points);
+    check_device(sample_sizes_points, "points.sample_sizes");
+    CHECK_TYPE(points);
+    check_sample_size_type(sample_sizes_points, "points.sample_sizes");
+
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)");
+    TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor");
+    check_same_device(sample_sizes_points, points,
+                      "points.sample_sizes must be on the same device as points");
+
+    const int num_samples = points.size(0);
+    const int max_num_points = points.size(1);
+    const int num_dims = points.size(2);
+    auto res = at::empty({num_samples}, points.options());
+
+    TORCH_CHECK(sample_sizes_points.size(0) == num_samples,
+                "points.sample_sizes must contain one count per polyline in points");
+    check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes");
+
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous();
+
+    auto launch = [&](auto sample_size_type_tag) {
+        using sample_size_t = decltype(sample_size_type_tag);
+        if (points.is_cuda()) {
+            AT_DISPATCH_FLOATING_TYPES_AND2(
+                at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths_var_size_batch", [&] {
+                    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+                    polyline_lengths_var_size_batch<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        res.data_ptr<scalar_t>(),                                  // lengths
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        stream                                                     // stream
+                    );
+                    CUDA_CHECK_LAST();
+                });
+        } else {
+            AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_var_size_batch_cpu", [&] {
+                polyline_lengths_var_size_batch_cpu<scalar_t, sample_size_t>(
+                    points_contiguous.data_ptr<scalar_t>(),                   // points
+                    max_num_points,                                           // max_num_points
+                    num_dims,                                                 // num_dims
+                    res.data_ptr<scalar_t>(),                                 // lengths
+                    num_samples,                                              // num_samples
+                    sample_sizes_points_contiguous.data_ptr<sample_size_t>()  // sample_sizes_points
+                );
+            });
+        }
+    };
+    if (sample_sizes_points.scalar_type() == at::kInt) {
+        launch(int32_t{});
+    } else {
+        launch(int64_t{});
+    }
+
+    return res;
+}
+
+}  // namespace polyline
+
+namespace py = pybind11;
+using namespace polyline;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "Lane helpers polyline interpolation bindings";
+    m.def("polyline_interpolation", (at::Tensor(*)(at::Tensor, at::Tensor, bool)) & polyline_interpolation,
+          py::arg("points"), py::arg("distances"), py::arg("relative") = false,
+          "Interpolate points along polylines at given distances.");
+    m.def("_polyline_lengths", (at::Tensor(*)(at::Tensor)) & polyline_lengths, py::arg("points"),
+          "Internal tensor-only entry point for fixed-size polyline length computation.");
+    m.def("_polyline_interpolation_var_size_batch",
+          (at::Tensor(*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, bool)) &
+              polyline_interpolation_var_size_batch,
+          py::arg("points"), py::arg("distances"), py::arg("sample_sizes_points"),
+          py::arg("sample_sizes_distances_to_sample"), py::arg("relative") = false,
+          "Internal tensor-only entry point for variable-length polyline interpolation.");
+    m.def("_polyline_lengths_var_size_batch",
+          (at::Tensor(*)(at::Tensor, at::Tensor)) & polyline_lengths_var_size_batch, py::arg("points"),
+          py::arg("sample_sizes_points"),
+          "Internal tensor-only entry point for variable-length polyline length computation.");
+}
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cu b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu
new file mode 100644
index 0000000..7d62499
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <limits>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+#include "helper_macros.cuh"
+
+#include "polyline.cuh"
+#include "polyline_kernels.cuh"
+#include "polyline_shared_memory_config.cuh"
+
+namespace polyline {
+
+// Return the largest power of two that is <= n.
+// For n <= 1, this returns 0 for n == 0 and 1 for n == 1.
+static inline int last_power_of_2(int n) {
+    if (n <= 0) {
+        return 0;
+    }
+    unsigned int v = static_cast<unsigned int>(n);
+    // Propagate highest set bit to all lower bits.
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    // Now (v + 1) >> 1 is the highest power of two <= original n.
+    const int power_of_two = static_cast<int>((v + 1u) >> 1);
+    return power_of_two;
+}
+
+template <typename dtype>
+struct PolylineLengthLaunchConfig {
+    dim3 block_dim;
+    dim3 grid_dim;
+    size_t shared_mem_size;
+};
+
+static int polyline_launch_threads_x(int num_samples_per_block) {
+    const int max_num_threads = 1024;
+
+    const int max_threads_x_for_y = max_num_threads / num_samples_per_block;
+    // Round down to a multiple of 32, but keep at least one warp.
+    int threads_x = (max_threads_x_for_y / 32) * 32;
+    if (threads_x < 32) {
+        threads_x = 32;
+    }
+    return threads_x;
+}
+
+template <typename dtype>
+static PolylineLaunchConfig<dtype> make_polyline_launch_config_for_y(int num_points, int num_samples,
+                                                                     int num_samples_per_block) {
+    const int threads_x = polyline_launch_threads_x(num_samples_per_block);
+
+    const dim3 block_dim(threads_x, num_samples_per_block, 1);
+    const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1);
+    const int num_points_full_blocks = ((num_points + block_dim.x - 1) / block_dim.x) * block_dim.x;
+    const int num_warps_per_sample = (block_dim.x + 31) / 32;
+    const size_t scratch_buffer_size_elems = block_dim.y * (num_warps_per_sample + 1);
+    const size_t distances_buffer_size_elems_shared = static_cast<size_t>(block_dim.y) * num_points;
+
+    PolylineLaunchConfig<dtype> cfg;
+    cfg.block_dim = block_dim;
+    cfg.grid_dim = grid_dim;
+    cfg.num_points_full_blocks = num_points_full_blocks;
+    cfg.shared_mem_size = (distances_buffer_size_elems_shared + scratch_buffer_size_elems) * sizeof(dtype);
+    cfg.distance_buffer_ext_size_elems = 0;
+    cfg.use_shared_distances = true;
+    cfg.max_shared_full = 0;
+    return cfg;
+}
+
+template <typename dtype>
+static size_t polyline_external_distance_buffer_size_elems(const PolylineLaunchConfig<dtype>& cfg,
+                                                           int num_points) {
+    const size_t buffer_size_elems = static_cast<size_t>(cfg.grid_dim.y) * cfg.block_dim.y * num_points;
+    return buffer_size_elems;
+}
+
+template <typename dtype>
+static size_t polyline_scratch_shared_mem_size(const PolylineLaunchConfig<dtype>& cfg) {
+    const int num_warps_per_sample = (cfg.block_dim.x + 31) / 32;
+    const size_t shared_mem_size =
+        static_cast<size_t>(cfg.block_dim.y) * (num_warps_per_sample + 1) * sizeof(dtype);
+    return shared_mem_size;
+}
+
+template <typename dtype>
+PolylineLaunchConfig<dtype> make_polyline_launch_config(int num_points, int num_samples, int device) {
+    // Keep blockDim.y at 1 so blockDim.x can use the full thread block for each sample.
+    PolylineLaunchConfig<dtype> cfg = make_polyline_launch_config_for_y<dtype>(num_points, num_samples, 1);
+
+    // Determine whether we can stay in the shared‑memory path using the opt‑in
+    // limit (`sharedMemPerBlockOptin`) instead of falling back to the external
+    // buffer.
+    const size_t max_shared_full = polyline_max_shared_full_for_device(device);
+
+    if (cfg.shared_mem_size <= max_shared_full) {
+        cfg.max_shared_full = max_shared_full;
+        return cfg;
+    }
+
+    cfg.shared_mem_size = polyline_scratch_shared_mem_size(cfg);
+    cfg.use_shared_distances = false;
+    cfg.max_shared_full = max_shared_full;
+    cfg.distance_buffer_ext_size_elems = polyline_external_distance_buffer_size_elems(cfg, num_points);
+    return cfg;
+}
+
+template <typename dtype>
+static PolylineLengthLaunchConfig<dtype> make_polyline_length_launch_config(int num_samples) {
+    const int max_num_threads = 1024;
+    const int max_y_by_threads = max_num_threads / 32;  // assuming at least one warp in x
+    const int max_y_candidate = min(num_samples, max_y_by_threads);
+    int num_samples_per_block = last_power_of_2(max_y_candidate);
+    if (num_samples_per_block < 1) {
+        num_samples_per_block = 1;
+    }
+
+    const int max_threads_x_for_y = max_num_threads / num_samples_per_block;
+    int threads_x = (max_threads_x_for_y / 32) * 32;
+    if (threads_x < 32) {
+        threads_x = 32;
+    }
+
+    const dim3 block_dim(threads_x, num_samples_per_block, 1);
+    const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1);
+    const int num_warps_per_sample = (block_dim.x + 31) / 32;
+
+    PolylineLengthLaunchConfig<dtype> cfg;
+    cfg.block_dim = block_dim;
+    cfg.grid_dim = grid_dim;
+    cfg.shared_mem_size = static_cast<size_t>(block_dim.y) * num_warps_per_sample * sizeof(dtype);
+    return cfg;
+}
+
+template <typename dtype>
+void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances,
+                            dtype* result_points, int num_samples, bool relative_distances, int device,
+                            const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                            cudaStream_t stream) {
+    if (cfg.use_shared_distances) {
+        configure_polyline_sampling_kernel_once<dtype, true>(device, cfg.max_shared_full);
+        polyline_sampling_fully_shared_kernel<dtype, true>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, relative_distances, nullptr);
+    } else {
+        polyline_sampling_fully_shared_kernel<dtype, false>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, relative_distances, distance_buffer_ext);
+    }
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype>
+void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples,
+                      cudaStream_t stream) {
+    auto cfg = make_polyline_length_launch_config<dtype>(num_samples);
+    polyline_lengths_kernel<dtype><<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+        points, lengths, num_points, num_dims, num_samples);
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances,
+                                           int num_distances, dtype* result_points, int num_samples,
+                                           sample_size_dtype* sample_sizes_points,
+                                           sample_size_dtype* sample_sizes_distances_to_sample,
+                                           bool relative_distances, int device,
+                                           const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                                           cudaStream_t stream) {
+    if (cfg.use_shared_distances) {
+        configure_polyline_sampling_var_batch_kernel_once<dtype, sample_size_dtype, true>(
+            device, cfg.max_shared_full);
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, true>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+                relative_distances, nullptr);
+    } else {
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, false>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+                relative_distances, distance_buffer_ext);
+    }
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths,
+                                     int num_samples, sample_size_dtype* sample_sizes_points,
+                                     cudaStream_t stream) {
+    auto cfg = make_polyline_length_launch_config<dtype>(num_samples);
+    polyline_lengths_var_batch_kernel<dtype, sample_size_dtype>
+        <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+            points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points);
+    CUDA_CHECK_LAST();
+}
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE)                                          \
+    template void polyline_interpolation<DTYPE>(                                           \
+        DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances, int device,        \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE)                                                            \
+    template PolylineLaunchConfig<DTYPE> make_polyline_launch_config<DTYPE>(int num_points, int num_samples, \
+                                                                            int device);
+
+#define INSTANTIATE_POLYLINE_LENGTHS(DTYPE)                                                             \
+    template void polyline_lengths<DTYPE>(DTYPE * points, int num_points, int num_dims, DTYPE* lengths, \
+                                          int num_samples, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE)               \
+    template void polyline_interpolation_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(                \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances,    \
+        DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points,            \
+        SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE)              \
+    template void polyline_lengths_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(               \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_CUDA_DTYPE(DTYPE)                        \
+    INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE)                         \
+    INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE)                         \
+    INSTANTIATE_POLYLINE_LENGTHS(DTYPE)                               \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int)     \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int64_t) \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int)           \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int64_t)
+
+INSTANTIATE_POLYLINE_CUDA_DTYPE(float)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(double)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::Half)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::BFloat16)
+
+#undef INSTANTIATE_POLYLINE_CUDA_DTYPE
+#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH
+#undef INSTANTIATE_POLYLINE_LENGTHS
+#undef INSTANTIATE_POLYLINE_LAUNCH_CONFIG
+#undef INSTANTIATE_POLYLINE_INTERPOLATION
+}  // namespace polyline
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp
new file mode 100644
index 0000000..f0d7cd0
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <vector>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Parallel.h>
+
+#include "polyline_common.cuh"
+
+namespace polyline {
+
+template <typename dtype>
+using cpu_acc_t = at::acc_type<dtype, false>;
+
+template <typename dtype>
+static void compute_accumulated_distances_cpu(const dtype* points_sample, int num_points, int num_dims,
+                                              cpu_acc_t<dtype>* accum_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    accum_distances[0] = static_cast<acc_t>(0.0);
+    for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) {
+        accum_distances[point_idx + 1] =
+            accum_distances[point_idx] +
+            compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+    }
+}
+
+template <typename dtype>
+static void sample_polyline_cpu(const dtype* points_sample, const dtype* distances_sample, int num_points,
+                                int num_dims, int num_distances, dtype* result_sample,
+                                bool relative_distances, std::vector<cpu_acc_t<dtype>>& accum_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    if (num_distances == 0) {
+        return;
+    }
+    if (num_points == 0) {
+        dtype* result_sample_i = result_sample;
+        for (int distance_idx = 0; distance_idx < num_distances;
+             ++distance_idx, result_sample_i += num_dims) {
+            fill_point_with_nan_common<dtype>(result_sample_i, num_dims);
+        }
+        return;
+    }
+    compute_accumulated_distances_cpu<dtype>(points_sample, num_points, num_dims, accum_distances.data());
+    const acc_t total_length = accum_distances[num_points - 1];
+    for (int distance_idx = 0; distance_idx < num_distances; ++distance_idx) {
+        const acc_t distance_to_sample =
+            relative_distances ? static_cast<acc_t>(distances_sample[distance_idx]) * total_length
+                               : static_cast<acc_t>(distances_sample[distance_idx]);
+        sample_at_distance_common<dtype, acc_t>(points_sample, accum_distances.data(), distance_to_sample,
+                                                num_points, num_dims,
+                                                result_sample + distance_idx * num_dims);
+    }
+}
+
+template <typename dtype>
+void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances,
+                                int num_distances, dtype* result_points, int num_samples,
+                                bool relative_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    const size_t stride_points = static_cast<size_t>(num_points) * static_cast<size_t>(num_dims);
+    const size_t stride_distances = static_cast<size_t>(num_distances);
+    const size_t stride_result = static_cast<size_t>(num_distances) * static_cast<size_t>(num_dims);
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        std::vector<acc_t> accum_distances(num_points);
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const dtype* points_sample = points + sample_idx * stride_points;
+            const dtype* distances_sample = distances + sample_idx * stride_distances;
+            dtype* result_sample = result_points + sample_idx * stride_result;
+            sample_polyline_cpu<dtype>(points_sample, distances_sample, num_points, num_dims, num_distances,
+                                       result_sample, relative_distances, accum_distances);
+        }
+    });
+}
+
+template <typename dtype>
+void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths,
+                          int num_samples) {
+    using acc_t = cpu_acc_t<dtype>;
+    const size_t stride_points = static_cast<size_t>(num_points) * static_cast<size_t>(num_dims);
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const dtype* points_sample = points + sample_idx * stride_points;
+            acc_t length = static_cast<acc_t>(0.0);
+            if (num_points == 0) {
+                length = polyline_nan<acc_t>();
+            } else {
+                for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) {
+                    length += compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+                }
+            }
+            lengths[sample_idx] = static_cast<dtype>(length);
+        }
+    });
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                               const dtype* distances, int num_distances,
+                                               dtype* result_points, int num_samples,
+                                               const sample_size_dtype* sample_sizes_points,
+                                               const sample_size_dtype* sample_sizes_distances_to_sample,
+                                               bool relative_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        std::vector<acc_t> accum_distances(max_num_points);
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const int curr_num_points = static_cast<int>(sample_sizes_points[sample_idx]);
+            const int curr_num_distances = static_cast<int>(sample_sizes_distances_to_sample[sample_idx]);
+            const dtype* points_sample = points + sample_idx * max_num_points * num_dims;
+            const dtype* distances_sample = distances + sample_idx * num_distances;
+            dtype* result_sample = result_points + sample_idx * num_distances * num_dims;
+            sample_polyline_cpu<dtype>(points_sample, distances_sample, curr_num_points, num_dims,
+                                       curr_num_distances, result_sample, relative_distances,
+                                       accum_distances);
+        }
+    });
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                         dtype* lengths, int num_samples,
+                                         const sample_size_dtype* sample_sizes_points) {
+    using acc_t = cpu_acc_t<dtype>;
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const int curr_num_points = static_cast<int>(sample_sizes_points[sample_idx]);
+            const dtype* points_sample = points + sample_idx * max_num_points * num_dims;
+            acc_t length = static_cast<acc_t>(0.0);
+            if (curr_num_points == 0) {
+                length = polyline_nan<acc_t>();
+            } else {
+                for (int point_idx = 0; point_idx < curr_num_points - 1; ++point_idx) {
+                    length += compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+                }
+            }
+            lengths[sample_idx] = static_cast<dtype>(length);
+        }
+    });
+}
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE)                                                 \
+    template void polyline_interpolation_cpu<DTYPE>(                                                  \
+        const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE)                                                  \
+    template void polyline_lengths_cpu<DTYPE>(const DTYPE* points, int num_points, int num_dims, \
+                                              DTYPE* lengths, int num_samples);
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE)                   \
+    template void polyline_interpolation_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(                    \
+        const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points,              \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE)               \
+    template void polyline_lengths_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(                \
+        const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_points);
+
+#define INSTANTIATE_POLYLINE_CPU_DTYPE(DTYPE)                             \
+    INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE)                         \
+    INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE)                               \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int)     \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int)           \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int64_t)
+
+INSTANTIATE_POLYLINE_CPU_DTYPE(float)
+INSTANTIATE_POLYLINE_CPU_DTYPE(double)
+
+#undef INSTANTIATE_POLYLINE_CPU_DTYPE
+#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU
+#undef INSTANTIATE_POLYLINE_LENGTHS_CPU
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_CPU
+
+}  // namespace polyline
diff --git a/packages/lane_helpers/pyproject.toml b/packages/lane_helpers/pyproject.toml
new file mode 100644
index 0000000..41b2940
--- /dev/null
+++ b/packages/lane_helpers/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = [
+    "setuptools>=64",
+    "wheel",
+    "scikit-build>=0.17.0",
+    "pybind11>=2.10.0",
+    "setuptools-scm>=8",
+    "accvlab-build-config @ file:../../build_config",
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "accvlab.lane_helpers"
+dynamic = ["version"]
+description = "Lane helper utilities for ACCV-Lab."
+requires-python = ">=3.8"
+dependencies = [
+    "torch>=2.0.0",
+    "numpy>=1.22.2",
+]
+
+[project.optional-dependencies]
+optional = [
+    "matplotlib",
+    "pytest",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["accvlab.lane_helpers*"]
+
+[tool.setuptools_scm]
+version_scheme = "no-guess-dev"
+fallback_version = "0.0.0"
+root = "../.."
diff --git a/packages/lane_helpers/setup.py b/packages/lane_helpers/setup.py
new file mode 100644
index 0000000..557ed99
--- /dev/null
+++ b/packages/lane_helpers/setup.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from skbuild import setup
+from setuptools import find_namespace_packages
+
+_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR = """
+#########################################################################################
+# Missing build dependency: accvlab-build-config.                                       #
+#                                                                                       #
+# ACCV-Lab package builds normally use --no-build-isolation, so the shared build helper #
+# must already be installed in the active environment. Install it first with:           #
+#                                                                                       #
+#     pip install <ACCV-Lab root>/build_config                                          #
+#                                                                                       #
+# and retry.                                                                            #
+#                                                                                       #
+# Alternatively, use <ACCV-Lab root>/scripts/package_manager.sh to install packages in  #
+# the documented order.                                                                 #
+#########################################################################################
+"""
+
+try:
+    from accvlab_build_config import build_cmake_args
+except ModuleNotFoundError as exc:
+    if exc.name != "accvlab_build_config":
+        raise
+    raise RuntimeError(_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR) from exc
+
+_cmake_args = build_cmake_args()
+
+
+setup(
+    name="accvlab.lane_helpers",
+    description="Lane helper utilities for ACCV-Lab.",
+    packages=find_namespace_packages(include=["accvlab.lane_helpers*"]),
+    include_package_data=True,
+    zip_safe=False,
+    cmake_source_dir="ext_impl",
+    cmake_install_dir="accvlab/lane_helpers",
+    cmake_args=_cmake_args,
+)
diff --git a/packages/lane_helpers/tests/polyline_test_utils.py b/packages/lane_helpers/tests/polyline_test_utils.py
new file mode 100644
index 0000000..8cad98f
--- /dev/null
+++ b/packages/lane_helpers/tests/polyline_test_utils.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+
+DEVICES = ["cpu", "cuda"]
+
+
+def sample_polyline_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor:
+    # For no distances, the result is an empty tensor.
+    if distances.shape[0] == 0:
+        sampled_points = points.new_empty((0, points.shape[1]))
+        return sampled_points
+    # For no points, the result is NaN for every requested point coordinate.
+    if points.shape[0] == 0:
+        sampled_points = points.new_full((distances.shape[0], points.shape[1]), torch.nan)
+        return sampled_points
+
+    segment_lengths = torch.linalg.vector_norm(points[1:] - points[:-1], dim=1)
+    accum = torch.cat([segment_lengths.new_zeros(1), torch.cumsum(segment_lengths, dim=0)])
+    total_length = accum[-1]
+
+    out = []
+    for distance in distances:
+        d = torch.clamp(distance, min=0.0, max=total_length)
+        lower_idx = int(torch.nonzero(accum <= d, as_tuple=False)[-1])
+        if lower_idx >= points.shape[0] - 1:
+            out.append(points[-1])
+            continue
+
+        upper_idx = lower_idx + 1
+        lower_dist = accum[lower_idx]
+        upper_dist = accum[upper_idx]
+        segment_dist = upper_dist - lower_dist
+        if segment_dist <= torch.finfo(points.dtype).eps:
+            out.append(points[lower_idx])
+            continue
+
+        weight_upper = (d - lower_dist) / segment_dist
+        weight_lower = (upper_dist - d) / segment_dist
+        out.append(points[lower_idx] * weight_lower + points[upper_idx] * weight_upper)
+
+    sampled_points = torch.stack(out)
+    return sampled_points
+
+
+def sample_batch_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor:
+    sampled_points = torch.stack(
+        [
+            sample_polyline_cpu(points_sample, distances_sample)
+            for points_sample, distances_sample in zip(points, distances)
+        ]
+    )
+    return sampled_points
+
+
+def polyline_lengths_cpu(points: torch.Tensor) -> torch.Tensor:
+    # For no points, the length is undefined.
+    if points.shape[1] == 0:
+        lengths = points.new_full((points.shape[0],), torch.nan)
+        return lengths
+    # For a single point, the length is 0.
+    if points.shape[1] == 1:
+        lengths = points.new_zeros((points.shape[0],))
+        return lengths
+
+    lengths = torch.linalg.vector_norm(points[:, 1:] - points[:, :-1], dim=2).sum(dim=1)
+    return lengths
+
+
+def polyline_lengths_var_size_cpu(points: torch.Tensor, sample_sizes: torch.Tensor) -> torch.Tensor:
+    lengths = []
+    for sample_idx in range(points.shape[0]):
+        num_points = int(sample_sizes[sample_idx].item())
+        lengths.append(polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0])
+    lengths = torch.stack(lengths)
+    return lengths
+
+
+def assert_ragged_matches_cpu(
+    result: RaggedBatch,
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    points_sample_sizes: torch.Tensor,
+    distances_sample_sizes: torch.Tensor,
+    *,
+    atol: float = 1e-5,
+) -> None:
+    assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu())
+
+    for sample_idx in range(points.shape[0]):
+
+        num_points = int(points_sample_sizes[sample_idx].item())
+        num_distances = int(distances_sample_sizes[sample_idx].item())
+        expected = sample_polyline_cpu(
+            points[sample_idx, :num_points].cpu(),
+            distances[sample_idx, :num_distances].cpu(),
+        )
+
+        actual = result.tensor[sample_idx, :num_distances].cpu()
+
+        assert torch.allclose(actual, expected, atol=atol, rtol=0.0, equal_nan=True)
+
+
+def make_random_ragged_polyline_case(
+    *,
+    seed: int,
+    batch_size: int = 7,
+    max_num_points: int = 12,
+    max_num_distances: int = 17,
+    num_dims: int = 3,
+) -> tuple[RaggedBatch, RaggedBatch]:
+    generator = torch.Generator().manual_seed(seed)
+    points_sample_sizes = torch.randint(1, max_num_points + 1, (batch_size,), generator=generator)
+    distances_sample_sizes = torch.randint(0, max_num_distances + 1, (batch_size,), generator=generator)
+
+    max_points_in_batch = int(points_sample_sizes.max().item())
+    max_distances_in_batch = int(distances_sample_sizes.max().item())
+
+    points = torch.full((batch_size, max_points_in_batch, num_dims), 9999.0, dtype=torch.float32)
+    distances = torch.full((batch_size, max_distances_in_batch), -9999.0, dtype=torch.float32)
+
+    for sample_idx in range(batch_size):
+        num_points = int(points_sample_sizes[sample_idx].item())
+        num_distances = int(distances_sample_sizes[sample_idx].item())
+        points[sample_idx, :num_points] = torch.rand((num_points, num_dims), generator=generator)
+        total_length = polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0]
+        distances[sample_idx, :num_distances] = (
+            torch.rand((num_distances,), generator=generator) * total_length
+        )
+
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+
+    return points_batch, distances_batch
+
+
+def make_padded_ragged_polyline_case(
+    device: str,
+) -> tuple[RaggedBatch, RaggedBatch]:
+    # Poitns data
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0], [0.0, 0.0]],
+            [[3.5, -1.25], [4.5, -1.25], [4.5, 0.75], [9999.0, 9999.0], [9999.0, 9999.0]],
+            [[-2.0, 3.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]],
+            [[10.0, 0.0], [12.0, 0.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.tensor([5, 3, 1, 2], device=device)
+    # Distances data
+    distances = torch.tensor(
+        [
+            [0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0],
+            [3.0, 2.0, 1.0, 0.0, -1.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+            [9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+            [-5.0, 1.0, 5.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances_sample_sizes = torch.tensor([11, 5, 0, 3], device=device)
+
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+
+    return points_batch, distances_batch
+
+
+def distances_for_mode(
+    points: torch.Tensor, absolute_distances: torch.Tensor, *, relative: bool
+) -> torch.Tensor:
+
+    if not relative:
+        return absolute_distances
+
+    lengths = polyline_lengths_cpu(points.cpu()).to(
+        device=absolute_distances.device, dtype=absolute_distances.dtype
+    )
+
+    # For zero length, use 1.0 to avoid division by zero.
+    safe_lengths = torch.where(lengths > 0, lengths, torch.ones_like(lengths))
+    relative_distances = absolute_distances / safe_lengths[:, None]
+
+    return relative_distances
+
+
+def ragged_distances_for_mode(
+    points: RaggedBatch,
+    absolute_distances: RaggedBatch,
+    *,
+    relative: bool,
+) -> RaggedBatch:
+    if not relative:
+        return absolute_distances
+
+    relative_distances = absolute_distances.tensor.clone()
+    lengths = polyline_lengths_var_size_cpu(points.tensor.cpu(), points.sample_sizes.cpu()).to(
+        device=absolute_distances.tensor.device, dtype=absolute_distances.tensor.dtype
+    )
+    for sample_idx in range(points.tensor.shape[0]):
+        num_distances = int(absolute_distances.sample_sizes[sample_idx].item())
+        if num_distances == 0:
+            continue
+        length = lengths[sample_idx]
+        if length > 0:
+            relative_distances[sample_idx, :num_distances] /= length
+        else:
+            relative_distances[sample_idx, :num_distances] = 0.0
+    relative_distances_batch = absolute_distances.create_with_sample_sizes_like_self(relative_distances)
+    return relative_distances_batch
diff --git a/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py
new file mode 100644
index 0000000..ffa4946
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import DEVICES, distances_for_mode, sample_batch_cpu
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_rectangle_polyline_interpolation(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0]],
+        device=device,
+    )
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_batched_polyline_interpolation(relative: bool, device: str):
+    base_points = torch.tensor(
+        [
+            [0.0, 0.0],
+            [1.0, 0.0],
+            [1.0, 2.0],
+            [0.0, 2.0],
+            [0.0, 0.0],
+        ],
+        dtype=torch.float32,
+    )
+    offsets = torch.tensor([[0.0, 0.0], [3.5, -1.25]], dtype=torch.float32)
+    points = (base_points.unsqueeze(0) + offsets.unsqueeze(1)).to(device)
+    distances = torch.tensor(
+        [
+            [0.0, 0.5, 1.0, 3.0, 6.0],
+            [6.0, 5.0, 3.0, 1.0, 0.0],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points.contiguous(), distances_input.contiguous(), relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    distances = torch.tensor(
+        [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    ).transpose(0, 1)
+    assert not points.is_contiguous()
+    assert not distances.is_contiguous()
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_out_of_range_distances_clamp_to_endpoints(relative: bool, device: str):
+    points = torch.tensor([[[0.0, 0.0], [1.0, 0.0], [1.0, 2.0]]], device=device, dtype=torch.float32)
+    distances = torch.tensor([[-4.0, -1.0, 0.0, 3.0, 4.0]], device=device, dtype=torch.float32)
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_single_point_polyline(relative: bool, device: str):
+    points = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32)
+    distances = torch.tensor([[-1.0, 0.0, 1.0]], device=device, dtype=torch.float32)
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_zero_point_polyline_returns_nan(relative: bool, device: str):
+    points = torch.empty((2, 0, 3), device=device, dtype=torch.float32)
+    distances = torch.tensor([[0.0, 1.0], [-1.0, 2.0]], device=device, dtype=torch.float32)
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert result.shape == (2, 2, 3)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_zero_point_polyline_with_zero_distances_returns_empty(relative: bool, device: str):
+    points = torch.empty((2, 0, 3), device=device, dtype=torch.float32)
+    distances = torch.empty((2, 0), device=device, dtype=torch.float32)
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert result.shape == (2, 0, 3)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_random_polyline_matches_cpu_reference(relative: bool, device: str):
+    num_iters = 100
+    generator = torch.Generator().manual_seed(0)
+    for _ in range(num_iters):
+        num_points = int(torch.randint(15, 61, (), generator=generator).item())
+        num_distances = int(torch.randint(15, 61, (), generator=generator).item())
+        points_cpu = torch.rand((3, num_points, 2), generator=generator, dtype=torch.float32)
+        distances_cpu = torch.rand((3, num_distances), generator=generator, dtype=torch.float32)
+
+        segment_lengths = torch.linalg.vector_norm(points_cpu[:, 1:] - points_cpu[:, :-1], dim=2)
+        total_lengths = torch.sum(segment_lengths, dim=1)
+        distances_cpu = distances_cpu * total_lengths[:, None]
+
+        distances_input_cpu = distances_for_mode(points_cpu, distances_cpu, relative=relative)
+
+        expected = sample_batch_cpu(points_cpu, distances_cpu)
+        result = polyline.interpolate(
+            points_cpu.to(device), distances_input_cpu.to(device), relative=relative
+        )
+
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_large_polyline_interpolation_external_distance_buffer(relative: bool):
+    # Create a large polyline to ensure that the external distance buffer is used.
+    num_points = 200_000
+    x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32)
+    points = torch.stack((x, torch.zeros_like(x)), dim=1).unsqueeze(0)
+    distances = torch.tensor([[0.0, 0.25, 0.5, 1.0, 2.0]], device="cuda", dtype=torch.float32)
+    expected = torch.tensor(
+        [[[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]]],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        result = polyline.interpolate(points, distances, relative=relative)
+    stream.synchronize()
+
+    assert torch.allclose(result, expected, atol=1e-4, rtol=0.0)
diff --git a/packages/lane_helpers/tests/test_polyline_lengths.py b/packages/lane_helpers/tests/test_polyline_lengths.py
new file mode 100644
index 0000000..4b31de2
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_lengths.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import (
+    DEVICES,
+    make_padded_ragged_polyline_case,
+    make_random_ragged_polyline_case,
+    polyline_lengths_cpu,
+    polyline_lengths_var_size_cpu,
+)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_rectangle_and_single_point(device: str):
+    rectangle = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    single_point = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32)
+
+    assert torch.allclose(polyline.lengths(rectangle).cpu(), torch.tensor([6.0]), atol=1e-5, rtol=0.0)
+    assert torch.allclose(polyline.lengths(single_point).cpu(), torch.tensor([0.0]), atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_zero_point_batch_returns_nan(device: str):
+    points = torch.empty((3, 0, 2), device=device, dtype=torch.float32)
+
+    result = polyline.lengths(points)
+
+    assert result.shape == (3,)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_random_nd_matches_cpu_reference(device: str):
+    generator = torch.Generator().manual_seed(1)
+    num_iters = 100
+    for _ in range(num_iters):
+        points_cpu = torch.rand((5, 37, 4), generator=generator, dtype=torch.float32)
+
+        expected = polyline_lengths_cpu(points_cpu)
+        result = polyline.lengths(points_cpu.to(device))
+
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_accepts_non_contiguous_points(device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    assert not points.is_contiguous()
+
+    result = polyline.lengths(points)
+    expected = polyline_lengths_cpu(points.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_random_matches_cpu_reference(device: str):
+    num_iters = 100
+    for i in range(num_iters):
+        points_batch_cpu, _ = make_random_ragged_polyline_case(seed=i * 100)
+        points_batch = points_batch_cpu.to(device)
+
+        result = polyline.lengths_var_size_batch(points_batch)
+        expected = polyline_lengths_var_size_cpu(points_batch_cpu.tensor, points_batch_cpu.sample_sizes)
+
+        assert result.shape == (points_batch.tensor.shape[0],)
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_matches_cpu_reference_and_ignores_padding(device: str):
+    points_batch, _ = make_padded_ragged_polyline_case(device)
+
+    result = polyline.lengths_var_size_batch(points_batch)
+    expected = polyline_lengths_var_size_cpu(points_batch.tensor.cpu(), points_batch.sample_sizes.cpu())
+
+    assert result.shape == (4,)
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_zero_point_row_returns_nan(device: str):
+    points = torch.tensor(
+        [
+            [[9999.0, 9999.0], [9999.0, 9999.0]],
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32)
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+    expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0, equal_nan=True)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_all_zero_point_rows_return_nan(device: str):
+    points = torch.empty((3, 0, 2), device=device, dtype=torch.float32)
+    sample_sizes = torch.zeros(3, device=device, dtype=torch.int32)
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+
+    assert result.shape == (3,)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_accepts_int32_sample_sizes_and_non_contiguous_points(device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32)
+    assert not points.is_contiguous()
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+    expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+def test_polyline_lengths_var_size_batch_handles_inactive_cuda_rows():
+    num_samples = 33
+    points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32)
+    points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32)
+    points[:, 0, 1] = 0.0
+    points[:, 1, 0] = points[:, 0, 0] + 1.0
+    points[:, 1, 1] = 0.0
+    sample_sizes = torch.full((num_samples,), 2, device="cuda")
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+
+    assert torch.allclose(result.cpu(), torch.ones(num_samples), atol=1e-5, rtol=0.0)
diff --git a/packages/lane_helpers/tests/test_polyline_validation.py b/packages/lane_helpers/tests/test_polyline_validation.py
new file mode 100644
index 0000000..fe7db26
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_validation.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import DEVICES
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_rejects_invalid_ragged_layout(device: str):
+    points = torch.randn((2, 3, 4), device=device)
+    distances = torch.randn((2, 4), device=device)
+    points_batch = RaggedBatch(
+        points.transpose(1, 2).contiguous(),
+        sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32),
+        non_uniform_dim=2,
+    )
+    distances_batch = RaggedBatch(
+        distances, sample_sizes=torch.tensor([4, 2], device=device, dtype=torch.int32)
+    )
+
+    # Polyline points must use dimension 1 as the non-uniform point dimension.
+    with pytest.raises(AssertionError, match="points.non_uniform_dim"):
+        polyline.interpolate_var_size_batch(points_batch, distances_batch)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_validates_inputs(device: str):
+    points = torch.randn((1, 3, 2), device=device)
+    distances = torch.randn((1, 4), device=device)
+    points_batch = RaggedBatch(points, sample_sizes=torch.tensor([3], device=device))
+    distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([4], device=device))
+
+    # Points sample sizes must not be negative.
+    bad_points_sizes = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.interpolate_var_size_batch(bad_points_sizes, distances_batch)
+
+    # Distance sample sizes must not exceed the padded distance dimension.
+    bad_distances_sizes = RaggedBatch(distances, sample_sizes=torch.tensor([5], device=device))
+    with pytest.raises(RuntimeError, match="distances.sample_sizes"):
+        polyline.interpolate_var_size_batch(points_batch, bad_distances_sizes)
+
+    # Points and distances must have the same dtype.
+    distances_double = distances_batch.double()
+    with pytest.raises(RuntimeError, match="same dtype"):
+        polyline.interpolate_var_size_batch(points_batch, distances_double)
+
+    # Points and distances must have the same sample size dtype.
+    mismatched_sample_size_dtype = RaggedBatch(
+        distances,
+        sample_sizes=torch.tensor([4], device=device, dtype=torch.int32),
+    )
+    with pytest.raises(RuntimeError, match="same dtype"):
+        polyline.interpolate_var_size_batch(points_batch, mismatched_sample_size_dtype)
+
+
+def test_polyline_functions_reject_mixed_cpu_cuda_inputs():
+    points_cpu = torch.randn((1, 3, 2), device="cpu")
+    distances_cpu = torch.randn((1, 4), device="cpu")
+    points_cuda = points_cpu.cuda()
+    distances_cuda = distances_cpu.cuda()
+
+    # Fixed-size points and distances must live on the same device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate(points_cpu, distances_cuda)
+
+    # Ragged points and distances must live on the same device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate_var_size_batch(
+            RaggedBatch(points_cpu, sample_sizes=torch.tensor([3], device="cpu")),
+            RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")),
+        )
+
+    # Ragged sample sizes must live on the same device as their data tensor.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate_var_size_batch(
+            RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")),
+            RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")),
+        )
+
+    # Lengths use only points, but points.sample_sizes must still match the points device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.lengths_var_size_batch(
+            RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu"))
+        )
+
+
+def test_cpu_polyline_functions_reject_low_precision_dtypes():
+    for dtype in (torch.float16, torch.bfloat16):
+        # CPU kernels intentionally support only float32 and float64.
+        points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], dtype=dtype)
+        distances = torch.tensor([[0.0, 1.0]], dtype=dtype)
+        points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2]))
+        distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2]))
+
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.interpolate(points, distances)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.lengths(points)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.interpolate_var_size_batch(points_batch, distances_batch)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.lengths_var_size_batch(points_batch)
+
+
+def test_cuda_polyline_functions_accept_low_precision_dtypes():
+    for dtype in (torch.float16, torch.bfloat16):
+        points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype)
+        distances = torch.tensor([[0.0, 1.0]], device="cuda", dtype=dtype)
+        points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2], device="cuda"))
+        distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2], device="cuda"))
+
+        expected_points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype)
+        expected_lengths = torch.tensor([1.0], device="cuda", dtype=dtype)
+
+        assert torch.equal(polyline.interpolate(points, distances), expected_points)
+        assert torch.equal(polyline.lengths(points), expected_lengths)
+        assert torch.equal(
+            polyline.interpolate_var_size_batch(points_batch, distances_batch).tensor, expected_points
+        )
+        assert torch.equal(polyline.lengths_var_size_batch(points_batch), expected_lengths)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_rejects_invalid_ragged_layout(device: str):
+    points = torch.randn((2, 3, 4), device=device)
+    points_batch = RaggedBatch(
+        points.transpose(1, 2).contiguous(),
+        sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32),
+        non_uniform_dim=2,
+    )
+
+    # Polyline points must use dimension 1 as the non-uniform point dimension.
+    with pytest.raises(AssertionError, match="points.non_uniform_dim"):
+        polyline.lengths_var_size_batch(points_batch)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_validates_sample_sizes(device: str):
+    points = torch.randn((1, 3, 2), device=device)
+
+    # Length sample sizes must not be negative.
+    bad_small = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.lengths_var_size_batch(bad_small)
+
+    # Length sample sizes must not exceed the padded point dimension.
+    bad_large = RaggedBatch(points, sample_sizes=torch.tensor([4], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.lengths_var_size_batch(bad_large)
diff --git a/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py
new file mode 100644
index 0000000..7234ef0
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import (
+    DEVICES,
+    assert_ragged_matches_cpu,
+    make_padded_ragged_polyline_case,
+    make_random_ragged_polyline_case,
+    ragged_distances_for_mode,
+)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_matches_cpu_reference(relative: bool, device: str):
+    points_batch, distances_batch = make_padded_ragged_polyline_case(device)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(points_batch, distances_input_batch, relative=relative)
+
+    assert isinstance(result, RaggedBatch)
+    assert result.tensor.shape == (4, 11, 2)
+    assert result.non_uniform_dim == 1
+    assert_ragged_matches_cpu(
+        result,
+        points_batch.tensor,
+        distances_batch.tensor,
+        points_batch.sample_sizes,
+        distances_batch.sample_sizes,
+    )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_random_matches_cpu_reference(relative: bool, device: str):
+    num_iters = 100
+    for i in range(num_iters):
+        points_batch_cpu, distances_batch_cpu = make_random_ragged_polyline_case(seed=i)
+        points_batch = points_batch_cpu.to(device)
+        distances_batch = distances_batch_cpu.to(device)
+        distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+        result = polyline.interpolate_var_size_batch(
+            points_batch,
+            distances_input_batch,
+            relative=relative,
+        )
+
+        assert isinstance(result, RaggedBatch)
+        assert result.tensor.shape == (
+            points_batch.tensor.shape[0],
+            distances_batch.tensor.shape[1],
+            points_batch.tensor.shape[2],
+        )
+        assert_ragged_matches_cpu(
+            result,
+            points_batch.tensor,
+            distances_batch.tensor,
+            points_batch.sample_sizes,
+            distances_batch.sample_sizes,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_matches_fixed_size_when_uniform(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0]],
+            [[2.0, 2.0], [3.0, 2.0], [3.0, 4.0], [2.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 0.5, 2.0, 4.0], [4.0, 2.0, 0.5, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    )
+    sample_sizes = torch.tensor([points.shape[1], points.shape[1]], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor(
+        [distances.shape[1], distances.shape[1]], device=device, dtype=torch.int32
+    )
+    points_batch = RaggedBatch(points, sample_sizes=sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+    expected = polyline.interpolate(
+        points.contiguous(), distances_input_batch.tensor.contiguous(), relative=relative
+    )
+
+    assert torch.allclose(result.tensor, expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    distances = torch.tensor(
+        [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    ).transpose(0, 1)
+    assert not points.is_contiguous()
+    assert not distances.is_contiguous()
+
+    points_sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([4, 2], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(
+        result,
+        points,
+        distances,
+        points_sample_sizes,
+        distances_sample_sizes,
+    )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_zero_point_row_returns_nan(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[9999.0, 9999.0], [9999.0, 9999.0]],
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 1.0], [0.0, 0.5], [-1.0, 2.0]],
+        device=device,
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([2, 2, 2], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(
+        result,
+        points,
+        distances,
+        points_sample_sizes,
+        distances_sample_sizes,
+    )
+    assert torch.isnan(result.tensor[0, :2]).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_zero_max_distances_returns_empty(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.empty((2, 0), device=device, dtype=torch.float32)
+    points_sample_sizes = torch.tensor([2, 1], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([0, 0], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert isinstance(result, RaggedBatch)
+    assert result.tensor.shape == (2, 0, 2)
+    assert torch.equal(result.sample_sizes.cpu(), torch.zeros(2, dtype=torch.int32))
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_all_zero_point_rows_return_nan(relative: bool, device: str):
+    points = torch.empty((2, 0, 2), device=device, dtype=torch.float32)
+    distances = torch.tensor([[0.0, 1.0, 2.0], [-1.0, 0.5, 3.0]], device=device, dtype=torch.float32)
+    points_sample_sizes = torch.zeros(2, device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.full((2,), 3, device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert result.tensor.shape == (2, 3, 2)
+    assert torch.isnan(result.tensor).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_variable_size_large_polyline_interpolation_external_distance_buffer(relative: bool):
+    # Create a large polyline to ensure that the external distance buffer is used.
+    num_points = 200_000
+    x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32)
+    first_polyline = torch.stack((x, torch.zeros_like(x)), dim=1)
+    second_polyline = torch.stack((x, torch.ones_like(x)), dim=1)
+    points = torch.stack((first_polyline, second_polyline), dim=0)
+    distances = torch.tensor(
+        # Note that 9999.0 is a filler is not not part of the distances used for interpolation (due to `distances_sample_sizes`)
+        [[0.0, 0.25, 0.5, 1.0, 2.0], [1.0, 0.5, 0.0, -1.0, 9999.0]],
+        device="cuda",
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.full((2,), num_points, device="cuda", dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([5, 4], device="cuda", dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    expected = torch.tensor(
+        [
+            [[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]],
+            # Note that 9999.0 is a filler and is not checked for equality in the test.
+            [[1.0, 1.0], [0.5, 1.0], [0.0, 1.0], [0.0, 1.0], [9999.0, 9999.0]],
+        ],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        result = polyline.interpolate_var_size_batch(points_batch, distances_batch, relative=relative)
+    stream.synchronize()
+
+    assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu())
+    assert torch.allclose(result.tensor[0, :5], expected[0, :5], atol=1e-4, rtol=0.0)
+    assert torch.allclose(result.tensor[1, :4], expected[1, :4], atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_variable_size_polyline_interpolation_handles_inactive_cuda_rows(relative: bool):
+    num_samples = 33
+    points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32)
+    points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32)
+    points[:, 0, 1] = 0.0
+    points[:, 1, 0] = points[:, 0, 0] + 1.0
+    points[:, 1, 1] = 0.0
+    distances = (
+        torch.tensor([[0.0, 0.25, 1.0]], device="cuda", dtype=torch.float32).expand(num_samples, -1).clone()
+    )
+    points_sample_sizes = torch.full((num_samples,), 2, device="cuda")
+    distances_sample_sizes = torch.full((num_samples,), 3, device="cuda")
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(result, points, distances, points_sample_sizes, distances_sample_sizes)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])