From e99dbff93750a96f6e2c9a97a7c3f28188eb204d Mon Sep 17 00:00:00 2001
From: Roman Schaffert <rschaffert@nvidia.com>
Date: Wed, 10 Jun 2026 14:18:40 +0800
Subject: [PATCH] Added polyline interpolation (in the new `lane_helpers`
 sub-package)

- Added polyline interpolation
- Cleaned up error handlingh in torch extensions
- Added functionality to generate plots during docs generation

Signed-off-by: Roman Schaffert <rschaffert@nvidia.com>
---
 .gitignore                                    |   1 -
 docker/Dockerfile                             |   4 +-
 docs/Makefile                                 |   5 +-
 docs/generate_package_docs_assets.py          | 176 +++++
 docs/guides/DEVELOPMENT_GUIDE.md              |  59 +-
 docs/guides/DOCUMENTATION_SETUP_GUIDE.md      | 127 +++-
 docs/guides/INSTALLATION_GUIDE.md             |   9 +-
 docs/index.rst                                |   1 +
 docs/spelling_wordlist.txt                    |   4 +
 namespace_packages_config.py                  |   1 +
 .../batched_indexing_access_helpers.h         |  98 +--
 .../draw_heatmap/csrc/draw_heatmap_cuda.cu    |  48 +-
 .../docs/_on_doc_generation.py                |  59 ++
 packages/example_package/docs/intro.rst       |  15 +
 .../evaluation_results/simple_plot.csv        |   6 +
 packages/example_package/pyproject.toml       |   1 +
 .../accvlab/lane_helpers/__init__.py          |  28 +
 .../accvlab/lane_helpers/polyline/__init__.py |  27 +
 .../lane_helpers/polyline/functions.py        | 113 +++
 .../lane_helpers/docs/_on_doc_generation.py   |  91 +++
 packages/lane_helpers/docs/api.rst            |   9 +
 packages/lane_helpers/docs/example.rst        |  16 +
 .../images/polyline_sampling_illustration.png | Bin 0 -> 31019 bytes
 packages/lane_helpers/docs/index.rst          |  11 +
 packages/lane_helpers/docs/introduction.rst   | 113 +++
 .../lane_helpers/docu_referenced_dirs.txt     |   1 +
 .../evaluation/_shapely_evaluation_outputs.py | 157 ++++
 .../evaluation/plot_shapely_evaluation.py     | 595 +++++++++++++++
 .../evaluation/shapely_evaluation.py          | 681 ++++++++++++++++++
 .../batch_1_runtime_cpu.md                    |  13 +
 .../batch_1_runtime_cuda.md                   |  13 +
 .../batch_1_runtime_shapely.md                |  13 +
 .../batch_1_speedup_cpu_vs_shapely.md         |  13 +
 .../batch_1_speedup_cuda_vs_cpu.md            |  13 +
 .../batch_1_speedup_cuda_vs_shapely.md        |  13 +
 .../batch_64_runtime_cpu.md                   |  13 +
 .../batch_64_runtime_cuda.md                  |  13 +
 .../batch_64_runtime_shapely.md               |  13 +
 .../batch_64_speedup_cpu_vs_shapely.md        |  13 +
 .../batch_64_speedup_cuda_vs_cpu.md           |  13 +
 .../batch_64_speedup_cuda_vs_shapely.md       |  13 +
 packages/lane_helpers/examples/basic_usage.py |  53 ++
 packages/lane_helpers/ext_impl/CMakeLists.txt |  55 ++
 .../ext_impl/polyline/CMakeLists.txt          |  50 ++
 .../polyline/include/helper_macros.cuh        |  25 +
 .../ext_impl/polyline/include/polyline.cuh    | 178 +++++
 .../polyline/include/polyline_common.cuh      | 169 +++++
 .../include/polyline_dtype_compat.cuh         | 127 ++++
 .../polyline/include/polyline_kernels.cuh     | 466 ++++++++++++
 .../include/polyline_shared_memory_config.cuh | 120 +++
 .../ext_impl/polyline/src/polyline.cpp        | 399 ++++++++++
 .../ext_impl/polyline/src/polyline.cu         | 270 +++++++
 .../ext_impl/polyline/src/polyline_cpu.cpp    | 193 +++++
 packages/lane_helpers/pyproject.toml          |  35 +
 packages/lane_helpers/setup.py                |  53 ++
 .../lane_helpers/tests/polyline_test_utils.py | 226 ++++++
 .../test_polyline_fixed_interpolation.py      | 209 ++++++
 .../tests/test_polyline_lengths.py            | 179 +++++
 .../tests/test_polyline_validation.py         | 166 +++++
 .../test_polyline_var_size_interpolation.py   | 314 ++++++++
 60 files changed, 5781 insertions(+), 118 deletions(-)
 create mode 100644 docs/generate_package_docs_assets.py
 create mode 100644 packages/example_package/docs/_on_doc_generation.py
 create mode 100644 packages/example_package/evaluation_results/simple_plot.csv
 create mode 100644 packages/lane_helpers/accvlab/lane_helpers/__init__.py
 create mode 100644 packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py
 create mode 100644 packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py
 create mode 100644 packages/lane_helpers/docs/_on_doc_generation.py
 create mode 100644 packages/lane_helpers/docs/api.rst
 create mode 100644 packages/lane_helpers/docs/example.rst
 create mode 100644 packages/lane_helpers/docs/images/polyline_sampling_illustration.png
 create mode 100644 packages/lane_helpers/docs/index.rst
 create mode 100644 packages/lane_helpers/docs/introduction.rst
 create mode 100644 packages/lane_helpers/docu_referenced_dirs.txt
 create mode 100644 packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py
 create mode 100644 packages/lane_helpers/evaluation/plot_shapely_evaluation.py
 create mode 100644 packages/lane_helpers/evaluation/shapely_evaluation.py
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md
 create mode 100644 packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md
 create mode 100644 packages/lane_helpers/examples/basic_usage.py
 create mode 100644 packages/lane_helpers/ext_impl/CMakeLists.txt
 create mode 100644 packages/lane_helpers/ext_impl/polyline/CMakeLists.txt
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh
 create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline.cpp
 create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline.cu
 create mode 100644 packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp
 create mode 100644 packages/lane_helpers/pyproject.toml
 create mode 100644 packages/lane_helpers/setup.py
 create mode 100644 packages/lane_helpers/tests/polyline_test_utils.py
 create mode 100644 packages/lane_helpers/tests/test_polyline_fixed_interpolation.py
 create mode 100644 packages/lane_helpers/tests/test_polyline_lengths.py
 create mode 100644 packages/lane_helpers/tests/test_polyline_validation.py
 create mode 100644 packages/lane_helpers/tests/test_polyline_var_size_interpolation.py

diff --git a/.gitignore b/.gitignore
index 31a607c..7a579ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,3 @@ __pycache__/
 *.whl
 
 *.log
-
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 6ae40b1..82c44b8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -113,8 +113,8 @@ RUN pip install pandas==1.5.3 \
     numba==0.59 \
     pyquaternion==0.9.9
 
-RUN pip install nuscenes-devkit && \
-    pip install shapely tqdm pillow networkx fire
+RUN pip install nuscenes-devkit==1.2.0 && \
+    pip install shapely==2.0.7 tqdm==4.67.3 pillow==12.2.0 networkx==3.4.2 fire==0.7.1
 
 RUN pip install pytest pytest-timeout
 RUN pip install pynvml
diff --git a/docs/Makefile b/docs/Makefile
index 51667be..987e408 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -23,8 +23,9 @@ help:
 
 # Generate namespace package documentation before building
 generate:
-	python3 mirror_referenced_dirs.py
 	python3 generate_new_namespace_package_docs.py
+	python3 generate_package_docs_assets.py
+	python3 mirror_referenced_dirs.py
 	python3 update_docs_index.py
 
 # Sync the root README into the docs tree before building
@@ -41,7 +42,7 @@ clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	rm -rf $(BUILDDIR)/
 	rm -rf api/generated/
-	rm -rf ../packages/*/docs/generated/
+	rm -rf ../packages/*/docs/_generated/
 
 # Auto-build documentation (watches for changes)
 livehtml: sync-readme generate
diff --git a/docs/generate_package_docs_assets.py b/docs/generate_package_docs_assets.py
new file mode 100644
index 0000000..e205e28
--- /dev/null
+++ b/docs/generate_package_docs_assets.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from dataclasses import dataclass
+import importlib.util
+from pathlib import Path
+import sys
+from types import ModuleType
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class PackageDocsContext:
+    project_root: Path
+    namespace_package: str
+    package_name: str
+    package_root: Path
+    docs_root: Path
+    generated_dir: Path
+
+
+HookFunction = Callable[[PackageDocsContext], None]
+_GENERATED_ASSET_GITIGNORE = "*\n"
+
+
+def _load_hook_module(hook_path: Path, package_name: str) -> ModuleType:
+    # Temporary module name for the imported hook.
+    module_name = f"_accvlab_docs_assets_{package_name}"
+
+    # Import
+    spec = importlib.util.spec_from_file_location(module_name, hook_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not create import spec for docs asset hook: {hook_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    return module
+
+
+def _get_hook_function(module: ModuleType, hook_path: Path) -> HookFunction:
+    hook_function = getattr(module, "generate_docs_assets", None)
+    if not callable(hook_function):
+        raise AttributeError(
+            f"Docs asset hook must define a callable generate_docs_assets(context): {hook_path}"
+        )
+    return hook_function
+
+
+def _prepare_generated_dir(context: PackageDocsContext) -> None:
+    """Create the package's generated docs asset directory and keep it untracked."""
+    context.generated_dir.mkdir(parents=True, exist_ok=True)
+    (context.generated_dir / ".gitignore").write_text(_GENERATED_ASSET_GITIGNORE, encoding="utf-8")
+
+
+def _build_context(project_root: Path, namespace_package: str) -> PackageDocsContext:
+    package_name = namespace_package.split(".")[-1]
+    package_root = project_root / "packages" / package_name
+    docs_root = package_root / "docs"
+    generated_dir = docs_root / "_generated"
+    ctx = PackageDocsContext(
+        project_root=project_root,
+        namespace_package=namespace_package,
+        package_name=package_name,
+        package_root=package_root,
+        docs_root=docs_root,
+        generated_dir=generated_dir,
+    )
+    return ctx
+
+
+def _generate_assets_for_package(
+    *,
+    project_root: Path,
+    namespace_package: str,
+    verbose: bool,
+) -> bool:
+    context = _build_context(project_root, namespace_package)
+    hook_path = context.docs_root / "_on_doc_generation.py"
+    if not hook_path.exists():
+        if verbose:
+            print(f"No docs asset hook for {context.package_name}")
+        return False
+
+    if verbose:
+        print(f"Running docs asset hook for {context.package_name}: {hook_path}")
+    module = _load_hook_module(hook_path, context.package_name)
+    _prepare_generated_dir(context)
+    hook_function = _get_hook_function(module, hook_path)
+    hook_function(context)
+    print(f"Generated docs assets for {context.package_name}")
+    return True
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run optional package-local documentation asset generation hooks.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output.",
+    )
+    parser.add_argument(
+        "--package",
+        dest="package_names",
+        action="append",
+        help="Package name to process, such as lane_helpers. Can be passed more than once.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _parse_args()
+    docs_dir = Path(__file__).resolve().parent
+    project_root = docs_dir.parent
+    sys.path.insert(0, str(project_root))
+
+    try:
+        from namespace_packages_config import NAMESPACE_PACKAGES
+    except ImportError as exc:
+        print(
+            f"Error: Could not import NAMESPACE_PACKAGES from namespace_packages_config.py: {exc}",
+            file=sys.stderr,
+        )
+        return 1
+
+    package_filter = set(args.package_names or [])
+    namespace_packages = [
+        namespace_package
+        for namespace_package in NAMESPACE_PACKAGES
+        if not package_filter or namespace_package.split(".")[-1] in package_filter
+    ]
+    if package_filter and len(namespace_packages) != len(package_filter):
+        found_package_names = {namespace_package.split(".")[-1] for namespace_package in namespace_packages}
+        missing_package_names = sorted(package_filter - found_package_names)
+        print(f"Error: Unknown namespace package(s): {', '.join(missing_package_names)}", file=sys.stderr)
+        return 1
+
+    hook_count = 0
+    for namespace_package in namespace_packages:
+        package_name = namespace_package.split(".")[-1]
+        try:
+            hook_ran = _generate_assets_for_package(
+                project_root=project_root,
+                namespace_package=namespace_package,
+                verbose=args.verbose,
+            )
+        except Exception as exc:
+            print(f"Error: docs asset generation failed for {package_name}: {exc}", file=sys.stderr)
+            return 1
+        if hook_ran:
+            hook_count += 1
+
+    if args.verbose:
+        print(f"Ran {hook_count} package docs asset hook(s).")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docs/guides/DEVELOPMENT_GUIDE.md b/docs/guides/DEVELOPMENT_GUIDE.md
index ffb1f5a..08077d4 100644
--- a/docs/guides/DEVELOPMENT_GUIDE.md
+++ b/docs/guides/DEVELOPMENT_GUIDE.md
@@ -48,7 +48,8 @@ There are two example projects which showcase how a namespace package is structu
 - `packages/example_package`: Showcases a package containing PyTorch extensions built using
   `CppExtension` and `CUDAExtension` provided by PyTorch as well as an external implementation (see
   [External Implementations](#external-implementations) section for more details on external implementations)
-  as described below.
+  as described below. It also includes a package-local documentation asset hook that generates a simple plot
+  from committed CSV data under `evaluation_results/` during the docs build.
 - `packages/example_skbuild_package`: Showcases a package using `scikit-build` for C++/CUDA implementation 
   (see the [Alternative: SKBuild-Based Packages](#alternative-skbuild-based-packages) section for more 
   details on this approach).
@@ -70,6 +71,8 @@ To add a new namespace package (e.g., `example_package`), you need to create:
 | **Setup** | `packages/example_package/setup.py` | Package build configuration |
 | **Project Config** | `packages/example_package/pyproject.toml` | Modern Python project configuration and authoritative dependency definition |
 | **Documentation include list (optional)** | `packages/example_package/docu_referenced_dirs.txt` | List additional directories referenced by the docs (besides `docs/`). See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details.|
+| **Documentation asset hook (optional)** | `packages/example_package/docs/_on_doc_generation.py` | Generate package-owned docs assets such as plots from committed evaluation data. See [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md#package-local-generated-assets). |
+| **Evaluation results (optional)** | `packages/example_package/evaluation_results/` | Package-owned committed inputs for generating docs assets, such as data to plot. |
 
 > **ℹ️ Note**: Apart from the above, further folders/files can be included (and made use of manually or added to the 
 > documentation) if needed. A typical use case is to include e.g. an `examples` directory which is:
@@ -84,26 +87,29 @@ The following diagram shows the relevant project structure containing the folder
 
 ```
 accvlab/
-├── packages/                        # Namespace packages directory
+├── packages/                         # Namespace packages directory
 │   ├── optim_test_tools/...
 │   ├── batching_helpers/...
-│   └── example_package/             # ← New namespace package
-│       ├── accvlab/                 # ← Namespace root
-│       │   └── example_package/     # ← Implementation for "example_package" package
+│   └── example_package/              # ← New namespace package
+│       ├── accvlab/                  # ← Namespace root
+│       │   └── example_package/      # ← Implementation for "example_package" package
 │       │       ├── __init__.py
-│       │       ├── csrc/            # ← C++/CUDA sources
-│       │       └── include/         # ← Headers
-│       ├── ext_impl/                # ← Optional: external implementation
+│       │       ├── csrc/             # ← C++/CUDA sources
+│       │       └── include/          # ← Headers
+│       ├── ext_impl/                 # ← Optional: external implementation
 │       │   ├── build_and_copy.sh
 │       │   └── ...
-│       ├── tests/                   # ← Tests for "example_package" package
-│       ├── docs/                    # ← Documentation for "example_package" package
-│       ├── setup.py                 # ← Package build configuration
-│       ├── pyproject.toml           # ← Project configuration (including dependencies)
-│       └── docu_referenced_dirs.txt # ← Optional: list additional directories referenced by the docs (besides `docs/`)
-├── build_config/                    # Shared build utilities
-├── docs/                            # Main documentation
-└── namespace_packages_config.py     # ← Namespace package needs to be listed here
+│       ├── tests/                    # ← Tests for "example_package" package
+│       ├── evaluation_results/       # ← Optional committed inputs for generated docs assets
+│       ├── docs/                     # ← Documentation for "example_package" package
+│       │   ├── _on_doc_generation.py # ← Optional docs asset hook
+│       │   └── ...
+│       ├── setup.py                  # ← Package build configuration
+│       ├── pyproject.toml            # ← Project configuration (including dependencies)
+│       └── docu_referenced_dirs.txt  # ← Optional: list additional directories referenced by the docs (besides `docs/`)
+├── build_config/                     # Shared build utilities
+├── docs/                             # Main documentation
+└── namespace_packages_config.py      # ← Namespace package needs to be listed here
 ```
 
 Note that inside the package, there is the directory structure `accvlab/example_package`. This is where the 
@@ -238,6 +244,11 @@ root = "../.."
 
 Use this pattern for your own namespace package, adapting the dependency names as needed.
 
+Use `[project.optional-dependencies].optional` for dependencies needed by tests, examples, or package-local
+documentation asset hooks, but not by the core package at runtime. For example, if a docs hook generates plots
+from committed data, put the plotting library in the package's optional dependencies rather than in the base
+`[project].dependencies`.
+
 > **ℹ️ Note**: The `accvlab-build-config @ file:../../build_config` build dependency is intentionally a
 > local path reference. From a package under `packages/<package_name>/`, it resolves to the repository's `build_config/` package 
 > so isolated pip builds use the local helper package. See
@@ -317,6 +328,18 @@ Most of the contained packages extend this basic structure considerably to provi
 documentation. Please see the [Documentation Setup Guide](DOCUMENTATION_SETUP_GUIDE.md) for more details on 
 the documentation system and how to set it up.
 
+If your package needs generated docs assets, add `packages/<package_name>/docs/_on_doc_generation.py`. The
+documentation build creates `packages/<package_name>/docs/_generated/`, keeps it untracked, and passes that
+directory to the hook. Keep user-facing `.rst`/`.md` files static and reference generated assets with relative
+paths such as `_generated/<asset_name>.png`. The hook should generate those assets from committed inputs and
+fail clearly if required inputs are missing. Store committed plot or evaluation inputs outside the package
+`docs/` folder, for example under `packages/<package_name>/evaluation_results/`, so Sphinx does not discover
+data tables as standalone documentation pages.
+
+> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement
+> workflows. They should only regenerate documentation assets, such as plots, from data that is already
+> available in the repository.
+
 #### 8. Test Your Package
 
 ```bash
@@ -352,6 +375,10 @@ When adding a new namespace package, ensure you have:
 - [ ] **Documentation**: Generated with docs scripts and customized intro
 - [ ] **Documentation include list (optional)**: `docu_referenced_dirs.txt` created and populated if extra 
   folders (e.g. `examples/`) are referenced and are needed to build the documentation
+- [ ] **Documentation asset hook (optional)**: `_on_doc_generation.py` added if the package needs generated
+  documentation assets
+- [ ] **Evaluation results (optional)**: `packages/<package_name>/evaluation_results/` contains committed
+  inputs for generated docs assets if needed
 - [ ] **Examples (optional)**: `packages/<package_name>/examples/` created and referenced from docs if used
 - [ ] **Dependencies**: Declared runtime and optional dependencies in `pyproject.toml`
 - [ ] **External implementation**: (Optional) `packages/<package_name>/ext_impl/` for external builds
diff --git a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
index 521725f..667ce0e 100644
--- a/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
+++ b/docs/guides/DOCUMENTATION_SETUP_GUIDE.md
@@ -9,6 +9,7 @@ The documentation system provides:
 
 - **Explicit namespace package configuration** through `namespace_packages_config.py`
 - **Dynamic documentation generation** for each configured namespace package
+- **Optional package-local asset generation** for generated documentation assets such as plots
 - **Comprehensive API reference** with auto-generated content (extracted from docstrings)
 - **Referenced directories mirroring** to access files from the individual namespace packages in the 
   documentation by
@@ -43,6 +44,10 @@ The documentation generation makes use of multiple scripts:
   - **Template-based**: Uses consistent templates for all namespace packages (but generated files may be 
     modified as needed)
   - **Safe regeneration**: Only creates missing files if no `index.rst` is present for the namespace package
+- **`generate_package_docs_assets.py`**: Runs optional package-local documentation asset hooks
+  - **Package-owned**: Each package can decide whether it needs generated assets and how to create them
+  - **Format-agnostic**: The hook can read any package-owned input files and write any output files in the output folder; 
+    The core docs system does not prescribe a data format
 - **`update_docs_index.py`**: Updates main index file by including references to newly added namespace 
   packages
 - **`mirror_referenced_dirs.py`**: Mirrors (symlinks by default) the `docs` directory and other needed 
@@ -64,27 +69,28 @@ The documentation generation makes use of multiple scripts:
 #### Main Documentation Directory (`docs/`)
 ```
 docs/
-├── conf.py                        # Sphinx configuration using namespace_packages_config
-├── index.rst                      # Main documentation index
+├── conf.py                         # Sphinx configuration using namespace_packages_config
+├── index.rst                       # Main documentation index
 ├── generate_new_namespace_package_docs.py   # Creates structure for new namespace packages
-├── update_docs_index.py           # Updates navigation and indices
-├── mirror_referenced_dirs.py      # Mirrors referenced directories (symlinks by default)
-├── sync_root_readme_for_docs.py   # Syncs project root README into docs/project_overview
-├── Makefile                       # Build commands
-├── requirements.txt               # Documentation dependencies
-├── project_overview/              # Synced copy of the project root README used as docs overview
-├── contained_package_docs_mirror/ # Mirrored package documentation via symlinks (or copies)
-│   ├── example_package/           # Example namespace package docs (representative)
-│   │   ├── docs/                  # Documentation files
-│   │   │   ├── index.rst          # Namespace package overview
-│   │   │   ├── intro.rst          # Introduction (manual content)
-│   │   │   └── api.rst            # API reference (auto-generated)
-│   │   └── examples/              # Additional mirrored directory (referenced in docs)
-│   └── [other_packages]/          # Other configured namespace packages
-├── common/                        # Shared documentation resources
+├── generate_package_docs_assets.py # Runs optional package-local docs asset hooks
+├── update_docs_index.py            # Updates navigation and indices
+├── mirror_referenced_dirs.py       # Mirrors referenced directories (symlinks by default)
+├── sync_root_readme_for_docs.py    # Syncs project root README into docs/project_overview
+├── Makefile                        # Build commands
+├── requirements.txt                # Documentation dependencies
+├── project_overview/               # Synced copy of the project root README used as docs overview
+├── contained_package_docs_mirror/  # Mirrored package documentation via symlinks (or copies)
+│   ├── example_package/            # Example namespace package docs (representative)
+│   │   ├── docs/                   # Documentation files
+│   │   │   ├── index.rst           # Namespace package overview
+│   │   │   ├── intro.rst           # Introduction (manual content)
+│   │   │   └── api.rst             # API reference (auto-generated)
+│   │   └── examples/               # Additional mirrored directory (referenced in docs)
+│   └── [other_packages]/           # Other configured namespace packages
+├── common/                         # Shared documentation resources
 ├── _static/css/
-│           └── custom.css         # Custom styling
-└── _build/                        # Built documentation output
+│           └── custom.css          # Custom styling
+└── _build/                         # Built documentation output
 ```
 
 **Notes**:
@@ -103,7 +109,10 @@ packages/
     ├── docs/                      # Source documentation files
     │   ├── index.rst              # Namespace package overview
     │   ├── intro.rst              # Introduction (manual content)
-    │   └── api.rst                # API reference (auto-generated)
+    │   ├── api.rst                # API reference (auto-generated)
+    │   ├── _on_doc_generation.py  # Optional package-local docs asset hook
+    │   └── _generated/            # Generated assets created at docs build time
+    ├── evaluation_results/        # Optional committed inputs for generated docs assets
     ├── docu_referenced_dirs.txt   # List of additional directories to copy
     ├── examples/                  # Example code (mirrored and referenced by docs)
     └── [other_dirs]/              # Other package directories
@@ -112,6 +121,9 @@ packages/
 
 **Notes**:
 - The `packages/example_package/` structure shows the source documentation that gets mirrored during build
+- The `example_package` includes a small generated plot example: committed CSV data under
+  `packages/example_package/evaluation_results/` is converted into an image under
+  `packages/example_package/docs/_generated/` during the docs build
 - **⚠️ Important**: Content should be edited in the source locations (`packages/<package_name>/docs/`), not in 
   the mirrored locations
 - In case of the `example_package`, the `examples/` directory is mirrored to maintain documentation references 
@@ -172,6 +184,65 @@ etc.) can still be found after the documentation is mirrored to the build locati
 - Only list additional directories that are referenced by your documentation. Note that the API documentation
   does not rely on this mirroring, but is extracted from the installed packages.
 
+### Package-Local Generated Assets
+
+Packages can generate documentation assets during the docs build by adding an optional hook:
+
+```text
+packages/<package_name>/docs/_on_doc_generation.py
+```
+
+If present, `generate_package_docs_assets.py` imports the hook and calls:
+
+```python
+def generate_docs_assets(context):
+    ...
+```
+
+The hook receives a context with package and documentation paths, including:
+
+- `context.project_root`
+- `context.package_root`
+- `context.docs_root`
+- `context.generated_dir`
+
+The docs asset generator creates `context.generated_dir` before calling the hook. This directory is always:
+
+```text
+packages/<package_name>/docs/_generated/
+```
+
+It also writes a local `.gitignore` file there so generated assets remain untracked. The hook should write
+generated images or other generated files directly into `context.generated_dir`, or into subdirectories below
+it if a package needs additional structure.
+
+Source documentation files remain static. For example, an `.rst` file can reference a generated image with a
+normal relative path:
+
+```rst
+.. figure:: _generated/runtime_plot.png
+   :alt: Runtime plot
+```
+
+Packages own the input data and generation logic. For example, a package can commit benchmark result tables
+under `packages/<package_name>/evaluation_results/` and generate plots from those tables during the docs
+build. If a generated asset is required by the static docs, the hook should fail with a clear error when the
+required input data is missing or malformed.
+
+> **⚠️ Important**: Documentation asset hooks must not run evaluations, benchmarks, or other measurement
+> workflows. They should only regenerate documentation assets, such as plots, from data that is already
+> available in the repository. It is recommended to store results in simple formats such as .csv or .md, 
+> and use those as the source of truth for the plots.
+>
+> Keep committed plot or evaluation inputs outside the package `docs/` folder, for example under
+> `packages/<package_name>/evaluation_results/`. This prevents Sphinx from discovering e.g. `.md` data tables as
+> standalone documentation pages while keeping the inputs package-local.
+
+Package-specific dependencies needed only by the hook should be declared in that package's optional
+dependencies in `pyproject.toml`. The default local installation path (`./scripts/install_local.sh`) installs
+optional package dependencies. If you build docs after installing packages without optional dependencies,
+package-local asset hooks may fail when their optional plotting or parsing dependencies are missing.
+
 ### Building Documentation Locally
 
 **Quick build using the script** (can be run from any directory, example shows running from the project 
@@ -208,6 +279,9 @@ make livehtml
   in sequence
 - The `html` target ensures all scripts run before building
 - The `livehtml` target also runs the scripts for development builds
+- Package-local docs asset hooks run before package docs are mirrored, so generated assets under
+  `packages/<package_name>/docs/_generated/` are available from both the package docs source tree and the
+  mirrored docs tree.
 - When running spelling via the script, the generation scripts are executed first to ensure mirrored package 
   docs are up to date. Spelling findings are written to `docs/_build/spelling/output.txt`.
 
@@ -220,6 +294,9 @@ make livehtml
 >  - It does **not** reinstall or rebuild packages for you. This means that if you change the docstrings in 
 >    the source tree of a package, you need to reinstall the package (for example via 
 >    `./scripts/install_local.sh`) and then restart `make livehtml` to see updated docstrings.
+>  - It does **not** rerun package-local docs asset hooks for you after startup. This means that if you change
+>    committed plot data or hook code, you need to restart `make livehtml` (or run `make generate`) to
+>    regenerate plots and other generated docs assets.
 
 ### Spell-checking
 
@@ -456,6 +533,10 @@ the per-package runtime dependencies defined in each package's `pyproject.toml`)
 - Theme packages
 - Other documentation-specific dependencies
 
+Package-specific docs asset dependencies belong to the corresponding package's optional dependencies. This keeps the 
+global documentation requirements focused on the Sphinx build itself while allowing package-owned hooks to declare their 
+own plotting or data-processing dependencies.
+
 ### File Descriptions
 
 #### Core Configuration Files
@@ -478,6 +559,12 @@ automatically as part of the docs build; you normally do not need to run them ma
 - **`packages/<package_name>/docs/index.rst`**: Namespace package overview (source)
 - **`packages/<package_name>/docs/intro.rst`**: Manual introduction content (source)
 - **`packages/<package_name>/docs/api.rst`**: Auto-generated API reference (source)
+- **`packages/<package_name>/docs/_on_doc_generation.py`**: Optional hook for package-local generated docs
+  assets
+- **`packages/<package_name>/docs/_generated/`**: Generated documentation assets created during docs
+  generation and ignored by Git
+- **`packages/<package_name>/evaluation_results/`**: Optional package-owned committed inputs for generated
+  docs assets, such as benchmark tables used for plots
 - **`packages/<package_name>/docu_referenced_dirs.txt`**: List of directories containing files used in the 
   documentation in addition to `docs` (to mirror into the documentation source directory).
 - **`docs/contained_package_docs_mirror/<package_name>/docs/`**: Mirrored documentation (symlink to the 
diff --git a/docs/guides/INSTALLATION_GUIDE.md b/docs/guides/INSTALLATION_GUIDE.md
index df0fe2e..b9fe12f 100644
--- a/docs/guides/INSTALLATION_GUIDE.md
+++ b/docs/guides/INSTALLATION_GUIDE.md
@@ -53,10 +53,11 @@ dependencies (needed for some tests and examples), pass the `--optional` flag ex
 ./scripts/package_manager.sh install -e --optional
 ```
 
-> **⚠️ Important**: Installing with optional dependencies is required if you plan to run the contained
-> tests, as they rely on optional dependencies such as `pytest` (and possibly other dependencies). It may be 
-> also required for the contained examples, as they may use additional packages which are otherwise 
-> not used in the core library.
+> **⚠️ Important**: Installing with optional dependencies is required for workflows that rely on packages
+> outside the core library, including contained tests, contained examples, and documentation generation.
+> Documentation generation may run package-local asset hooks, for example to regenerate plots from committed
+> data, and those hooks can require plotting or data-processing packages. Tests commonly require tools such as
+> `pytest` and may require further packages.
 
 The package manager script:
 - Automatically installs the required `accvlab_build_config` helper package (see the `build_config` directory
diff --git a/docs/index.rst b/docs/index.rst
index 273cc02..6492146 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -71,6 +71,7 @@ Please see the documentation of each namespace package for usage instructions (a
    contained_package_docs_mirror/batching_helpers/docs/index
    contained_package_docs_mirror/multi_tensor_copier/docs/index
    contained_package_docs_mirror/dali_pipeline_framework/docs/index
+   contained_package_docs_mirror/lane_helpers/docs/index
    contained_package_docs_mirror/draw_heatmap/docs/index
    contained_package_docs_mirror/optim_test_tools/docs/index
 
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index bee6c51..bda1b02 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -212,3 +212,7 @@ ABI
 aggregator
 multimodal
 cubin
+Polyline
+polyline
+Polylines
+polylines
diff --git a/namespace_packages_config.py b/namespace_packages_config.py
index a94ab18..aaf9d3d 100644
--- a/namespace_packages_config.py
+++ b/namespace_packages_config.py
@@ -27,6 +27,7 @@
     'accvlab.batching_helpers',
     'accvlab.multi_tensor_copier',
     'accvlab.dali_pipeline_framework',
+    'accvlab.lane_helpers',
     'accvlab.draw_heatmap',
     'accvlab.optim_test_tools',
     # Add new namespace packages in the same way as above
diff --git a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
index 9e66681..4df9745 100644
--- a/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
+++ b/packages/batching_helpers/accvlab/batching_helpers/cpp_impl/batched_indexing_access_helpers.h
@@ -64,48 +64,48 @@
 #define DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \
     AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INDEX_TYPES(__VA_ARGS__))
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CPU(x) AT_ASSERTM(x.is_cpu(), #x " must be a CPU tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_SAME_CUDA_DEVICE(tensors_list...)                                                        \
-    {                                                                                                  \
-        const std::vector<torch::Tensor> tensors = {tensors_list};                                     \
-        CHECK_CUDA(tensors[0]);                                                                        \
-        const auto& device = tensors[0].device();                                                      \
-        for (size_t i = 1; i < tensors.size(); ++i) {                                                  \
-            AT_ASSERTM(tensors[i].device() == device, "All input tensors must be on the same device"); \
-        }                                                                                              \
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) TORCH_CHECK(x.is_cpu(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_SAME_CUDA_DEVICE(tensors_list...)                                                         \
+    {                                                                                                   \
+        const std::vector<torch::Tensor> tensors = {tensors_list};                                      \
+        CHECK_CUDA(tensors[0]);                                                                         \
+        const auto& device = tensors[0].device();                                                       \
+        for (size_t i = 1; i < tensors.size(); ++i) {                                                   \
+            TORCH_CHECK(tensors[i].device() == device, "All input tensors must be on the same device"); \
+        }                                                                                               \
     }
-#define CHECK_SAME_DTYPE(error_msg, tensors_list...)                                     \
-    {                                                                                    \
-        const std::vector<torch::Tensor> tensors = {tensors_list};                       \
-        for (size_t i = 1; i < tensors.size(); ++i) {                                    \
-            AT_ASSERTM(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \
-        }                                                                                \
+#define CHECK_SAME_DTYPE(error_msg, tensors_list...)                                      \
+    {                                                                                     \
+        const std::vector<torch::Tensor> tensors = {tensors_list};                        \
+        for (size_t i = 1; i < tensors.size(); ++i) {                                     \
+            TORCH_CHECK(tensors[i].scalar_type() == tensors[0].scalar_type(), error_msg); \
+        }                                                                                 \
     }
 
-#define CHECK_SIZE_MATCH(tensor1, tensor2)                                                    \
-    {                                                                                         \
-        /* If the tensors are empty, the actual sizes are not relevant */                     \
-        if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                            \
-            AT_ASSERTM((tensor1).dim() == (tensor2).dim(),                                    \
-                       #tensor1 " and " #tensor2 " must have the same number of dimensions"); \
-            for (size_t i = 0; i < (tensor1).dim(); ++i) {                                    \
-                AT_ASSERTM((tensor1).size(i) == (tensor2).size(i),                            \
-                           #tensor1 " and " #tensor2 " must have the same size");             \
-            }                                                                                 \
-        }                                                                                     \
+#define CHECK_SIZE_MATCH(tensor1, tensor2)                                                     \
+    {                                                                                          \
+        /* If the tensors are empty, the actual sizes are not relevant */                      \
+        if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                             \
+            TORCH_CHECK((tensor1).dim() == (tensor2).dim(),                                    \
+                        #tensor1 " and " #tensor2 " must have the same number of dimensions"); \
+            for (size_t i = 0; i < (tensor1).dim(); ++i) {                                     \
+                TORCH_CHECK((tensor1).size(i) == (tensor2).size(i),                            \
+                            #tensor1 " and " #tensor2 " must have the same size");             \
+            }                                                                                  \
+        }                                                                                      \
     }
 
 #define CHECK_SIZE_MATCH_FIRST_DIMS(tensor1, tensor2, num_dims_to_check)                                     \
     {                                                                                                        \
         /* If the tensors are empty, the actual sizes are not relevant */                                    \
         if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                                           \
-            AT_ASSERTM((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check),     \
-                       #tensor1 " and " #tensor2 " must have at least " +                                    \
-                           std::to_string(num_dims_to_check) + " dimensions");                               \
+            TORCH_CHECK((tensor1).dim() >= (num_dims_to_check) && (tensor2).dim() >= (num_dims_to_check),    \
+                        #tensor1 " and " #tensor2 " must have at least " +                                   \
+                            std::to_string(num_dims_to_check) + " dimensions");                              \
             for (size_t i = 0; i < (num_dims_to_check); ++i) {                                               \
-                AT_ASSERTM(                                                                                  \
+                TORCH_CHECK(                                                                                 \
                     (tensor1).size(i) == (tensor2).size(i),                                                  \
                     #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \
             }                                                                                                \
@@ -116,35 +116,35 @@
     {                                                                                                        \
         /* If the tensors are empty, the actual sizes are not relevant */                                    \
         if (!((tensor1).numel() == 0 && (tensor2).numel() == 0)) {                                           \
-            AT_ASSERTM((tensor1).dim() == (tensor2).dim(),                                                   \
-                       #tensor1 " and " #tensor2 " must have the same number of dimensions");                \
+            TORCH_CHECK((tensor1).dim() == (tensor2).dim(),                                                  \
+                        #tensor1 " and " #tensor2 " must have the same number of dimensions");               \
             for (size_t i = 0; i < (tensor1).dim(); ++i) {                                                   \
                 if (i == (dim_to_allow_mismatch)) {                                                          \
                     continue;                                                                                \
                 }                                                                                            \
-                AT_ASSERTM(                                                                                  \
+                TORCH_CHECK(                                                                                 \
                     (tensor1).size(i) == (tensor2).size(i),                                                  \
                     #tensor1 " and " #tensor2 " must have the same size in dimension " + std::to_string(i)); \
             }                                                                                                \
         }                                                                                                    \
     }
 
-#define CHECK_NUM_DIMS(tensor, num_dims)                                                  \
-    {                                                                                     \
-        /* If the tensor is empty, the number of dimensions is not relevant */            \
-        if (!((tensor).numel() == 0)) {                                                   \
-            AT_ASSERTM((tensor).dim() == (num_dims),                                      \
-                       #tensor " must have " + std::to_string(num_dims) + " dimensions"); \
-        }                                                                                 \
+#define CHECK_NUM_DIMS(tensor, num_dims)                                                   \
+    {                                                                                      \
+        /* If the tensor is empty, the number of dimensions is not relevant */             \
+        if (!((tensor).numel() == 0)) {                                                    \
+            TORCH_CHECK((tensor).dim() == (num_dims),                                      \
+                        #tensor " must have " + std::to_string(num_dims) + " dimensions"); \
+        }                                                                                  \
     }
 
-#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims)                                                  \
-    {                                                                                              \
-        /* If the tensor is empty, the number of dimensions is not relevant */                     \
-        if (!((tensor).numel() == 0)) {                                                            \
-            AT_ASSERTM((tensor).dim() >= (num_dims),                                               \
-                       #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \
-        }                                                                                          \
+#define CHECK_NUM_DIMS_AT_LEAST(tensor, num_dims)                                                   \
+    {                                                                                               \
+        /* If the tensor is empty, the number of dimensions is not relevant */                      \
+        if (!((tensor).numel() == 0)) {                                                             \
+            TORCH_CHECK((tensor).dim() >= (num_dims),                                               \
+                        #tensor " must have at least " + std::to_string(num_dims) + " dimensions"); \
+        }                                                                                           \
     }
 
 static inline int64_t get_number_data_elements_per_index(const torch::Tensor& input_data,
diff --git a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
index ed5e9c4..85d5d38 100644
--- a/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
+++ b/packages/draw_heatmap/accvlab/draw_heatmap/csrc/draw_heatmap_cuda.cu
@@ -20,8 +20,8 @@
 #include <cuda_runtime.h>
 #include <torch/extension.h>
 
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) \
     CHECK_CUDA(x);     \
     CHECK_CONTIGUOUS(x);
@@ -69,11 +69,11 @@ void draw_heatmap_launcher(at::Tensor& heatmap, const at::Tensor& centers, const
     CHECK_INPUT(radii);
     CHECK_INPUT(heatmap_idxes);
 
-    AT_ASSERTM(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0");
-    AT_ASSERTM(centers.size(0) == heatmap_idxes.size(0),
-               "centers and heatmap_idxes must have the same size at dim0");
-    AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]");
-    AT_ASSERTM(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]");
+    TORCH_CHECK(centers.size(0) == radii.size(0), "centers and radii must have the same size at dim0");
+    TORCH_CHECK(centers.size(0) == heatmap_idxes.size(0),
+                "centers and heatmap_idxes must have the same size at dim0");
+    TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [num_heatmaps, height, width]");
+    TORCH_CHECK(centers.dim() == 2 && centers.size(1) == 2, "centers must be of shape [num_targets, 2]");
 
     const int num_targets = centers.size(0);
     const int num_heatmaps = heatmap.size(0);
@@ -101,15 +101,15 @@ void draw_heatmap_batched_launcher(at::Tensor& heatmap, const at::Tensor& center
 
     const int batch_size = heatmap.size(0);
     const int num_targets = radii.size(1);
-    AT_ASSERTM(
+    TORCH_CHECK(
         batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0),
         "batch_size (dim 0) need to be the same for all inputs");
-    AT_ASSERTM(num_targets == centers.size(1),
-               "maximum number of targets (dim 1) need to be the same centers and radii");
-    AT_ASSERTM(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]");
-    AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2,
-               "centers must be of shape [batch_size, num_targets, 2]");
-    AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
+    TORCH_CHECK(num_targets == centers.size(1),
+                "maximum number of targets (dim 1) need to be the same centers and radii");
+    TORCH_CHECK(heatmap.dim() == 3, "heatmap must be of shape [batch_size, height, width]");
+    TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2,
+                "centers must be of shape [batch_size, num_targets, 2]");
+    TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
 
     const int height = heatmap.size(1);
     const int width = heatmap.size(2);
@@ -138,23 +138,23 @@ void draw_heatmap_batched_classwise_launcher(at::Tensor& heatmap, const at::Tens
 
     const int batch_size = heatmap.size(0);
     const int num_targets = radii.size(1);
-    AT_ASSERTM(
+    TORCH_CHECK(
         batch_size == radii.size(0) && batch_size == centers.size(0) && batch_size == nums_targets.size(0),
         "batch_size (dim 0) need to be the same for all inputs");
-    AT_ASSERTM(num_targets == centers.size(1),
-               "maximum number of targets (dim 1) need to be the same centers and radii");
-    AT_ASSERTM(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]");
-    AT_ASSERTM(centers.dim() == 3 && centers.size(2) == 2,
-               "centers must be of shape [batch_size, num_targets, 2]");
-    AT_ASSERTM(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
+    TORCH_CHECK(num_targets == centers.size(1),
+                "maximum number of targets (dim 1) need to be the same centers and radii");
+    TORCH_CHECK(heatmap.dim() == 4, "heatmap must be of shape [batch_size, max_num_classes, height, width]");
+    TORCH_CHECK(centers.dim() == 3 && centers.size(2) == 2,
+                "centers must be of shape [batch_size, num_targets, 2]");
+    TORCH_CHECK(radii.dim() == 2, "radii must be of shape [batch_size, num_targets]");
 
     const int height = heatmap.size(2);
     const int width = heatmap.size(3);
     const int max_num_classes = heatmap.size(1);
     // Validate labels tensor shape and range before launching the kernel
-    AT_ASSERTM(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]");
-    AT_ASSERTM(labels.size(0) == batch_size && labels.size(1) == num_targets,
-               "labels shape must be [batch_size, radii.size(1)]");
+    TORCH_CHECK(labels.dim() == 2, "labels must be of shape [batch_size, radii.size(1)]");
+    TORCH_CHECK(labels.size(0) == batch_size && labels.size(1) == num_targets,
+                "labels shape must be [batch_size, radii.size(1)]");
     AT_DISPATCH_FLOATING_TYPES(
         heatmap.scalar_type(), "draw_heatmap_cuda_batched", ([&] {
             draw_heatmap_batched_cuda(
diff --git a/packages/example_package/docs/_on_doc_generation.py b/packages/example_package/docs/_on_doc_generation.py
new file mode 100644
index 0000000..28bb331
--- /dev/null
+++ b/packages/example_package/docs/_on_doc_generation.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+from pathlib import Path
+from typing import Any
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+_DATA_FILE = Path("evaluation_results") / "simple_plot.csv"
+_OUTPUT_FILE = "simple_plot.png"
+
+
+def _read_plot_data(input_file: Path) -> tuple[list[float], list[float]]:
+    if not input_file.exists():
+        raise FileNotFoundError(f"Required example plot input data is missing: {input_file}")
+
+    with input_file.open("r", encoding="utf-8", newline="") as csv_file:
+        reader = csv.DictReader(csv_file)
+        if reader.fieldnames != ["x", "y"]:
+            raise ValueError(f"Expected CSV columns 'x,y' in {input_file}")
+        x_values: list[float] = []
+        y_values: list[float] = []
+        for row in reader:
+            x_values.append(float(row["x"]))
+            y_values.append(float(row["y"]))
+
+    if not x_values:
+        raise ValueError(f"Expected at least one data row in {input_file}")
+    return x_values, y_values
+
+
+def generate_docs_assets(context: Any) -> None:
+    input_file = context.package_root / _DATA_FILE
+    output_file = context.generated_dir / _OUTPUT_FILE
+    x_values, y_values = _read_plot_data(input_file)
+
+    figure, axis = plt.subplots(figsize=(5.0, 3.2), constrained_layout=True)
+    axis.plot(x_values, y_values, marker="o")
+    axis.set_title("Generated Example Plot")
+    axis.set_xlabel("x")
+    axis.set_ylabel("y")
+    axis.grid(True)
+    figure.savefig(output_file)
+    plt.close(figure)
diff --git a/packages/example_package/docs/intro.rst b/packages/example_package/docs/intro.rst
index 104d8ec..237fba1 100644
--- a/packages/example_package/docs/intro.rst
+++ b/packages/example_package/docs/intro.rst
@@ -56,6 +56,21 @@ Examples
 For examples, see :doc:`examples`. The example makes use of ``note-literalinclude`` to include the 
 example code in the documentation and highlight notes in the code (comment blocks starting with ``# @NOTE``).
 
+Generated Documentation Assets
+------------------------------
+
+This package also demonstrates package-local documentation asset generation. The docs build reads committed
+data from ``evaluation_results/simple_plot.csv`` and writes the generated plot to
+``docs/_generated/simple_plot.png``. The source documentation remains static and references the generated
+image using a normal relative path.
+
+.. figure:: _generated/simple_plot.png
+   :alt: Simple generated plot from committed CSV data
+   :align: center
+   :width: 70%
+
+   Example plot generated from committed CSV data during documentation generation.
+
 .. toctree::
    :maxdepth: 2
    :caption: Examples
diff --git a/packages/example_package/evaluation_results/simple_plot.csv b/packages/example_package/evaluation_results/simple_plot.csv
new file mode 100644
index 0000000..9675110
--- /dev/null
+++ b/packages/example_package/evaluation_results/simple_plot.csv
@@ -0,0 +1,6 @@
+x,y
+0,0
+1,1
+2,4
+3,9
+4,16
diff --git a/packages/example_package/pyproject.toml b/packages/example_package/pyproject.toml
index ef14420..cbe6014 100644
--- a/packages/example_package/pyproject.toml
+++ b/packages/example_package/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
 
 [project.optional-dependencies]
 optional = [
+    "matplotlib",
     "pytest",
 ]
 
diff --git a/packages/lane_helpers/accvlab/lane_helpers/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/__init__.py
new file mode 100644
index 0000000..e5b5150
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import PackageNotFoundError, version
+
+from . import polyline
+
+try:
+    __version__ = version("accvlab.lane_helpers")
+except PackageNotFoundError:
+    __version__ = "0.0.0"
+
+
+__all__ = [
+    "__version__",
+    "polyline",
+]
diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py
new file mode 100644
index 0000000..e1ec3a1
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .functions import (
+    interpolate,
+    interpolate_var_size_batch,
+    lengths,
+    lengths_var_size_batch,
+)
+
+__all__ = [
+    "interpolate",
+    "interpolate_var_size_batch",
+    "lengths",
+    "lengths_var_size_batch",
+]
diff --git a/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py
new file mode 100644
index 0000000..417575b
--- /dev/null
+++ b/packages/lane_helpers/accvlab/lane_helpers/polyline/functions.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from .. import _polyline_sampling
+
+if TYPE_CHECKING:
+    from accvlab.batching_helpers import RaggedBatch
+
+
+def interpolate(points: torch.Tensor, distances: torch.Tensor, *, relative: bool = False) -> torch.Tensor:
+    """Interpolate batched polylines at requested distances.
+
+    Args:
+        points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``.
+        distances: Tensor with shape ``(batch, num_distances)`` on the same device as ``points``.
+            Distances below zero are clamped to the first point of the polyline. Distances beyond the
+            total polyline length are clamped to the last point. When ``relative=True``, this corresponds
+            to clamping values below ``0`` and above ``1``.
+        relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length.
+            If ``False``, interpret them as absolute distances from the start of each polyline.
+
+    Returns:
+        Tensor with shape ``(batch, num_distances, num_dims)`` on the same device as ``points``.
+    """
+    result = _polyline_sampling.polyline_interpolation(points, distances, relative=relative)
+    return result
+
+
+def lengths(points: torch.Tensor) -> torch.Tensor:
+    """Compute the total length of each polyline in a fixed-size batch.
+
+    Args:
+        points: CPU or CUDA tensor with shape ``(batch, num_points, num_dims)``.
+
+    Returns:
+        Tensor with shape ``(batch,)`` on the same device as ``points``.
+    """
+    result = _polyline_sampling._polyline_lengths(points)
+    return result
+
+
+def interpolate_var_size_batch(
+    points: RaggedBatch, distances: RaggedBatch, *, relative: bool = False
+) -> RaggedBatch:
+    """Interpolate variable-length batched polylines at requested distances.
+
+    Args:
+        points: RaggedBatch-like object with tensor data on CPU or CUDA and shape
+            ``(batch, max_num_points, num_dims)``.
+        distances: RaggedBatch-like object with shape ``(batch, max_num_distances)`` and tensor data
+            on the same device as ``points``. Distances below zero are clamped to the first point of the
+            polyline. Distances beyond the total polyline length are clamped to the last point. When
+            ``relative=True``, this corresponds to clamping values below ``0`` and above ``1``.
+        relative: If ``True``, interpret ``distances`` as fractions of each polyline's total length.
+            If ``False``, interpret them as absolute distances from the start of each polyline.
+
+    Returns:
+        RaggedBatch-like object with shape ``(batch, max_num_distances, num_dims)`` and tensor data
+        on the same device as ``points``.
+    """
+    assert points.num_batch_dims == 1, "points must have exactly one batch dimension"
+    assert distances.num_batch_dims == 1, "distances must have exactly one batch dimension"
+    assert (
+        points.non_uniform_dim == 1
+    ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)"
+    assert (
+        distances.non_uniform_dim == 1
+    ), "distances.non_uniform_dim must be 1 for shape (batch, max_num_distances)"
+
+    result = _polyline_sampling._polyline_interpolation_var_size_batch(
+        points.tensor,
+        distances.tensor,
+        points.sample_sizes,
+        distances.sample_sizes,
+        relative=relative,
+    )
+    result_batch = distances.create_with_sample_sizes_like_self(result)
+    return result_batch
+
+
+def lengths_var_size_batch(points: RaggedBatch) -> torch.Tensor:
+    """Compute the total length of each polyline in a variable-size batch.
+
+    Args:
+        points: RaggedBatch-like object with tensor data on CPU or CUDA and shape
+            ``(batch, max_num_points, num_dims)``.
+
+    Returns:
+        Tensor with shape ``(batch,)`` on the same device as ``points``.
+    """
+    assert points.num_batch_dims == 1, "points must have exactly one batch dimension"
+    assert (
+        points.non_uniform_dim == 1
+    ), "points.non_uniform_dim must be 1 for shape (batch, max_num_points, num_dims)"
+    result = _polyline_sampling._polyline_lengths_var_size_batch(points.tensor, points.sample_sizes)
+    return result
diff --git a/packages/lane_helpers/docs/_on_doc_generation.py b/packages/lane_helpers/docs/_on_doc_generation.py
new file mode 100644
index 0000000..1d6c43d
--- /dev/null
+++ b/packages/lane_helpers/docs/_on_doc_generation.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+import sys
+from typing import Any
+
+_RESULTS_SUBDIR = Path("evaluation_results") / "polyline_runtime_evaluation"
+_GENERATED_IMAGE_SUBDIR = Path("polyline_runtime_evaluation")
+_DOC_BATCH_SIZES = [1, 64]
+_DOC_REQUIRED_MARKDOWN_METRICS = (
+    "runtime_shapely",
+    "runtime_cpu",
+    "runtime_cuda",
+    "speedup_cpu_vs_shapely",
+    "speedup_cuda_vs_shapely",
+    "speedup_cuda_vs_cpu",
+)
+_DOC_REQUIRED_IMAGE_NAMES = tuple(
+    f"batch_{batch_size}_{plot_kind}_comparison.png"
+    for batch_size in _DOC_BATCH_SIZES
+    for plot_kind in ("runtime", "speedup")
+)
+
+
+def _required_markdown_paths(input_dir: Path) -> list[Path]:
+    return [
+        input_dir / f"batch_{batch_size}_{metric_name}.md"
+        for batch_size in _DOC_BATCH_SIZES
+        for metric_name in _DOC_REQUIRED_MARKDOWN_METRICS
+    ]
+
+
+def _validate_required_markdown_inputs(input_dir: Path) -> None:
+    if not input_dir.exists():
+        raise FileNotFoundError(
+            "Required committed Markdown input directory is missing for lane_helpers docs asset generation: "
+            f"{input_dir}."
+        )
+
+    missing_inputs = [path for path in _required_markdown_paths(input_dir) if not path.exists()]
+    if missing_inputs:
+        missing_list = "\n".join(f"  - {path}" for path in missing_inputs)
+        raise FileNotFoundError(
+            "Missing required committed Markdown input file(s) for lane_helpers docs asset generation:\n"
+            f"{missing_list}"
+        )
+
+
+def _validate_required_images(output_dir: Path) -> None:
+    missing_outputs = [
+        output_dir / image_name
+        for image_name in _DOC_REQUIRED_IMAGE_NAMES
+        if not (output_dir / image_name).exists()
+    ]
+    if missing_outputs:
+        missing_list = "\n".join(f"  - {path}" for path in missing_outputs)
+        raise FileNotFoundError(
+            "Polyline runtime docs asset generation did not produce all images referenced by introduction.rst:\n"
+            f"{missing_list}"
+        )
+
+
+def generate_docs_assets(context: Any) -> None:
+    input_dir = context.package_root / _RESULTS_SUBDIR
+    output_dir = context.generated_dir / _GENERATED_IMAGE_SUBDIR
+
+    _validate_required_markdown_inputs(input_dir)
+
+    evaluation_dir = context.package_root / "evaluation"
+    sys.path.insert(0, str(evaluation_dir))
+    import plot_shapely_evaluation
+
+    plot_shapely_evaluation.plot_from_markdown_directory(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        batch_sizes=_DOC_BATCH_SIZES,
+        annotate_plots=True,
+    )
+    _validate_required_images(output_dir)
diff --git a/packages/lane_helpers/docs/api.rst b/packages/lane_helpers/docs/api.rst
new file mode 100644
index 0000000..072f978
--- /dev/null
+++ b/packages/lane_helpers/docs/api.rst
@@ -0,0 +1,9 @@
+API Reference
+=============
+
+.. automodule:: accvlab.lane_helpers
+
+polyline
+--------
+
+.. automodule:: accvlab.lane_helpers.polyline
diff --git a/packages/lane_helpers/docs/example.rst b/packages/lane_helpers/docs/example.rst
new file mode 100644
index 0000000..4f74766
--- /dev/null
+++ b/packages/lane_helpers/docs/example.rst
@@ -0,0 +1,16 @@
+Example
+=======
+
+Polyline Interpolation
+----------------------
+
+The example below samples a rectangle-shaped polyline at a handful of distances.
+
+.. important::
+
+   You can run the example using the script ``packages/lane_helpers/examples/basic_usage.py``.
+
+.. note-literalinclude:: ../examples/basic_usage.py
+   :language: python
+   :caption: packages/lane_helpers/examples/basic_usage.py
+   :linenos:
diff --git a/packages/lane_helpers/docs/images/polyline_sampling_illustration.png b/packages/lane_helpers/docs/images/polyline_sampling_illustration.png
new file mode 100644
index 0000000000000000000000000000000000000000..1caaf7819fe08ee86754fc0588ce8b08b3b31d6c
GIT binary patch
literal 31019
zcmYg&bzD>b_x~L!?LZ_29i_A&oytHYB}PaprF3@;i_wloQYGY~bPNy}DqSj$j!8-*
z-TTeY=l%YCe}8bd#~$~ddtT>t>h(P55Pe_oE*&*HH2?r~_cY;#06<3k3K_dVLHuC7
zbngrCgUVCW+y?*@ga7_PMp}YD5`Sd#RX6iB@^JJGc<Ai_1Ox<#K5=*Pv3uz0AnM`m
zl(nJA4ggnyd+^)Ff!S+Q0oj&_qr*Rgl9CfLS#{UqFWq6h@LcsmGb{Yn3`^TnTgV@p
zS<}-;ZBJunO)Fzj%$J0(Usi>;KfSa}Vt9v&H}d)OOFv}7PHL78I^~ilg)NFE$1<nF
zrprfKHYPJe29>hb1_pvQ<OX@bn42A}c5%eZ&<uQj4vWBgfyb7pb!A8m>E?ahm4~$o
z-_*6HApg0Acu`A4SyG$=7~#IQ)&Oa7x<FMpyPA6lwhBQ0LntGh50*}9N}67};`gw=
z9CV2sioAh6gGorNxzNHNX#;;>_Ltl)3sjEbP-IIIaps9G_v73GwrD*;u#R~rcwvA9
z_=hdK;}@uG>Pad^itM*wNjPTlBr*$WVZ1l_$fi8o9KimUc{`o&(OtZF^Q&}y2;AOT
z;5tVnO=PU`UdMT(dqd^_@%EMxt%Zl;ujBi{U>hzsV^S3gjgl1wAH{llvkN+82?*Sh
zPxHC0?L8p=A1*IVq+-8gbsOl<)$O06LaEJYLoas8m9~O)$38-l7VrY)5oroi8sHy3
z;%=}*<!CXqqZFZ{y*|^tFdDc$PX(te<9ne3Zt(b`C<`_cn@;n@CBKUAe@l$H1AmJh
z!dl`VA<oRxmUKzz0cnU>sq-DU^=G<iZUfv=(7u!K7WI9)J_B6(&J(BnDz^W7cmXzv
zCotpF$8rn6@{*M>@c1pr|9K@2bSEjJ5cE7EdwJ>q*rUop-2!ZOnrzk+lOoKIMJph(
z>hNjnm4RJ>|GVkNOYO)oBs2CS-$t)HI6jcuL@<r+dd0@Vkm>RBUx5=Yq90-a06T(^
zGVeL}oeR!q%b@!7@oU(!%w<6QpWS1si;et_=_AyqV8qJbffHlN$@p!<{X&)hZ|5GL
zrW|XZZfqLYR;QdTx9gwoUx&(-6(}tW{C{+cB#~#Qb2h0}Jk16;Q>4JzMzPjd`uQUW
z@ULb*Z2abtN8CT{@#L1NwR#U`^mo4Xdny(GxB2%`U7GmT(+zIzv2@`n5?&-V{tDlQ
z_V@pf7qbBh@N`Z=dkpRR-kE6&GZPkY<KzdoFkt(S*76G%pRBVZ!YT`tx~H4NPc)!x
z7C&+?Ws?6-tuN`o1(C^T8!ANpIH%_74e;xQYnqk+-%D5%NZC*4D7D7WoJy3g`-wAV
zJqeex{*S`*Up3e{0j=R!&Jt(#z`o4^F$bF%k;@u9Fc_BA_UX*<QJs;hPzAFYgX=Z{
z^Iu74+}4C^K$yJdo<78^t=!1JGD`n}RQkO$XrFVdJ&6kVR0OVSjVDhkJu4*<*MmEP
zD^-R8?^(C$0hA`4C#^VxY=)ANQt%KzOdIJG*=bzMH?zv|U(sp}mY?KOa+3E^RFxD?
z-+*aHE=Ar&__;uv9v???b91c(5R7_S=9e?Eb4N&W0D#*oib{Bs+-ELL>S+vnYX{zi
zYnTrARA@5$q2C*$EUaG*r|>i8)bLHz7TVnnc9}@~`93g<3u1X4rcV-KfJ3|qH~nW(
zZ@2fR2g#<%e%=*|#z}Rsm)_N27jkhh`|vtA+VC38fJ|3F3LP_)>pGH_u((@S*I#0g
zBO*^JYXi^ijHkJrzMCcSA|qb=Ev0CM5iN5#M~}|F=u-%CE;5hh7dv12o=P^+8kR@}
zqAzkN-;k~h5B1f;iL>!M)r7xyi@PtwucaP;@#Aw}M#l9AxH>+>%y`Olak#5JyQH0O
zT~U3Vt5*4RCBON{dkHLP0i;doO4{47;(u6w8m-CfN$*K<|GW4#Y4_3RzkjYkCG?)7
zQ1>MT?@TE}1)X`mf|Ho5V<o9>7DIj?vy9^X{&fBPe$uH~vpSI0zbvI-A|PaxBX0`7
zOxk)MCslYRs>?-Brx;%Htpm<WLVM8k#VcT;`0z{)woPvfd6kK1QA0@0td=+@qdt5Q
zv5}LV@$bp8-|^sub&_)5VU_lN;JSi}{!aQ9VQ7K7`0(?u)HR>`<UGagC=<;@w*sRI
zJHMnNKB{36)KH6(y5PN~GQSds2@_tE$1&1)88BEsG5#Oh*V2H~Qz%o!u@QylHJ|eP
zgIWBoug=f^L?I;n{rTP%ccKsnilX0ctvyIDSYF^=&08)AQlr*IyH2NASMH@ARvBm0
z<HfLUX#%$Ri)ij&;nD!TF|LrjuAY0#8DBycLUDrM)+Y0KJW}Fp5jhgd8u+YLPwg7y
zg*VOxP$l|Gb?g(|ySHCD2V(k5-={+9PC7EewWQ|gKf<?5c=9Wb3dwpm-JDK(c)RJ{
z+=JzgTwRQug+xzg{lM|enR@&G+A>uZDx3UMeU;uy=RjGu1@5Yy%iX(U1YO)(h%X=7
zZ1CCnw5v^~{ZXs@=qFx1!Lln8CIy!_LPZ+r%}6ZYfeN2bm36$xC<dt}Z50h{L!gs^
zwS!k*4XUAqeX*i284{M`sWXZ>!#yriAl!MdI5LRmAaPyBxpg$V=U8(WE9sIEs>3{g
zaB?QkksDh*FXmRWsku5(Tf%2yttU*LMlw^w8~I;lN=f5Jg<X{W`0$Zy=p3XnsM5J4
z`_CeKQFF;_&6@JdRn>k$N$T~rKdNr1_b%?1W?gQ#`F3eFvK(9~@+^B4qYi(K)Ua&;
z2d?`?Y+r+!0yYOt9tOCGv?`VMZ*Bng?)@@=eryTfGe*Y*lf0{2YP&)1XUs6Tloqi$
z`?2TqOnp_5S9aaUWho+~Dy2Qtc<mQ@b3_+!BoAY(+%E56mGnIFo=FD;5?`C#bepgF
z{pL%D`ob2?Dx0)&0XjlezC`%khr@+ywV>KOYgmON0>(U|i)Ascy|GL4`u-s`km)s{
z1|D}9?hDw!wz=?^TTkvG&Tij0F`_M}0S;}p^+`JuvW$Yv&PKax!V<wp10gG~wjw$Y
z?guV=xP3ZSrCguzooaMa$h%|yJob<;Hqe3Sx~^^c<t1WMpR!ZN0V|PuwpoM+7ZlyP
zbK-6~+FnE-KsFy|8{7>0`tx`8cS#}H*|*2mBZpBB_ul0QH#VmRY6nj=7SARYC4J~1
zhq%7<WMf31MJ|LNI3kqFQ_i~6QGJwL+f6eDxWUv41@-S%%*c93XF?|8W+~A@jL*-N
z%Fbsuek(dX(6k+%aE?aM^q!9?drfBde32H76WUV7kEpja-t5RwjPAOHSG3%>DPb;Q
zgT~XBKLxeWAQ`U$V#30xWcUh5NxBr%<$@IfkL~X%c*NZ-x(oBB-AaHa+a+u0L=Uh@
zmOkZCHp@GBK(N%2dlO?;yLGzxKtngf)NgmZSl$Q>)q%Gtu`*TDkuZ@Yj--BIc`?Br
zs90v-eEn@R383ulvr$3yy^@aU9c6+-P12at!56%WZO0QS-wi_(mGR1%_h*kL>OSh^
zUW19fU#UJDp&^Pk42qVYeEp3{=#96tdV9^y@4uh3vyQFj#hb-poL5b7bft{ZjhAtQ
zF<pju0kE;o=0A*^hMSD=;dlxj{)`*_^`-c0#zEvc+o_X|;@hM^Z4vGBAJTPy@?*?(
zW0|P4=_fPYIADehoXB2QoAvrV%yX~Sy-r%vgm$_|rO1A4HP|Ts)8nxx4FyoQA833;
zlUB$*2?1302oaAEhLVfO7@n05_`)}z)$QJGoBU^kpu<Qb@A9pvTIs?%-@4!Jux+nW
zT4GdRxDD?hwYiuf+x+$sA?)3EQoH+d*DlY!ilHv*d_<cFf1e0{Z0cYt-|y1R8%tc(
zLihZx7rLuPcX_w8moYj<59blvb%5B*G>otzTC0E4{pSzo98I|p2<~XE@P|s@d%`~n
z{az0+lv1$Kexo12^=&$3da_Dx&l-A^2)g%HMay9X?W{hspqC=WNS#OwY+1x5e2}P5
zKqv<W=XhoJ1}iXXUS)mBT__)8fqy0eT;!Z?g=XAzG-+ScBuDtn*qH9tDk-6_bMChU
zKSD505e@l^=NRwc^?Oc4N4M5KvT)|lr=$E9Tp2C-{yUqk(rlrSjxMz9P2`Y^bB4J=
z+vwvIc_=#%w5a`2UwPRg&zF~fBECfV*dq}3p}Z1jzk7O;@1o+!MOo0v+Govdc5x7>
z+y&V(FUR&>qT}yVE>0Ua;KK<0?k9d*`2~HF&8DF9Tz;~u+VvxaD5X1=ZcU;C7z8Ad
zJCSHmzDI>%6|u&Ncf?<YmWZAzu>I5^<6;=Cif{3KSUq!vm`YsIru9$X4|KwLvYAY<
zW=b4k1T}P)MU_+uC2n!zXDQBUQ#j?i@Vws{Y5gUV>hpSt_3KYMiDrF)2@S<A;9&80
zO-yplIHGAoLUieNN~;N$w*(n~oi@rRlz_9QA|ifKrilKCo9IJ*Bf-6g(6ItAHum5O
zF}QVq9{a9U!YX|;82dYuSx5F$RY4BXbgQSWdv{u&JYlW0aoyfGO%{mhavMvbFvj^|
z5dh4aT^!eH3^|1cMtFw40WFY&C{0EdSs+m6BIPvqxJ)V^G?OIsf{fZV*9b#gv6{2c
z;qCl}&UX|q`#$X4X4~zK+fF3wh|e|gQmwgTPqe(kwITPD9OM3;C<V5gBL1qqi^fdu
zX`ihMH_~r*(5fq&osM0P%ds%<(h1|0SbMg&*9X4uDs5(~WX*GbjBi|Aj8Vg9kT375
zH5rRBp?i(}u9kD(>d1g-!}&?060$E(s4y1JB8PhIEXDt9X_=e~X;m39j8;lCJV>nv
zWEMwtMmVCmHy0LXO(PkAlf$ze_4xoW*dt*?_@^13PIZ(3&B6Mpnn94ZgmE?JYU7u1
z0AKJp)%a&z!kWTEYd+i;ZJBm~>n0+9{n5ff-KLS0<x9VfpGRhYk84^Bym;hTYO2o)
z587bq=kTJ^;UMdkXiu*&rJ0G^Qg1zng`0_f{>u5DqFZy~@~&|=`JDu=byZZN&oqdk
zX+FF#hxt<$wkEmM{P1MB@%Yd8u-7BSdKb}I#!?8?QYCt0M6DJ4oHkT;YquCl48F?u
z&+p|U5anX|Hf`aG<k_qm7V4Kdh5shlo?p-Ez=G!MNhs7$AWCM9W5V=#SAkv6>Pfb*
zFY|?4I&teXeHZIZh=(2>mA^;FDW13AU*1G_{hR|q*&@O0Ru;Q=2Ga=ByYZS|&Y78w
zB9Fp*-Vwd}c|hpd=<kfjg+JFqbe<HS2S}Y)XgxxFuK4r48|!K7j}{xNHC_)@@gLWP
z)3c!KW4c%(Sv-$O6GXjRdkn;ZSHk9EpV3qR{B$YrHXjKm+1C{f^$t^pL~+paZK$=I
z-Sd~xH=Q*!B+XfuQfkzQ5%^XPaBBe7?r0j@wJh~(0Iv9{Dyh4YHTms#^PBaeJ0IUz
z6J@WH2e&8rW;VilMF9Hn^v#0QezXa-9X*&I9B9w^^!};W?UCSp9JWAoFo!;y_5)?O
z@%(v8H9iG<$I&6>8J_m#_A%duNE+aRp`4CJVy?k2YOa*bj?9~a&aBXnDb$wO&%}2S
z<Ux%BXH3^9QAVtnsRBwefMVRwi_gG6m7`|1xu{D#ztBVx<z6AXuXKk!N<Sv;1I)~W
z-O1!)w4s5dpnUdhBF%VXkb9u#T|{dcn0Ft?;l(%q62#{>eA>9aBsxmCZTeD~WX%F0
zP>a6K%u|%I<n$43WQV$n`@%zqZ7JMHB$^2y%#gf~jXOSp<<%F#SSDH(@yqYDo)fn&
z(3aKH)<Be)1QrRL#9Xa(esbMUy&^X*d*v80boEQn+0<bL-R5G0vJeBUL0jauYO70b
zX8)^g2MA><tN5F$Yx1=D;jTdxz{<YY8OX{1<v|v!&FluNUEh64s`u<{(Qze-@C-(w
zfvC?J7bs1Pza{B8Nr>uGe&Ed~Vjg*aQoj`FJ=a{^31}#}!UTylbF*i?&jaHzt~K*p
zD)t|nd6~5`;{l&1!xyW%b?F8Grll+8YR(?w<bVXXFGaA_Gxv7nLuQ2NmxqW<d%~um
zQL5_mC#vsIr<XO;Ym5q+ubs*e(VFFMO+51Yt%L-4OrxQz&MhsSf(Llk7dK!(Hl@<z
ziLO#XmNE4^faB@$I94J0o%bJNFMiZ<lK3tSv{M7V{Vjw5#QG*#fDkfkIb8bAwQ;I)
zva4%WkG%3J0g32@U_q!{emj!=Eh8xkk-8)nTx5gJ`05$PHGmZpdl+|<RGm+?P}4rJ
z%Z1S7d@&tY@_=`|O#YjSDp5?TOz~eWv+FH8$?-`LjkigeMOK<7<G-*28`fK?ozq-_
zg7l&9kS+`A2pM2zWpj1$T4256q)ywJ7?UeojU><`@n^>R`SvE!&S)X+;X-xw!FF*s
zDj{1M!oS+zdR#)+mpHFluQ&15$W#P8X-X_v5qe`4wDp8qk?V^8+&N+1*mAb>7gh_;
z4L0)4uNqe^5yC7o+|pw&e0xKs3n`d=FuUO^Hdd1VO3&GGoS-Tu@^?Y^IIotyp3W5A
zSSif`uYW6f{Yd|cH8B-B<M~K+>2zsRB6aQlty%>+i)%wQi4^A|uwr|`K~$F|UKH<k
zo?(I~1&^yq$|Ntj(kjwR0asy5ka7|h(o$q6G81bEJ<3G=d=c1yH;kpbR%apoNYzL1
zSGbqFe^IXl)OJbqAkrumhxNusXpdjBi(`9G1TihZZ+%?e4J-_lLDdx)7UWCP4MOVY
zl%KQN#i%W}^kuR&Dkp2LQenGu6nZ6rS9*fcCVZ)*S9sk;^Bdkp4w4XnAF#J;-PN^R
z80N!Bu*C!`^)luy833P`z&lUU#<`G+O$otTJIF~FQC;02d&kMxD0Z&)T8*jyBkaq8
z#h<jO?~QW0bA5&Z%X<%`n>fy+FHg~b0zs4TFRXUQ(Wo_&2X&?H?5TQ*Hd#6nB^Ocw
zLu{3OlGU;=lW5kK&we;Dmp!`+H|Y#m=6vDnVgw3TKdX1Y_YV9z@uMHpM3|s??4AQ!
zUxjf)7D?h;nrp#E*z*l~zBWMBj+O41`hqEh!JqAvG#~5@MdC*=SKIF1!K{77&V_$G
z{AL5gjt5rPIlrDyz@6p%lu%GGDZa0+;9nE~S=E~2CdyFDxb{k*mz_j|b>xfa*?C*5
zT+MH3dP6CC<(*ThHH=6iAlf<%Ch66=XdfleShmh8_zAcJHe1Hz2sZ@oE@F=!5jILe
z2%^$UXDD?&L16wkd-RIAcl47qn8EiMe!W(M0b5Ce<KI76)PFL}jozyl)qb`1&8k~3
zo>cq4EIiG1H(GyeGIjd+F6GA;5!1mrO$zdww?m0cQ3ckK6{ZnWn)enY-ztr={6FZS
z*RHoZ02b+d-!FC7xs_ZQ#pDw)D#?(iz!0^YKHL$~c{J4-+~7_UNR1+Sm_na>F;MZk
zhvHq&R@st$x8?XN6GJH$HRWLzudlcy)p(5b)g-ZKkH3N3f!`5py#~W&lHKUYxV8kf
z?cBEa_R11GsEM**TY3?gZOTM7Tx6&{Cweq9qt;A>c{5x;kk-|^-E;d{Q}x6kXo^!u
zAXPVZ&Ua16V_VLy*Uj?&l!L-gdlHFcb^r7yeIp8I(&{q-vyv`*Gh?d;V^jmM>-Bl%
z_u<C1<YpKbEANkYg4dH5QhkHgpX%Kotrh*lUSWu!TriFkKl0P*hk0vt=@KAjFqn~=
z`v|MiqH!%L#?3{rHq1i4c@dpJpBo#sbMVMQzMp%1{I{{2Ia>1rz-5G6cyr60m^-+H
z%b(4Oa2RD7QOR3}R8{}98t?63VdR}iU5sCrQl;6tQ*y!!V~OdagKb;j*$KPmux%N_
zslN7DYj_$J@D+KFyv{FW#@)!2?Y&IqpI^ezC)c+ct55wYD~hmnJ%;=Zooe0w>uaOA
z)`W4R9A9ac`aYvNLoWy1mmJ@^f_i`Ao@2ihZWuK%q4gmnBsTYPCwJV>LoIKl+=bg~
zbh&ZZ^~{7}dst@wEImL3O#T?}>9K!U-Qiz2WH~e0H{dIObVu<V-V>JJB$qPHPRvm|
z)xgKvM$Wil2P*#-6aMOF%qEjFw*|}A`#F-cJlkfKi?5@)IHBfOVKm4RE9>yCR)1@V
zQV8`S&lU|{1PjHKtj=r?&RDBXkm`Eg>)0ikkRSD^cl2`Qf9T2UY28A<HNo<^OUxia
z0)iAk7JV!)N-LV{*^Xc~)RR;!<yz*QPIBx7fA(UY-)Bf^)L+nnE62wwT9<$`R83)s
zsTU!3+W2X48tWONmC1&ZorAotVai+)Z{>2_Tk9G!1r&Cg)M}(tXG5C&D{rQ1HD_xo
zHmLatY^n905?w;!JFuK2wq8tMbmUfdA%Qla+q6Zx<)YKbIIqG;VfvGpOP)?#{fdz^
zj!rGdjn$I^2&)g^!O?u=AX&@#TwZJIHy7HgZS&1OG_FF)n&MduY;8B9=eC>kQE{Xi
zhf_&|qBqi5F${h%@4KCKKM6H%u`PASGdaop+(zvcRD~8BQ!PsULI-t2-#_T~a~NhJ
zBS2@DZh7NH_`Id%QLN4T71J2MD~PG9uwsb*?21QjKFDz*KgcR6`<@8ASx$k+g#xpG
zPL@nexO{wET;@K*pGvwovvRMWY-;$(_bXkwqBuNu&q;o&@_J_IBW=N2X3RYV%Oye+
z5l<P4QsGnB!&|Kym1{mX=dg*B+<4WK<j2o!dYModZ(LkQ#G}l}70W<#zQ;y57SDhM
z-9Jk4Lj7iReZBS2yk6ntLE)^D<+jYLFH)}3o@XoCs46&!n9c0iV8MQEMzJEs*MS={
zA_cR2ulMq|KKFm*Rk$^^Yj_%DsWhOVHI%t60NaV=@G|JvjtNt0l|GolJTb11(L5w?
zi@)ihjj*98`)zDR@O4^x=VDZnZXMT|Y`Q+#UEANUIQhM8+&E^@mz`YRcq~P$@+#L7
z@thUvx1`6l{QFyxv2(xCpF>g$!xCML0wIl<@Z0quK~b{HuI6cGX)8a|SUlX`**Ptz
zT@Y(*j!$rVQuisz^8JnR$eGj+l0|owZN3eK*T2fwM&P}qt8YE+G~Np|o=Kor41bIC
z?-}2S%*rF*inWglckBHz|5Wq29m`o|yxQPL(&ZVw=F<cJI$`aKFvQA>ft>NUrcx`e
zoy4BFJkXoQJ^E|I*oPM@Me&a>cT4TR1rMUTOe4zyQxcoLy~d7=NNt2^OxKd(7CN0V
zdbh{kdx4na4=K$4i7w}5^i#@7e|os@8~kO(*!if=*jYf-XR{VO)*|dp(=sd5?8zWG
zdMeB6QP~XnV5^9qP26&am{j{(d~CA5eumIa-ifZu{BOcX0mjvt$hOx6->Us?z1K2@
z=_lrG^(8Bd;p@-4+rMC=vBprgkKovcUz>umIpSH|7@~gMao2I|p)W(=+MBuLkn4L4
z1)x}T*Lu@1-(U$FTad%LQH~_Y4&w}T{LQ=&T(kX)p6DX<CWd#3V7q`d`FSDuOq#^#
zLd4RU^dx&`J-7mL4k$2SE*wa{x`fuS%q(|7_!*>G9rAn1Zv{b*>cQ#cxq90;HmtE9
z<rW<@bLvg-`E1}PQnOok3LXaDl7IW4pz%(DVBzcE_uV`YZf;tIb>N9wGqb7r{B49t
zVqdib91g?U+g_@Gj*jllbhmoQlsX3$E{igSQ!l$=g%)HF56;OZ1z^d>ds&4WaW`*+
zjplIo$1|@ReJ8ZcuYaCgikRq`B$VBSh2?R-)Qx2nZu}TOkhVMAuRR;<=`iMM;u#de
z8en%HDFZ}wzl+Dmce@T3P<GY?;mURk2PCU6Q|x6FHC^a-!=4l2(P$GBlX6UXn4#bs
z&eMePw}WJ6;U1LQgfWdy(Wy@lAD*(=%DFo?El)3CE=;VC>E-Nkvl1Y26w}3^{_#>2
z#0JP^M(e*{(LZ+9MkK?$AzQ**9MDV`2ljZj;LvYdKCmh=&!6<TAO+$X3<H<XY~>We
z+$PXq;=08qu!KFh{jH7Wv(QP(d^*fkk7m>)NvH+F^ayCPA!cTYsbK4Z7M$!*N;K%F
z12Ov0d$NjF{$wTi#e<i#$1D&N@Od|E1iauLBEtG{R>$*u2yQ4kbHbFDAY^g_cQG$j
zuh#RB+)q7^s$wHkmL_6G>#bC?%UN}Fr&K#Xp7NQL<jK`-*>4FJUkHxdGys|84<{zz
z0#~w!^vOG<4pm#oj;oz&pcSBs7=mqE;e}T7VtWteQo9ih0c#_5eEn~ha0ccq=VBqC
zQP7_{>0y!I1$I&HW(Sr_Mz0@<Y|$a;iofk%<$+O>m;(J+fojEKl0yY?O4(bo3nqvx
zuYm0+$jC7REMl!x7ZGL-FQaH7I|Tww2WE)*&i(tiTFi~Jh?i5}cOR;{DEB76Bo<UJ
zrQ?EWJkvmBu0Xd-Ti5Upp+~G0V(#GE1>5E6rO3I+v1??n697ljC=#YKpCg?o>uLHQ
zf<(9SQJ+451o~1pyF1=I;%}D=$qMeh`7@SC5byuR^3zB`{<H*m{j`wa>ePO={?%fv
z<$UaLu0(H&IutjkDCy8gDZNQDK%&=M9ZwG(d%^L_)ZR;LD?{YSlF4+L)^aQCzN!1G
z+^3IWhEYv4Uf(iE1s$bzVz?|NVKB<(cWHlozVwoe?|MH~k8pkW@kV<Y-LgYg`d5~B
zA4potz==Ce->%MXx+=z;Y#ls7oLQ$Wg)ts-xr^Lfli2#V7hparMAngZgY=Z%_NCa_
zh-s4)*3g#G^EF7n@;>us$8%jcKS1NT+1$CiJ1qpG{PmQ~4k?f4MTB8y<mCZWS0$zu
zO@#WJ)LakFu(}v?a-J^(y1cL%$N+@C>eJn#$-{+r+@yM}H|3<C?8c2w#yp6=;;A8G
zlu%CXb`Lu+lJqIif<s}1_sK+BxZBX@L!T1ftVm*!`Sstrf-Zdeg~cMJyJDqcqI4Qi
zjpRp66(`!Y#`WH#@Js=fr+R-W)&3DK^Zhjk28Wg{G4Lu;0JaBL<`Myc;rQqTE+5;q
zEwfr$j@k%vNxjmdKS);+iK&SeiSEDgw}zLP)3{k!-&30Tq}%`vq(5v)JzdRgHQEZn
zOHGO)@>W&Oe7{R!npiy6-MWz|#S7VFmTy6o{wjS!e3p9ynQY(2KAWA+K74)z8$s#>
zuBaT#?gvh?_g=pYjK=2qZ7WvXL+DdAtaAhDBn+i1_a4@*PjEv%QJ-oe%4wUz?5Gz!
zRC6dlH9KQ&#VNgiWbF`U|FeSXW>MaQ{^t#;L1HZKV=0f0ny-9_;MWr@`LkZm&*Em2
zYWd<M2cw19X)blqQiVH5>Z&Sj$sx}4xZF<Z4|dWhKx3(~-pUbDmbr~-nfJ~X2_vtO
zO%Ors-70fxrB{3Gz-ZX3uB=EJUakK3%gjSj`lx}pukSxh2=sjA5gKzcA*W-qYradk
z^jfCqaqx|{MshFzkJ@i<Fg8gSnYdaI!>QH}$RzdTLzj@?;RW}QVU>KCw?s%$wGIM@
zYv#;4;_HzoStO+${FTOH34$s;i;BcNZ3SKcdHgxdlPy;lK)~H8ren#0eZ~tKt=HB{
zc41W)4`eok+S&7CyJWn2kLe9@sqL7TH(xNKc|2t<aA@eo>t0co3%32B<y;zdH$%KU
zsWQq~;*nOICbv-iL|ws?V7azqr&(8vOx2+Ri>x1|#DM%M#dU<9csQ2vH+o{b9po}i
zDr|xq3(Ah5^30jZAtqJ9a{FS~iAZuV*q&>o@lt(?+(tR`xJ0>k3IYK91rnPs%JQ@G
zNQXMNryN&)?@q0aBSRr*_U7LH++^Z2;eiW1_cgp%bHmNz$1M*n^kcu?t)%tF(uNz&
zj-5U8J;s(y*gKXbD%w{^qOc6VDQpR@#Z)m}GM@L&ftBFcIP#+J_R4XNQXFdVY0A^f
z_0rxk{j=g787)UwPn58Zh$~qOasso0^=80TRruvO<q6J9-c9d3Z@<3aSp_PyI-W$X
zu0?4hPs0!**Mr{NZ0CjTVH>a|u&*!m6V@$Ml;sOSYj*Wlj|=asNNY`HTwwNtK0<_B
zek-f-&>yP0slFsCBdY1=Y|%Ts`#MJ`S!m8}JfUfoK!dP)@Y&=0B~AD_w<wBK^k9@m
zE_Vi%wQfnr$=w$_)L$7J`3yM-uHb@prI_%|x`>&a)gw~fVF+?c0JWB=z2eDx@gtbt
zI8H3gJHNcE(dqmcg;00q(<;31HpV#yPBgh_f47a%LJNzmB;6c^S~gRPl_2~e*vPxP
z>3We7?L0~OEP6gL-3q?(HFeF!6gRG%aK?>y3O^u2^b3Bev5~0aG;Lg&1CQriPlyEw
zwbXKQ0>|V`I~r`1jP-eov0a@I2Jon^?^q@*^TYI?Oz3J)zLQxD;azr7`eNY6PUJF{
zSNpkZPYy9-pN<qdFflh5!qvuqzbVDT6JN>BcDXd^rDhy;G^QO=!dv2d$0BKc{KxmV
z6GTOmsrS%zU_24|;t&@x?nufI8s6PSgW&~<7@E`0<2j8<y5B;GJs~AdNj5X`n%OiB
zkG?<bcY(}o$eTefp}TTU4Ewdd<@+wUgiY7)Jn|Mvl<rZk5~TcIt&>E=q;5}9PJvtI
z(4$?~tFPU-bzQQATa*gX5pHk(NDILP+g8J~<C9_AL({{n#9{)(Qi{dXg+T=fZ7pXq
zmg0jA&ON2*Xux`G3g4L!I9g|F=R6GwfY^`@U`>CD7g-*)4eP7V^W!s!%@E8s$%=uU
z?AAL`w`NHmur+xey~Tc94`-HPI)vKX>!4)>L6|q)mJl8WJ<{T;e%AV&Mm)?69{x=Y
zc`Au%%`;t;KRgiKop_=4aEl?R5BOaN4Oj|b6o(wdS~D2`l31ww)pOGo7`9KY81NMn
zS6bjlG+*uXsR!Og?t$`^iFPw>xeQm*OID=hES<<Ipa%p(nX48RyB!53wB+-d2rIp7
zNApZUBj*s%2c&_{+Ju9uQ&d+AhW8_hYbU~JS@KyW>JxcDv8xZdN%ktLOMY@DH|);!
zvys|$7O#NH77(<duU>aA^Aj#h*cU_WJ!}7zcC#J7b7wu4gC+ZUmK<k0-JvODgmg6$
zcXsi<B;R}|s(dz_Yg?;K0OkR>cP_9ii20C?4T>LL*ixJ9ThR8-k%Pf_yi}6uuuX`)
zI?EH8i!TgSMg^7vaS@rYi|8Xvn+itrLD|ZgG;Y<PT&1X`^IhW`P~p`3Sv69hl-O(J
zcyZ_1`A*~k>D}rB*9t$_HjO34l1@ufmS=SEG(a1z8DeAX_YpNonN1x>#<f*l5<YA(
zCWH{_$k~A0gBQ(Fa0DzU93Bj*s!(%M*N~P*4jy$<o_smBYiQrWuq6xZf2ob5${}2_
zB>nJ*6l-f(mu~ELf5c2C<Z!HQgk+|ppT6bIYJ8;dcpx0U2+WY=GNDhrB$xEQHml?n
zgY}DJ?TK>~1Z~gTFRxuI0CS4C?Td<qJNd$U5d5Rby7t0{v6=lp63W6i>+i;PeNY^b
z>7XT6r+6za9LjAlcu|T*{;tH>ViUb0Ec2k~=$3@z3GJbYsspykR&ES-l^N>{4MM=f
zd0;HDU9J(GUp`G!g=hKbU4xcD(W^)T{=9?h9oYFRJbV+Fj>*dTzMObNkXzV-qHlM#
zl!z!_J-I2GFMlP_Nty2biUy+jVg{9Ed~cS|Ymh5z6F)Pkd5W1iYprlfNUgr_2xQD|
z7ZN16UlWXNAEmfHRc`w%a&#&&8J*Q3&nQcI=*;CtJR)Y~6@_xYx+geJ)@O=KCy7$x
zgVoPIO*@#~_?DJpQ*<lwQz_!WITaKgT!JI4ekViDbDK%aPmaoW&d!}%i?#(Crq$Wz
zCE6BEz2&+`z!s(q220{utee;jm3ZRUud=JJl<KySbMeDxhZ^OJ^&bXU?I?H=ZcOMw
zpdK>Ah;ChM;+<@z6r(S3;f82H7)n-N_0&ItAR~X#PQHH7rP5$PB2zKANKj-2x*GOD
z&_AYY3d4@uY$tXv(h9;PXwwCF%?#$>Sy2#x_bblOBk`@l^A#KJbIL<K^msU#%&Wdw
za9R{r>Pb2<7cA>ay~TqUfo&&PQWn6xFL+)Mr#H1}D(DV8JMN|NVx~)*b(UR9R`h>8
zvu`HQNZH!?(cs+AbBlXRVV~C_^m=0Wf>1K8TWfer88I`|LlCaR4oEyYL~48FTg`5w
zx|FdFm<5wOVFw$nwlsj2matGi(G_w0kl)3zX5?MA?W_DjZ(}9QhzG;&s(;+gXPt!@
zh<%51u(?7p7*JXKRWA=-pas`sLKB$L2eDmkVR@-nY`6nuMOV1@M>y1e&yXOLxB`|A
zzF_+?_U2YO8uU0@_`Hsdz|jF_%$%mHOi-t?7B5H|k6nl|y9RSw`aZzh+Ap7f4aPF(
zIMVLra@NFTH|UDC9wEeb5nBK!2$L<^aO%uINk_{3EZ<(<>_|!0pLG`wzX$I~!-$RG
z&)p;LBJ#w%Y>sotfu3xe&7jva%}vOv;LID(Yn!<iev)x5Wmz3t@6U5|xilx<EOKow
z?LB{H{vjs>mi1D7{+2DdC%<T=tE6WtO7S}EWMOj1{r!q4V#@k?`L|&PaK#YOd>b*i
zlWO6JS5|Chm0h!fc?a9LMR!qQ#>W+qJjQ$Sh_C`q=RRVqCW6}#$4%nn72$J^uU1}q
zlriI&YG4UpL6)CPDFl}kyt?;wWc4@#dUR`p;K=WO7EZUt06l6z&8O&$&@4V9rWO!=
zL^^&tfaCfvS^yfv<jLiJqwQ68p(_Nr9dIFJxL?^!RiXE^PBfN*r%fS5?qhM^ecT>q
z-nODRyy~VSl8G3aPYl1U7M%yY8ytfxmt$>^<?mOBNHrJYj7F5Tf;!OtJ-<2;7zK|>
z@t*h&taiGi*YSi`b!)f<{j>AjLvP!WU&>kY@-$bOFa!H1Cs)f{Z|>$E$yFE3mE`uM
zMt9wf^dKoNU6DcTg(I4&OqXXb;LBTtABak5E39}c{snc$M!aOG277(#OmCkucC32c
z9`=cH4hp6LQd$Wtf!Vk#HC$wa>n`K{fvxe?2UA6s4qBmC)|uFdfu&_*TE+j;`1`zd
zUlI599kFNSM_-F=zT%+gkKT+3%Cy}19Os&_SB({Q)W09l!qBtJzZ2Y|-FQ6Y-+E=3
z=+M*ep+-kq)jQ<Ev&hbIy<Wr!t2WjAdjuWMfl)`WP}msZE?bU)$&mq0W94yu2WTHI
zE@IutG`wi?#OvrmI1Dy-BFpEh(uUwUM1d{e21i6WE(6yiJoYE7<_)H=<8!fdEgSg`
zj+~P=w)SgmtshAe1oN-h=^XtQUAB~NP(SwO+}PNEx>59_bVYvBu!yFLb&LL1F$s%(
zNCg8)=+LGKgemr0aU{P_KzLyJ-O7+-!p-mlImEg&k`ETI%1?mnuM)(tQ-u*=)JWdm
zMC{3mDNE)iPbnWAvA%o%{R+Qj(v|@>PrU$CDO=rX`NV+HLqy=XM}$pH$n)cl6dv6@
zo+_p`w*kRe_4(VDNnjAxu9-8J?E^u1W;JM#jBBtskxYxIXb)*bFI>#VNy<O>mu%28
z+4Jr*oDAvygFQR4gu@+4isvW>9%{`|Bp)nzxu;BO^HPkwV|iEbOSKy3aY75=87Ul}
znl$%+2qE2=k=7erB}i89n(S>IYgwKn=hO@I5Ej+HxiL~z7eB8N@#-5g;BkeG@*b0=
zLzd;U0y!u#5>qP>0A+(d7dkScggu5vcdlKHqfMRpOb!fuIjd?;KxJRhY44XlREr$3
zcxqv1q&vv>kywL6fko=`5WGlaadpY=_YvVp9qUN9X3peHuJEhGf-I8BlWpb;J#gKV
z<&{1F;ANrE*2!xenP1E9HI_5Jl7W_dox)v4OkEBPSN!${oCr~D@HIJdQ}dt<;vq>4
zu~^)1+#|pkNn6T6?wxXnaQAwDB{o%r5$s1?uZ;K=v(YK9Dc0rs(kjJiVxYK$Xr5l;
zYkB~eE?pro&&k6iI54CvTVVkZDjX#pQRC~?NE55?RpEpXXX&U@*24^x<p)rP%!UR1
zz|!{vxor72n~MnD!z!a4<TL6gy{twi;ce3pq!GLTOypC$yZhbI@rj|wrb1@l?;!mk
zQyj;bLbP=QvBx@kjr79MSzYJ6Z^K12wJnqAYUqoIV+v>xm5hV8bhuR`%9V7ZBUQi0
zU)91@=*)jcIDw9Scx2CevLyWEKEA>3Pd90p_|;_1iu$~XzRh-c6$4;DZk4m}hp1=x
z_bUPF1<V)|_-n+T!oxZ-*pOFmz6~&NVQzE5)>aPavDhkm&;|iE4e>_U1Nc0#9Sw<!
zyNs>}^k+A6vpFB>_A!Sym80w>rH1}iYxd~(^|Gattcy0Iy97ohGgj^u!JmNXb`qo|
zu05Iba1|&T8pch5*-Hjg`e#S`kS+Dky>CqRRV<MVTG5759dfvXC)co1p$bbR`mtRE
z`>`2UhvSUZs+WBaPDra{{pCWmCjAv8iEZpOecMfKt*uV8MXx?jt3*8{Ob&cLv!EU0
zMRa7mxs8a3Dec>S0_}Fxcqr<Q75Q~6mY0FEg&@6IvLRdEtaRW`*yL0Z4x3J#FsLW7
z0US$Kh8|k>hTuHNW!y%H2>++7=5+orZMYsi*HxGSw1L)iv%b7!{fEOuj^^b~TKf~y
zfZbGPb2vSzGPNSfWRiMQ=54RUXnTazN6?g{l}_=D-Sa&du8ZI{!Ks7CK9=GFFKC2M
zjp2H~3(KKUxxS^EZjHr+(+}2(!OfkXL9FzvtE#PivrFLzU1jAb?BBWV2aiug&X1@q
z_mHhl^mN(76H6to;dReOJihj$f;u)#*z8`CAkI?31Ud!DR<PwBVzBPuN3%JQ|H(la
zaAOGRF9gesa4yDsmIx&^gqR7gwbXf(FUo|Bi+gg;fan|ATLrE}HMvYiH#G&7qb5aR
zc`WEnAU3l6*V9;r{qOdRi9R0MLlWh5!`ILGFj6sMF<lEV3Aw|!+&X#j7W@^L>eDJa
z6$W99#4KuVxQg%qQC-5zMEjEn#JSi6AXj3sd8h7|WVG`I^057avIXgOPu;}G+$j<m
z(L(CHrPNHy$pjj#E0k?)<PS!WTAwuCUQ0YiGoHaLj5?5`OqdTF9v;HC<%F18ICw^N
z7QE`@NTE@uQlE*8;Tw?ZU}cj3YkGYTI;MxOr#0CES4-tbh)Diz!~|l`(DL(-L`X!(
zRD&W%{qyAj>^T_Bys_cxA${cLAl7N$LvUMJ$cVfi-}LOv2hq$aOP)a^d>JhT+s;QR
zl6`1j5W_%aZzE*<M>0^H#Nv3Tt{=f>a%wI1&c;r1WarM=)Sr#X7c?ZoJ#W?zejT68
zU?reOsi;raVt3rz!^-4J%1O_r@{_%egwOe5`uJ`3HIji{BlYsH<TA>?Y&@`hulj1x
z_WD11rjxWB!`)>Lymh{Dob4|PC(0083#%OO0lg_Uln<5yD7}HJhY&x@#Jg;E6v7Cr
z#3qYiH=EA=M^N`0h8_!B{1gdSbT%LxgDJs#a%R&aFdRl<Y0m4}U@evj9RF^oJ@5$c
zgq6Tc;6sU}e$x<z8+Y3>A*q<YV_ho-LtJ{7QdIvrMme#gU;IdBjBl`sN@i}J6E7Ba
zWsUME{iJ-uTK{OjWmsD>)^mZR>8xRo5cSS$_2|U1M*s%J3*(n7ia!%w|INR~b@@~r
z>Kt7hUdOjJO3aHhawp?01jQTD?fB=43{-~2lvV_kkw2z(u3(<5G}P78$b_8|;a#2z
ztV%BPBgO)Q$WE{Ql*eSMa1d#SlmZ)*gpU}XAIwqo?-=_%sapb9G@wTUo<%5HBiy3v
zpa^yf1E}h&)F*qo5V(zS!(0he!94E!HXgSri45_3$caHNqKVxr>hOU}Xdn2z8qB){
z70U0a^A450kVnWyg{s06cF}IhIkteUcH-UCR=2gxV;AC_iH<&3JDq<b3;J~#QR&}c
z4qoN$8MuEVJM<$2;3-l&UyS;CYMm>!RaFwve|P!wb1V@ODB~Q@N#xK`dDV+o!BbEA
zo^q8MK3`)|l`MO17oc}A!R<6QUF=B}>>*h3w|!KnemfH)p4r^1;=l`^x3>Iw>}#nz
z>4zza&1CT5P@iwYCL*MAJ8PqoY%l+|QsP1^{sk5kyc|CN9Z^|M13^?MZ6I(6bH7Tt
z)*=+j?5=epqc!veB2ulVlr<lCn=-ZwQ`SHc%KWtY%w1qB!fN1zY706k5m`Q)G(MHK
zASGG2UAGr(%wX(SiL#^#HH3#7uJsLq<Gipu@-im$5MnPcbA;GMwsp@EC>rFWf=MtO
zis6+m&r1;d!G(_|(KB(eA0l2T)w;EjGv#*Z1}M-xzK=We8X{9k$b!kzABt`f!-Z)6
zjN0vr%ew<fdoa#S2Jd*j^%pmbY@#)5QGGk4SuaJ!xbZewX+d`od1TWhpYFoX7o8HV
zNE8!j1oJ>Jh>n)-)yd2$6_$v$C_^)^!cL;QD7_FD1U((L2lg{hBB$D3QzG`HEE|aR
zvz(ya+VU!Na|5Hy&RQ>d4O$A(%w?MP#}u}0MA-BrcQ@&siS!9uZsR&~rW}DpAaJhF
z`2Oz2fOjq=A5`$_esK{^pyXV)VZ?J|RD7>Q&M6?dV>LN3(<)(#o=vE1_H5}<?$EFW
zp;4nW0i6gs4FZkvl|V&XCm=90FDe7|_4tG^$!a7q<heT7lw9)V{I2c6B`-vnCNbH%
z`>W_xJ!I*`-n%G5>1mTs7~j?ec1JtXhD2rUbt`so<D_M#nJhi9qRIT3`a7@*J}-<9
zhaM$^$~cCR(-PDvMSb5R5>e>fAF%OSFTXMsEv252A<N(Kwfrq!q*d|zu#)t(uPUAW
zWU^hGdC!IZ?DUeHyefS#`Jk(@g}%?=X>&4+Sr9`}R~|RGAc~oM_qxsE%LHq6eLqHc
zH`CsA9dl6KXS_?jKfuqhHqdg;O>odEHbO)Ir&!aXCqSKJa^JubB#_4Q1u;9br>KA#
zE$5FK?>&JYUEFel$_)={Uf-9p=;j%{2J`4R%k4XmF(<YqXS}6qqE=0G89t>%s^HV;
z?Jc!rAON(4Qu%q-jeJZ==mF3=`1aJG+}MwNYyZypZ-KVgpQHb3%U_`8f+0=>D^<7=
zbt4<x!0H|i4oA$Tah~?*dl0D3LIW%i9?Z}-a>(?2DNhPBG?Ru{UNXwP$3PTHM)q!i
z$$8mA9Q~hr-u^pAyLse`0P*Nn9mz14bV1}IdRhxk$KS>WkcRbyii;cLh(VFnF6P=o
z{n?RB1g|J`iyl9LnQtANJY%R$1cPa|ZtwH52+Kf;MHef}l>{R0$HsoF0DQChI5$O=
zFM$rW{Lw^%@DoDnAMZjYbLLcz<|vwM6P|`=N|?jj01b!kU}Afj+K^yt&h!uj#>kB3
zKps%!Dr>^sFQVsmFpH*<Szf(_4Fg=OLy&b*Vz~u-k{H5jv^ME>!+o5Sv7b-L36X%~
z?+YPRsy5?ILGEVwKWx~WSUxMs#bZW;dVEn&GxMR#_h56br$gtl=!yyz@`wS7U}aJw
ztB=DeLun6FP>LXU5a1JsPZF)0W0ZW_n<`0{o-Ha6-gvgqkj(?@gyd1003(aRx<nt^
zvHP)=6yPCtXd2=Fgq-@TGb}j`cjolt82~cM9@4Y2AaUfwzjJxZcUw%(zGI6cUDONc
z%r2bffn__Lypcx7O&+i+hAlta%kHnw>xorzujUq_x=OO{1^~7X($DDkX;<1l?oj@|
zVX>*zsw6!_kLr4ez4EDr-|vllqzht?<62$%%?uJ?s-J}ToDm!xf8N9s(SU+`2JH1-
zn+s*yGS9RB_IFrGeRu|fJhIx~97theL<`y`MYI&(ezbUWgZyW*O@g*&%1%m$^4G|d
zJFfp}u;imlwH8z-K6pbpFU}*R<OJd##NB{k`8&?21Nlr3+#)IWJ$PDvjZ;T7g`|aZ
zl>4Q{vkOX2&4aXDc9@c<a{5qX+-3Z+<)Bbx9d_<DS0X#BM74-~)1v=h5`X88hE#}T
zPA%bj2pnCTKo}jKWAvo#L;$h(0vlejW~Kk^tF3MRYef=(R8w4N>vshQ3!2*UN4uKW
z!s}rm!W8H5&?Z)?fxGU~8@K<^BVNoeG|}P%@l?>DR@5z&<_i=j%_QBYUhW}kOX5*+
z!`oe4&ax(IKkCPWZQJg*_elYw)3lXx*XW4wEPS1WeFocRfU>d7*1lD3o|*bKQTY$2
zb}<5LY(Vj>#I6nL<j;j#Q=|$+Wkp^M>(VxLlx}^T8l>DlZ8T5$&erewHxeRIFMJf%
z0Swl~7YeA)JK_!SUQjs`gpA*Wrh7+))057|i{q02&`yr3Z~|_XuCO30vA40^SWDXv
z;D9l1@Id_VE<yoW8y;ZHur71FUJAaZuAEu8E=!tU+4;`<Ds0EIEt4F3(Act34JU|I
zaC_RZj*#(Fa9R9s5?G6Ojd=Expe<d|29NVW<#I6VM41K|ak4(oWj&}!c>GXFIHHBu
z&@t`H^H;CW<lQf~$1Cy88Q~~B7cT3wpqbP#j^f17|L;I&5j=(wxQ^`)5AkFu=U%C=
z-&eEgX)|oi;fMWLMTcEPM`N?jHs-=ObA!&F-YHuqf5zexqS*7_-L}&)NVNd)dDt&l
zb(kS{=xt&_Fcx`&*qb)=Eh9@ibQ5tmSHCvhmP^=dDVi-mn5buA4+I>^xLUvc$MO8t
z7`qto0|^lR;8?+nYHNcL9RxGH0gL+el$2x4NZ0my;Ek=o2a#@9e%0Twwcze_Bmv$b
z&uBhvM22VCZ*F$}%ZEx&12qz*L%wm)WHnMWBL<_w2P2y0^XN-fOl;)0T=6^@vOtT-
zRpdm-gC=E75h}&p+cQQ7AwI8WHtyqmxjQMU34;`AES^(30Nx?*_xFDZQ_UMuh(kWD
zaEnn$Jd=lL9Cn6HJ={p;&YxaveOaX*+36D$_N3i1?qDMDawmVIoR|~LkWS}Mll0ll
zKP0rSF9Arfj8sqLj*SircpQUr#nOFVVji8;Y+Ya7P+7YIuv%fI7%1PK|Jfn0`B&|W
zR2@=DvA#k^?0-C=-A)}E8ig}ORkG4!{Xs{v(p#8U3pD>~GPOMsL_4UM8Lo0t3R9K{
zn62(!;Z`hqmH*{tcoR(^h$P~j|GoE87YVSi`HMKpN?w!Pu*1E+cU?gC_aMKMsy?wD
z?!;K?`zrchsxdvpdp)Qs+El`QAnP?JZqfm}&u%~cBtb++Be5;v2f3l+|C&7$7x7pk
zXS2WM-%6YrF6pW+=?~A+1+E6_80C&A@N6C01l1AU_<u<A^Iis6S<mtHUq+3us}29#
z3!qSGEjoO>juK9ylBVD)@7)O|jv_N6tTLTA{^jV-f52%k+uDdwlUz69t%(cbMTvHu
zlw3=eE&A}6qymRiGYi3go$X&CW^9=Qw!l^|a6TU}ljS6b?IcO;-1l0OZMAv>a_nG+
z%~i?su_8g)#J0TuGG<R?Og8)l$Nf#x?AM(~0g$ZIxw8}5eW0~jDsdIC{9PYVx&kUr
z{BPAT8i4j9*oaaZ+nq`8Jj%h+g&)-9CNQBj@YnO14%N#>plG(W%<~AVvZa5<jpJ_6
z6CV;DIb2szx4*-+WJC9fI$Ho{6e7DQk<WOjksGinMn-0qkmdL9Om-2i8F3v3xD(_O
zQV{<xD0}8<?*M^)%M_ZK8Ket`!L|+XlGwpWkNNQy)K3zYqbZM4XZaE7|Lu7cIY5!S
zdAuH_PMqdXM)=(jXE>zvtVd1WfVEOslTA~7(nhGHEaA_-U^vNLkTZ*3{}Uk$nr=Hy
zdkMw}$psR~M2Sc1W7)#;x()~(_JNAp)tNfbtCyyOl;!=3?opk?f8I!;43&c(!to{T
z#9l$eJ)(t50u-DPPUnnQ=`63o-qAeKMvRoM^uC}yZ)TXKAPhbVApG;+2#VJJuGA1m
zj=8<BnLlqpYz>2iLRnG{3F}h<pBxZj1j5MD@UkQ=sgdgaV5Ss7v&WC3HR2yXqS1&K
zspLpywToBLUJG@rd=#xDeM#H+k~r!D>Ic~kdIx0Y!<C$dh3vu2#O87z%)t86q|d|F
z+{8}B{pqE(=HbTlUhdUq)1VR`0^hwP-Y5Gv0nC9s(<s>|Ss1;mkHs{VFy%l`pG{qZ
zND}Y8q=FoA0F&O+Cv_=y!;dsC1~uSc!vjm8%w;%*9@hKU-2QvhEoD+<NM+wC<JJ@W
z8*Hyr<GJ@ltN#CM@6G?AY~Q%?8Dpu0CPLY|M<v-}ER(Tx*PsnaWnUsAiLo1znQ37n
zwBa6Gi=wh6JA>)Al^LY6WSc?8K86Wnn|Utj^Zot@&+GZ&{^|01&E-1J>s*fGeY}tN
zd0e17lyC6h>2noZ_h<`)mZP;^_x?L)3L7E7otGUbFDQ9%P7mC<0(iy&pWQn_tTCMu
z350^IKpi~O1$;;u)(*-NjWUFr8cBwp*=#AN9mFu%4=;jb2?N@jUC_aTd@Wd_kjah=
zQS$rbP~b7AAEz|{_rT!w5=N4UJ=`0p(x3HyRil(;0+mqTMtgDjyYl93*9!~7434)w
zA-N?H{U|*I9uez`MwhfE5o_^EAUg;yndFpqseH3%Z_DJt0ZLCubVwG+ND%S%a}!^L
z?aR<6RaJfVI=k7V<0mL+`ESySC*z^?(@66Y>eJ2tYujl7I<@)pMX)ev^LpSC9*p0-
zp56oS8JpJ&LbCte@_%t$EH9vF3B)J;gNf$Q`7gv?uhKa&(C-;ZyG!id!&N<+JLtFf
zg#_0DHGuPW3v0V(`}@}tbCe+aDq|;L>|`n-Ts`(Yjr@i@r_1Q2MMc*189JdFqZl}X
z^;Kq-t-2?e9pG-@9u8!s{zgap8v0RbRM-fe@C?;BPm1s!UEW#^j#vCwisarvf|Xyw
z;l8teDKTb^yBTVfE0k%1HzXZ@iPC_VFl#iU)Pt2Eb>DJw+V^?6FYIMhkX9b~NG%31
z_>d*Rl;mfFh&CorHGrN#^u%M#8&yd^OxInmqTy2yfv#V?0_}tt$s$rlEEgd7jeY(<
z1lPqY_f^VPBH(jrlVR_6L0u?G_&bDA9ga1!DpwLAiWPV-!Y0t7#vbU0q}(UUfH8vN
zZM2&7^FtThT_RQ}wDpY;sycO6SQElifzuJ|Rm#7I(9!`F0?Gy9MAT<*7bd&7P-Rg0
zGq(>8LNm)~tNUqt@sRGFVj6vqBV^e#M6N2ki{?!4R(hG11<BY+4)D&z;c5A1jc4%?
zRRR1NHJP4BId(HTgm*SJwWS8HmVWUN>6k1O1FDmUKI@=OQ$WXo{qt4Itnmm#kK%e3
ziIkc9))mTrOdJDef;dP+`(1zI4yEgicf(Uno|kZlex!8&yspCfz-w>RhM~~FFf{Ld
zt*PZ0`MFVacz(K0i6XQ}Skv#;6LhVCzy>8Z-JW@RY<~R5`tS-}xKM;AiQT=mSJQ42
z5dzpyOylE|H8n~0eVu{z$Dn~1ttBL8Ca>@dHLI0b^R(Href4tkKMgiE;+f|1RTkdk
z(LLf~F45ivOSg%=a@?oBf~Bs<Tt?q41W~ce-pn!R{B(Eu^{{wi6Gc=t=kH%h;rs&`
z7Zq_sjXK~@v<F0Mfw_3%h7RW@1hi>5p6rHN2zYD8*+-$QJLWlthSDba6{!{-nbh0R
zOrE(K66x^c$7i2lCOe$~Zl9cX`Yt!m?ov>&b44Pyy*~;Y>bY(3qB{2=g^hdDfaj-E
z;HJXNihQ#YI{dJ7+4_Z&9S?}dD1l>R5p74p?-qgPAUrVkq(%IZN5G;AQIdj2f{N_r
zi>WrXwJKVy343mmsC2+8Il?oHH)kUp5MnJ+<U;-jQd8qu)ZX48&SII!BEZ~!X??eQ
zVwB?%ym#A8g7=;u>)hKR>`$f9^@?eugk!o9JX|)7tiaW1N|U?t+d}%BXiXmNm`Ugz
zBW7jAs!vT!L@&NHlkXcee!ajnW52v78gbrQA~GaQk!)wzlSQ$mP72xj&<BVJJ(#Nk
zbdE^GB@tDgo*a^!i@bm$d!0$BllVjfGj1;|sE*CO3(jb12M-PQrL<X%jo?~9rs?K;
zYs?yzVI%*<M00`pm(fsM3^Hj7yCd>v^53+8@)Uc&1^*mKOcd%MnLZoK99@)a>E3SS
z%ZplKO?mJO3$vLPv}~-?vEjNiC|VLR98>|}i;`nKI!i1r%>0NBMy*>$%n>Lugfkd#
z3vKnNUJ@k~I-f&C9E0&Z-r&$n-povR57}Yt#c`3Q^k1j%6CtpXbg4|XEl`4qpFrF*
zvklblYX!Fmb%=<%``e|w3N&kUX8iKX%siWl$~W|CmlywMB%S}LTA91H!kLVlFnV6I
zw#yyEfJ)xz*&xu*hj_?948RhfX?Nfpl=~jGHbV|NFTg0FkaKj`UDe_BB2mwWvqeA{
z;piPVh{Sm15o2iJ_x7x!e>|z9T~8to!?I2kNB+B?&7eiuqO(pHCo{%>X!GkQML~%H
zmm-YX%|M&@!m@EY<>f|owK$31`SEQqQ@4ygh(sQ%t4I|s$IyLk8q3mklo}8ib7c=O
z`y>OukX=B@j#)N}9cj~JryMqyjv~KJ$a+QO?r-Y<iFOn<3X41W$<3^|V9@vQP=Lhr
zOWB3!o$Pl+t~`5$HlY7wM@+7CX>fQfaY8lM_*_@?*nHyY?<=S{AqWm)1c!^n)>CQE
z%o=?Ou5tY6`e)|-helEsxS*{~B^gK?APxyZYZU1HRPuIBuEE}18mZx4e?`l?!a`$x
zF>(JyqKgiPrL7nthBV}ZlrS^vN$ksU$zQ^solA##ur3<bWg|o?3r>59)r7v|v{1*}
z)sfte9NEXw2bKpkO<LzJqZ&uy!Pm$^R2ztvu%e)|NQT(D`w_p4LgIvQdmQl&UWj5x
zDA6gw>~0#UH}X$&&s@&sl8^-4jhS_W$~ccRZ5=8L^rH1+L9lv$rak1qmN#j()qEM=
z*xKHn3=YITiQy%c=4aoz<l=Ippa)Q230T<PFs&%j7h;p4N;tpL5Aa}L)Ve%&3e|TZ
zQ4_1p?k4uC0S@6)u;*u*61=tJn-wWy58Dt=v_GYNX80=O^9*)v;~L{NWv%6i8Z7I~
zJaesta+*~2TCX7MpAVdk6L=y3w)EMJL~P>$!*6G)qxM%WW=q(dvsj_i<Y#AjonwKt
zVg8t-Y_5RPYwjQpd)tp&0vLRh>HB!!-xz3LhyU@8mx+n5YoRlPUDeA(k&saBqHNF8
zQYXJI%Z_&630v5}#+MH2bf=DK3b-W@IjYi<Q3sp!{fs@0!`&W}(%~Et(n>XL;Mq%#
zkx>-CY%G1EcwMdO4I#CxhGTjG%@gDvXj+PtNN>z;@HM|7Ki3>265o=V$~SH`4h9x4
zpKQa5xP{6%Ta2e83Aac?^eak7>06Eln<DTFMH_sbeTON*qrXr$Wt#sIsCf`L{SMl5
z%To58DTu7dou)-?YdU;)E#vTmJ%#HIXr2sPX>4pqSctPliD0kp=f9TVlgtIgi)XmI
z_tz&)OUPQP-MPJ_QB@|t2vR6NmQ|vfBOwoty8d`}fTUSDenJ8;^^Z&8XF)fghuc{A
z*6fO>E{unT(5}pY+<&rXede;WNM$s#3EbdZMZSaov8E-W<f5WGH#~?T$ij?eEw$;8
zOn(j^^_&-E85bD2CX=!YmXDGfnmXVmbtQ5-oNgL9bhn@UrDF;1;4kC*e#6veIOsez
zU(Cv{w(7p!#T2)^9$zW7?@8#N0lXEcwhFwW;+@~OOYMK-b}c(!#Tf3cJO;1V`8;0D
zr9!2$?H*ShglT5m%RCQ$96BhIuay+P7pH>1)jDeCIS_my!U!{EnfCOa6rn|UYtxN7
zNE`+A8<ZkIz4j^fV&&S+FMUZXEyT)49{klSu@IRgo2VLOk+ra*S+Io%UlmqocrOzq
z{($I3stUWkOm|SkC~^w7#}N-o1y7GdrgegERlOhG5vQsW?#0aXOxC{R|5CQxl(`MR
zV7d11!a{iV06bRV+u`UT_KJhl^r6xb-1~!L^N0TQrb_O)(yD&<^bP7&f6;QVpuKIL
z&7F@tkHw|M(E*+fV^FNe`s!2A^MRyj9npF{7c?&zIj!^}-Xrx+g(aJJ=kN)VXpU1L
zYM?FAN|l0yd6+T+XY2!#h?U}iz>%L*x#3bWa9(jvWc_C8Z_weZD%~dU$K~Ib5<Oc8
ziI&&}jVEL~eqV`IaB>@(_%kCH>kifh`xS&)7(dAq4lgfQ`PGHb6A}4gUnC(>Qg=PW
zqWz`)!CSqiW{q(+tXrl{ipzb&i%{*Mm(i$38w!-70t8}+ijEf<Iyy1qAi7T_&(8g!
zN8HCK_k(OrcU&CbpRQH$@<z<C&%q~?q7vb&5^0p$2F&7k;yMh*w9TK82sqiW&Tj$J
zpP?cTO@CRy7H_$o``kr^vObrSN3oL>>_*Mos+$21>wxpGQ;YqFrCsIVpu^*9rizuN
z>({oMc?Z&mBVis9sHj8_WX;;@Nc58~ktkJImTsRIv{;I(8=%Z?F5Usa993yq`QWF=
z(b4gd?KplBEeefx%^=+(RULwN2*&j2BgDAN+Ed1V!4#2~GsR-~IaXPW4WRy4DC5oc
zfwiH9Aq)LzS*R#MOMtFn+UFdtD1}RROW*7{aRE#C)AP#1f6e9hV8}u4`1IH`Bn(Li
zuJH3b8ocIr@?-@h_1x9o@T#w?+bJZ<EaeWtS9#<yu~M)y*&`D_sv7G&-Ql|3NC<0-
z0tSeu3QJ5RCPRxcC?g53SX1Bj+y~{z6DV3dQRTvo$Z$&s*3|)%)^TsW48HBL1W#|C
z5<5xq?O}LF_CHJPUqr4uDnEk)pC-@+k`5zhJO?A8Y#qiNY~(dG@D$2Ojw{^M&o{8W
z3yQU9)TTJ$&%#TRovci5dz|?<`H%HaCfR}QTn!M|*<CxMN{W&kC8_*+UbO$-eIwRo
zeV0x)?OMQ`d+E19TG4d3jsr1v#?^i^-6o$xlIWnFZ8?(dQEs#IBR^^-@>raQ#yw>M
z|D~)=ph;8K(wuN2jg}sE`TPoV!-J7d*~fT9s;WLcOU@aNQeAdt7k9q(a#A?}r*9)m
zVZrQJ;y<4>cX&gnc+W%Ud9kWP)mmWJi_z<iK)fWs#5bT&3Ax}Na8cH^eib!Xm{$Kn
z6)nGJ+jGFoy}G|TaGd#<ftc7von+!V+D*xtWPn-EZ7#lfjfh!VEPn0kh8z9zrHx_@
z5B{l-8$E)w18lc$9L?FH*nNZ@s({E{H~8_PU8r3{W`4~mzzk)i57+WJVY_DTJoif|
z44qi*)Oc#_@NF%yHON&Us``p1&%W;XyfA-GHZu4Au)a&fVNrE7T8ull;Ff2c=^@zi
z<IBLT3edL$gRx%BMT&&K?7r&B_M00qoC?hNfM5SV?O(_8Vq>`66b-=9R`A0D*mm2)
zZ9=|;XkFm>WmwI3ubs3!vP6mDh)MO&g8{+<I$ur%NC|TEZZ8vI>=E`y+WJpHB){Rs
z=W#-P5^*xu75+S<knPN<@!J0f0$(tJtn=as1hwv-=?b2EzrIP0muUZ?syhw>GRw=H
zk8SA=Zh3z79@i%Rt@B7+snH4cRNH4F#G#JXzHI36AG^7;0<@MGKDX_x`O12P*tk*$
zt+`9Z^py52Ba_mvJo0{fcj)m5GMZKs#-bk{$}tV!(P7m>hPK?lX%w)H9WyMEA+X6-
z`24F=A`P88js1oZ7M>Cb3s-swfyBBR=t||bks3}$Mo#Ya%q$h;A#@v$-MTdZT*_%;
zsbV{wOUowKar@yI*~Q_#yP!(h|4i2!1ca>2TPu&~xFgxDLt>cvqBGAUA9t(=qXq<{
z5O5Ldl~pZa-rIFvu2lRu_`Lrj;vp&3ypc*MkgAb7*Zj(^m`1CCjfhFbABqmtv6VQa
z2IGObYVI;|Lh4|kXoHn-H)PzK>5y$?g$jXSM24P`?ABK}8CNf;Y=%$J&a=f2`{rxH
z!dM@0c0E@_mBXgv7!;QO(%h8CX$z5fL&M&b&p9_UoqYA8O^;GyZf5S=IpCPLF}(Wm
z<FxPll#w?6qQ<px`M5n_z|tQ^4_^D==|r&*3!hT<z{FLM_1tY&Hsr%*rlT#;{(fP|
zIW^dbu={DWrowogoxq%$R)^3Mh-K0Fh%RHlGd~CWjsLMT45%rnMsPzjIV!z%!#6uV
z(K@VulAs;oAUBb2Z|2=F#ed?6_QL8%v<z(4jl<5by8<~|rh=yt-<)A1Rb#Lo@$>GV
zNZb*<WYHK&Xvxp(;)S^sge}Llw-2U<hp~zaT-_qZD3pwPtngGie9qCSh+cj(fSp3{
zDC2WfFCWHq^?%=oT(YN|&r2)CReTq1<%Eiv_OxkSk2MbuMAsdIiG!EG!UO<1R2M5A
zp~onlD4vEui;uz$Dn7g*IV2I`cv0n%71GLm>8_F#n5>-7pp{8YED@{AmS=FG-wy5E
zA`2{~8}4@i7!I#e@V3j!!8UJjZ@@|hG1(Zp)_#9No|Z3x_Mf7zyA(`3l(>-pou5q0
znE5jMxjb+e_KMo{`QWEcmacmDEwUt>(@6EI4fqCC^PQy^(f;>|76SD9#A!IkyPz=m
zWoy~4A#g?-$=5s@vlUP}>IBQNUlX}H@7@0G?ANjGO{u!-IiFK&UW^!6CeR_N$sH4Q
z-q<&*=j?@?cSG~=GB)HzMQhu>%Y`EBQ(?4@XBuH(E46S9E>N;;6P~5Ll!pc=1h}9E
zPIUe)7v_cL*+n|$><y88Z;i_D{ULV^PsExAI5$`%B@vS;RGktWaO7A7E7asD#Rr_4
zBo@tIK+L{X9CO^oM-B$1slq#yrX#oD$b}N}9w2TGLam=f50yTNlm^v9G=-*)Yc-|B
zXJeF#CC7d}O11tu6f|GwZF^%l93VA3bV3L1{;-N^FnEWS3@7q0_FH1_&Zr~s`u>29
zyO(D$F~kY5i9kN68<dn~8NB43_S+{Oc<a-eDdkFMg@K;tF)D7<)J3L~95a6Nw+G4|
zZQb)9cljTu`yY}0AAA2_Rs+8Y4FBIYhkb`&io2mnKH<n5Wti1{q9l;{nE&G(`e3ju
z5oeJ6Aglb`w>nA(Y~<3c|0nzEzQNxN^MyrJqbMOsV2(!PUki`SQHGmBp=yL>^F|ga
z0$tNz3g2>nulDYNcDx1vo-nsU-ym02hf@G87R0)s@~Jfbjqr4H9nfYX;y~L!n;1YF
zkd}x7#qbASp;u+xaSRt|panX*a8T^}&#h4R)m|tRsLKHHhh&jRC;j{{*m|KAu)+ai
z?9W83eR_I`#GhtU;)$C~*)a-SCwOCL5AFsEl{fvF)<*e;Hw659Z$Ve6)zdAP<#Q-X
z03nODMe~AkjwBG9p!2E>JIV*r8#ufOG$b&yJ;PY?V*~2XxHukxWtq=kUO&Px)+o$V
zgRx|wNpZv^0Rw>!jry!_t<@G-1G5RXJ!1uKKlf$}&Yaeegi`Sm0^7+4;37`;01>_4
zESGOg0je|<yu+xI93ZaewD&MFORgNjgB8siHGyvoJl*_8!0$EmHkl{ctH3>h=H)kq
z4hqjaCb^Izu;@DR%BrTOJmNF8WKfL!Y`D~sOB8(zOpMC?Y~87Wv#cXV+>cf*5E
z=1r4c^+22m<7SP;vp0x;9Qq{_ROj6K3G`4etGN`w5IZu=<|Dbskurk1ThpTFf->5N
zJ$xQA)X8L)_Q&F*Hx)>;*SBcp18?{6YK?f|XxE<Zlu`%uka$!vIV!ZKZ!0u8ef=E~
zE8EnM!zY!OXq0@aQ4JHsVo^q~6vTkihnusm*H3L7Eu)yt-uQIm5vfODJT<lGx`o`I
zQL~f;@Rdq(>N6VchIb^Ir1i8##i9{{t`7PGf8fTV6T&h0j*2RclC~cF>~R(L15)9@
zZ%Xd*_j4Ubk^qpe;OF;Q{Rv5qFqfK&Gx)YBx0O;&gTcgw6zyH3aru#IGyP-pgt0l_
z=Kmq1v0|}tx}TF8x<U!uFb$(k9R5QBUnnuEHga_<yjcX*2^I@5i&%L|%CGS~r1k`O
zpAh;U=pbc?a)fc!T`R7sefI>B*aDm}#I@4I{CwXy0%QqTLQ4zR*Iqs#y1oq|fIz@5
zdolgsHgA7V$Sc)&q8X6a^CUnBSqphyly`}R#C$Kbjs!Nk0XVgrYNIZ-+_Kld+;Q~4
z^9rN-Hwm+q2Ha!pw1xFyiXHmpg{^UUA`OiMTMJu(3?pS4*CT4eS^D9pTRyO=ir<tN
zNQZ>5CNNhslNUb@1P36URt<XQSWZs#(CNOujZ-{Bn7=S^ZqW<f-TzD3XH{`9fR|TZ
zHZnB)IEG`fm6rA?zfU|!p+i*6Rb0-p=w&_s4kDLW4qu9;p8Hz8zjjH6lM?!+=FOdU
z#W!M<^SI{n#YG{f?(fpS3i1l<$=>51%RaJZZ~0E=84Tzm>TzdTVn%u}6B4Q3cUt1@
zTf-Md{-8o1?~~{n8d(|UpZ$+!@W?r!Ao%YX^8{=}kYOe$n!@U@?HDmeiqFg>9^-_~
zUQP16X&-HG5Wh45Yj(IEwbpFtTeSCQH+?6~VLA52hD`oMFQ&hGb*ijKyj@{}YGBc5
zO|V9z`RCT1)9Q#5f-x1PHBl)EKPUVVb2psScwY}vB=IL<9zN(oOz6EoG>Q~dff7t;
zX?N(W!mj+~ViDSraPaIB%cI_+&1>bw;4VomO#6uVl2Ku*ankf_j7z2|@2XQiubxoL
zWPL53VwrXoI^#C>Q^3&q#Gs~-wrCMhFqM<n_v4{CG;lxrQ%q786spup)0n+(dxr`n
zIzi%K&1<3e{2_;d&Ej)$td&;;R;;0#xGd90Ke3)LHKv_|8sSXN7duUll`BuC*OPQH
zJ>rVo3Qh!fTDo|eH|TO!=}SaoR?9C%`VUD7(2yc@O?T45##f;;V5obT51n5l_DVq=
z%3l}zVMO#T8qcHhlZbYuxyc}BdO-B2a~CJJ{kHeWOXAZ!8OFq*_>77BylZBx#$ax`
zh-suraEk7-%gDcbGUfRn?l~`K`#FcRmXeo|<>QV0nhKQx&$Z}MPz*p4r8~Yz^XnD_
zQQV5Ni)!}D?wj9;3Y)pyxo~CK^V-112!r$nsEB-M>URoJOA@LDwi?>_lGPwxFokgM
zdSom=pDz?rij`hWQxmS7nJL(L<1MCc>@mS@rysSS0EY*=`1*EVG;0he0G$2WRc6BM
zjgH#CQR*Twxs~e(xKh`YVdih5Vp0p$=tKETU78!hbiPZ3EFp4IMV@u8wwqGo8UXo`
zkmy7fy*QxjQLPF}rUqQ3ZgJpRSYMvep#UeOToC3FDIC<`3zgTE4BhwJlvKwOfofgh
zybtZK<LBrRy&oVo$3`YOxq3ag4=D=0ZP1Pk>kgwZhrnwc!%`<-SwE|izV>+OaFXv}
z?Uf7u8cAx@CVi*Ytvtf1j{BNSR@kYY;^5Sf5F_pbq0-U8HARcX_sb3AM{&!EBI<qF
zz6aSqbuzxF?}pY=zAh|u7qhzDWV5=z3eEIQwQkwY4F{CGkbG4?!B`}8v6<ZPO#fMy
z5S%-OyL&QAH#LH4=unLrhjuo}GaQ~on8278jfK!+Yn0JZ7|V2IZIId_5l<XU@BMYe
z;dA2rU*Y8?e=VQzm20;vt=8GHF$q?4X>zrqmqTVMM%apB$jX|6gvp5=;>NcWU23&4
zlO}^}pceB+*HL2`sFHdGq5nkZ=T+qZAvTlP3x^F!LJdISPoMF{E0tk9`{;@fp{8B8
z+Ox#Q<@&ngB6FMXX>18E+8z*ll@I}&j|&g^3LJDCFRwWbnu$JTu(|Jm+1to|V#NrQ
z#Lk#AatAnKYLo_oFToq+4N(D02mqW9<h=y&t$|e{*4#E4zgE}$Vp^{McMG8BW6I~N
zo9oR0kaoc=L+WoDz1DMXP<#qK{O(7|1g|c){YpdI$zWjlIaUQOe%TJ4|4fsYpXJfk
zL2LpMA;dl1V7CUM3%7rwS&BX5>%;1{XGqA4(;#)SK$#}D8$4^>IwC?4@sPRPuMZyg
zM!P*}^*tA-pKvoZ4DH*nvVPfa%U$0mZewuo<yEJIhjrGB7uU}Ee>I+owwD;>7}~!l
zbE4(!BnHpSck_BAvA+tTJ$Zy&mbwwpD0DuGVJD`<w?k#5l4k7(SaPNQk(Id+H~$L`
z2{?5pi`uhD_?G~a+^*!kzZdFl52fWMz*yQtGe0gg^dXmVuM({`ttP~2s>b5Wqt6hM
zJ?QAyv_NxlXL)EnIPr*Snh04K+tkiVB8sFew?q6zG-))n#@%q)_is*n>FxVOLx+8r
zvj9C<t?9DT69#Em4{oi$zgKc1fTTk?b;u=TsT4ED@0_W|$y7KG(Dg7q?FXHDy2bsk
zTr%~x0D;&jb3qWaSc`o+?ytG}oC$I*2yp;bCrCa5mm3eac0_mMYjSOycltyH8-Fjq
zJb*kV@LSvQpdW5HIgueLDl?}S$>bS0ehBZpHR&J~UtZ{y6cim@{`{TkgUUJ_N}Wn8
z0~*-D6=&b3kps9nM64v+HSRrQf{2K2G=wZ)O28C~fez)?50l5h_?q8iC1C%fWIxkl
zmuT3tOfZG)f+TLc$qZG0H4(!4`tn^^`Qr_Lv2o0+-Yd@o3qU$0;u+fqK=K!|?EbDD
zjtb+=(_9m7Eu_S%Y&i~)bjWhZFM(g0S($@(6APb5Mt}KXX}!|<uk@v|k`0Lp-3-&+
z*zN4{M+RPK(*Uc8l~+syzsuL7;s@bVE`6kCo0Wd=4?3ISM3<gHb$mrdW=W-JkDdy#
zUDowQfVaXTy4SBU)HJ}NVbPCLN4fS*!e1+in)8iF#KH=G2mvJO;vv?JFTH`8E_oL1
z(FyfhTbmdTvzTpfhdJaHzb?2wZ=+m@OCcr1ml=Z*i5~kX?eg}D7KI0oI%pl9_E~!P
zpn#%xyhH$G0XpY0I!E2IR}UugB=!V9j4kgNI`CZo;rHodMs_#*rQ&3R6K|r_1r^~V
zqe;51SDC_$Sb&LaTv*t!z2@d=oRr?qfxjsp?OQ0AVR`PWh{$NAb6(NfKO1jSX6QcV
z3%|u#Ann)Q4g#LK^G1QK95*;N_=wsJpe>0_oZs-R^v<2>C#?*ge##e)YuaA_h(!PA
z#2PC>UpGU`-^&cDtp#(ebL1iXt}%6Eo9oL8EY-tvx7H?jpu1jgP3JA9D!ZFiKJ&f1
zX6oV5?{;*7U&)ve6<@^wSCmo6PZ{X^3C5zB(uO>ii4sjDwm|p-Q5c<$L6LkP1ikFX
zJ&VRG(E0hz#GMi&7aHh=B5T7^bl)ZpZ8}uGB(x)uD1t&^R$S~*^1GlN_lZgZZocN3
zubY?XCQ~k!BT~QpDk$%C&^0IjzJPD%Z4EEAg)JvEed7Dc1w-E7jB8p4SMEU6&(K^>
z74uiK3E8oE0z;x?HMmNto)ao#Thw0eQFbdWBUAN^udXoYn<F>i6%h*s3a8zo+<^z5
zU0ERjr!vMZ07(I-;#>Z^SP#4V^{RZao_bOwOFvMkiGHR;*WW9`Ns>CKx+O-u{nD1U
zln~LA<8*!JQd^x5Xu44Dm8++F|6rNr)7i}138WPR%DmJSijPz<uLn?`Wr1gsNtkD-
zasCxb1s>N%OTCoOkqD4MF1^R}Z3DPXou*Y$ekRFVJe&vC;W#n^k1HSyV3+mthM+M>
z{`M#^LL1)mAdr7`w)1>66#^`;O#%%D??E-XIL+k`)&+RA@G9I)D6T5Bti^%r8=o`n
z#6H|QdCRg&lu|mxMKg==Zk825q`2Fxg*~Uku~4py3`W;QREN^SHW}wK%8oQ$f&DP)
z^C5sri*tC<pGf2jwhIgiWpv(sX9|(uk$<Gf4l)k$_GVr@nOnTGr^E#AR40nxV#~#9
z+^LymWhWt3iz)GIE%lMYgJC~FWrVAo*)mfbb;F*x_TE=dGx-t%xwEf0^}DfGk$qi<
z3{qQMsF}NRrhfqCoR>)v-(;6>)fkx=!!GN-7)teK29hG`7wFpoJ;LP>p7P6B1;S}6
zop)y|3Q%Z=1W{)f(^++a#6?`jmu=kcv)obZa(M4EC;g(ggF+c})|Dl%aeg+%mk<Gn
z%U0&@60tkkhZs{~<$*DJy!<uY7vy0xDJ`Yx9GbJ(jTf?mykCdDBcCh@zBH=8a_Gob
zbSHQmJtW|KbJ_Ba!S`5M@=}Ih6KF=?=~kdAF!03@p|g&U|7K)_mFLc^KZD2v#rrB>
z4b$%d^{qtT`h(ujh3tp}qLq`aRHO>~6MJeU*D7~NAMs{5iuOF)!5Ok?q%UL_^#~(^
zzG;-@A*&JVzySW>R4x?=9qmnZ=4ATq^ru>e({+ETFuZ|mJ$W1kWJiX())Khaqfzpg
zvfQ4KY=1;<XiyX+D>3GVp(Uu%T`QA`AnY#W(1WBkdYT7!oj23lG(7ZD8}tuLQ&`gD
zca$*Mp~I1Y2J)(>MUYIbx!&nk%rA`}V_+A+#g~XB5Y*~QlX-26&XH|jUXy---AIgY
zMIx^6pL9sPz}iYQS%G9Ms~t!xR!gKEl^=%J89pgpviomjP&THiqYFg)FO^Od0B3IC
zV+G<>PY~U%GF`rDgx`h8wQpgTvZfy-qibz@ATo0iA#<NovuexMxT%`2e61evuS&Y`
zM`!b^Jn#t>-GxKd<>htP?1qd#w$yK1&oQE(Dt3{l$h-STM0;`^ogxe%;kGlrxbKMN
z#N0sb`upmr&Pv6`pg28n&UG(TRLscSPYWs2Z>p4W<&k)buTF4-ZYw6_RKx&n=Z41r
zz-drqS;?xw*mRGpL5Qk?eB|fc@<wef`{Ulrw7Z}@t*P<8x#i+pCx_~m(mwhU_*9yb
z^{*A$I#6k~yX<^IYXL|L#M_6^g)!00a#iVy=fh;cd?9sX1mkn;$KPWFfGowhUewYG
z5zzPCVsHUz9b836=;}pir<lBhD^DNpeYH$X2E3_x{w1Tsxxi4{1T;67#meY~HDM3q
zO+VcGSl7L5>(BiWR>)nr3<RV-gEt-cO$Q8YX0{jom>nN07DYtu8w)YWu+|4nkUTOs
zjw`WepqV&OtDj;$Nd63T+c|^nk-Lr-8S#JUQqzuZc{gM-$qf=+$&2|Y*A2Y;=W03O
zX&G|3q?}y^I+Egu3tEvW4_d5P?XA<EBhMN@WSF~r@%&G;x|ECW(1u0B>%Bftzx7sS
zoqAR_pQRc~g9n3pHQ-u%N6Y7)wtT~@oO_)mEa8LT5C8v7daekv2gM8|HYVHYV!R+u
znj`<-9GzA+ej{7|3%L^%?yC1mYpC|VlNQUHGrKtLUt7ulw-00h#(bhJjv5y^3<tSm
zqt{ix9nRIhlJ;rpNrG*^NRH#A;E5d0zIHdGuA66CQd<%zML7=LcdLXq>7iFP$-z%d
z<)GnyuC~Z*ob856jlx&X^JyH}uS-K_nS*~a{(gCTZOD?<aFg%9ebd5Kyq10-CjL8S
zwFEP`crw+mt{_aL>D>gte#EiT_L3apiO)c}kX#fh94&73;!QK3DLGl9A0ue{qdV{f
zpr1kQJbw-j)UNvVR9WQEL71VcITW;h0PX~Tg;v?}unB7LdKoBAc7mEpzj>O!bWaVL
zI`_eP`7b+M9wyZ^3MZ!~K6hp1%AopSQT))oYzf`M9(Mq24}>o6uW&IuPVd{dxDNsd
za$V|Ks~@L>h_e8dhiU)*@3rgiiY^F(;_;(+(VX+mX)&Ck?E`ddN2cwYBx0{juMVMU
zLmyIeVdg;bj!K^$T?!)VY8~D;xTagYPTF%CSrY~*GkgH!t>RZ#8`g4rPiBplV^>wr
zmnGh%3fHf6p-t*yg=#qIA{DYET*kY4ZU3Z`Q!d6YGhb$7m7uY>=)`I@^;`eOcy=AK
z)8<D1GKog6y=D43XHq_{g3TdVz{@}kW(;RgC4j%MQ29u~cGn?wn5%dcNR~)cK{F+q
zD{Hl%rXi<XJlFCT>_zGfnsjm+x8T&><dfC)(oj>7VuhFGY^jFt^>A~QOiuW;vtf8@
zqL(peTC^hntp?Gm8d?1k>FYo>JXr=_6<!TG%Wqy?tt%_50scldDY8<}ljrLT3rsU|
zG~jEnks5&Z%r$G2%RTyd)FZAw-YP7F)$-wkk*i^ojk2{yk}<^=Uo3#XCpK;?_D<1F
z5tl})uqu9_s{Na$)Ab=$?FF)oQG2lGyAe03X?+l&7X@jQ(OZ;csrcp%9k@Z+hk~>~
zw-?($OOV7D6JM<D(SR6|O30Q~0UfRwl~X(zF!ajFvG=lKY;W;be*2W;r^HENJ1^!%
zb2<4P@;0u~(H}ajQ~$7FRGMLc&q@-}{<yZm&4Wi{J&HJgyCdI=SBh2UMOiId(jk{2
z%aE+k2O#*bDu)H|cUr3E_Qx44gWQIG09v8l*aHF4n+Z{;=<NgB$Qp3FB%2;>v2i!~
zT$k(!{|K@icU6k|&xZgPu}w^U0lumD47X8l4X^)Pn4$#bs2YydfB{KIa`wQoA)tad
zMq24Xu0*Q_=rtWCBzQSgJg6M!QLORG&3&OTn2CguXyqQnwGYUPD3pxZ(GLRL4qDXg
z)htX*XX^OxvmQRzP0TxohZL}@SC!gn1r(q*>0sn^J4+S0T=X29&RejFyLl9uvrS!C
z@xzm9e*qJbC^Z=K!G)ety!ZKYGKgDo{kA+kvvC}8WEnoXwaHFfUMiesPcAIDwN|Y@
z(HNyt;t4>+ZK3Qj&klzM?h4RsDqU*vVv1!;1#d@&?WvSR;GV7x2|Gj9HDOjs#Gwya
z2cdz7*rpT>%6-yG-e;crYrCf{T8W)QDM357nCD5rBQqT1?r{_WSKP-j0n+<CAq14G
z{C-=n5O*K@1&th>F)jmX0m^qU`vO_F)Y<PAcqZ<uO&-Z^0w#}1{ms~@{2=RB_OIRx
z%cqfgtX^kK{$c#fP$)74A<2zEXMMlBAPs@o&JwL(W{Jl=yyA%#*Ly)$vQ<ZM@6)0_
zhEUUkkbhpf;5=(bpcrrj;T3H?EInO!6gGljz}(>h$I*p8c1~Nc<MH?1V;&LSDXDV`
zSx#<y#!lT{PEN)6c#kXn?<tFB3;}nyIJ1p+KUXN*DMfgR(kcxr@NZJO>@rIJTrmtM
zU~!eH1f(K>RKggg3;1m7CmOlB%`!-EYi!C|+vj}oD->_)i}~eHr5yWH?BtRcoc|>t
zNb@#~Jb=r!10JlgXOqnmsmmZK`G!Zv9245aMvrxsVIG=IH$kzGTrXzCte*|X3{d0U
zqs$Q`0sU=y`Y-SQrBEcHkpMoer2weWHqz<4(Pl7f`dm2U3+oc!;?G?9G+$-5QGAXo
za05~K*Mgq_{I)b?T)>#(>FxCYy*FGPUi7+GvFHg*(7um($nq=-pew}GMXWS%bq+0C
zytyuL*2A13wg2#CTD?J}()-<5EZVh;)%<|T7nv9cxt&9Q^d9)oG3d0J6{_%r`|bY&
DFOY3-

literal 0
HcmV?d00001

diff --git a/packages/lane_helpers/docs/index.rst b/packages/lane_helpers/docs/index.rst
new file mode 100644
index 0000000..40095e5
--- /dev/null
+++ b/packages/lane_helpers/docs/index.rst
@@ -0,0 +1,11 @@
+Lane Helpers
+============
+
+This is the documentation for the ``accvlab.lane_helpers`` package.
+
+.. toctree::
+   :maxdepth: 1
+
+   introduction
+   api
+   example
diff --git a/packages/lane_helpers/docs/introduction.rst b/packages/lane_helpers/docs/introduction.rst
new file mode 100644
index 0000000..9bf3eac
--- /dev/null
+++ b/packages/lane_helpers/docs/introduction.rst
@@ -0,0 +1,113 @@
+Introduction
+============
+
+Polyline Sampling
+-----------------
+
+Functionality
+^^^^^^^^^^^^^
+
+The ``lane_helpers`` package provides utilities for lane-processing workloads.
+
+The main functionality is batched polyline interpolation. A polyline is a sequence of points in the
+space :math:`\mathbb{R}^D`, written as :math:`\mathbf{p}_i`, where each pair of consecutive points defines
+one line segment.
+
+Given sampling distances :math:`d_j` measured from the first point :math:`\mathbf{p}_0` along the
+polyline, the sampling function :func:`~accvlab.lane_helpers.polyline.interpolate` returns the
+corresponding sampled points :math:`\mathbf{q}_j`.
+
+.. figure:: images/polyline_sampling_illustration.png
+   :alt: Illustration of polyline sampling
+   :align: center
+   :scale: 45%
+
+   Two-segment polyline sampled at two distances. The input points are shown as green circles, and the
+   sampled points are shown as red circles.
+
+Sampling distances do not need to be sorted. Distances can be provided either as absolute distances along
+the polyline or as fractions of each polyline's total length.
+
+Point coordinates are not limited to 2D. The coordinate dimension is the last tensor dimension, and 2D,
+3D, and higher-dimensional coordinates are supported.
+
+For batches with variable numbers of points or distances, use
+:func:`~accvlab.lane_helpers.polyline.interpolate_var_size_batch` with
+:class:`~accvlab.batching_helpers.RaggedBatch` inputs.
+
+Functionality to compute the total length of each polyline is also provided (through
+:func:`~accvlab.lane_helpers.polyline.lengths` and :func:`~accvlab.lane_helpers.polyline.lengths_var_size_batch`).
+
+Runtime Evaluation
+^^^^^^^^^^^^^^^^^^
+
+The runtime evaluation compares batched interpolation for both CPU and CUDA against a Shapely LineString
+reference over a grid of point counts, numbers of sampled distances, and batch sizes. Runtime plots report
+milliseconds per interpolation call, while speedup plots report the x-fold improvement over the Shapely
+reference.
+
+.. seealso::
+
+   The evaluation script is available at ``packages/lane_helpers/evaluation/shapely_evaluation.py``. It can be
+   used to run the benchmark sweep for different problem sizes on your target system.
+
+Performance depends on the batch size for both CPU and CUDA execution. CUDA parallelism scales with the number
+of polylines in the batch, so very small batch sizes may not fully utilize the GPU.
+
+For practical problem sizes, it is recommended to choose the implementation based primarily on where the
+tensors already live: CPU inputs should generally stay on CPU, and CUDA inputs should generally stay on CUDA.
+Moving tensors only to use a different implementation can dominate the interpolation cost.
+
+The plots below focus on batch sizes 1 and 64 as examples. The evaluation script runs for more batch sizes by
+default, and other batch sizes can be easily added.
+
+.. note::
+
+   The following measurements are intended as directional guidance. Exact runtimes depend on the used system, 
+   with performance primarily influenced by the CPU and GPU.
+
+   The plots shown here were generated on a system with an ``NVIDIA RTX 5000 Ada Generation`` GPU and an 
+   ``AMD Ryzen 9 7950X`` 16-Core Processor.
+
+.. note::
+
+   In the following runtime plots, markers highlight the smallest measured problem size, the largest measured 
+   problem size, and the 100-point/100-distance cell.
+
+   In the speedup plots, markers highlight the smallest measured problem size and the largest speedup. If speedup is not
+   above 1x everywhere, they also mark representative cells near the first matching point-count and distance-count
+   configuration where speedup exceeds 1x.
+
+Batch size 1 shows behavior for the smallest batch configuration in the benchmark:
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_1_runtime_comparison.png
+   :alt: Runtime comparison heatmaps for batch size 1
+   :align: center
+   :width: 100%
+
+   Runtime comparison for batch size 1. Rows vary the number of polyline points, and columns vary the number
+   of sampled distances.
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_1_speedup_comparison.png
+   :alt: Speedup comparison heatmaps for batch size 1
+   :align: center
+   :width: 100%
+
+   Speedup comparison for batch size 1.
+
+For larger batch sizes, CUDA can expose more parallel work and its speedup over the other methods typically
+becomes more pronounced. Batch size 64 shows this behavior:
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_64_runtime_comparison.png
+   :alt: Runtime comparison heatmaps for batch size 64
+   :align: center
+   :width: 100%
+
+   Runtime comparison for batch size 64.
+
+.. figure:: _generated/polyline_runtime_evaluation/batch_64_speedup_comparison.png
+   :alt: Speedup comparison heatmaps for batch size 64
+   :align: center
+   :width: 100%
+
+   Speedup comparison for batch size 64.
diff --git a/packages/lane_helpers/docu_referenced_dirs.txt b/packages/lane_helpers/docu_referenced_dirs.txt
new file mode 100644
index 0000000..1e107f5
--- /dev/null
+++ b/packages/lane_helpers/docu_referenced_dirs.txt
@@ -0,0 +1 @@
+examples
diff --git a/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py
new file mode 100644
index 0000000..2cf0d9f
--- /dev/null
+++ b/packages/lane_helpers/evaluation/_shapely_evaluation_outputs.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import numpy as np
+
+_LARGE_SPEEDUP_THRESHOLD = 1_000.0
+
+
+# Helper function for formatting speedup values in result tables.
+def _format_speedup_value(value: float) -> str:
+    if abs(value) >= _LARGE_SPEEDUP_THRESHOLD:
+        return f"{value:.2e}"
+    return f"{value:.2f}"
+
+
+# Helper function for formatting one measured metric as a Markdown table.
+def _format_table(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    scientific: bool,
+) -> str:
+    rows = [
+        "| # Points (down) / # Distances (right) | " + " | ".join(str(item) for item in nums_distances) + " |"
+    ]
+    rows.append("| :----- |" + " :-----: |" * len(nums_distances))
+    for points_idx, num_points_current in enumerate(nums_points):
+        values = []
+        for distances_idx in range(len(nums_distances)):
+            value = data[points_idx, distances_idx]
+            if scientific:
+                values.append(np.format_float_scientific(value, precision=3))
+            else:
+                values.append(_format_speedup_value(value))
+        rows.append(f"| {num_points_current} | " + " | ".join(values) + " |")
+    table = "\n".join(rows)
+    return table
+
+
+# Helper function for writing one Markdown table to disk.
+def _write_markdown(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    filename: Path,
+    scientific: bool,
+) -> None:
+    table = _format_table(data, nums_points, nums_distances, scientific=scientific)
+    filename.write_text(table + "\n", encoding="utf-8")
+
+
+# Helper function for writing the Markdown table output for one metric.
+def _write_metric_outputs(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    filename_stem: Path,
+    scientific: bool,
+) -> None:
+    _write_markdown(
+        data,
+        nums_points,
+        nums_distances,
+        filename=filename_stem.with_suffix(".md"),
+        scientific=scientific,
+    )
+
+
+# Entry point: write all Markdown tables for one evaluated batch size.
+def write_batch_results(
+    output_dir: Path,
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    shapely_runtime_ms: np.ndarray | None,
+    cpu_runtime_ms: np.ndarray,
+    cuda_runtime_ms: np.ndarray,
+    skip_shapely: bool,
+    assert_results: bool,
+    max_abs_diff_cpu: np.ndarray | None,
+    max_abs_diff_cuda: np.ndarray | None,
+    max_abs_diff_cuda_vs_cpu: np.ndarray | None,
+) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms
+    prefix = f"batch_{batch_size}"
+
+    def write_metric(
+        metric_name: str,
+        data: np.ndarray,
+        *,
+        scientific: bool,
+    ) -> None:
+        _write_metric_outputs(
+            data,
+            nums_points,
+            nums_distances,
+            filename_stem=output_dir / f"{prefix}_{metric_name}",
+            scientific=scientific,
+        )
+
+    if not skip_shapely:
+        cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms
+        cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms
+        write_metric(
+            "runtime_shapely",
+            shapely_runtime_ms,
+            scientific=True,
+        )
+    # CPU and CUDA outputs are always available; Shapely-related metrics are optional.
+    write_metric("runtime_cuda", cuda_runtime_ms, scientific=True)
+    write_metric(
+        "runtime_cpu",
+        cpu_runtime_ms,
+        scientific=True,
+    )
+    if not skip_shapely:
+        write_metric(
+            "speedup_cuda_vs_shapely",
+            cuda_speedup_over_shapely,
+            scientific=False,
+        )
+        write_metric(
+            "speedup_cpu_vs_shapely",
+            cpu_speedup_over_shapely,
+            scientific=False,
+        )
+    write_metric(
+        "speedup_cuda_vs_cpu",
+        cuda_speedup_over_cpu,
+        scientific=False,
+    )
+    if assert_results:
+        write_metric(
+            "max_abs_diff_cuda_vs_cpu",
+            max_abs_diff_cuda_vs_cpu,
+            scientific=True,
+        )
+    if assert_results and not skip_shapely:
+        write_metric("max_abs_diff", max_abs_diff_cuda, scientific=True)
+        write_metric("max_abs_diff_cpu", max_abs_diff_cpu, scientific=True)
diff --git a/packages/lane_helpers/evaluation/plot_shapely_evaluation.py b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py
new file mode 100644
index 0000000..34acf06
--- /dev/null
+++ b/packages/lane_helpers/evaluation/plot_shapely_evaluation.py
@@ -0,0 +1,595 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.colors as colors
+from matplotlib.axes import Axes
+from matplotlib import pyplot as plt
+import numpy as np
+
+DEFAULT_ANNOTATE_PLOTS = True
+_LARGE_SPEEDUP_THRESHOLD = 1_000.0
+_PLOT_FIGSIZE = (6.4, 5.2)
+_COMPARISON_SUBPLOT_WIDTH = 6.6
+_PLOT_SUPTITLE_FONT_SIZE = 22
+_PLOT_TITLE_FONT_SIZE = 18
+_PLOT_AXIS_LABEL_FONT_SIZE = 16
+_PLOT_TICK_LABEL_FONT_SIZE = 14
+_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE = 14
+_PLOT_ANNOTATION_FONT_SIZE = 16
+_PLOT_ANNOTATION_MARKER_SIZE = 52
+_PLOT_ANNOTATION_X_OFFSET = 0.25
+_PLOT_COLORBAR_FRACTION = 0.046
+_PLOT_COLORBAR_PAD = 0.02
+
+
+@dataclass(frozen=True)
+class _MetricPlotConfig:
+    title: str
+    annotation: str | None = None
+
+
+_METRIC_PLOT_CONFIGS = {
+    "runtime_shapely": _MetricPlotConfig("Shapely", annotation="runtime"),
+    "runtime_cuda": _MetricPlotConfig("CUDA", annotation="runtime"),
+    "runtime_cpu": _MetricPlotConfig("CPU", annotation="runtime"),
+    "speedup_cuda_vs_shapely": _MetricPlotConfig("CUDA vs. Shapely", annotation="speedup"),
+    "speedup_cpu_vs_shapely": _MetricPlotConfig("CPU vs. Shapely", annotation="speedup"),
+    "speedup_cuda_vs_cpu": _MetricPlotConfig("CUDA vs. CPU", annotation="speedup"),
+    "max_abs_diff_cuda_vs_cpu": _MetricPlotConfig("CUDA max abs. difference to CPU"),
+    "max_abs_diff": _MetricPlotConfig("CUDA max abs. difference to Shapely"),
+    "max_abs_diff_cpu": _MetricPlotConfig("CPU max abs. difference to Shapely"),
+}
+_SHAPELY_DEPENDENT_METRICS = frozenset(
+    {
+        "runtime_shapely",
+        "speedup_cuda_vs_shapely",
+        "speedup_cpu_vs_shapely",
+        "max_abs_diff",
+        "max_abs_diff_cpu",
+    }
+)
+_RUNTIME_METRICS_WITH_SHAPELY = ("runtime_shapely", "runtime_cpu", "runtime_cuda")
+_RUNTIME_METRICS_WITHOUT_SHAPELY = ("runtime_cpu", "runtime_cuda")
+_SPEEDUP_METRICS_WITH_SHAPELY = (
+    "speedup_cpu_vs_shapely",
+    "speedup_cuda_vs_shapely",
+    "speedup_cuda_vs_cpu",
+)
+_SPEEDUP_METRICS_WITHOUT_SHAPELY = ("speedup_cuda_vs_cpu",)
+
+
+# Helper function for formatting speedup values in tables and annotations.
+def _format_speedup_value(value: float) -> str:
+    if abs(value) >= _LARGE_SPEEDUP_THRESHOLD:
+        return f"{value:.2e}"
+    return f"{value:.2f}"
+
+
+# Helper function for splitting one Markdown table row into stripped cells.
+def _split_markdown_table_row(row: str) -> list[str]:
+    row = row.strip()
+    if not row.startswith("|") or not row.endswith("|"):
+        raise ValueError(f"Expected Markdown table row, got: {row}")
+    cells = [cell.strip() for cell in row.strip("|").split("|")]
+    return cells
+
+
+# Helper function for loading one metric table written by `_write_markdown`.
+def _read_metric_table(filename: Path) -> tuple[list[int], list[int], np.ndarray]:
+    table_rows = [
+        line.strip()
+        for line in filename.read_text(encoding="utf-8").splitlines()
+        if line.strip().startswith("|")
+    ]
+    if len(table_rows) < 3:
+        raise ValueError(f"Expected a Markdown header, separator, and at least one data row in {filename}")
+
+    header_cells = _split_markdown_table_row(table_rows[0])
+    if not header_cells or not header_cells[0].startswith("# Points"):
+        raise ValueError(f"Expected first Markdown header cell to describe point counts in {filename}")
+    nums_distances = [int(cell) for cell in header_cells[1:]]
+    nums_points: list[int] = []
+    values: list[list[float]] = []
+
+    for row in table_rows[2:]:
+        row_cells = _split_markdown_table_row(row)
+        if len(row_cells) != len(nums_distances) + 1:
+            raise ValueError(f"Expected {len(nums_distances) + 1} cells in {filename}, got {len(row_cells)}")
+        nums_points.append(int(row_cells[0]))
+        values.append([float(cell) for cell in row_cells[1:]])
+
+    data = np.asarray(values, dtype=np.float64)
+    return nums_points, nums_distances, data
+
+
+# Helper function for choosing which speedup heatmap cells should show numeric labels.
+def _selected_speedup_annotation_cells(
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> list[tuple[int, int]]:
+    def find_value_index(values: list[int], value: int) -> int | None:
+        try:
+            index = values.index(value)
+        except ValueError:
+            return None
+        return index
+
+    def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None:
+        if cell not in cells:
+            cells.append(cell)
+
+    def find_first_faster_distance_idx(points_idx: int) -> int | None:
+        for distances_idx in range(len(nums_distances)):
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0:
+                return distances_idx
+        return None
+
+    def find_first_faster_points_idx(distances_idx: int) -> int | None:
+        for points_idx in range(len(nums_points)):
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] >= 1.0:
+                return points_idx
+        return None
+
+    cells: list[tuple[int, int]] = []
+
+    points_idx = find_value_index(nums_points, 2)
+    distances_idx = find_value_index(nums_distances, 1)
+    if points_idx is not None and distances_idx is not None:
+        add_unique_cell(cells, (points_idx, distances_idx))
+
+    finite_mask = np.isfinite(data)
+    if np.any(finite_mask):
+        finite_data = np.where(finite_mask, data, -np.inf)
+        points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape)
+        add_unique_cell(cells, (int(points_idx), int(distances_idx)))
+
+    finite_values = data[finite_mask]
+    if finite_values.size > 0 and not np.all(finite_values > 1.0):
+        distances_idx_by_value = {value: idx for idx, value in enumerate(nums_distances)}
+        for points_idx, num_points_current in enumerate(nums_points):
+            distances_idx = distances_idx_by_value.get(num_points_current)
+            if distances_idx is None:
+                continue
+            if np.isfinite(data[points_idx, distances_idx]) and data[points_idx, distances_idx] > 1.0:
+                add_unique_cell(cells, (points_idx, distances_idx))
+                if points_idx > 0:
+                    faster_distances_idx = find_first_faster_distance_idx(points_idx - 1)
+                    if faster_distances_idx is not None:
+                        add_unique_cell(cells, (points_idx - 1, faster_distances_idx))
+                if distances_idx > 0:
+                    faster_points_idx = find_first_faster_points_idx(distances_idx - 1)
+                    if faster_points_idx is not None:
+                        add_unique_cell(cells, (faster_points_idx, distances_idx - 1))
+                break
+
+    return cells
+
+
+# Helper function for choosing which runtime heatmap cells should get marker labels.
+def _selected_runtime_annotation_cells(
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> list[tuple[int, int]]:
+    def find_value_index(values: list[int], value: int) -> int | None:
+        try:
+            index = values.index(value)
+        except ValueError:
+            return None
+        return index
+
+    def add_unique_cell(cells: list[tuple[int, int]], cell: tuple[int, int]) -> None:
+        if cell not in cells:
+            cells.append(cell)
+
+    cells: list[tuple[int, int]] = []
+    if nums_points and nums_distances:
+        add_unique_cell(cells, (0, 0))
+        add_unique_cell(cells, (len(nums_points) - 1, len(nums_distances) - 1))
+
+    points_idx = find_value_index(nums_points, 100)
+    distances_idx = find_value_index(nums_distances, 100)
+    if points_idx is not None and distances_idx is not None:
+        add_unique_cell(cells, (points_idx, distances_idx))
+
+    return cells
+
+
+# Helper function for placing numeric labels on selected speedup heatmap cells.
+def _speedup_annotation_text_position(
+    points_idx: int,
+    distances_idx: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    selected_cells: list[tuple[int, int]],
+    data: np.ndarray,
+    max_speedup_cell: tuple[int, int] | None,
+) -> tuple[float, str]:
+    if max_speedup_cell == (points_idx, distances_idx) and distances_idx > 0:
+        return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right"
+
+    is_left_of_value_diagonal = nums_distances[distances_idx] < nums_points[points_idx]
+    has_adjacent_above_one_annotation = any(
+        (other_points_idx, other_distances_idx) != (points_idx, distances_idx)
+        and abs(other_points_idx - points_idx) + abs(other_distances_idx - distances_idx) == 1
+        and np.isfinite(data[other_points_idx, other_distances_idx])
+        and data[other_points_idx, other_distances_idx] >= 1.0
+        for other_points_idx, other_distances_idx in selected_cells
+    )
+    should_place_left = distances_idx == len(nums_distances) - 1 or (
+        distances_idx > 0 and is_left_of_value_diagonal and has_adjacent_above_one_annotation
+    )
+    if should_place_left:
+        return distances_idx - _PLOT_ANNOTATION_X_OFFSET, "right"
+    return distances_idx + _PLOT_ANNOTATION_X_OFFSET, "left"
+
+
+# Helper function for drawing optional numeric labels on selected speedup heatmap cells.
+def _annotate_speedup_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> None:
+    selected_cells = _selected_speedup_annotation_cells(data, nums_points, nums_distances)
+    finite_mask = np.isfinite(data)
+    max_speedup_cell = None
+    if np.any(finite_mask):
+        finite_data = np.where(finite_mask, data, -np.inf)
+        points_idx, distances_idx = np.unravel_index(np.argmax(finite_data), data.shape)
+        max_speedup_cell = (int(points_idx), int(distances_idx))
+
+    for points_idx, distances_idx in selected_cells:
+        value = data[points_idx, distances_idx]
+        if not np.isfinite(value):
+            continue
+
+        ax.scatter(
+            [distances_idx],
+            [points_idx],
+            marker="o",
+            s=_PLOT_ANNOTATION_MARKER_SIZE,
+            c="black",
+            edgecolors="white",
+            linewidths=0.8,
+            zorder=3,
+        )
+
+        text_x, horizontal_alignment = _speedup_annotation_text_position(
+            points_idx,
+            distances_idx,
+            nums_points,
+            nums_distances,
+            selected_cells,
+            data,
+            max_speedup_cell,
+        )
+        ax.text(
+            text_x,
+            points_idx,
+            _format_speedup_value(value),
+            ha=horizontal_alignment,
+            va="center",
+            fontsize=_PLOT_ANNOTATION_FONT_SIZE,
+            color="black",
+            bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75},
+            zorder=4,
+        )
+
+
+# Helper function for drawing fixed reference markers on runtime heatmap cells.
+def _annotate_runtime_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> None:
+    for points_idx, distances_idx in _selected_runtime_annotation_cells(nums_points, nums_distances):
+        value = data[points_idx, distances_idx]
+        if not np.isfinite(value):
+            continue
+
+        ax.scatter(
+            [distances_idx],
+            [points_idx],
+            marker="o",
+            s=_PLOT_ANNOTATION_MARKER_SIZE,
+            c="black",
+            edgecolors="white",
+            linewidths=0.8,
+            zorder=3,
+        )
+
+        if distances_idx == len(nums_distances) - 1:
+            text_x = distances_idx - 0.15
+            horizontal_alignment = "right"
+        else:
+            text_x = distances_idx + 0.15
+            horizontal_alignment = "left"
+        ax.text(
+            text_x,
+            points_idx,
+            f"{value:.1e}",
+            ha=horizontal_alignment,
+            va="center",
+            fontsize=_PLOT_ANNOTATION_FONT_SIZE,
+            color="black",
+            bbox={"boxstyle": "round,pad=0.12", "facecolor": "white", "edgecolor": "none", "alpha": 0.75},
+            zorder=4,
+        )
+
+
+# Helper function for drawing one heatmap into an existing subplot.
+def _draw_heatmap(
+    ax: Axes,
+    data: np.ndarray,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    title: str,
+    log_scale: bool,
+    annotate_speedup: bool = False,
+    annotate_runtime: bool = False,
+) -> None:
+    norm = None
+    if log_scale:
+        positive_values = data[data > 0]
+        if positive_values.size > 0:
+            norm = colors.LogNorm(vmin=positive_values.min(), vmax=positive_values.max())
+
+    image = ax.imshow(data, norm=norm)
+    ax.set_yticks(list(range(len(nums_points))), labels=nums_points, fontsize=_PLOT_TICK_LABEL_FONT_SIZE)
+    ax.set_ylabel("Number of points", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE)
+    ax.set_xticks(
+        list(range(len(nums_distances))),
+        labels=nums_distances,
+        rotation=45,
+        fontsize=_PLOT_TICK_LABEL_FONT_SIZE,
+    )
+    ax.set_xlabel("Number of distances", fontsize=_PLOT_AXIS_LABEL_FONT_SIZE)
+    ax.set_title(title, fontsize=_PLOT_TITLE_FONT_SIZE, pad=12)
+    colorbar = ax.figure.colorbar(
+        image,
+        ax=ax,
+        fraction=_PLOT_COLORBAR_FRACTION,
+        pad=_PLOT_COLORBAR_PAD,
+    )
+    colorbar.ax.tick_params(labelsize=_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE)
+    colorbar.ax.yaxis.offsetText.set_fontsize(_PLOT_COLORBAR_TICK_LABEL_FONT_SIZE)
+    if annotate_speedup:
+        _annotate_speedup_heatmap(ax, data, nums_points, nums_distances)
+    if annotate_runtime:
+        _annotate_runtime_heatmap(ax, data, nums_points, nums_distances)
+
+
+# Helper function for writing a multi-subplot comparison plot for one metric group.
+def _plot_metric_comparison(
+    metric_names: tuple[str, ...],
+    metric_data: dict[str, np.ndarray],
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    batch_size: int,
+    figure_title: str,
+    filename: Path,
+    annotate_plots: bool,
+) -> None:
+    available_metric_names = tuple(metric_name for metric_name in metric_names if metric_name in metric_data)
+    if not available_metric_names:
+        return
+
+    subplot_width = _PLOT_FIGSIZE[0] if len(available_metric_names) == 1 else _COMPARISON_SUBPLOT_WIDTH
+    fig, axes = plt.subplots(
+        1,
+        len(available_metric_names),
+        figsize=(subplot_width * len(available_metric_names), _PLOT_FIGSIZE[1]),
+        constrained_layout=True,
+    )
+    fig.suptitle(f"{figure_title} (Batch Size {batch_size})", fontsize=_PLOT_SUPTITLE_FONT_SIZE)
+    axes = np.atleast_1d(axes).tolist()
+
+    for ax, metric_name in zip(axes, available_metric_names):
+        metric_config = _METRIC_PLOT_CONFIGS[metric_name]
+        _draw_heatmap(
+            ax,
+            metric_data[metric_name],
+            nums_points,
+            nums_distances,
+            title=metric_config.title,
+            log_scale=True,
+            annotate_speedup=metric_config.annotation == "speedup" and annotate_plots,
+            annotate_runtime=metric_config.annotation == "runtime" and annotate_plots,
+        )
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+# Helper function for writing comparison plots whose subplot counts depend on Shapely availability.
+def _write_comparison_outputs(
+    output_dir: Path,
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    metric_data: dict[str, np.ndarray],
+    *,
+    has_shapely_results: bool,
+    annotate_plots: bool,
+) -> list[Path]:
+    runtime_metric_names = (
+        _RUNTIME_METRICS_WITH_SHAPELY if has_shapely_results else _RUNTIME_METRICS_WITHOUT_SHAPELY
+    )
+    speedup_metric_names = (
+        _SPEEDUP_METRICS_WITH_SHAPELY if has_shapely_results else _SPEEDUP_METRICS_WITHOUT_SHAPELY
+    )
+    prefix = f"batch_{batch_size}"
+    comparison_files = [
+        output_dir / f"{prefix}_runtime_comparison.png",
+        output_dir / f"{prefix}_speedup_comparison.png",
+    ]
+    _plot_metric_comparison(
+        runtime_metric_names,
+        metric_data,
+        nums_points,
+        nums_distances,
+        batch_size=batch_size,
+        figure_title="Runtime [ms]",
+        filename=comparison_files[0],
+        annotate_plots=annotate_plots,
+    )
+    _plot_metric_comparison(
+        speedup_metric_names,
+        metric_data,
+        nums_points,
+        nums_distances,
+        batch_size=batch_size,
+        figure_title="Speedup [x-fold]",
+        filename=comparison_files[1],
+        annotate_plots=annotate_plots,
+    )
+    return comparison_files
+
+
+# Helper function for parsing comma-separated integer lists.
+def _parse_int_list(value: str) -> list[int]:
+    parsed_values = [int(item) for item in value.split(",") if item]
+    return parsed_values
+
+
+def plot_batch_results_from_markdown(
+    input_dir: Path,
+    output_dir: Path,
+    batch_size: int,
+    annotate_plots: bool,
+) -> list[Path]:
+    prefix = f"batch_{batch_size}_"
+    markdown_files = sorted(input_dir.glob(f"{prefix}*.md"))
+    if not markdown_files:
+        raise FileNotFoundError(f"No Markdown result tables found for batch={batch_size} in {input_dir}")
+    available_metric_names = {markdown_file.stem[len(prefix) :] for markdown_file in markdown_files}
+    has_shapely_results = "runtime_shapely" in available_metric_names
+
+    metric_data: dict[str, np.ndarray] = {}
+    comparison_nums_points: list[int] | None = None
+    comparison_nums_distances: list[int] | None = None
+    for markdown_file in markdown_files:
+        metric_name = markdown_file.stem[len(prefix) :]
+        if metric_name not in _METRIC_PLOT_CONFIGS:
+            continue
+        if metric_name in _SHAPELY_DEPENDENT_METRICS and not has_shapely_results:
+            continue
+
+        nums_points, nums_distances, data = _read_metric_table(markdown_file)
+        metric_data[metric_name] = data
+        comparison_nums_points = nums_points
+        comparison_nums_distances = nums_distances
+
+    if comparison_nums_points is not None and comparison_nums_distances is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        plotted_files = _write_comparison_outputs(
+            output_dir,
+            batch_size,
+            comparison_nums_points,
+            comparison_nums_distances,
+            metric_data,
+            has_shapely_results=has_shapely_results,
+            annotate_plots=annotate_plots,
+        )
+    else:
+        plotted_files = []
+
+    if not plotted_files:
+        raise FileNotFoundError(
+            f"No known Markdown result tables found for batch={batch_size} in {input_dir}"
+        )
+    return plotted_files
+
+
+def plot_from_markdown_directory(
+    *,
+    input_dir: Path,
+    output_dir: Path,
+    batch_sizes: list[int],
+    annotate_plots: bool = DEFAULT_ANNOTATE_PLOTS,
+) -> list[Path]:
+    if not input_dir.exists():
+        raise FileNotFoundError(f"Markdown input directory does not exist: {input_dir}")
+
+    plotted_files: list[Path] = []
+    for batch_size in batch_sizes:
+        batch_plotted_files = plot_batch_results_from_markdown(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            batch_size=batch_size,
+            annotate_plots=annotate_plots,
+        )
+        plotted_files.extend(batch_plotted_files)
+    return plotted_files
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate polyline runtime plot images from Markdown result tables.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        required=True,
+        help="Directory containing Markdown result tables.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        required=True,
+        help="Directory where plot images should be written.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        default="1,4,16,64",
+        help="Comma-separated batch sizes to plot.",
+    )
+    no_annotate_plots_action = parser.add_argument(
+        "--no-annotate-plots",
+        dest="annotate_plots",
+        action="store_false",
+        help="Disable annotations in generated heatmaps.",
+    )
+    parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS)
+    no_annotate_plots_action.default = argparse.SUPPRESS
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = _parse_args()
+    batch_sizes = _parse_int_list(args.batch_sizes)
+    plotted_files = plot_from_markdown_directory(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        batch_sizes=batch_sizes,
+        annotate_plots=args.annotate_plots,
+    )
+    for plotted_file in plotted_files:
+        print(f"Generated plot: {plotted_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/evaluation/shapely_evaluation.py b/packages/lane_helpers/evaluation/shapely_evaluation.py
new file mode 100644
index 0000000..1048191
--- /dev/null
+++ b/packages/lane_helpers/evaluation/shapely_evaluation.py
@@ -0,0 +1,681 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from collections.abc import Callable, Iterator
+import gc
+import sys
+import time
+from pathlib import Path
+from types import ModuleType
+
+import numpy as np
+from shapely import get_coordinates, line_interpolate_point, linestrings
+import torch
+
+# Import helpers for outputting results and plots
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+import plot_shapely_evaluation
+import _shapely_evaluation_outputs as shapely_evaluation_outputs
+
+# ==================== Default configuration for the evaluation ====================
+
+# These constants are convenient local configuration knobs. However, these configurations can also
+# be done with CLI arguments.
+# When changing these constants, check the CLI arguments further below in the script,
+# because some flags only override the default in one direction.
+
+# Sweep values for the heatmap axes and the batch-size examples.
+DEFAULT_NUMS_POINTS = [2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
+DEFAULT_NUMS_DISTANCES = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]
+DEFAULT_BATCH_SIZES = [1, 4, 16, 64]
+# Keep the measured work roughly constant across batch sizes.
+DEFAULT_NUM_POLYLINES_PER_MEASUREMENT = 64 * 10
+# Warm up a representative mid-sized configuration before timing the sweep.
+DEFAULT_NUM_WARMUP_RUNS = 3
+DEFAULT_WARMUP_NUM_POINTS = 100
+DEFAULT_WARMUP_NUM_DISTANCES = 100
+# Shapely can be skipped for faster CPU/CUDA-only benchmark sweeps.
+DEFAULT_SKIP_SHAPELY = False
+# Result checks are optional because they add Shapely reference work to each configuration.
+DEFAULT_ASSERT_RESULTS = False
+DEFAULT_ASSERT_ATOL = 1e-3
+DEFAULT_ASSERT_RTOL = 0.0
+# Plot annotations call out representative cells in generated heatmaps.
+DEFAULT_ANNOTATE_PLOTS = True
+# ================== End: Default configuration for the evaluation =================
+
+
+# ================== Constants for the evaluation ==================
+DEVICE = "cuda"
+DTYPE_NP = np.float32
+DTYPE_TORCH = torch.float32
+_POLYLINE_MODULE: ModuleType | None = None
+# ================ End: Constants for the evaluation ===============
+
+
+# Helper function for lazily importing the compiled polyline module outside plotting-only mode.
+def _get_polyline_module() -> ModuleType:
+    global _POLYLINE_MODULE
+    if _POLYLINE_MODULE is None:
+        from accvlab.lane_helpers import polyline as polyline_module
+
+        _POLYLINE_MODULE = polyline_module
+    return _POLYLINE_MODULE
+
+
+# Helper function for config parsing
+def _parse_int_list(value: str) -> list[int]:
+    parsed_values = [int(item) for item in value.split(",") if item]
+    return parsed_values
+
+
+# Helper function for computing the batched Shapely reference.
+def _compute_batched_shapely_reference(points: np.ndarray, distances: np.ndarray) -> np.ndarray:
+    line_strings = linestrings(points)
+    interpolated_points = line_interpolate_point(line_strings[:, None], distances)
+    batched_reference = (
+        get_coordinates(interpolated_points)
+        .reshape(
+            points.shape[0],
+            distances.shape[1],
+            points.shape[2],
+        )
+        .astype(DTYPE_NP)
+    )
+    return batched_reference
+
+
+# Helper function for computing per-polyline lengths in NumPy.
+def _compute_polyline_lengths_np(points: np.ndarray) -> np.ndarray:
+    if points.shape[1] <= 1:
+        lengths = np.zeros((points.shape[0],), dtype=DTYPE_NP)
+        return lengths
+    lengths = np.linalg.norm(points[:, 1:] - points[:, :-1], axis=2).sum(axis=1).astype(DTYPE_NP)
+    return lengths
+
+
+# Helper function for comparing CPU and CUDA outputs against Shapely when requested.
+def _assert_matches_shapely(
+    shapely_result: np.ndarray,
+    result: np.ndarray,
+    *,
+    implementation_name: str,
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    atol: float,
+    rtol: float,
+) -> None:
+    try:
+        np.testing.assert_allclose(result, shapely_result, atol=atol, rtol=rtol)
+    except AssertionError as exc:
+        max_abs_diff = np.abs(shapely_result - result).max()
+        raise AssertionError(
+            f"{implementation_name} result differs from Shapely for "
+            f"batch={batch_size}, points={num_points}, distances={num_distances}; "
+            f"max_abs_diff={max_abs_diff}, atol={atol}, rtol={rtol}"
+        ) from exc
+
+
+# Helper function for constructing one deterministic benchmark input configuration.
+def _make_evaluation_case(
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    *,
+    seed: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    generator = np.random.default_rng(seed=seed)
+    # Set up the polylines
+    points = generator.uniform(0.0, 1.0, size=(batch_size, num_points, 2)).astype(DTYPE_NP)
+    lengths = _compute_polyline_lengths_np(points)
+    # Set up the distances to sample the polyline at
+    distances_normalized = generator.uniform(0.0, 1.0, size=(batch_size, num_distances)).astype(DTYPE_NP)
+    distances = distances_normalized * lengths[:, None]
+    return points, distances
+
+
+# Helper function for iterating over deterministic benchmark configurations.
+def _iter_evaluation_cases(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+) -> Iterator[tuple[int, int, int, int, int]]:
+    for points_idx, num_points_current in enumerate(nums_points):
+        for distances_idx, num_distances_current in enumerate(nums_distances):
+            seed = batch_size * 1_000_000 + num_points_current * 1_000 + num_distances_current
+            yield points_idx, distances_idx, num_points_current, num_distances_current, seed
+
+
+# Helper function for placing the same NumPy inputs on CUDA and CPU.
+def _make_torch_tensors(
+    *arrays: np.ndarray,
+) -> tuple[torch.Tensor, ...]:
+    tensors_gpu = [torch.tensor(array, device=DEVICE, dtype=DTYPE_TORCH) for array in arrays]
+    tensors_cpu = [torch.tensor(array, device="cpu", dtype=DTYPE_TORCH) for array in arrays]
+    return *tensors_gpu, *tensors_cpu
+
+
+# Helper function for placing NumPy inputs on one target device.
+def _make_torch_tensors_on_device(
+    *arrays: np.ndarray,
+    device: str,
+) -> tuple[torch.Tensor, ...]:
+    tensors = tuple(torch.tensor(array, device=device, dtype=DTYPE_TORCH) for array in arrays)
+    return tensors
+
+
+# Helper function for timing repeated calls and synchronizing CUDA work when needed.
+def _time_call(
+    function: Callable[[], object],
+    *,
+    num_runs: int,
+    synchronize_cuda: bool = False,
+) -> float:
+    if synchronize_cuda:
+        # Ensure previous work is finished before starting the timing.
+        torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(num_runs):
+        function()
+    if synchronize_cuda:
+        # Ensure all work is finished before stopping the timing.
+        torch.cuda.synchronize()
+    runtime = (time.perf_counter() - start) / num_runs
+    return runtime
+
+
+# Helper function for reducing cross-implementation timing interference.
+def _cleanup_between_implementation_sweeps() -> None:
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+
+
+# Helper function for timing the Shapely reference implementation.
+def _time_shapely(
+    points: np.ndarray,
+    distances: np.ndarray,
+    *,
+    num_runs: int,
+) -> float:
+    compute_function = lambda: _compute_batched_shapely_reference(points, distances)
+    runtime = _time_call(
+        compute_function,
+        num_runs=num_runs,
+    )
+    return runtime
+
+
+# Helper function for timing the CUDA implementation.
+def _time_cuda(
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    *,
+    num_runs: int,
+) -> float:
+    polyline_module = _get_polyline_module()
+    compute_function = lambda: polyline_module.interpolate(points, distances)
+    runtime = _time_call(
+        compute_function,
+        num_runs=num_runs,
+        synchronize_cuda=True,
+    )
+    return runtime
+
+
+# Helper function for timing the CPU implementation.
+def _time_cpu(
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    *,
+    num_runs: int,
+) -> float:
+    polyline_module = _get_polyline_module()
+    compute_function = lambda: polyline_module.interpolate(points, distances)
+    runtime = _time_call(compute_function, num_runs=num_runs)
+    return runtime
+
+
+# Helper function for warming up all selected implementations once before measured runs.
+def _run_warmup(
+    *,
+    batch_size: int,
+    num_points: int,
+    num_distances: int,
+    num_warmup_runs: int,
+    skip_shapely: bool,
+) -> None:
+    if num_warmup_runs <= 0:
+        return
+
+    points_np, distances_np = _make_evaluation_case(
+        batch_size,
+        num_points,
+        num_distances,
+        seed=0,
+    )
+    points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np)
+    polyline_module = _get_polyline_module()
+
+    for _ in range(num_warmup_runs):
+        if not skip_shapely:
+            _compute_batched_shapely_reference(points_np, distances_np)
+        polyline_module.interpolate(points_cpu, distances_cpu)
+        polyline_module.interpolate(points_gpu, distances_gpu)
+
+    torch.cuda.synchronize()
+
+
+# Helper to (optionally) validate the results against the Shapely reference.
+def _run_validation_sweep(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    assert_atol: float,
+    assert_rtol: float,
+    max_abs_diff_cpu: np.ndarray,
+    max_abs_diff_cuda: np.ndarray,
+    max_abs_diff_cuda_vs_cpu: np.ndarray,
+) -> None:
+    print(f"Running validation sweep for batch={batch_size}")
+    polyline_module = _get_polyline_module()
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running validation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        shapely_result = _compute_batched_shapely_reference(points_np, distances_np)
+        points_gpu, distances_gpu, points_cpu, distances_cpu = _make_torch_tensors(points_np, distances_np)
+        cpu_result = polyline_module.interpolate(points_cpu, distances_cpu).numpy()
+        cuda_result = polyline_module.interpolate(points_gpu, distances_gpu).cpu().numpy()
+
+        max_abs_diff_cpu[points_idx, distances_idx] = np.abs(shapely_result - cpu_result).max()
+        max_abs_diff_cuda[points_idx, distances_idx] = np.abs(shapely_result - cuda_result).max()
+        max_abs_diff_cuda_vs_cpu[points_idx, distances_idx] = np.abs(cpu_result - cuda_result).max()
+
+        _assert_matches_shapely(
+            shapely_result,
+            cpu_result,
+            implementation_name="CPU",
+            batch_size=batch_size,
+            num_points=num_points_current,
+            num_distances=num_distances_current,
+            atol=assert_atol,
+            rtol=assert_rtol,
+        )
+        _assert_matches_shapely(
+            shapely_result,
+            cuda_result,
+            implementation_name="CUDA",
+            batch_size=batch_size,
+            num_points=num_points_current,
+            num_distances=num_distances_current,
+            atol=assert_atol,
+            rtol=assert_rtol,
+        )
+
+
+# Helper function for evaluating every point-count and distance-count pair for one batch size.
+def _evaluate_batch_size(
+    batch_size: int,
+    nums_points: list[int],
+    nums_distances: list[int],
+    *,
+    num_runs: int,
+    assert_results: bool,
+    assert_atol: float,
+    assert_rtol: float,
+    skip_shapely: bool,
+) -> tuple[
+    np.ndarray | None, np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None, np.ndarray | None
+]:
+    result_shape = (len(nums_points), len(nums_distances))
+
+    shapely_runtime_ms = None if skip_shapely else np.zeros(result_shape, dtype=np.float64)
+    cuda_runtime_ms = np.zeros(result_shape, dtype=np.float64)
+    cpu_runtime_ms = np.zeros(result_shape, dtype=np.float64)
+
+    max_abs_diff_cuda = np.zeros_like(cpu_runtime_ms) if assert_results else None
+    max_abs_diff_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None
+    max_abs_diff_cuda_vs_cpu = np.zeros_like(cpu_runtime_ms) if assert_results else None
+
+    if not skip_shapely:
+        print(f"Running Shapely sweep for batch={batch_size}, runs={num_runs}")
+        for (
+            points_idx,
+            distances_idx,
+            num_points_current,
+            num_distances_current,
+            seed,
+        ) in _iter_evaluation_cases(batch_size, nums_points, nums_distances):
+            print(
+                "Running Shapely evaluation "
+                f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+                f"runs={num_runs}"
+            )
+            points_np, distances_np = _make_evaluation_case(
+                batch_size,
+                num_points_current,
+                num_distances_current,
+                seed=seed,
+            )
+
+            shapely_runtime = _time_shapely(
+                points_np,
+                distances_np,
+                num_runs=num_runs,
+            )
+            shapely_runtime_ms[points_idx, distances_idx] = shapely_runtime * 1000
+        _cleanup_between_implementation_sweeps()
+
+    print(f"Running CPU sweep for batch={batch_size}, runs={num_runs}")
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running CPU evaluation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+            f"runs={num_runs}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        points_cpu, distances_cpu = _make_torch_tensors_on_device(
+            points_np,
+            distances_np,
+            device="cpu",
+        )
+
+        cpu_runtime_ms[points_idx, distances_idx] = (
+            _time_cpu(
+                points_cpu,
+                distances_cpu,
+                num_runs=num_runs,
+            )
+            * 1000
+        )
+    _cleanup_between_implementation_sweeps()
+
+    print(f"Running CUDA sweep for batch={batch_size}, runs={num_runs}")
+    for points_idx, distances_idx, num_points_current, num_distances_current, seed in _iter_evaluation_cases(
+        batch_size, nums_points, nums_distances
+    ):
+        print(
+            "Running CUDA evaluation "
+            f"batch={batch_size}, points={num_points_current}, distances={num_distances_current}, "
+            f"runs={num_runs}"
+        )
+        points_np, distances_np = _make_evaluation_case(
+            batch_size,
+            num_points_current,
+            num_distances_current,
+            seed=seed,
+        )
+        points_gpu, distances_gpu = _make_torch_tensors_on_device(
+            points_np,
+            distances_np,
+            device=DEVICE,
+        )
+
+        cuda_runtime_ms[points_idx, distances_idx] = (
+            _time_cuda(
+                points_gpu,
+                distances_gpu,
+                num_runs=num_runs,
+            )
+            * 1000
+        )
+    _cleanup_between_implementation_sweeps()
+
+    if assert_results:
+        _run_validation_sweep(
+            batch_size,
+            nums_points,
+            nums_distances,
+            assert_atol=assert_atol,
+            assert_rtol=assert_rtol,
+            max_abs_diff_cpu=max_abs_diff_cpu,
+            max_abs_diff_cuda=max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu=max_abs_diff_cuda_vs_cpu,
+        )
+        _cleanup_between_implementation_sweeps()
+
+    return (
+        shapely_runtime_ms,
+        cpu_runtime_ms,
+        cuda_runtime_ms,
+        max_abs_diff_cpu,
+        max_abs_diff_cuda,
+        max_abs_diff_cuda_vs_cpu,
+    )
+
+
+# Helper function for parsing command-line arguments.
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Evaluate batched CPU/CUDA polyline interpolation against a Shapely LineString reference "
+            "over point-count, distance-count, and batch-size sweeps."
+        ),
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--num-points",
+        dest="nums_points",
+        default=",".join(str(item) for item in DEFAULT_NUMS_POINTS),
+        help="Comma-separated point counts for the polyline-length sweep.",
+    )
+    parser.add_argument(
+        "--num-distances",
+        dest="nums_distances",
+        default=",".join(str(item) for item in DEFAULT_NUMS_DISTANCES),
+        help="Comma-separated sample-distance counts for the interpolation sweep.",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        default=",".join(str(item) for item in DEFAULT_BATCH_SIZES),
+        help="Comma-separated batch sizes to evaluate.",
+    )
+    parser.add_argument(
+        "--num-polylines-per-measurement",
+        type=int,
+        default=DEFAULT_NUM_POLYLINES_PER_MEASUREMENT,
+        help="Target number of polylines measured per configuration; divided by batch size to get runs.",
+    )
+    parser.add_argument(
+        "--num-warmup-runs",
+        type=int,
+        default=DEFAULT_NUM_WARMUP_RUNS,
+        help="Number of untimed warmup runs before the measured sweep.",
+    )
+    parser.add_argument(
+        "--warmup-num-points",
+        type=int,
+        default=DEFAULT_WARMUP_NUM_POINTS,
+        help="Point count used for warmup inputs.",
+    )
+    parser.add_argument(
+        "--warmup-num-distances",
+        type=int,
+        default=DEFAULT_WARMUP_NUM_DISTANCES,
+        help="Sample-distance count used for warmup inputs.",
+    )
+    parser.add_argument(
+        "--skip-shapely",
+        action="store_true",
+        default=DEFAULT_SKIP_SHAPELY,
+        help="Skip Shapely reference timing and Shapely-based speedup plots.",
+    )
+    parser.add_argument(
+        "--assert-results",
+        action="store_true",
+        default=DEFAULT_ASSERT_RESULTS,
+        help="Compare CPU and CUDA outputs against Shapely using the configured tolerances.",
+    )
+    parser.add_argument(
+        "--assert-atol",
+        type=float,
+        default=DEFAULT_ASSERT_ATOL,
+        help="Absolute tolerance used when asserting results against Shapely.",
+    )
+    parser.add_argument(
+        "--assert-rtol",
+        type=float,
+        default=DEFAULT_ASSERT_RTOL,
+        help="Relative tolerance used when asserting results against Shapely.",
+    )
+    no_annotate_plots_action = parser.add_argument(
+        "--no-annotate-plots",
+        dest="annotate_plots",
+        action="store_false",
+        help="Disable annotations in generated heatmaps.",
+    )
+    parser.set_defaults(annotate_plots=DEFAULT_ANNOTATE_PLOTS)
+    no_annotate_plots_action.default = argparse.SUPPRESS
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("polyline_shapely_eval_results"),
+        help="Directory for Markdown result tables and generated plot images.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+# Main entry point for the full benchmark sweep.
+def main() -> None:
+    args = _parse_args()
+    nums_points = _parse_int_list(args.nums_points)
+    nums_distances = _parse_int_list(args.nums_distances)
+    batch_sizes = _parse_int_list(args.batch_sizes)
+    # Make relative output paths independent of the caller's working directory.
+    if not args.output_dir.is_absolute():
+        args.output_dir = SCRIPT_DIR / args.output_dir
+
+    if not torch.cuda.is_available():
+        raise RuntimeError("This evaluation requires a CUDA-capable PyTorch installation.")
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Result assertions require Shapely, so disabling Shapely also disables assertions.
+    assert_results = args.assert_results and not args.skip_shapely
+
+    print("Performing runtime evaluation...")
+    print(f"Numbers of points: {nums_points}")
+    print(f"Numbers of distances: {nums_distances}")
+    print(f"Batch sizes: {batch_sizes}")
+    print(f"Measured polylines per configuration: {args.num_polylines_per_measurement}")
+    print(
+        "Warmup configuration: "
+        f"batch={max(batch_sizes)}, points={args.warmup_num_points}, "
+        f"distances={args.warmup_num_distances}, runs={args.num_warmup_runs}"
+    )
+    print(f"Use Shapely reference: {not args.skip_shapely}")
+    print(f"Assert results against Shapely: {assert_results}")
+    print(f"Annotate plots: {args.annotate_plots}")
+    print(f"Output directory: {args.output_dir}")
+
+    _run_warmup(
+        batch_size=max(batch_sizes),
+        num_points=args.warmup_num_points,
+        num_distances=args.warmup_num_distances,
+        num_warmup_runs=args.num_warmup_runs,
+        skip_shapely=args.skip_shapely,
+    )
+
+    for batch_size in batch_sizes:
+        # Keep (roughly) the same number of measured polylines per configuration across batch sizes.
+        num_runs = max(1, args.num_polylines_per_measurement // batch_size)
+        print(f"Using {num_runs} measured runs for batch={batch_size}")
+
+        # Run evaluation & get results for one batch size (number of polylines in single call).
+        (
+            shapely_runtime_ms,
+            cpu_runtime_ms,
+            cuda_runtime_ms,
+            max_abs_diff_cpu,
+            max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu,
+        ) = _evaluate_batch_size(
+            batch_size,
+            nums_points,
+            nums_distances,
+            num_runs=num_runs,
+            assert_results=assert_results,
+            assert_atol=args.assert_atol,
+            assert_rtol=args.assert_rtol,
+            skip_shapely=args.skip_shapely,
+        )
+
+        # Write results to disk.
+        shapely_evaluation_outputs.write_batch_results(
+            args.output_dir,
+            batch_size,
+            nums_points,
+            nums_distances,
+            shapely_runtime_ms,
+            cpu_runtime_ms,
+            cuda_runtime_ms,
+            args.skip_shapely,
+            assert_results,
+            max_abs_diff_cpu,
+            max_abs_diff_cuda,
+            max_abs_diff_cuda_vs_cpu,
+        )
+
+        # Print info.
+        cuda_speedup_over_cpu = cpu_runtime_ms / cuda_runtime_ms
+        if not args.skip_shapely:
+            cuda_speedup_over_shapely = shapely_runtime_ms / cuda_runtime_ms
+            cpu_speedup_over_shapely = shapely_runtime_ms / cpu_runtime_ms
+            print(f"Average Shapely runtime [ms], batch={batch_size}:\n{shapely_runtime_ms}")
+        print(f"Average CPU runtime [ms], batch={batch_size}:\n{cpu_runtime_ms}")
+        print(f"Average CUDA runtime [ms], batch={batch_size}:\n{cuda_runtime_ms}")
+        if not args.skip_shapely:
+            print(f"CPU speedup over Shapely, batch={batch_size}:\n{cpu_speedup_over_shapely}")
+            print(f"CUDA speedup over Shapely, batch={batch_size}:\n{cuda_speedup_over_shapely}")
+        print(f"CUDA speedup over CPU, batch={batch_size}:\n{cuda_speedup_over_cpu}")
+        if assert_results:
+            print(f"CUDA max absolute difference to CPU, batch={batch_size}:\n{max_abs_diff_cuda_vs_cpu}")
+            print(f"CPU max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cpu}")
+            print(f"CUDA max absolute difference to Shapely, batch={batch_size}:\n{max_abs_diff_cuda}")
+
+    plotted_files = plot_shapely_evaluation.plot_from_markdown_directory(
+        input_dir=args.output_dir,
+        output_dir=args.output_dir,
+        batch_sizes=batch_sizes,
+        annotate_plots=args.annotate_plots,
+    )
+    print(f"Generated {len(plotted_files)} plot image(s) from Markdown results.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md
new file mode 100644
index 0000000..1ac6974
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 1.03e-03 | 9.861e-04 | 1.020e-03 | 1.141e-03 | 1.175e-03 | 1.157e-03 | 1.294e-03 | 1.536e-03 | 2.404e-03 | 3.742e-03 | 6.497e-03 | 1.486e-02 |
+| 5 | 1.006e-03 | 1.011e-03 | 1.022e-03 | 1.044e-03 | 1.109e-03 | 1.292e-03 | 1.406e-03 | 1.77e-03 | 2.851e-03 | 4.664e-03 | 8.391e-03 | 2.134e-02 |
+| 10 | 1.017e-03 | 1.018e-03 | 1.031e-03 | 1.056e-03 | 1.118e-03 | 1.32e-03 | 1.477e-03 | 1.912e-03 | 3.197e-03 | 5.305e-03 | 9.844e-03 | 2.811e-02 |
+| 20 | 1.021e-03 | 1.029e-03 | 1.050e-03 | 1.07e-03 | 1.209e-03 | 1.318e-03 | 1.579e-03 | 2.032e-03 | 3.506e-03 | 6.075e-03 | 1.094e-02 | 2.844e-02 |
+| 50 | 1.105e-03 | 1.113e-03 | 1.134e-03 | 1.157e-03 | 1.246e-03 | 1.509e-03 | 1.846e-03 | 2.308e-03 | 4.140e-03 | 7.052e-03 | 1.320e-02 | 4.408e-02 |
+| 100 | 1.216e-03 | 1.238e-03 | 1.248e-03 | 1.277e-03 | 1.469e-03 | 1.582e-03 | 1.909e-03 | 2.581e-03 | 4.646e-03 | 8.162e-03 | 1.514e-02 | 5.076e-02 |
+| 200 | 1.5e-03 | 1.494e-03 | 1.536e-03 | 1.604e-03 | 1.652e-03 | 1.869e-03 | 2.262e-03 | 3.043e-03 | 5.500e-03 | 9.331e-03 | 1.722e-02 | 7.581e-02 |
+| 500 | 2.238e-03 | 2.232e-03 | 2.302e-03 | 2.347e-03 | 2.408e-03 | 2.654e-03 | 3.131e-03 | 4.002e-03 | 6.978e-03 | 1.108e-02 | 2.022e-02 | 9.638e-02 |
+| 1000 | 3.430e-03 | 3.437e-03 | 3.476e-03 | 3.52e-03 | 3.594e-03 | 3.868e-03 | 4.464e-03 | 5.305e-03 | 8.287e-03 | 1.318e-02 | 2.306e-02 | 1.374e-01 |
+| 2000 | 5.753e-03 | 5.774e-03 | 5.815e-03 | 5.88e-03 | 5.95e-03 | 6.256e-03 | 6.811e-03 | 7.942e-03 | 1.099e-02 | 1.65e-02 | 2.710e-02 | 1.768e-01 |
+| 5000 | 1.279e-02 | 1.289e-02 | 1.285e-02 | 1.336e-02 | 1.37e-02 | 1.370e-02 | 1.45e-02 | 1.527e-02 | 1.876e-02 | 2.451e-02 | 3.842e-02 | 2.290e-01 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md
new file mode 100644
index 0000000..ee2c4c8
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_cuda.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.535e-03 | 3.428e-03 | 3.44e-03 | 3.456e-03 | 3.432e-03 | 3.443e-03 | 3.43e-03 | 3.521e-03 | 3.464e-03 | 3.47e-03 | 3.486e-03 | 4.911e-03 |
+| 5 | 3.394e-03 | 3.39e-03 | 3.455e-03 | 3.415e-03 | 3.385e-03 | 3.388e-03 | 3.478e-03 | 3.472e-03 | 3.471e-03 | 3.533e-03 | 3.648e-03 | 5.321e-03 |
+| 10 | 3.437e-03 | 3.397e-03 | 3.396e-03 | 3.392e-03 | 3.454e-03 | 3.384e-03 | 3.467e-03 | 3.482e-03 | 3.480e-03 | 3.476e-03 | 3.841e-03 | 5.798e-03 |
+| 20 | 3.419e-03 | 3.408e-03 | 3.403e-03 | 3.381e-03 | 3.393e-03 | 3.442e-03 | 3.482e-03 | 3.464e-03 | 3.467e-03 | 3.468e-03 | 3.970e-03 | 6.137e-03 |
+| 50 | 3.470e-03 | 3.4e-03 | 3.388e-03 | 3.394e-03 | 3.405e-03 | 3.382e-03 | 3.48e-03 | 3.528e-03 | 3.462e-03 | 3.471e-03 | 4.098e-03 | 6.448e-03 |
+| 100 | 3.400e-03 | 3.453e-03 | 3.415e-03 | 3.401e-03 | 3.412e-03 | 3.411e-03 | 3.491e-03 | 3.417e-03 | 3.481e-03 | 3.535e-03 | 4.291e-03 | 6.711e-03 |
+| 200 | 3.486e-03 | 3.396e-03 | 3.396e-03 | 3.454e-03 | 3.396e-03 | 3.403e-03 | 3.478e-03 | 3.460e-03 | 3.405e-03 | 3.461e-03 | 4.406e-03 | 7.067e-03 |
+| 500 | 3.479e-03 | 3.389e-03 | 3.394e-03 | 3.409e-03 | 3.476e-03 | 3.385e-03 | 3.471e-03 | 3.474e-03 | 3.471e-03 | 3.594e-03 | 4.706e-03 | 7.429e-03 |
+| 1000 | 3.478e-03 | 3.409e-03 | 3.407e-03 | 3.382e-03 | 3.383e-03 | 3.435e-03 | 3.453e-03 | 3.456e-03 | 3.463e-03 | 3.689e-03 | 5.009e-03 | 7.908e-03 |
+| 2000 | 3.765e-03 | 3.787e-03 | 3.771e-03 | 3.777e-03 | 3.785e-03 | 3.795e-03 | 3.822e-03 | 3.857e-03 | 4.169e-03 | 4.674e-03 | 5.979e-03 | 9.683e-03 |
+| 5000 | 6.e-03 | 5.96e-03 | 6.107e-03 | 6.115e-03 | 6.131e-03 | 6.205e-03 | 6.26e-03 | 6.297e-03 | 6.666e-03 | 7.521e-03 | 9.576e-03 | 1.49e-02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md
new file mode 100644
index 0000000..218e33c
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_runtime_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 5.920e-03 | 5.929e-03 | 6.514e-03 | 7.182e-03 | 9.292e-03 | 1.258e-02 | 1.976e-02 | 3.39e-02 | 7.530e-02 | 1.522e-01 | 3.996e-01 | 1.684e+00 |
+| 5 | 5.928e-03 | 5.962e-03 | 6.537e-03 | 7.384e-03 | 1.018e-02 | 1.435e-02 | 2.246e-02 | 3.783e-02 | 9.250e-02 | 2.432e-01 | 4.686e-01 | 1.798e+00 |
+| 10 | 6.044e-03 | 6.014e-03 | 6.824e-03 | 7.726e-03 | 9.796e-03 | 1.534e-02 | 2.687e-02 | 4.409e-02 | 1.118e-01 | 2.867e-01 | 5.987e-01 | 2.145e+00 |
+| 20 | 6.032e-03 | 6.105e-03 | 6.865e-03 | 8.794e-03 | 1.120e-02 | 2.134e-02 | 3.391e-02 | 6.222e-02 | 1.556e-01 | 3.645e-01 | 6.786e-01 | 2.637e+00 |
+| 50 | 6.499e-03 | 6.279e-03 | 8.662e-03 | 9.916e-03 | 1.697e-02 | 3.003e-02 | 5.847e-02 | 1.228e-01 | 2.529e-01 | 6.603e-01 | 1.177e+00 | 3.703e+00 |
+| 100 | 6.479e-03 | 7.37e-03 | 1.051e-02 | 1.371e-02 | 2.748e-02 | 4.721e-02 | 1.027e-01 | 1.929e-01 | 5.007e-01 | 9.981e-01 | 2.129e+00 | 6.030e+00 |
+| 200 | 7.691e-03 | 9.454e-03 | 1.444e-02 | 2.263e-02 | 3.906e-02 | 9.756e-02 | 1.886e-01 | 3.467e-01 | 8.815e-01 | 1.792e+00 | 3.648e+00 | 1.008e+01 |
+| 500 | 8.883e-03 | 1.283e-02 | 3.616e-02 | 3.52e-02 | 8.427e-02 | 2.162e-01 | 4.081e-01 | 8.556e-01 | 2.177e+00 | 4.328e+00 | 8.686e+00 | 2.235e+01 |
+| 1000 | 8.225e-03 | 2.181e-02 | 5.647e-02 | 8.705e-02 | 1.809e-01 | 4.044e-01 | 7.995e-01 | 1.644e+00 | 4.172e+00 | 8.583e+00 | 1.69e+01 | 4.323e+01 |
+| 2000 | 3.628e-02 | 4.154e-02 | 6.644e-02 | 1.663e-01 | 4.074e-01 | 8.366e-01 | 1.748e+00 | 3.414e+00 | 8.769e+00 | 1.775e+01 | 3.397e+01 | 8.585e+01 |
+| 5000 | 5.857e-02 | 1.474e-01 | 2.871e-01 | 5.538e-01 | 8.380e-01 | 2.379e+00 | 4.255e+00 | 8.153e+00 | 2.054e+01 | 4.167e+01 | 8.298e+01 | 2.090e+02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md
new file mode 100644
index 0000000..4d3f267
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cpu_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 5.75 | 6.01 | 6.38 | 6.30 | 7.91 | 10.87 | 15.27 | 22.07 | 31.33 | 40.68 | 61.51 | 113.29 |
+| 5 | 5.89 | 5.90 | 6.40 | 7.07 | 9.18 | 11.11 | 15.98 | 21.38 | 32.44 | 52.15 | 55.84 | 84.26 |
+| 10 | 5.94 | 5.91 | 6.62 | 7.32 | 8.76 | 11.63 | 18.19 | 23.06 | 34.97 | 54.04 | 60.82 | 76.29 |
+| 20 | 5.91 | 5.93 | 6.54 | 8.22 | 9.26 | 16.20 | 21.48 | 30.62 | 44.39 | 60.01 | 62.01 | 92.75 |
+| 50 | 5.88 | 5.64 | 7.64 | 8.57 | 13.61 | 19.90 | 31.67 | 53.21 | 61.08 | 93.63 | 89.16 | 84.01 |
+| 100 | 5.33 | 5.96 | 8.42 | 10.73 | 18.70 | 29.85 | 53.82 | 74.74 | 107.76 | 122.28 | 140.68 | 118.80 |
+| 200 | 5.13 | 6.33 | 9.40 | 14.11 | 23.64 | 52.19 | 83.38 | 113.93 | 160.26 | 192.08 | 211.85 | 132.97 |
+| 500 | 3.97 | 5.75 | 15.71 | 15.00 | 34.99 | 81.47 | 130.37 | 213.79 | 311.96 | 390.72 | 429.62 | 231.94 |
+| 1000 | 2.40 | 6.35 | 16.25 | 24.73 | 50.34 | 104.54 | 179.10 | 309.90 | 503.46 | 651.02 | 732.57 | 314.65 |
+| 2000 | 6.31 | 7.19 | 11.43 | 28.28 | 68.47 | 133.72 | 256.58 | 429.83 | 797.87 | 1.08e+03 | 1.25e+03 | 485.50 |
+| 5000 | 4.58 | 11.44 | 22.34 | 41.47 | 61.17 | 173.59 | 293.48 | 533.92 | 1.09e+03 | 1.70e+03 | 2.16e+03 | 912.55 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md
new file mode 100644
index 0000000..487290e
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 0.29 | 0.29 | 0.30 | 0.33 | 0.34 | 0.34 | 0.38 | 0.44 | 0.69 | 1.08 | 1.86 | 3.03 |
+| 5 | 0.30 | 0.30 | 0.30 | 0.31 | 0.33 | 0.38 | 0.40 | 0.51 | 0.82 | 1.32 | 2.30 | 4.01 |
+| 10 | 0.30 | 0.30 | 0.30 | 0.31 | 0.32 | 0.39 | 0.43 | 0.55 | 0.92 | 1.53 | 2.56 | 4.85 |
+| 20 | 0.30 | 0.30 | 0.31 | 0.32 | 0.36 | 0.38 | 0.45 | 0.59 | 1.01 | 1.75 | 2.76 | 4.63 |
+| 50 | 0.32 | 0.33 | 0.33 | 0.34 | 0.37 | 0.45 | 0.53 | 0.65 | 1.20 | 2.03 | 3.22 | 6.84 |
+| 100 | 0.36 | 0.36 | 0.37 | 0.38 | 0.43 | 0.46 | 0.55 | 0.76 | 1.33 | 2.31 | 3.53 | 7.56 |
+| 200 | 0.43 | 0.44 | 0.45 | 0.46 | 0.49 | 0.55 | 0.65 | 0.88 | 1.62 | 2.70 | 3.91 | 10.73 |
+| 500 | 0.64 | 0.66 | 0.68 | 0.69 | 0.69 | 0.78 | 0.90 | 1.15 | 2.01 | 3.08 | 4.30 | 12.97 |
+| 1000 | 0.99 | 1.01 | 1.02 | 1.04 | 1.06 | 1.13 | 1.29 | 1.53 | 2.39 | 3.57 | 4.60 | 17.37 |
+| 2000 | 1.53 | 1.52 | 1.54 | 1.56 | 1.57 | 1.65 | 1.78 | 2.06 | 2.64 | 3.53 | 4.53 | 18.26 |
+| 5000 | 2.13 | 2.16 | 2.10 | 2.18 | 2.23 | 2.21 | 2.32 | 2.43 | 2.81 | 3.26 | 4.01 | 15.38 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md
new file mode 100644
index 0000000..1125800
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_1_speedup_cuda_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 1.67 | 1.73 | 1.89 | 2.08 | 2.71 | 3.65 | 5.76 | 9.63 | 21.74 | 43.87 | 114.64 | 342.86 |
+| 5 | 1.75 | 1.76 | 1.89 | 2.16 | 3.01 | 4.23 | 6.46 | 10.89 | 26.65 | 68.85 | 128.46 | 337.91 |
+| 10 | 1.76 | 1.77 | 2.01 | 2.28 | 2.84 | 4.53 | 7.75 | 12.66 | 32.12 | 82.47 | 155.86 | 369.85 |
+| 20 | 1.76 | 1.79 | 2.02 | 2.60 | 3.30 | 6.20 | 9.74 | 17.96 | 44.88 | 105.11 | 170.94 | 429.75 |
+| 50 | 1.87 | 1.85 | 2.56 | 2.92 | 4.98 | 8.88 | 16.80 | 34.81 | 73.04 | 190.20 | 287.27 | 574.20 |
+| 100 | 1.91 | 2.13 | 3.08 | 4.03 | 8.05 | 13.84 | 29.42 | 56.46 | 143.84 | 282.36 | 496.26 | 898.57 |
+| 200 | 2.21 | 2.78 | 4.25 | 6.55 | 11.50 | 28.67 | 54.23 | 100.20 | 258.88 | 517.84 | 827.99 | 1.43e+03 |
+| 500 | 2.55 | 3.79 | 10.65 | 10.32 | 24.24 | 63.87 | 117.57 | 246.30 | 627.28 | 1.20e+03 | 1.85e+03 | 3.01e+03 |
+| 1000 | 2.36 | 6.40 | 16.58 | 25.74 | 53.47 | 117.71 | 231.55 | 475.60 | 1.20e+03 | 2.33e+03 | 3.37e+03 | 5.47e+03 |
+| 2000 | 9.63 | 10.97 | 17.62 | 44.03 | 107.63 | 220.43 | 457.23 | 885.05 | 2.10e+03 | 3.80e+03 | 5.68e+03 | 8.87e+03 |
+| 5000 | 9.76 | 24.73 | 47.01 | 90.57 | 136.68 | 383.37 | 679.79 | 1.29e+03 | 3.08e+03 | 5.54e+03 | 8.67e+03 | 1.40e+04 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md
new file mode 100644
index 0000000..584833d
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.839e-03 | 2.263e-03 | 2.751e-03 | 3.603e-03 | 5.367e-03 | 1.043e-02 | 1.963e-02 | 3.62e-02 | 8.759e-02 | 1.75e-01 | 3.531e-01 | 8.816e-01 |
+| 5 | 2.581e-03 | 2.852e-03 | 3.709e-03 | 5.404e-03 | 8.689e-03 | 2.09e-02 | 4.92e-02 | 1.152e-01 | 3.25e-01 | 6.775e-01 | 1.329e+00 | 3.191e+00 |
+| 10 | 3.203e-03 | 3.694e-03 | 4.880e-03 | 6.879e-03 | 1.078e-02 | 2.716e-02 | 6.540e-02 | 1.654e-01 | 4.613e-01 | 9.447e-01 | 1.831e+00 | 4.547e+00 |
+| 20 | 4.909e-03 | 5.348e-03 | 6.714e-03 | 8.853e-03 | 1.4e-02 | 3.155e-02 | 8.694e-02 | 2.126e-01 | 5.852e-01 | 1.196e+00 | 2.377e+00 | 5.819e+00 |
+| 50 | 9.96e-03 | 1.06e-02 | 1.193e-02 | 1.534e-02 | 2.064e-02 | 4.532e-02 | 1.234e-01 | 2.898e-01 | 7.611e-01 | 1.517e+00 | 3.015e+00 | 7.419e+00 |
+| 100 | 1.739e-02 | 1.813e-02 | 2.019e-02 | 2.349e-02 | 2.952e-02 | 5.775e-02 | 1.613e-01 | 3.547e-01 | 8.880e-01 | 1.768e+00 | 3.481e+00 | 8.625e+00 |
+| 200 | 3.283e-02 | 3.274e-02 | 3.517e-02 | 3.895e-02 | 4.640e-02 | 8.618e-02 | 2.034e-01 | 4.200e-01 | 1.036e+00 | 2.037e+00 | 4.028e+00 | 9.911e+00 |
+| 500 | 7.66e-02 | 7.725e-02 | 8.045e-02 | 8.389e-02 | 9.346e-02 | 1.439e-01 | 2.970e-01 | 5.451e-01 | 1.259e+00 | 2.425e+00 | 4.751e+00 | 1.168e+01 |
+| 1000 | 1.509e-01 | 1.580e-01 | 1.543e-01 | 1.593e-01 | 1.691e-01 | 2.349e-01 | 4.054e-01 | 6.792e-01 | 1.476e+00 | 2.797e+00 | 5.417e+00 | 1.32e+01 |
+| 2000 | 3.070e-01 | 3.025e-01 | 3.032e-01 | 3.147e-01 | 3.206e-01 | 4.106e-01 | 5.676e-01 | 9.091e-01 | 1.797e+00 | 3.308e+00 | 6.246e+00 | 1.515e+01 |
+| 5000 | 7.461e-01 | 7.505e-01 | 7.574e-01 | 7.580e-01 | 7.719e-01 | 8.920e-01 | 1.084e+00 | 1.424e+00 | 2.472e+00 | 4.160e+00 | 7.566e+00 | 1.774e+01 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md
new file mode 100644
index 0000000..66190a3
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_cuda.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 8.093e-03 | 4.831e-03 | 4.721e-03 | 4.601e-03 | 4.671e-03 | 4.493e-03 | 4.649e-03 | 4.688e-03 | 4.626e-03 | 4.676e-03 | 5.169e-03 | 8.322e-03 |
+| 5 | 4.795e-03 | 4.643e-03 | 4.625e-03 | 4.689e-03 | 4.613e-03 | 4.672e-03 | 4.744e-03 | 4.554e-03 | 4.539e-03 | 5.416e-03 | 5.288e-03 | 8.217e-03 |
+| 10 | 4.59e-03 | 4.752e-03 | 5.439e-03 | 4.58e-03 | 4.614e-03 | 4.622e-03 | 4.530e-03 | 4.682e-03 | 4.752e-03 | 4.708e-03 | 5.339e-03 | 8.593e-03 |
+| 20 | 4.6e-03 | 4.605e-03 | 4.989e-03 | 4.693e-03 | 4.636e-03 | 5.333e-03 | 4.715e-03 | 4.553e-03 | 4.573e-03 | 4.735e-03 | 5.519e-03 | 9.150e-03 |
+| 50 | 4.689e-03 | 4.66e-03 | 4.779e-03 | 4.624e-03 | 4.725e-03 | 4.518e-03 | 4.727e-03 | 4.716e-03 | 5.022e-03 | 4.643e-03 | 5.667e-03 | 9.064e-03 |
+| 100 | 4.615e-03 | 4.651e-03 | 4.751e-03 | 4.623e-03 | 4.545e-03 | 4.623e-03 | 4.731e-03 | 4.651e-03 | 5.399e-03 | 4.741e-03 | 1.487e-02 | 9.328e-03 |
+| 200 | 4.689e-03 | 5.153e-03 | 4.695e-03 | 4.742e-03 | 4.776e-03 | 4.716e-03 | 5.547e-03 | 4.72e-03 | 4.628e-03 | 4.870e-03 | 5.972e-03 | 9.408e-03 |
+| 500 | 4.613e-03 | 4.76e-03 | 4.683e-03 | 4.732e-03 | 5.307e-03 | 4.693e-03 | 4.659e-03 | 4.737e-03 | 5.204e-03 | 4.969e-03 | 6.236e-03 | 9.575e-03 |
+| 1000 | 4.701e-03 | 4.749e-03 | 4.716e-03 | 4.71e-03 | 4.676e-03 | 4.783e-03 | 4.781e-03 | 4.735e-03 | 4.820e-03 | 5.343e-03 | 6.652e-03 | 1.028e-02 |
+| 2000 | 5.166e-03 | 5.164e-03 | 5.182e-03 | 5.161e-03 | 5.152e-03 | 5.738e-03 | 5.541e-03 | 5.356e-03 | 5.642e-03 | 6.490e-03 | 7.966e-03 | 1.232e-02 |
+| 5000 | 7.627e-03 | 7.593e-03 | 7.564e-03 | 7.658e-03 | 7.698e-03 | 7.712e-03 | 7.797e-03 | 7.8e-03 | 8.731e-03 | 1.037e-02 | 1.327e-02 | 2.230e-02 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md
new file mode 100644
index 0000000..afd2356
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_runtime_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 2.728e-02 | 3.092e-02 | 5.674e-02 | 1.079e-01 | 1.983e-01 | 4.919e-01 | 9.971e-01 | 7.669e+00 | 1.382e+01 | 3.652e+01 | 7.587e+01 | 2.001e+02 |
+| 5 | 2.908e-02 | 3.597e-02 | 6.759e-02 | 1.316e-01 | 2.390e-01 | 5.987e-01 | 5.551e+00 | 5.53e+00 | 1.544e+01 | 3.595e+01 | 8.467e+01 | 2.072e+02 |
+| 10 | 3.137e-02 | 4.208e-02 | 8.376e-02 | 1.58e-01 | 3.011e-01 | 4.584e+00 | 1.489e+00 | 5.831e+00 | 1.888e+01 | 3.929e+01 | 8.577e+01 | 2.247e+02 |
+| 20 | 3.699e-02 | 5.445e-02 | 1.104e-01 | 2.123e-01 | 4.003e-01 | 1.003e+00 | 5.894e+00 | 6.842e+00 | 2.179e+01 | 4.142e+01 | 1.000e+02 | 2.533e+02 |
+| 50 | 5.549e-02 | 8.991e-02 | 1.946e-01 | 3.639e-01 | 7.197e-01 | 1.785e+00 | 3.579e+00 | 7.337e+00 | 3.250e+01 | 5.705e+01 | 1.321e+02 | 3.331e+02 |
+| 100 | 8.595e-02 | 1.505e-01 | 3.145e-01 | 6.342e-01 | 1.227e+00 | 3.114e+00 | 6.111e+00 | 1.251e+01 | 4.497e+01 | 8.732e+01 | 1.823e+02 | 4.563e+02 |
+| 200 | 1.464e-01 | 2.415e-01 | 6.012e-01 | 1.157e+00 | 2.252e+00 | 5.739e+00 | 1.556e+01 | 2.64e+01 | 6.973e+01 | 1.423e+02 | 2.886e+02 | 7.270e+02 |
+| 500 | 3.363e-01 | 5.55e-01 | 1.525e+00 | 2.669e+00 | 5.563e+00 | 1.352e+01 | 3.084e+01 | 5.781e+01 | 1.467e+02 | 2.977e+02 | 6.112e+02 | 1.519e+03 |
+| 1000 | 6.443e-01 | 1.201e+00 | 2.763e+00 | 5.361e+00 | 1.078e+01 | 2.608e+01 | 5.396e+01 | 1.076e+02 | 2.877e+02 | 5.609e+02 | 1.143e+03 | 2.861e+03 |
+| 2000 | 1.189e+00 | 1.968e+00 | 5.459e+00 | 1.057e+01 | 2.136e+01 | 5.446e+01 | 1.096e+02 | 2.185e+02 | 5.670e+02 | 1.114e+03 | 2.244e+03 | 5.570e+03 |
+| 5000 | 5.404e+00 | 5.521e+00 | 1.321e+01 | 2.739e+01 | 5.495e+01 | 1.321e+02 | 2.654e+02 | 5.364e+02 | 1.375e+03 | 2.726e+03 | 5.435e+03 | 1.369e+04 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md
new file mode 100644
index 0000000..929b485
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cpu_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 7.11 | 13.66 | 20.62 | 29.95 | 36.94 | 47.14 | 50.78 | 211.88 | 157.80 | 208.71 | 214.88 | 226.96 |
+| 5 | 11.27 | 12.61 | 18.22 | 24.35 | 27.51 | 28.65 | 112.84 | 48.00 | 47.51 | 53.07 | 63.71 | 64.93 |
+| 10 | 9.79 | 11.39 | 17.16 | 22.96 | 27.92 | 168.75 | 22.77 | 35.24 | 40.93 | 41.59 | 46.86 | 49.41 |
+| 20 | 7.53 | 10.18 | 16.45 | 23.99 | 28.60 | 31.78 | 67.79 | 32.19 | 37.23 | 34.62 | 42.08 | 43.52 |
+| 50 | 5.57 | 8.48 | 16.31 | 23.72 | 34.86 | 39.38 | 29.00 | 25.31 | 42.71 | 37.61 | 43.83 | 44.90 |
+| 100 | 4.94 | 8.30 | 15.58 | 27.00 | 41.57 | 53.93 | 37.89 | 35.28 | 50.64 | 49.39 | 52.37 | 52.91 |
+| 200 | 4.46 | 7.38 | 17.09 | 29.70 | 48.53 | 66.59 | 76.49 | 62.85 | 67.33 | 69.88 | 71.66 | 73.36 |
+| 500 | 4.39 | 7.18 | 18.96 | 31.82 | 59.52 | 93.93 | 103.85 | 106.04 | 116.48 | 122.75 | 128.65 | 130.10 |
+| 1000 | 4.27 | 7.60 | 17.91 | 33.65 | 63.73 | 111.04 | 133.13 | 158.49 | 194.95 | 200.54 | 210.95 | 216.73 |
+| 2000 | 3.87 | 6.51 | 18.00 | 33.57 | 66.62 | 132.64 | 193.13 | 240.29 | 315.51 | 336.89 | 359.30 | 367.59 |
+| 5000 | 7.24 | 7.36 | 17.44 | 36.14 | 71.19 | 148.05 | 244.92 | 376.69 | 556.45 | 655.34 | 718.35 | 771.81 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md
new file mode 100644
index 0000000..39bcbd0
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_cpu.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 0.47 | 0.47 | 0.58 | 0.78 | 1.15 | 2.32 | 4.22 | 7.72 | 18.93 | 37.42 | 68.31 | 105.94 |
+| 5 | 0.54 | 0.61 | 0.80 | 1.15 | 1.88 | 4.47 | 10.37 | 25.30 | 71.59 | 125.09 | 251.34 | 388.32 |
+| 10 | 0.70 | 0.78 | 0.90 | 1.50 | 2.34 | 5.88 | 14.44 | 35.34 | 97.07 | 200.67 | 342.88 | 529.19 |
+| 20 | 1.07 | 1.16 | 1.35 | 1.89 | 3.02 | 5.92 | 18.44 | 46.69 | 127.98 | 252.67 | 430.68 | 636.00 |
+| 50 | 2.12 | 2.27 | 2.50 | 3.32 | 4.37 | 10.03 | 26.11 | 61.46 | 151.55 | 326.74 | 532.01 | 818.56 |
+| 100 | 3.77 | 3.90 | 4.25 | 5.08 | 6.49 | 12.49 | 34.09 | 76.28 | 164.48 | 372.91 | 234.16 | 924.55 |
+| 200 | 7.00 | 6.35 | 7.49 | 8.21 | 9.72 | 18.28 | 36.67 | 89.00 | 223.79 | 418.30 | 674.41 | 1.05e+03 |
+| 500 | 16.61 | 16.23 | 17.18 | 17.73 | 17.61 | 30.67 | 63.76 | 115.08 | 242.03 | 488.09 | 761.85 | 1.22e+03 |
+| 1000 | 32.10 | 33.27 | 32.72 | 33.83 | 36.17 | 49.11 | 84.79 | 143.44 | 306.20 | 523.52 | 814.37 | 1.28e+03 |
+| 2000 | 59.44 | 58.58 | 58.52 | 60.99 | 62.24 | 71.56 | 102.44 | 169.74 | 318.56 | 509.73 | 784.05 | 1.23e+03 |
+| 5000 | 97.82 | 98.84 | 100.13 | 98.98 | 100.27 | 115.66 | 139.00 | 182.58 | 283.08 | 401.24 | 570.04 | 795.20 |
diff --git a/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md
new file mode 100644
index 0000000..12e13eb
--- /dev/null
+++ b/packages/lane_helpers/evaluation_results/polyline_runtime_evaluation/batch_64_speedup_cuda_vs_shapely.md
@@ -0,0 +1,13 @@
+| # Points (down) / # Distances (right) | 1 | 2 | 5 | 10 | 20 | 50 | 100 | 200 | 500 | 1000 | 2000 | 5000 |
+| :----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
+| 2 | 3.37 | 6.40 | 12.02 | 23.45 | 42.45 | 109.46 | 214.50 | 1.64e+03 | 2.99e+03 | 7.81e+03 | 1.47e+04 | 2.40e+04 |
+| 5 | 6.06 | 7.75 | 14.61 | 28.06 | 51.83 | 128.15 | 1.17e+03 | 1.21e+03 | 3.40e+03 | 6.64e+03 | 1.60e+04 | 2.52e+04 |
+| 10 | 6.83 | 8.85 | 15.40 | 34.49 | 65.26 | 991.79 | 328.65 | 1.25e+03 | 3.97e+03 | 8.35e+03 | 1.61e+04 | 2.61e+04 |
+| 20 | 8.04 | 11.83 | 22.13 | 45.25 | 86.35 | 188.01 | 1.25e+03 | 1.50e+03 | 4.77e+03 | 8.75e+03 | 1.81e+04 | 2.77e+04 |
+| 50 | 11.84 | 19.30 | 40.72 | 78.71 | 152.33 | 394.99 | 757.11 | 1.56e+03 | 6.47e+03 | 1.23e+04 | 2.33e+04 | 3.68e+04 |
+| 100 | 18.63 | 32.36 | 66.20 | 137.20 | 269.98 | 673.70 | 1.29e+03 | 2.69e+03 | 8.33e+03 | 1.84e+04 | 1.23e+04 | 4.89e+04 |
+| 200 | 31.22 | 46.87 | 128.05 | 243.96 | 471.56 | 1.22e+03 | 2.80e+03 | 5.59e+03 | 1.51e+04 | 2.92e+04 | 4.83e+04 | 7.73e+04 |
+| 500 | 72.90 | 116.59 | 325.71 | 564.06 | 1.05e+03 | 2.88e+03 | 6.62e+03 | 1.22e+04 | 2.82e+04 | 5.99e+04 | 9.80e+04 | 1.59e+05 |
+| 1000 | 137.05 | 252.94 | 585.84 | 1.14e+03 | 2.31e+03 | 5.45e+03 | 1.13e+04 | 2.27e+04 | 5.97e+04 | 1.05e+05 | 1.72e+05 | 2.78e+05 |
+| 2000 | 230.13 | 381.18 | 1.05e+03 | 2.05e+03 | 4.15e+03 | 9.49e+03 | 1.98e+04 | 4.08e+04 | 1.01e+05 | 1.72e+05 | 2.82e+05 | 4.52e+05 |
+| 5000 | 708.50 | 727.06 | 1.75e+03 | 3.58e+03 | 7.14e+03 | 1.71e+04 | 3.40e+04 | 6.88e+04 | 1.58e+05 | 2.63e+05 | 4.09e+05 | 6.14e+05 |
diff --git a/packages/lane_helpers/examples/basic_usage.py b/packages/lane_helpers/examples/basic_usage.py
new file mode 100644
index 0000000..a099fd8
--- /dev/null
+++ b/packages/lane_helpers/examples/basic_usage.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from accvlab.lane_helpers import polyline
+
+
+def main() -> None:
+    if not torch.cuda.is_available():
+        raise RuntimeError("This example requires a CUDA-capable PyTorch installation.")
+
+    # @NOTE Use one rectangle polyline with shape (batch=1, num_points=5, num_dims=2).
+    points = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    # @NOTE Use a tensor of distances to sample the polyline at (batch=1, num_distances=5).
+    distances = torch.tensor([[0.0, 0.5, 1.0, 3.0, 6.0]], device="cuda", dtype=torch.float32)
+
+    # @NOTE Interpolate the polyline at the given distances.
+    sampled_points = polyline.interpolate(points, distances)
+    # @NOTE Compute the length of the polyline.
+    line_lengths = polyline.lengths(points)
+
+    # @NOTE Print the results.
+    print(f"Interpolated points:\n{sampled_points}")
+    print(f"Line length(s): {line_lengths}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/lane_helpers/ext_impl/CMakeLists.txt b/packages/lane_helpers/ext_impl/CMakeLists.txt
new file mode 100644
index 0000000..bfab1b2
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.18)
+project(accvlab_lane_helpers_ext LANGUAGES CXX CUDA)
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+execute_process(
+    COMMAND "python3" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'share', 'cmake'))"
+    OUTPUT_VARIABLE TORCH_CMAKE_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PATH}")
+
+find_package(CUDA REQUIRED)
+find_package(Torch REQUIRED)
+find_package(Python COMPONENTS Interpreter Development REQUIRED)
+
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m pybind11 --cmakedir
+    OUTPUT_VARIABLE pybind11_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+find_package(pybind11 REQUIRED)
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+    set(CMAKE_CUDA_ARCHITECTURES native)
+  else()
+    set(CMAKE_CUDA_ARCHITECTURES "75;80;86")
+  endif()
+endif()
+
+separate_arguments(TORCH_CXX_FLAGS_LIST NATIVE_COMMAND "${TORCH_CXX_FLAGS}")
+
+find_library(TORCH_PYTHON_LIBRARY torch_python PATHS ${TORCH_INSTALL_PREFIX}/lib)
+
+add_subdirectory(polyline)
diff --git a/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt
new file mode 100644
index 0000000..207431b
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pybind11_add_module(accvlab_lane_helpers_polyline_sampling MODULE
+    src/polyline.cu
+    src/polyline_cpu.cpp
+    src/polyline.cpp
+)
+
+set_target_properties(accvlab_lane_helpers_polyline_sampling PROPERTIES
+    CXX_STANDARD 17
+    CUDA_STANDARD 17
+    OUTPUT_NAME "_polyline_sampling"
+    PREFIX ""
+)
+
+target_compile_definitions(accvlab_lane_helpers_polyline_sampling PRIVATE
+    TORCH_EXTENSION_NAME=_polyline_sampling
+    TORCH_API_INCLUDE_EXTENSION_H
+)
+
+target_compile_options(accvlab_lane_helpers_polyline_sampling PRIVATE ${TORCH_CXX_FLAGS_LIST})
+
+target_link_libraries(accvlab_lane_helpers_polyline_sampling PRIVATE
+    ${TORCH_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    ${TORCH_PYTHON_LIBRARY}
+)
+
+target_include_directories(accvlab_lane_helpers_polyline_sampling PRIVATE
+    ${TORCH_INCLUDE_DIRS}
+    ${CUDA_INCLUDE_DIRS}
+    include/
+)
+
+install(TARGETS accvlab_lane_helpers_polyline_sampling
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .
+)
diff --git a/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh
new file mode 100644
index 0000000..441220a
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/helper_macros.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH
+#define POLYLINE_SAMPLING_CUDA_HELPER_MACROS_CUH
+
+#include <c10/cuda/CUDAException.h>
+
+#define CUDA_CHECK(error_code_or_call) C10_CUDA_CHECK(error_code_or_call)
+#define CUDA_CHECK_LAST() C10_CUDA_CHECK(cudaGetLastError())
+
+#endif
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh
new file mode 100644
index 0000000..01ce298
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline.cuh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Host-visible interface for the polyline interpolation CUDA
+// implementation. This header is intentionally free of CUDA device intrinsics
+// so it can be included from both C++ and CUDA translation units.
+
+#ifndef LANE_HELPERS_POLYLINE_CUH
+#define LANE_HELPERS_POLYLINE_CUH
+
+#include <cstddef>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+namespace polyline {
+
+template <typename dtype>
+struct PolylineLaunchConfig {
+    dim3 block_dim;
+    dim3 grid_dim;
+    int num_points_full_blocks;
+    size_t shared_mem_size;
+    size_t distance_buffer_ext_size_elems;
+    bool use_shared_distances;
+    size_t max_shared_full;
+};
+
+template <typename dtype>
+PolylineLaunchConfig<dtype> make_polyline_launch_config(int num_points, int num_samples, int device);
+
+template <typename dtype>
+void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances,
+                            dtype* result_points, int num_samples, bool relative_distances, int device,
+                            const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                            cudaStream_t stream);
+
+template <typename dtype>
+void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples,
+                      cudaStream_t stream);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances,
+                                           int num_distances, dtype* result_points, int num_samples,
+                                           sample_size_dtype* sample_sizes_points,
+                                           sample_size_dtype* sample_sizes_distances_to_sample,
+                                           bool relative_distances, int device,
+                                           const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                                           cudaStream_t stream);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths,
+                                     int num_samples, sample_size_dtype* sample_sizes_points,
+                                     cudaStream_t stream);
+
+template <typename dtype>
+void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances,
+                                int num_distances, dtype* result_points, int num_samples,
+                                bool relative_distances);
+
+template <typename dtype>
+void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                               const dtype* distances, int num_distances,
+                                               dtype* result_points, int num_samples,
+                                               const sample_size_dtype* sample_sizes_points,
+                                               const sample_size_dtype* sample_sizes_distances_to_sample,
+                                               bool relative_distances);
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                         dtype* lengths, int num_samples,
+                                         const sample_size_dtype* sample_sizes_points);
+
+// Explicit instantiations are provided in polyline.cu and polyline_cpu.cpp.
+#define DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE)                                \
+    extern template PolylineLaunchConfig<DTYPE> make_polyline_launch_config<DTYPE>( \
+        int num_points, int num_samples, int device);
+
+#define DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE)                                       \
+    extern template void polyline_interpolation<DTYPE>(                                    \
+        DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances, int device,        \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE)                                                 \
+    extern template void polyline_lengths<DTYPE>(DTYPE * points, int num_points, int num_dims, \
+                                                 DTYPE* lengths, int num_samples, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)            \
+    extern template void polyline_interpolation_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(         \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances,    \
+        DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points,            \
+        SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)           \
+    extern template void polyline_lengths_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(        \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream);
+
+#define DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(DTYPE)                        \
+    DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_INTERPOLATION_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_LENGTHS_EXTERN(DTYPE)                               \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int)     \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t) \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int)           \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN(DTYPE, int64_t)
+
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(float)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(double)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::Half)
+DECLARE_POLYLINE_CUDA_DTYPE_EXTERN(c10::BFloat16)
+
+#undef DECLARE_POLYLINE_CUDA_DTYPE_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_EXTERN
+#undef DECLARE_POLYLINE_LAUNCH_CONFIG_EXTERN
+
+#define DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE)                                              \
+    extern template void polyline_interpolation_cpu<DTYPE>(                                           \
+        const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances);
+
+#define DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE)                                                      \
+    extern template void polyline_lengths_cpu<DTYPE>(const DTYPE* points, int num_points, int num_dims, \
+                                                     DTYPE* lengths, int num_samples);
+
+#define DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)                \
+    extern template void polyline_interpolation_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(             \
+        const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points,              \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances);
+
+#define DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, SAMPLE_SIZE_DTYPE)            \
+    extern template void polyline_lengths_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(         \
+        const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_points);
+
+#define DECLARE_POLYLINE_CPU_DTYPE_EXTERN(DTYPE)                             \
+    DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN(DTYPE)                         \
+    DECLARE_POLYLINE_LENGTHS_CPU_EXTERN(DTYPE)                               \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int)     \
+    DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t) \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int)           \
+    DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN(DTYPE, int64_t)
+
+DECLARE_POLYLINE_CPU_DTYPE_EXTERN(float)
+DECLARE_POLYLINE_CPU_DTYPE_EXTERN(double)
+
+#undef DECLARE_POLYLINE_CPU_DTYPE_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU_EXTERN
+#undef DECLARE_POLYLINE_LENGTHS_CPU_EXTERN
+#undef DECLARE_POLYLINE_INTERPOLATION_CPU_EXTERN
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh
new file mode 100644
index 0000000..625dacf
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_common.cuh
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_COMMON_CUH
+#define LANE_HELPERS_POLYLINE_COMMON_CUH
+
+#include <cmath>
+#include <limits>
+
+#include "polyline_dtype_compat.cuh"
+
+#ifdef __CUDACC__
+// Keep scalar helpers callable from both CUDA kernels and CPU translation units.
+#define POLYLINE_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define POLYLINE_HOST_DEVICE_INLINE inline
+#endif
+
+namespace polyline {
+
+template <typename dtype>
+POLYLINE_HOST_DEVICE_INLINE dtype polyline_nan() {
+    const dtype nan_value = static_cast<dtype>(NAN);
+    return nan_value;
+}
+
+template <typename point_dtype>
+POLYLINE_HOST_DEVICE_INLINE void fill_point_with_nan_common(point_dtype* res_point, int num_dims) {
+    const point_dtype nan_value = polyline_nan<point_dtype>();
+    for (int d = 0; d < num_dims; ++d) {
+        res_point[d] = nan_value;
+    }
+}
+
+/**
+ * @brief Compute the Euclidean length of one polyline segment.
+ *
+ * @details
+ * `segment_idx` refers to the segment between points `segment_idx` and
+ * `segment_idx + 1`. The point coordinates are laid out consecutively as
+ * `(num_points, num_dims)`.
+ *
+ * The point dtype and accumulation dtype are intentionally separate so the CPU
+ * path can accumulate in a wider type while the CUDA path preserves its
+ * existing dtype behavior.
+ */
+template <typename point_dtype, typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE accum_dtype compute_segment_length_common(const point_dtype* points_sample,
+                                                                      int segment_idx, int num_dims) {
+    const point_dtype* first_point = points_sample + segment_idx * num_dims;
+    const point_dtype* second_point = points_sample + (segment_idx + 1) * num_dims;
+    accum_dtype accum_sqr = static_cast<accum_dtype>(0.0);
+    for (int d = 0; d < num_dims; ++d) {
+        const accum_dtype diff =
+            static_cast<accum_dtype>(first_point[d]) - static_cast<accum_dtype>(second_point[d]);
+        accum_sqr += diff * diff;
+    }
+    const accum_dtype segment_length = polyline_sqrt(accum_sqr);
+    return segment_length;
+}
+
+/**
+ * @brief Find the last index whose value is lower than or equal to `value`.
+ *
+ * @details
+ * The input sequence is expected to be monotonically non-decreasing cumulative
+ * distances. The return value can be:
+ * - `-1` when `value` lies before the first point.
+ * - `sequence_length - 1` when `value` lies at or beyond the last point.
+ * - Any valid lower segment endpoint otherwise.
+ *
+ * This is used to locate the segment containing the requested interpolation
+ * distance.
+ */
+template <typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE int get_index_of_last_lower_or_equal_to_common(const accum_dtype* sequence,
+                                                                           accum_dtype value,
+                                                                           int sequence_length) {
+    int min_idx = 0;
+    int max_idx = sequence_length - 1;
+
+    if (polyline_value_gt(sequence[0], value)) {
+        return -1;
+    }
+    if (polyline_value_lt(sequence[sequence_length - 1], value)) {
+        return sequence_length - 1;
+    }
+
+    while (max_idx - min_idx > 1) {
+        const int curr_idx = (max_idx + min_idx) >> 1;
+        const accum_dtype curr_val = sequence[curr_idx];
+        if (polyline_value_lt(curr_val, value)) {
+            min_idx = curr_idx;
+        } else if (polyline_value_gt(curr_val, value)) {
+            max_idx = curr_idx;
+        } else {
+            min_idx = curr_idx;
+            max_idx = curr_idx;
+        }
+    }
+    return min_idx;
+}
+
+/**
+ * @brief Sample one point on a polyline at a requested absolute distance.
+ *
+ * @details
+ * `accum_distances` stores the distance from the start of the polyline to each
+ * point. Distances outside the polyline are clamped to the first or last point.
+ * Degenerate zero-length segments return the lower endpoint.
+ */
+template <typename point_dtype, typename accum_dtype>
+POLYLINE_HOST_DEVICE_INLINE void sample_at_distance_common(const point_dtype* points,
+                                                           const accum_dtype* accum_distances,
+                                                           accum_dtype distance_to_sample_at, int num_points,
+                                                           int num_dims, point_dtype* res_point) {
+    const int index_min = get_index_of_last_lower_or_equal_to_common<accum_dtype>(
+        accum_distances, distance_to_sample_at, num_points);
+    if (index_min >= 0 && index_min < num_points - 1) {
+        const int index_max = index_min + 1;
+        const point_dtype* min_point = points + index_min * num_dims;
+        const point_dtype* max_point = points + index_max * num_dims;
+        const accum_dtype dist_min = accum_distances[index_min];
+        const accum_dtype dist_max = accum_distances[index_max];
+        const accum_dtype dist = dist_max - dist_min;
+        if (polyline_value_ge(dist, static_cast<accum_dtype>(std::numeric_limits<accum_dtype>::epsilon()))) {
+            const accum_dtype weight_max = (distance_to_sample_at - dist_min) / dist;
+            const accum_dtype weight_min = (dist_max - distance_to_sample_at) / dist;
+            for (int d = 0; d < num_dims; ++d) {
+                const accum_dtype interpolated = static_cast<accum_dtype>(min_point[d]) * weight_min +
+                                                 static_cast<accum_dtype>(max_point[d]) * weight_max;
+                res_point[d] = static_cast<point_dtype>(interpolated);
+            }
+        } else {
+            for (int d = 0; d < num_dims; ++d) {
+                res_point[d] = min_point[d];
+            }
+        }
+    } else if (index_min == -1) {
+        for (int d = 0; d < num_dims; ++d) {
+            // Note that we are accessing the first point, so that points[d] corresponds to the element we
+            // want to access, and no offset is needed.
+            res_point[d] = points[d];
+        }
+    } else if (index_min == num_points - 1) {
+        for (int d = 0; d < num_dims; ++d) {
+            res_point[d] = points[(num_points - 1) * num_dims + d];
+        }
+    }
+}
+
+}  // namespace polyline
+
+#undef POLYLINE_HOST_DEVICE_INLINE
+
+#endif  // LANE_HELPERS_POLYLINE_COMMON_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh
new file mode 100644
index 0000000..eae81c4
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_dtype_compat.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
+#define LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
+
+#include <cmath>
+
+// CUDA provides native __half/__nv_bfloat16 comparison intrinsics and shuffle
+// overloads, while c10 low-precision wrappers add extra conversion paths,
+// leading to compilation errors. The CUDA-only specializations below route c10 values
+// through the native CUDA operations where available; only scalar math such as sqrt
+// intentionally computes via float. Keeping these variants CUDA-only keeps CPU builds
+// free of these types.
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE inline
+#endif
+
+namespace polyline {
+
+#ifdef __CUDACC__
+template <typename dtype>
+__device__ __forceinline__ dtype shfl_xor_sync_compat(unsigned mask, dtype val, int laneMask) {
+    return __shfl_xor_sync(mask, val, laneMask);
+}
+
+template <>
+__device__ __forceinline__ c10::Half shfl_xor_sync_compat(unsigned mask, c10::Half val, int laneMask) {
+    return c10::Half(__shfl_xor_sync(mask, static_cast<__half>(val), laneMask));
+}
+
+template <>
+__device__ __forceinline__ c10::BFloat16 shfl_xor_sync_compat(unsigned mask, c10::BFloat16 val,
+                                                              int laneMask) {
+    return c10::BFloat16(__shfl_xor_sync(mask, static_cast<__nv_bfloat16>(val), laneMask));
+}
+#endif
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt(dtype lhs, dtype rhs) {
+    return lhs < rhs;
+}
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt(dtype lhs, dtype rhs) {
+    return lhs > rhs;
+}
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge(dtype lhs, dtype rhs) {
+    return lhs >= rhs;
+}
+
+#ifdef __CUDACC__
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hlt(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hgt(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge<c10::Half>(c10::Half lhs, c10::Half rhs) {
+    return __hge(static_cast<__half>(lhs), static_cast<__half>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_lt<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hlt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_gt<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hgt(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+
+template <>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE bool polyline_value_ge<c10::BFloat16>(c10::BFloat16 lhs,
+                                                                               c10::BFloat16 rhs) {
+    return __hge(static_cast<__nv_bfloat16>(lhs), static_cast<__nv_bfloat16>(rhs));
+}
+#endif
+
+template <typename dtype>
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE dtype polyline_sqrt(dtype value) {
+    return sqrt(value);
+}
+
+#ifdef __CUDACC__
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::Half polyline_sqrt(c10::Half value) {
+    return static_cast<c10::Half>(sqrtf(static_cast<float>(value)));
+}
+
+POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE c10::BFloat16 polyline_sqrt(c10::BFloat16 value) {
+    return static_cast<c10::BFloat16>(sqrtf(static_cast<float>(value)));
+}
+#endif
+
+}  // namespace polyline
+
+#undef POLYLINE_DTYPE_COMPAT_HOST_DEVICE_INLINE
+
+#endif  // LANE_HELPERS_POLYLINE_DTYPE_COMPAT_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh
new file mode 100644
index 0000000..2354b54
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_kernels.cuh
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_KERNELS_CUH
+#define LANE_HELPERS_POLYLINE_KERNELS_CUH
+
+#include "polyline_common.cuh"
+#include "polyline_dtype_compat.cuh"
+
+namespace polyline {
+
+template <typename dtype>
+__device__ __forceinline__ void prefix_sum_warp(int index, dtype value, int num_values_in_scan,
+                                                dtype& scan_value, dtype& sum_all) {
+    sum_all = value;
+    scan_value = static_cast<dtype>(0.0);
+    for (int i = 1; i < num_values_in_scan; i <<= 1) {
+        dtype sum_other = shfl_xor_sync_compat<dtype>(0xffffffffu, sum_all, i);
+        scan_value += ((static_cast<uint32_t>(index) & static_cast<uint32_t>(i)) > 0) * sum_other;
+        sum_all += sum_other;
+    }
+}
+
+/**
+ * @brief Perform a prefix sum on a block of values.
+ *
+ * @details
+ * The buffer `warp_scan_buffer` is used to store the sums of the individual warps, which is then used
+ * to compute the offsets to add to each warp. For that, a prefix sum is performed on the buffer in a second
+ * step (in-place). The size of the buffer is (in elements): `blockDim.y * num_warps_per_sample`.
+ * 
+ * @tparam dtype The type of the values to prefix sum
+ *
+ * @param value The value to prefix sum
+ * @param num_warps_per_sample The number of warps per sample
+ * @param warp_scan_buffer The buffer to store the partial sums of the iterations so far for each sample
+ *
+ * @return The prefix sum for the current thread
+ */
+template <typename dtype>
+__device__ __forceinline__ dtype prefix_sum_block(dtype value, int num_warps_per_sample,
+                                                  dtype* warp_scan_buffer) {
+    // ix corresponds to the thread index inside a single sample
+    const int ix = threadIdx.x;
+    const int iwx = threadIdx.x / 32;                         // index of the warp inside the sample
+    const int iw = threadIdx.y * num_warps_per_sample + iwx;  // index of the warp in the block
+    // Get thread id (consecutive IDs correspond to consecutive values in the array)
+    //const int tid_shared = threadIdx.y * bxsize + ix;
+
+    // Using warp shuffles iteratively, in two stages
+
+    // First stage: perform warp scans
+    dtype warp_scan1;
+    dtype warp_sum1;
+    prefix_sum_warp(ix, value, 32, warp_scan1, warp_sum1);
+    // If this is the first thread in the warp, it is responsible for storing the partial sum for the warp
+    if (ix % 32 == 0) warp_scan_buffer[iw] = warp_sum1;
+    __syncthreads();
+
+    // Warp scan for for the partial sums to obtain the offsets for each warp
+    // The first warp (i.e. ix < 32) participates in step 2 of the warp scan.
+    // Note that the whole first warp always participates in step 2, even if num_warps_per_sample < 32.
+    // This is to avoid a deadlock without using a complex mask generation method for `__shfl_xor_sync()`.
+    if (ix < 32) {
+        const int wid_shared = threadIdx.y * num_warps_per_sample + ix;
+        const bool is_inside = ix < num_warps_per_sample;
+        const dtype value = is_inside ? warp_scan_buffer[wid_shared] : static_cast<dtype>(0.0);
+        dtype warp_scan2;
+        dtype warp_sum2;
+        prefix_sum_warp(ix, value, num_warps_per_sample, warp_scan2, warp_sum2);
+        if (ix < num_warps_per_sample) {
+            warp_scan_buffer[wid_shared] = warp_scan2;
+        }
+    }
+    __syncthreads();
+
+    // Apply offsets to the partial sums to obtain the final values
+    warp_scan1 += warp_scan_buffer[iw];
+
+    return warp_scan1;
+}
+
+/**
+ * @brief Perform a prefix sum on a block of values.
+ *
+ * @details
+ * The buffer is split into 2 parts:
+ * - The first part (`blockDim.y` elements) stores the running sums of all
+ *   elements processed so far for each sample in y (`sum_buffer`).
+ * - The second part (`warp_scan_buffer`) is scratch space for the intra‑block
+ *   scan performed by `prefix_sum_block`. The size is: `blockDim.y * num_warps_per_sample`
+ *   (see `prefix_sum_block` for more details)
+ * Hence, the total buffer size in elements is (in elements):
+ * `blockDim.y + blockDim.y * num_warps_per_sample`.
+ * or
+ * `blockDim.y * (1 + num_warps_per_sample)`
+ * 
+ * The results are stored in the sequence array, overriding the input values.
+ * Note that the results are the accumulated values including the current value, i.e. the operation can be expressed as:
+ * `sequences[i] = sum(sequences[0:(i+1)])`, where the slicing is defined as in Python.
+ *
+ * 
+ *
+ * @tparam dtype The type of the values to prefix sum
+ *
+ * @param sequences The sequences to prefix sum for the current thread block. Note that this means that 
+ *   the first sequence is the one corresponding to threadIdx.y == 0 of the current block, not necessarily the 
+ *   first sequence in the global array.
+ * @param buffer Combined temporary storage used by the prefix-sum.
+ * @param numel_x The number of elements in the x dimension
+ * @param numel_x_full_blocks The number of elements in the x dimension extended to a multiple of blockDim.x
+ * @param numel_y The number of sequences in the y dimension
+ * @param offset The initial offset to add to the prefix sum of each sequence
+ */
+template <typename dtype>
+__device__ __forceinline__ void prefix_sum_looped(dtype* sequence, dtype* buffer_block, int numel_x,
+                                                  int numel_x_full_blocks, int numel_y, dtype offset) {
+    const int ix = threadIdx.x;
+    const int iy = threadIdx.y;
+
+    // Buffer for keeping the sums of the iterations so far for each sample
+    dtype* sum_buffer = buffer_block;
+    // Buffer as needed for the prefix sum implementation
+    dtype* warp_scan_buffer = buffer_block + blockDim.y;
+
+    int num_warps_per_sample = (blockDim.x + 31) / 32;
+
+    // Initialize the buffer containing the partial sums of the iterations so far for each sample
+    if (ix == 0) {
+        sum_buffer[iy] = static_cast<dtype>(0.0);
+    }
+    // Compute the sum one `bxsize` at a time for each sample
+    for (int i = ix; i < numel_x_full_blocks; i += blockDim.x) {
+        // Make sure that sum_buffer is written to (either initially or in the previous iteration)
+        __syncthreads();
+        const dtype value = i < numel_x ? sequence[i] : static_cast<dtype>(0.0);
+        const dtype value_out =
+            prefix_sum_block<dtype>(value, num_warps_per_sample, warp_scan_buffer) + sum_buffer[iy] + offset;
+        const dtype value_out_incl_current = value_out + value;
+        // Make sure that
+        //   - sequences are not written to before they are read from for the current iteration
+        //   - sum_buffer is not written to before it is read from for the current iteration
+        __syncthreads();
+        if (i < numel_x) {
+            sequence[i] = value_out_incl_current;
+        }
+        // Update the sum buffer for the next iteration to the current value of the last processed sample.
+        // Note that the last thread may be out of bounds and not correspond to the last element. However,
+        // in this case, the value us not needed (and also still is the correct value as the values are
+        // extended with zeros, so that the cumulative sum (computed as including the current value) is
+        // the same as for the last element)
+        if (ix == blockDim.x - 1) {
+            sum_buffer[iy] = value_out_incl_current;
+        }
+        // Offset is only applied in the first iteration. Afterwards, the offset is already included in the
+        // partial sum as stored in `sum_buffer` and the offset must not be applied again.
+        offset = static_cast<dtype>(0.0);
+    }
+    __syncthreads();
+}
+
+template <typename dtype>
+__device__ __forceinline__ dtype warp_reduce_sum(dtype value, int num_vals_per_partial) {
+    const int ix = threadIdx.x;
+    for (int i = 1; i < num_vals_per_partial; i <<= 1) {
+        const dtype val_other = shfl_xor_sync_compat<dtype>(0xffffffffu, value, i);
+        value += val_other;
+    }
+    return value;
+}
+
+template <typename dtype>
+__device__ __forceinline__ dtype sample_reduce_sum(dtype value, int num_warps_per_sample,
+                                                   dtype* warp_temp_and_result_buffer) {
+    const int ix = threadIdx.x;                               // index of thread in the block
+    const int iwx = threadIdx.x / 32;                         //index of warp in the sample
+    const int iw = threadIdx.y * num_warps_per_sample + iwx;  // index of the warp in the block
+
+    const dtype warp_sum = warp_reduce_sum(value, 32);
+    // The first thread in the warp writes the result for the warp
+    if (ix % 32 == 0) {
+        warp_temp_and_result_buffer[iw] = warp_sum;
+    }
+    // Make sure all warps have written their results
+    __syncthreads();
+
+    dtype sample_sum = static_cast<dtype>(0.0);
+    // The first warp reduces the results of the first stage
+    // Note that from now on, ix corresponds to the index of the warp (from stage 1) in the block (previously iw)
+    if (ix < 32) {
+        const int iw_base = threadIdx.y * num_warps_per_sample;
+        const dtype warp_sum_phase_1 =
+            ix < num_warps_per_sample ? warp_temp_and_result_buffer[iw_base + ix] : static_cast<dtype>(0.0);
+        // The partial sum will have constant segments, each segment corresponding to one sample (and containing as many values as
+        // there are warps per sample).
+        sample_sum = warp_reduce_sum(warp_sum_phase_1, num_warps_per_sample);
+    }
+    __syncthreads();
+    return sample_sum;
+}
+
+template <typename dtype>
+__device__ __forceinline__ void sample_distances(const dtype* points_sample,
+                                                 const dtype* accum_distances_sample,
+                                                 const dtype* distances_to_sample_sample,
+                                                 int num_distances_to_sample, int num_points, int num_dims,
+                                                 dtype* res_points_sample, bool relative_distances) {
+    const int x = threadIdx.x;
+    if (num_points == 0) {
+        const int result_stride = blockDim.x * num_dims;
+
+        dtype* res_points_current =
+            x < num_distances_to_sample ? res_points_sample + x * num_dims : res_points_sample;
+        for (int i = x; i < num_distances_to_sample; i += blockDim.x, res_points_current += result_stride) {
+            fill_point_with_nan_common<dtype>(res_points_current, num_dims);
+        }
+        return;
+    }
+
+    dtype total_length_if_needed = static_cast<dtype>(0.0);
+    if (relative_distances) {
+        total_length_if_needed = accum_distances_sample[num_points - 1];
+    }
+    for (int i = x; i < num_distances_to_sample; i += blockDim.x) {
+        dtype* res_points_current = res_points_sample + i * num_dims;
+        const dtype distance_to_sample = relative_distances
+                                             ? distances_to_sample_sample[i] * total_length_if_needed
+                                             : distances_to_sample_sample[i];
+        sample_at_distance_common<dtype, dtype>(points_sample, accum_distances_sample, distance_to_sample,
+                                                num_points, num_dims, res_points_current);
+    }
+}
+
+template <typename dtype>
+__device__ __forceinline__ void compute_distances(dtype* points_sample, int num_points, int num_dims,
+                                                  dtype* distances_sample) {
+    const int x = threadIdx.x;
+    if (num_points == 0) {
+        return;
+    }
+
+    if (x == 0) {
+        // Distance from the start to the first point is zero.
+        distances_sample[0] = static_cast<dtype>(0.0);
+    }
+    // Store segment lengths starting at index 1 so that an *inclusive* prefix
+    // sum over `distances_sample` yields distances to points:
+    //   distances_sample[j] = distance from start to point j.
+    for (int i = x; i < num_points - 1; i += blockDim.x) {
+        distances_sample[i + 1] = compute_segment_length_common<dtype, dtype>(points_sample, i, num_dims);
+    }
+}
+
+/**
+ * @brief Shared implementation for both fixed-size and variable-size batch kernels.
+ *
+ * @details
+ * This routine implements the common logic used by:
+ *  - `polyline_sampling_fully_shared_kernel` (fixed-size batches), and
+ *  - `polyline_sampling_fully_shared_var_batch_kernel` (variable-size batches).
+ *
+ * The shared memory is split into two parts:
+ * - The first part stores the distances and accumulated distances
+ *   (conversion in-place) for all points and has size (in elements):
+ *   `blockDim.y * max_num_points`.
+ * - The second part stores the temporary buffer used by
+ *   `prefix_sum_looped` and has size (in elements):
+ *   `blockDim.y * (num_warps_per_sample + 1)`.
+ *   (see the documentation of `prefix_sum_looped` for details).
+ * The total shared memory size is therefore (in elements):
+ * `(blockDim.y * max_num_points + blockDim.y * (num_warps_per_sample + 1))`.
+ *
+ *
+ * @tparam dtype The type of the points
+ *
+ * @param points The points to sample
+ * @param distances_to_sample The distances to sample at
+ * @param res_points The resulting sampled points
+ * @param max_num_points The maximum number of points per polyline in the batch
+ * @param max_num_points_full_blocks The maximum number of points extended to a multiple of blockDim.x
+ * @param num_dims The number of dimensions of the points
+ * @param max_num_distances_to_sample The maximum number of distances to sample at per polyline
+ * @param num_samples The number of samples (batch size)
+ * @param sample_sizes_points (optional) Per-sample number of points (variable-size batches)
+ * @param sample_sizes_distances_to_sample (optional) Per-sample number of distances (variable-size batches)
+ * @param relative_distances Interpret distances to sample as fractions of each polyline's total length
+ * @param distance_buffer_ext Optional external buffer for distances when shared memory is insufficient
+ */
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances, bool use_variable_size_batch>
+__device__ __forceinline__ void polyline_sampling_fully_shared_common(
+    dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points,
+    int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples,
+    sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample,
+    bool relative_distances, dtype* distance_buffer_ext) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* distances;
+    dtype* buffer;
+    if (use_shared_distances) {
+        // Shared-memory layout per block:
+        //   distances: [blockDim.y][max_num_points]
+        //   buffer   : [blockDim.y * (1 + num_warps_per_sample)]
+        distances = reinterpret_cast<dtype*>(shared_mem);
+        buffer = reinterpret_cast<dtype*>(shared_mem + blockDim.y * max_num_points * sizeof(dtype));
+    } else {
+        // External distances buffer is laid out per block as
+        //   [blockIdx.y][blockDim.y][max_num_points]
+        // so each block gets its own contiguous slice. The scratch `buffer`
+        // always starts at the beginning of this block's shared memory.
+        distances = distance_buffer_ext + blockIdx.y * blockDim.y * max_num_points;
+        buffer = reinterpret_cast<dtype*>(shared_mem);
+    }
+
+    const int y = threadIdx.y;
+    const int y_global = blockIdx.y * blockDim.y + y;
+    const bool is_active_sample = (y_global < num_samples);
+
+    // 1) Compute per-point distances only for valid samples. Inactive rows in
+    // the final block still participate in sync-heavy code paths with zero work.
+    int curr_num_points = 0;
+    int curr_num_distances_to_sample = 0;
+    if (is_active_sample) {
+        if (use_variable_size_batch) {
+            curr_num_points = sample_sizes_points[y_global];
+            curr_num_distances_to_sample = sample_sizes_distances_to_sample[y_global];
+        } else {
+            curr_num_points = max_num_points;
+            curr_num_distances_to_sample = max_num_distances_to_sample;
+        }
+
+        // Global index for points in device memory; distances remain indexed by the
+        // local y within the block because they live in shared memory.
+        dtype* points_sample = points + y_global * max_num_points * num_dims;
+        dtype* distances_sample = distances + y * max_num_points;
+        if (curr_num_points > 0) {
+            compute_distances<dtype>(points_sample, curr_num_points, num_dims, distances_sample);
+        }
+    }
+
+    // 2) Prefix-sum over distances for all rows in this block-local buffer.
+    //    This operates purely on (shared or external) distances, so it is
+    //    safe even for rows that don't correspond to a real sample; their
+    //    results are never used.
+    // The `distances` are per-block, so we use the local index `y` to access the distances for the current block.
+    dtype* distance = distances + y * max_num_points;
+    prefix_sum_looped<dtype>(distance,                    // sequences
+                             buffer,                      // buffer (sum_buffer + warp_scan_buffer)
+                             curr_num_points,             // numel_x
+                             max_num_points_full_blocks,  // numel_x_full_blocks (extended to full blocks)
+                             blockDim.y,                  // numel_y (number of samples per block)
+                             static_cast<dtype>(0.0)      // offset
+    );
+
+    // 3) Sample only for valid samples, using their (possibly shared or
+    //    external) accumulated distances.
+    if (is_active_sample) {
+        // Get the points for the current sample (use of global offset)
+        const dtype* points_sample = points + y_global * max_num_points * num_dims;
+        // Get the distances for the current sample (use of block-local offset, as distances are stored in
+        // shared memory (or in an external buffer with `points` referring to points for this block))
+        const dtype* distances_sample = distances + y * max_num_points;
+        // Get the distances to sample at for the current sample (use of global offset)
+        const dtype* distances_to_sample_sample =
+            distances_to_sample + y_global * max_num_distances_to_sample;
+        sample_distances<dtype>(points_sample, distances_sample, distances_to_sample_sample,
+                                curr_num_distances_to_sample, curr_num_points, num_dims,
+                                res_points + y_global * max_num_distances_to_sample * num_dims,
+                                relative_distances);
+    }
+}
+
+/**
+ * @brief Sample the points at the distances (fixed-size batches).
+ *
+ * See `polyline_sampling_fully_shared_common` for implementation details.
+ */
+template <typename dtype, bool use_shared_distances>
+__global__ void polyline_sampling_fully_shared_kernel(dtype* points, dtype* distances_to_sample,
+                                                      dtype* res_points, int num_points,
+                                                      int num_points_full_blocks, int num_dims,
+                                                      int num_distances_to_sample, int num_samples,
+                                                      bool relative_distances, dtype* distance_buffer_ext) {
+    polyline_sampling_fully_shared_common<dtype, int, use_shared_distances, false>(
+        points, distances_to_sample, res_points,
+        num_points,              // max_num_points
+        num_points_full_blocks,  // max_num_points_full_blocks
+        num_dims,
+        num_distances_to_sample,  // max_num_distances_to_sample
+        num_samples,
+        /*sample_sizes_points=*/nullptr,
+        /*sample_sizes_distances_to_sample=*/nullptr, relative_distances, distance_buffer_ext);
+}
+
+// Variable-size batch version of the kernel.
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+__global__ void polyline_sampling_fully_shared_var_batch_kernel(
+    dtype* points, dtype* distances_to_sample, dtype* res_points, int max_num_points,
+    int max_num_points_full_blocks, int num_dims, int max_num_distances_to_sample, int num_samples,
+    sample_size_dtype* sample_sizes_points, sample_size_dtype* sample_sizes_distances_to_sample,
+    bool relative_distances, dtype* distance_buffer_ext) {
+    polyline_sampling_fully_shared_common<dtype, sample_size_dtype, use_shared_distances, true>(
+        points, distances_to_sample, res_points, max_num_points, max_num_points_full_blocks, num_dims,
+        max_num_distances_to_sample, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+        relative_distances, distance_buffer_ext);
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_variable_size_batch>
+__device__ __forceinline__ void polyline_lengths_common(dtype* points, dtype* lengths, int max_num_points,
+                                                        int num_dims, int num_samples,
+                                                        sample_size_dtype* sample_sizes_points,
+                                                        dtype* reduction_buffer) {
+    const int x = threadIdx.x;
+    const int y_global = blockIdx.y * blockDim.y + threadIdx.y;
+    const bool is_active_sample = (y_global < num_samples);
+
+    int curr_num_points = 0;
+    dtype local_length = static_cast<dtype>(0.0);
+    if (is_active_sample) {
+        curr_num_points = use_variable_size_batch ? sample_sizes_points[y_global] : max_num_points;
+        const dtype* points_sample = points + y_global * max_num_points * num_dims;
+        for (int i = x; i < curr_num_points - 1; i += blockDim.x) {
+            local_length += compute_segment_length_common<dtype, dtype>(points_sample, i, num_dims);
+        }
+    }
+
+    const int num_warps_per_sample = (blockDim.x + 31) / 32;
+    const dtype length = sample_reduce_sum<dtype>(local_length, num_warps_per_sample, reduction_buffer);
+    if (is_active_sample && x == 0) {
+        lengths[y_global] = curr_num_points == 0 ? polyline_nan<dtype>() : length;
+    }
+}
+
+template <typename dtype>
+__global__ void polyline_lengths_kernel(dtype* points, dtype* lengths, int num_points, int num_dims,
+                                        int num_samples) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* reduction_buffer = reinterpret_cast<dtype*>(shared_mem);
+    polyline_lengths_common<dtype, int, false>(points, lengths, num_points, num_dims, num_samples,
+                                               /*sample_sizes_points=*/nullptr, reduction_buffer);
+}
+
+template <typename dtype, typename sample_size_dtype>
+__global__ void polyline_lengths_var_batch_kernel(dtype* points, dtype* lengths, int max_num_points,
+                                                  int num_dims, int num_samples,
+                                                  sample_size_dtype* sample_sizes_points) {
+    extern __shared__ uint8_t shared_mem[];
+    dtype* reduction_buffer = reinterpret_cast<dtype*>(shared_mem);
+    polyline_lengths_common<dtype, sample_size_dtype, true>(
+        points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points, reduction_buffer);
+}
+
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_KERNELS_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh
new file mode 100644
index 0000000..2668917
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/include/polyline_shared_memory_config.cuh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
+#define LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
+
+#include <cstddef>
+#include <mutex>
+#include <stdexcept>
+
+#include <cuda_runtime.h>
+
+#include "helper_macros.cuh"
+#include "polyline_kernels.cuh"
+
+namespace polyline {
+
+static constexpr int MAX_CACHED_CUDA_DEVICES = 64;
+
+static void check_non_negative_cuda_device(int device) {
+    if (device < 0) {
+        throw std::runtime_error("CUDA device index must be non-negative.");
+    }
+}
+
+static size_t query_polyline_max_shared_full_for_device(int device) {
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    size_t max_shared_full = static_cast<size_t>(prop.sharedMemPerBlock);
+    if (prop.sharedMemPerBlockOptin != 0) {
+        max_shared_full = static_cast<size_t>(prop.sharedMemPerBlockOptin);
+    }
+    return max_shared_full;
+}
+
+static size_t polyline_max_shared_full_for_device(int device) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+    static size_t max_shared_full_by_device[MAX_CACHED_CUDA_DEVICES] = {};
+
+    check_non_negative_cuda_device(device);
+    // Fallback if there are more devices than the maximum number of cached devices we use.
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        const size_t max_shared_full = query_polyline_max_shared_full_for_device(device);
+        return max_shared_full;
+    }
+
+    std::call_once(configured_devices[device], [device]() {
+        max_shared_full_by_device[device] = query_polyline_max_shared_full_for_device(device);
+    });
+    const size_t max_shared_full = max_shared_full_by_device[device];
+    return max_shared_full;
+}
+
+template <typename dtype, bool use_shared_distances>
+static void configure_polyline_sampling_kernel(size_t max_shared_full) {
+    CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel<dtype, use_shared_distances>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    static_cast<int>(max_shared_full)));
+    CUDA_CHECK(cudaFuncSetAttribute(polyline_sampling_fully_shared_kernel<dtype, use_shared_distances>,
+                                    cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+}
+
+template <typename dtype, bool use_shared_distances>
+static void configure_polyline_sampling_kernel_once(int device, size_t max_shared_full) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+
+    check_non_negative_cuda_device(device);
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        configure_polyline_sampling_kernel<dtype, use_shared_distances>(max_shared_full);
+        return;
+    }
+
+    std::call_once(configured_devices[device], [max_shared_full]() {
+        configure_polyline_sampling_kernel<dtype, use_shared_distances>(max_shared_full);
+    });
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+static void configure_polyline_sampling_var_batch_kernel(size_t max_shared_full) {
+    CUDA_CHECK(cudaFuncSetAttribute(
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(max_shared_full)));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+}
+
+template <typename dtype, typename sample_size_dtype, bool use_shared_distances>
+static void configure_polyline_sampling_var_batch_kernel_once(int device, size_t max_shared_full) {
+    static std::once_flag configured_devices[MAX_CACHED_CUDA_DEVICES];
+
+    check_non_negative_cuda_device(device);
+    if (device >= MAX_CACHED_CUDA_DEVICES) {
+        configure_polyline_sampling_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>(
+            max_shared_full);
+        return;
+    }
+
+    std::call_once(configured_devices[device], [max_shared_full]() {
+        configure_polyline_sampling_var_batch_kernel<dtype, sample_size_dtype, use_shared_distances>(
+            max_shared_full);
+    });
+}
+
+}  // namespace polyline
+
+#endif  // LANE_HELPERS_POLYLINE_SHARED_MEMORY_CONFIG_CUH
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp
new file mode 100644
index 0000000..a0d82a3
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cpp
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/pybind11.h>
+#include <cstdint>
+#include <limits>
+
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/torch.h>
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "polyline.cuh"
+#include "helper_macros.cuh"
+
+//#define PROFILE_AND_SYNC
+
+#ifdef PROFILE_AND_SYNC
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+namespace polyline {
+
+#define CHECK_DEVICE(x) check_device(x, #x)
+#define CHECK_CONTIGUOUS(x) check_contiguous(x, #x)
+#define CHECK_TYPE(x) check_type(x, #x)
+#define CHECK_INPUT(x)   \
+    CHECK_DEVICE(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_TYPE(x);
+inline void check_device(const at::Tensor& tensor, const char* description) {
+    TORCH_CHECK(tensor.is_cpu() || tensor.is_cuda(), description, " must be on CPU or CUDA");
+}
+
+inline void check_contiguous(const at::Tensor& tensor, const char* description) {
+    TORCH_CHECK(tensor.is_contiguous(), description, " must be contiguous");
+}
+
+inline void check_type(const at::Tensor& tensor, const char* description) {
+    if (tensor.is_cuda()) {
+        TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64 ||
+                        tensor.scalar_type() == torch::kFloat16 || tensor.scalar_type() == torch::kBFloat16,
+                    description, " must have dtype float16, float32, float64, or bfloat16 on CUDA");
+    } else {
+        TORCH_CHECK(tensor.scalar_type() == torch::kFloat32 || tensor.scalar_type() == torch::kFloat64,
+                    description, " must have dtype float32 or float64 on CPU");
+    }
+}
+
+inline void check_same_device(const at::Tensor& lhs, const at::Tensor& rhs, const char* message) {
+    TORCH_CHECK(lhs.device() == rhs.device(), message);
+}
+
+inline void check_sample_size_type(const at::Tensor& sample_sizes, const char* description) {
+    TORCH_CHECK(sample_sizes.scalar_type() == at::kInt || sample_sizes.scalar_type() == at::kLong,
+                description, " must have dtype int32 or int64");
+}
+
+inline void check_sample_sizes(const at::Tensor& sample_sizes, int max_size, const char* description) {
+    if (sample_sizes.numel() == 0) {
+        return;
+    }
+    TORCH_CHECK(
+        !torch::any(sample_sizes < 0).item<bool>() && !torch::any(sample_sizes > max_size).item<bool>(),
+        description, " values must be in [0, ", max_size, "]");
+}
+
+at::Tensor make_external_distance_buffer(size_t size_elems, const at::TensorOptions& options) {
+    // Keep external CUDA scratch memory owned by PyTorch's stream-aware allocator.
+    // A raw cudaFree here can race with the asynchronous custom kernel that uses this buffer.
+
+    // Return an empty tensor if no external distance buffer is needed.
+    if (size_elems == 0) {
+        return at::Tensor();
+    }
+
+    // Check that the size is not too large to allocate as a tensor.
+    TORCH_CHECK(size_elems <= static_cast<size_t>(std::numeric_limits<int64_t>::max()),
+                "external polyline distance buffer is too large to allocate as a tensor");
+
+    // Allocate the buffer and return it.
+    at::Tensor buffer = at::empty({static_cast<int64_t>(size_elems)}, options);
+    return buffer;
+}
+
+at::Tensor polyline_interpolation(at::Tensor points, at::Tensor distances, bool relative_distances) {
+    CHECK_DEVICE(points);
+    CHECK_DEVICE(distances);
+    CHECK_TYPE(points);
+    CHECK_TYPE(distances);
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)");
+    TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, num_distances)");
+    TORCH_CHECK(points.size(0) == distances.size(0),
+                "points and distances must contain the same number of polylines");
+    TORCH_CHECK(points.scalar_type() == distances.scalar_type(),
+                "points and distances must have the same dtype");
+    check_same_device(points, distances, "points and distances must be on the same device");
+
+    const int num_samples = points.size(0);
+    const int num_points = points.size(1);
+    const int num_distances = distances.size(1);
+    const int num_dims = points.size(2);
+    // Result has shape (batch, num_distances, point_dim) and otherwise
+    // matches `distances` (device, dtype).
+    auto res = at::empty({num_samples, num_distances, num_dims}, distances.options());
+    if (num_distances == 0) {
+        return res;
+    }
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor distances_contiguous = distances.contiguous();
+
+    if (points.is_cuda()) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation", [&] {
+                const int device = points.get_device();
+                c10::cuda::CUDAGuard device_guard(static_cast<c10::DeviceIndex>(device));
+                const auto stream = at::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
+                at::cuda::CUDAStreamGuard stream_guard(stream);
+                const auto cfg = make_polyline_launch_config<scalar_t>(num_points, num_samples, device);
+                // Allocate under the same stream used for the kernel launch so the caching allocator
+                // does not recycle this temporary scratch buffer before queued kernel work consumes it.
+                const at::Tensor distance_buffer_ext =
+                    make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options());
+                scalar_t* distance_buffer_ext_ptr =
+                    distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr<scalar_t>() : nullptr;
+                polyline_interpolation<scalar_t>(points_contiguous.data_ptr<scalar_t>(), num_points, num_dims,
+                                                 distances_contiguous.data_ptr<scalar_t>(), num_distances,
+                                                 res.data_ptr<scalar_t>(), num_samples, relative_distances,
+                                                 device, cfg, distance_buffer_ext_ptr, stream.stream());
+                CUDA_CHECK_LAST();
+            });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_interpolation_cpu", [&] {
+            polyline_interpolation_cpu<scalar_t>(points_contiguous.data_ptr<scalar_t>(), num_points, num_dims,
+                                                 distances_contiguous.data_ptr<scalar_t>(), num_distances,
+                                                 res.data_ptr<scalar_t>(), num_samples, relative_distances);
+        });
+    }
+
+    return res;
+}
+
+at::Tensor polyline_lengths(at::Tensor points) {
+    CHECK_DEVICE(points);
+    CHECK_TYPE(points);
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, num_points, num_dims)");
+
+    const int num_samples = points.size(0);
+    const int num_points = points.size(1);
+    const int num_dims = points.size(2);
+    auto res = at::empty({num_samples}, points.options());
+    const at::Tensor points_contiguous = points.contiguous();
+
+    if (points.is_cuda()) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths", [&] {
+                cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+                polyline_lengths<scalar_t>(points_contiguous.data_ptr<scalar_t>(),  // points
+                                           num_points,                              // num_points
+                                           num_dims,                                // num_dims
+                                           res.data_ptr<scalar_t>(),                // lengths
+                                           num_samples,                             // num_samples
+                                           stream                                   // stream
+                );
+                CUDA_CHECK_LAST();
+            });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_cpu", [&] {
+            polyline_lengths_cpu<scalar_t>(points_contiguous.data_ptr<scalar_t>(),  // points
+                                           num_points,                              // num_points
+                                           num_dims,                                // num_dims
+                                           res.data_ptr<scalar_t>(),                // lengths
+                                           num_samples                              // num_samples
+            );
+        });
+    }
+
+    return res;
+}
+
+at::Tensor polyline_interpolation_var_size_batch(at::Tensor points, at::Tensor distances,
+                                                 at::Tensor sample_sizes_points,
+                                                 at::Tensor sample_sizes_distances_to_sample,
+                                                 bool relative_distances) {
+    CHECK_DEVICE(points);
+    CHECK_DEVICE(distances);
+    check_device(sample_sizes_points, "points.sample_sizes");
+    check_device(sample_sizes_distances_to_sample, "distances.sample_sizes");
+    CHECK_TYPE(points);
+    CHECK_TYPE(distances);
+    check_sample_size_type(sample_sizes_points, "points.sample_sizes");
+    check_sample_size_type(sample_sizes_distances_to_sample, "distances.sample_sizes");
+
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)");
+    TORCH_CHECK(distances.ndimension() == 2, "distances must have shape (batch, max_num_distances)");
+    TORCH_CHECK(points.size(0) == distances.size(0),
+                "points and distances must contain the same number of polylines");
+    TORCH_CHECK(points.scalar_type() == distances.scalar_type(),
+                "points and distances must have the same dtype");
+    check_same_device(points, distances, "points and distances must be on the same device");
+    TORCH_CHECK(sample_sizes_points.scalar_type() == sample_sizes_distances_to_sample.scalar_type(),
+                "points.sample_sizes and distances.sample_sizes must have the same dtype "
+                "(both int32 or both int64)");
+    check_same_device(sample_sizes_points, points,
+                      "points.sample_sizes must be on the same device as points");
+    check_same_device(sample_sizes_distances_to_sample, distances,
+                      "distances.sample_sizes must be on the same device as distances");
+    TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor");
+    TORCH_CHECK(sample_sizes_distances_to_sample.ndimension() == 1,
+                "distances.sample_sizes must be a 1D tensor");
+
+    const int num_samples = points.size(0);
+    const int max_num_points = points.size(1);
+    const int max_num_distances = distances.size(1);
+    const int num_dims = points.size(2);
+    // Result has shape (batch, num_distances, point_dim) and otherwise
+    // matches `distances` (device, dtype).
+    auto res = at::empty({num_samples, max_num_distances, num_dims}, distances.options());
+
+    TORCH_CHECK(sample_sizes_points.size(0) == num_samples,
+                "points.sample_sizes must contain one count per polyline in points");
+    TORCH_CHECK(sample_sizes_distances_to_sample.size(0) == num_samples,
+                "distances.sample_sizes must contain one count per polyline in distances");
+    check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes");
+    check_sample_sizes(sample_sizes_distances_to_sample, max_num_distances, "distances.sample_sizes");
+    if (max_num_distances == 0) {
+        return res;
+    }
+
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor distances_contiguous = distances.contiguous();
+    const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous();
+    const at::Tensor sample_sizes_distances_to_sample_contiguous =
+        sample_sizes_distances_to_sample.contiguous();
+
+    auto launch = [&](auto sample_size_type_tag) {
+        using sample_size_t = decltype(sample_size_type_tag);
+        if (points.is_cuda()) {
+            AT_DISPATCH_FLOATING_TYPES_AND2(
+                at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_interpolation_var_size_batch", [&] {
+                    const int device = points.get_device();
+                    c10::cuda::CUDAGuard device_guard(static_cast<c10::DeviceIndex>(device));
+                    const auto stream = at::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
+                    at::cuda::CUDAStreamGuard stream_guard(stream);
+                    const auto cfg =
+                        make_polyline_launch_config<scalar_t>(max_num_points, num_samples, device);
+                    // Allocate under the same stream used for the kernel launch so the caching allocator
+                    // does not recycle this temporary scratch buffer before queued kernel work consumes it.
+                    const at::Tensor distance_buffer_ext =
+                        make_external_distance_buffer(cfg.distance_buffer_ext_size_elems, points.options());
+                    scalar_t* distance_buffer_ext_ptr =
+                        distance_buffer_ext.defined() ? distance_buffer_ext.data_ptr<scalar_t>() : nullptr;
+                    polyline_interpolation_var_size_batch<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        distances_contiguous.data_ptr<scalar_t>(),                 // distances
+                        max_num_distances,                                         // num_distances
+                        res.data_ptr<scalar_t>(),                                  // result_points
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        sample_sizes_distances_to_sample_contiguous
+                            .data_ptr<sample_size_t>(),  // sample sizes distances
+                        relative_distances,              // relative_distances
+                        device,                          // device
+                        cfg,                             // launch config
+                        distance_buffer_ext_ptr,         // distance_buffer_ext
+                        stream.stream()                  // stream
+                    );
+                    CUDA_CHECK_LAST();
+                });
+        } else {
+            AT_DISPATCH_FLOATING_TYPES(
+                points.scalar_type(), "polyline_interpolation_var_size_batch_cpu", [&] {
+                    polyline_interpolation_var_size_batch_cpu<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        distances_contiguous.data_ptr<scalar_t>(),                 // distances
+                        max_num_distances,                                         // num_distances
+                        res.data_ptr<scalar_t>(),                                  // result_points
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        sample_sizes_distances_to_sample_contiguous
+                            .data_ptr<sample_size_t>(),  // sample sizes distances
+                        relative_distances               // relative_distances
+                    );
+                });
+        }
+    };
+    if (sample_sizes_points.scalar_type() == at::kInt) {
+        launch(int32_t{});
+    } else {
+        launch(int64_t{});
+    }
+
+    return res;
+}
+
+at::Tensor polyline_lengths_var_size_batch(at::Tensor points, at::Tensor sample_sizes_points) {
+    CHECK_DEVICE(points);
+    check_device(sample_sizes_points, "points.sample_sizes");
+    CHECK_TYPE(points);
+    check_sample_size_type(sample_sizes_points, "points.sample_sizes");
+
+    TORCH_CHECK(points.ndimension() == 3, "points must have shape (batch, max_num_points, num_dims)");
+    TORCH_CHECK(sample_sizes_points.ndimension() == 1, "points.sample_sizes must be a 1D tensor");
+    check_same_device(sample_sizes_points, points,
+                      "points.sample_sizes must be on the same device as points");
+
+    const int num_samples = points.size(0);
+    const int max_num_points = points.size(1);
+    const int num_dims = points.size(2);
+    auto res = at::empty({num_samples}, points.options());
+
+    TORCH_CHECK(sample_sizes_points.size(0) == num_samples,
+                "points.sample_sizes must contain one count per polyline in points");
+    check_sample_sizes(sample_sizes_points, max_num_points, "points.sample_sizes");
+
+    const at::Tensor points_contiguous = points.contiguous();
+    const at::Tensor sample_sizes_points_contiguous = sample_sizes_points.contiguous();
+
+    auto launch = [&](auto sample_size_type_tag) {
+        using sample_size_t = decltype(sample_size_type_tag);
+        if (points.is_cuda()) {
+            AT_DISPATCH_FLOATING_TYPES_AND2(
+                at::kHalf, at::kBFloat16, points.scalar_type(), "polyline_lengths_var_size_batch", [&] {
+                    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+                    polyline_lengths_var_size_batch<scalar_t, sample_size_t>(
+                        points_contiguous.data_ptr<scalar_t>(),                    // points
+                        max_num_points,                                            // max_num_points
+                        num_dims,                                                  // num_dims
+                        res.data_ptr<scalar_t>(),                                  // lengths
+                        num_samples,                                               // num_samples
+                        sample_sizes_points_contiguous.data_ptr<sample_size_t>(),  // sample_sizes_points
+                        stream                                                     // stream
+                    );
+                    CUDA_CHECK_LAST();
+                });
+        } else {
+            AT_DISPATCH_FLOATING_TYPES(points.scalar_type(), "polyline_lengths_var_size_batch_cpu", [&] {
+                polyline_lengths_var_size_batch_cpu<scalar_t, sample_size_t>(
+                    points_contiguous.data_ptr<scalar_t>(),                   // points
+                    max_num_points,                                           // max_num_points
+                    num_dims,                                                 // num_dims
+                    res.data_ptr<scalar_t>(),                                 // lengths
+                    num_samples,                                              // num_samples
+                    sample_sizes_points_contiguous.data_ptr<sample_size_t>()  // sample_sizes_points
+                );
+            });
+        }
+    };
+    if (sample_sizes_points.scalar_type() == at::kInt) {
+        launch(int32_t{});
+    } else {
+        launch(int64_t{});
+    }
+
+    return res;
+}
+
+}  // namespace polyline
+
+namespace py = pybind11;
+using namespace polyline;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "Lane helpers polyline interpolation bindings";
+    m.def("polyline_interpolation", (at::Tensor(*)(at::Tensor, at::Tensor, bool)) & polyline_interpolation,
+          py::arg("points"), py::arg("distances"), py::arg("relative") = false,
+          "Interpolate points along polylines at given distances.");
+    m.def("_polyline_lengths", (at::Tensor(*)(at::Tensor)) & polyline_lengths, py::arg("points"),
+          "Internal tensor-only entry point for fixed-size polyline length computation.");
+    m.def("_polyline_interpolation_var_size_batch",
+          (at::Tensor(*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, bool)) &
+              polyline_interpolation_var_size_batch,
+          py::arg("points"), py::arg("distances"), py::arg("sample_sizes_points"),
+          py::arg("sample_sizes_distances_to_sample"), py::arg("relative") = false,
+          "Internal tensor-only entry point for variable-length polyline interpolation.");
+    m.def("_polyline_lengths_var_size_batch",
+          (at::Tensor(*)(at::Tensor, at::Tensor)) & polyline_lengths_var_size_batch, py::arg("points"),
+          py::arg("sample_sizes_points"),
+          "Internal tensor-only entry point for variable-length polyline length computation.");
+}
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline.cu b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu
new file mode 100644
index 0000000..7d62499
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline.cu
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <limits>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+#include "helper_macros.cuh"
+
+#include "polyline.cuh"
+#include "polyline_kernels.cuh"
+#include "polyline_shared_memory_config.cuh"
+
+namespace polyline {
+
+// Return the largest power of two that is <= n.
+// For n <= 1, this returns 0 for n == 0 and 1 for n == 1.
+static inline int last_power_of_2(int n) {
+    if (n <= 0) {
+        return 0;
+    }
+    unsigned int v = static_cast<unsigned int>(n);
+    // Propagate highest set bit to all lower bits.
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    // Now (v + 1) >> 1 is the highest power of two <= original n.
+    const int power_of_two = static_cast<int>((v + 1u) >> 1);
+    return power_of_two;
+}
+
+template <typename dtype>
+struct PolylineLengthLaunchConfig {
+    dim3 block_dim;
+    dim3 grid_dim;
+    size_t shared_mem_size;
+};
+
+static int polyline_launch_threads_x(int num_samples_per_block) {
+    const int max_num_threads = 1024;
+
+    const int max_threads_x_for_y = max_num_threads / num_samples_per_block;
+    // Round down to a multiple of 32, but keep at least one warp.
+    int threads_x = (max_threads_x_for_y / 32) * 32;
+    if (threads_x < 32) {
+        threads_x = 32;
+    }
+    return threads_x;
+}
+
+template <typename dtype>
+static PolylineLaunchConfig<dtype> make_polyline_launch_config_for_y(int num_points, int num_samples,
+                                                                     int num_samples_per_block) {
+    const int threads_x = polyline_launch_threads_x(num_samples_per_block);
+
+    const dim3 block_dim(threads_x, num_samples_per_block, 1);
+    const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1);
+    const int num_points_full_blocks = ((num_points + block_dim.x - 1) / block_dim.x) * block_dim.x;
+    const int num_warps_per_sample = (block_dim.x + 31) / 32;
+    const size_t scratch_buffer_size_elems = block_dim.y * (num_warps_per_sample + 1);
+    const size_t distances_buffer_size_elems_shared = static_cast<size_t>(block_dim.y) * num_points;
+
+    PolylineLaunchConfig<dtype> cfg;
+    cfg.block_dim = block_dim;
+    cfg.grid_dim = grid_dim;
+    cfg.num_points_full_blocks = num_points_full_blocks;
+    cfg.shared_mem_size = (distances_buffer_size_elems_shared + scratch_buffer_size_elems) * sizeof(dtype);
+    cfg.distance_buffer_ext_size_elems = 0;
+    cfg.use_shared_distances = true;
+    cfg.max_shared_full = 0;
+    return cfg;
+}
+
+template <typename dtype>
+static size_t polyline_external_distance_buffer_size_elems(const PolylineLaunchConfig<dtype>& cfg,
+                                                           int num_points) {
+    const size_t buffer_size_elems = static_cast<size_t>(cfg.grid_dim.y) * cfg.block_dim.y * num_points;
+    return buffer_size_elems;
+}
+
+template <typename dtype>
+static size_t polyline_scratch_shared_mem_size(const PolylineLaunchConfig<dtype>& cfg) {
+    const int num_warps_per_sample = (cfg.block_dim.x + 31) / 32;
+    const size_t shared_mem_size =
+        static_cast<size_t>(cfg.block_dim.y) * (num_warps_per_sample + 1) * sizeof(dtype);
+    return shared_mem_size;
+}
+
+template <typename dtype>
+PolylineLaunchConfig<dtype> make_polyline_launch_config(int num_points, int num_samples, int device) {
+    // Keep blockDim.y at 1 so blockDim.x can use the full thread block for each sample.
+    PolylineLaunchConfig<dtype> cfg = make_polyline_launch_config_for_y<dtype>(num_points, num_samples, 1);
+
+    // Determine whether we can stay in the shared‑memory path using the opt‑in
+    // limit (`sharedMemPerBlockOptin`) instead of falling back to the external
+    // buffer.
+    const size_t max_shared_full = polyline_max_shared_full_for_device(device);
+
+    if (cfg.shared_mem_size <= max_shared_full) {
+        cfg.max_shared_full = max_shared_full;
+        return cfg;
+    }
+
+    cfg.shared_mem_size = polyline_scratch_shared_mem_size(cfg);
+    cfg.use_shared_distances = false;
+    cfg.max_shared_full = max_shared_full;
+    cfg.distance_buffer_ext_size_elems = polyline_external_distance_buffer_size_elems(cfg, num_points);
+    return cfg;
+}
+
+template <typename dtype>
+static PolylineLengthLaunchConfig<dtype> make_polyline_length_launch_config(int num_samples) {
+    const int max_num_threads = 1024;
+    const int max_y_by_threads = max_num_threads / 32;  // assuming at least one warp in x
+    const int max_y_candidate = min(num_samples, max_y_by_threads);
+    int num_samples_per_block = last_power_of_2(max_y_candidate);
+    if (num_samples_per_block < 1) {
+        num_samples_per_block = 1;
+    }
+
+    const int max_threads_x_for_y = max_num_threads / num_samples_per_block;
+    int threads_x = (max_threads_x_for_y / 32) * 32;
+    if (threads_x < 32) {
+        threads_x = 32;
+    }
+
+    const dim3 block_dim(threads_x, num_samples_per_block, 1);
+    const dim3 grid_dim(1, (num_samples + block_dim.y - 1) / block_dim.y, 1);
+    const int num_warps_per_sample = (block_dim.x + 31) / 32;
+
+    PolylineLengthLaunchConfig<dtype> cfg;
+    cfg.block_dim = block_dim;
+    cfg.grid_dim = grid_dim;
+    cfg.shared_mem_size = static_cast<size_t>(block_dim.y) * num_warps_per_sample * sizeof(dtype);
+    return cfg;
+}
+
+template <typename dtype>
+void polyline_interpolation(dtype* points, int num_points, int num_dims, dtype* distances, int num_distances,
+                            dtype* result_points, int num_samples, bool relative_distances, int device,
+                            const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                            cudaStream_t stream) {
+    if (cfg.use_shared_distances) {
+        configure_polyline_sampling_kernel_once<dtype, true>(device, cfg.max_shared_full);
+        polyline_sampling_fully_shared_kernel<dtype, true>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, relative_distances, nullptr);
+    } else {
+        polyline_sampling_fully_shared_kernel<dtype, false>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, relative_distances, distance_buffer_ext);
+    }
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype>
+void polyline_lengths(dtype* points, int num_points, int num_dims, dtype* lengths, int num_samples,
+                      cudaStream_t stream) {
+    auto cfg = make_polyline_length_launch_config<dtype>(num_samples);
+    polyline_lengths_kernel<dtype><<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+        points, lengths, num_points, num_dims, num_samples);
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* distances,
+                                           int num_distances, dtype* result_points, int num_samples,
+                                           sample_size_dtype* sample_sizes_points,
+                                           sample_size_dtype* sample_sizes_distances_to_sample,
+                                           bool relative_distances, int device,
+                                           const PolylineLaunchConfig<dtype>& cfg, dtype* distance_buffer_ext,
+                                           cudaStream_t stream) {
+    if (cfg.use_shared_distances) {
+        configure_polyline_sampling_var_batch_kernel_once<dtype, sample_size_dtype, true>(
+            device, cfg.max_shared_full);
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, true>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+                relative_distances, nullptr);
+    } else {
+        polyline_sampling_fully_shared_var_batch_kernel<dtype, sample_size_dtype, false>
+            <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+                points, distances, result_points, max_num_points, cfg.num_points_full_blocks, num_dims,
+                num_distances, num_samples, sample_sizes_points, sample_sizes_distances_to_sample,
+                relative_distances, distance_buffer_ext);
+    }
+    CUDA_CHECK_LAST();
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch(dtype* points, int max_num_points, int num_dims, dtype* lengths,
+                                     int num_samples, sample_size_dtype* sample_sizes_points,
+                                     cudaStream_t stream) {
+    auto cfg = make_polyline_length_launch_config<dtype>(num_samples);
+    polyline_lengths_var_batch_kernel<dtype, sample_size_dtype>
+        <<<cfg.grid_dim, cfg.block_dim, cfg.shared_mem_size, stream>>>(
+            points, lengths, max_num_points, num_dims, num_samples, sample_sizes_points);
+    CUDA_CHECK_LAST();
+}
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE)                                          \
+    template void polyline_interpolation<DTYPE>(                                           \
+        DTYPE * points, int num_points, int num_dims, DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances, int device,        \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE)                                                            \
+    template PolylineLaunchConfig<DTYPE> make_polyline_launch_config<DTYPE>(int num_points, int num_samples, \
+                                                                            int device);
+
+#define INSTANTIATE_POLYLINE_LENGTHS(DTYPE)                                                             \
+    template void polyline_lengths<DTYPE>(DTYPE * points, int num_points, int num_dims, DTYPE* lengths, \
+                                          int num_samples, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE)               \
+    template void polyline_interpolation_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(                \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* distances, int num_distances,    \
+        DTYPE* result_points, int num_samples, SAMPLE_SIZE_DTYPE* sample_sizes_points,            \
+        SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances, int device, \
+        const PolylineLaunchConfig<DTYPE>& cfg, DTYPE* distance_buffer_ext, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, SAMPLE_SIZE_DTYPE)              \
+    template void polyline_lengths_var_size_batch<DTYPE, SAMPLE_SIZE_DTYPE>(               \
+        DTYPE * points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        SAMPLE_SIZE_DTYPE* sample_sizes_points, cudaStream_t stream);
+
+#define INSTANTIATE_POLYLINE_CUDA_DTYPE(DTYPE)                        \
+    INSTANTIATE_POLYLINE_LAUNCH_CONFIG(DTYPE)                         \
+    INSTANTIATE_POLYLINE_INTERPOLATION(DTYPE)                         \
+    INSTANTIATE_POLYLINE_LENGTHS(DTYPE)                               \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int)     \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH(DTYPE, int64_t) \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int)           \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH(DTYPE, int64_t)
+
+INSTANTIATE_POLYLINE_CUDA_DTYPE(float)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(double)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::Half)
+INSTANTIATE_POLYLINE_CUDA_DTYPE(c10::BFloat16)
+
+#undef INSTANTIATE_POLYLINE_CUDA_DTYPE
+#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH
+#undef INSTANTIATE_POLYLINE_LENGTHS
+#undef INSTANTIATE_POLYLINE_LAUNCH_CONFIG
+#undef INSTANTIATE_POLYLINE_INTERPOLATION
+}  // namespace polyline
\ No newline at end of file
diff --git a/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp
new file mode 100644
index 0000000..f0d7cd0
--- /dev/null
+++ b/packages/lane_helpers/ext_impl/polyline/src/polyline_cpu.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <vector>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Parallel.h>
+
+#include "polyline_common.cuh"
+
+namespace polyline {
+
+template <typename dtype>
+using cpu_acc_t = at::acc_type<dtype, false>;
+
+template <typename dtype>
+static void compute_accumulated_distances_cpu(const dtype* points_sample, int num_points, int num_dims,
+                                              cpu_acc_t<dtype>* accum_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    accum_distances[0] = static_cast<acc_t>(0.0);
+    for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) {
+        accum_distances[point_idx + 1] =
+            accum_distances[point_idx] +
+            compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+    }
+}
+
+template <typename dtype>
+static void sample_polyline_cpu(const dtype* points_sample, const dtype* distances_sample, int num_points,
+                                int num_dims, int num_distances, dtype* result_sample,
+                                bool relative_distances, std::vector<cpu_acc_t<dtype>>& accum_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    if (num_distances == 0) {
+        return;
+    }
+    if (num_points == 0) {
+        dtype* result_sample_i = result_sample;
+        for (int distance_idx = 0; distance_idx < num_distances;
+             ++distance_idx, result_sample_i += num_dims) {
+            fill_point_with_nan_common<dtype>(result_sample_i, num_dims);
+        }
+        return;
+    }
+    compute_accumulated_distances_cpu<dtype>(points_sample, num_points, num_dims, accum_distances.data());
+    const acc_t total_length = accum_distances[num_points - 1];
+    for (int distance_idx = 0; distance_idx < num_distances; ++distance_idx) {
+        const acc_t distance_to_sample =
+            relative_distances ? static_cast<acc_t>(distances_sample[distance_idx]) * total_length
+                               : static_cast<acc_t>(distances_sample[distance_idx]);
+        sample_at_distance_common<dtype, acc_t>(points_sample, accum_distances.data(), distance_to_sample,
+                                                num_points, num_dims,
+                                                result_sample + distance_idx * num_dims);
+    }
+}
+
+template <typename dtype>
+void polyline_interpolation_cpu(const dtype* points, int num_points, int num_dims, const dtype* distances,
+                                int num_distances, dtype* result_points, int num_samples,
+                                bool relative_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    const size_t stride_points = static_cast<size_t>(num_points) * static_cast<size_t>(num_dims);
+    const size_t stride_distances = static_cast<size_t>(num_distances);
+    const size_t stride_result = static_cast<size_t>(num_distances) * static_cast<size_t>(num_dims);
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        std::vector<acc_t> accum_distances(num_points);
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const dtype* points_sample = points + sample_idx * stride_points;
+            const dtype* distances_sample = distances + sample_idx * stride_distances;
+            dtype* result_sample = result_points + sample_idx * stride_result;
+            sample_polyline_cpu<dtype>(points_sample, distances_sample, num_points, num_dims, num_distances,
+                                       result_sample, relative_distances, accum_distances);
+        }
+    });
+}
+
+template <typename dtype>
+void polyline_lengths_cpu(const dtype* points, int num_points, int num_dims, dtype* lengths,
+                          int num_samples) {
+    using acc_t = cpu_acc_t<dtype>;
+    const size_t stride_points = static_cast<size_t>(num_points) * static_cast<size_t>(num_dims);
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const dtype* points_sample = points + sample_idx * stride_points;
+            acc_t length = static_cast<acc_t>(0.0);
+            if (num_points == 0) {
+                length = polyline_nan<acc_t>();
+            } else {
+                for (int point_idx = 0; point_idx < num_points - 1; ++point_idx) {
+                    length += compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+                }
+            }
+            lengths[sample_idx] = static_cast<dtype>(length);
+        }
+    });
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_interpolation_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                               const dtype* distances, int num_distances,
+                                               dtype* result_points, int num_samples,
+                                               const sample_size_dtype* sample_sizes_points,
+                                               const sample_size_dtype* sample_sizes_distances_to_sample,
+                                               bool relative_distances) {
+    using acc_t = cpu_acc_t<dtype>;
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        std::vector<acc_t> accum_distances(max_num_points);
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const int curr_num_points = static_cast<int>(sample_sizes_points[sample_idx]);
+            const int curr_num_distances = static_cast<int>(sample_sizes_distances_to_sample[sample_idx]);
+            const dtype* points_sample = points + sample_idx * max_num_points * num_dims;
+            const dtype* distances_sample = distances + sample_idx * num_distances;
+            dtype* result_sample = result_points + sample_idx * num_distances * num_dims;
+            sample_polyline_cpu<dtype>(points_sample, distances_sample, curr_num_points, num_dims,
+                                       curr_num_distances, result_sample, relative_distances,
+                                       accum_distances);
+        }
+    });
+}
+
+template <typename dtype, typename sample_size_dtype>
+void polyline_lengths_var_size_batch_cpu(const dtype* points, int max_num_points, int num_dims,
+                                         dtype* lengths, int num_samples,
+                                         const sample_size_dtype* sample_sizes_points) {
+    using acc_t = cpu_acc_t<dtype>;
+    at::parallel_for(0, num_samples, 0, [&](int64_t start, int64_t end) {
+        for (int64_t sample_idx = start; sample_idx < end; ++sample_idx) {
+            const int curr_num_points = static_cast<int>(sample_sizes_points[sample_idx]);
+            const dtype* points_sample = points + sample_idx * max_num_points * num_dims;
+            acc_t length = static_cast<acc_t>(0.0);
+            if (curr_num_points == 0) {
+                length = polyline_nan<acc_t>();
+            } else {
+                for (int point_idx = 0; point_idx < curr_num_points - 1; ++point_idx) {
+                    length += compute_segment_length_common<dtype, acc_t>(points_sample, point_idx, num_dims);
+                }
+            }
+            lengths[sample_idx] = static_cast<dtype>(length);
+        }
+    });
+}
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE)                                                 \
+    template void polyline_interpolation_cpu<DTYPE>(                                                  \
+        const DTYPE* points, int num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, bool relative_distances);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE)                                                  \
+    template void polyline_lengths_cpu<DTYPE>(const DTYPE* points, int num_points, int num_dims, \
+                                              DTYPE* lengths, int num_samples);
+
+#define INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE)                   \
+    template void polyline_interpolation_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(                    \
+        const DTYPE* points, int max_num_points, int num_dims, const DTYPE* distances, int num_distances, \
+        DTYPE* result_points, int num_samples, const SAMPLE_SIZE_DTYPE* sample_sizes_points,              \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_distances_to_sample, bool relative_distances);
+
+#define INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, SAMPLE_SIZE_DTYPE)               \
+    template void polyline_lengths_var_size_batch_cpu<DTYPE, SAMPLE_SIZE_DTYPE>(                \
+        const DTYPE* points, int max_num_points, int num_dims, DTYPE* lengths, int num_samples, \
+        const SAMPLE_SIZE_DTYPE* sample_sizes_points);
+
+#define INSTANTIATE_POLYLINE_CPU_DTYPE(DTYPE)                             \
+    INSTANTIATE_POLYLINE_INTERPOLATION_CPU(DTYPE)                         \
+    INSTANTIATE_POLYLINE_LENGTHS_CPU(DTYPE)                               \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int)     \
+    INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU(DTYPE, int64_t) \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int)           \
+    INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU(DTYPE, int64_t)
+
+INSTANTIATE_POLYLINE_CPU_DTYPE(float)
+INSTANTIATE_POLYLINE_CPU_DTYPE(double)
+
+#undef INSTANTIATE_POLYLINE_CPU_DTYPE
+#undef INSTANTIATE_POLYLINE_LENGTHS_VAR_SIZE_BATCH_CPU
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_VAR_SIZE_BATCH_CPU
+#undef INSTANTIATE_POLYLINE_LENGTHS_CPU
+#undef INSTANTIATE_POLYLINE_INTERPOLATION_CPU
+
+}  // namespace polyline
diff --git a/packages/lane_helpers/pyproject.toml b/packages/lane_helpers/pyproject.toml
new file mode 100644
index 0000000..41b2940
--- /dev/null
+++ b/packages/lane_helpers/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = [
+    "setuptools>=64",
+    "wheel",
+    "scikit-build>=0.17.0",
+    "pybind11>=2.10.0",
+    "setuptools-scm>=8",
+    "accvlab-build-config @ file:../../build_config",
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "accvlab.lane_helpers"
+dynamic = ["version"]
+description = "Lane helper utilities for ACCV-Lab."
+requires-python = ">=3.8"
+dependencies = [
+    "torch>=2.0.0",
+    "numpy>=1.22.2",
+]
+
+[project.optional-dependencies]
+optional = [
+    "matplotlib",
+    "pytest",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["accvlab.lane_helpers*"]
+
+[tool.setuptools_scm]
+version_scheme = "no-guess-dev"
+fallback_version = "0.0.0"
+root = "../.."
diff --git a/packages/lane_helpers/setup.py b/packages/lane_helpers/setup.py
new file mode 100644
index 0000000..557ed99
--- /dev/null
+++ b/packages/lane_helpers/setup.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from skbuild import setup
+from setuptools import find_namespace_packages
+
+_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR = """
+#########################################################################################
+# Missing build dependency: accvlab-build-config.                                       #
+#                                                                                       #
+# ACCV-Lab package builds normally use --no-build-isolation, so the shared build helper #
+# must already be installed in the active environment. Install it first with:           #
+#                                                                                       #
+#     pip install <ACCV-Lab root>/build_config                                          #
+#                                                                                       #
+# and retry.                                                                            #
+#                                                                                       #
+# Alternatively, use <ACCV-Lab root>/scripts/package_manager.sh to install packages in  #
+# the documented order.                                                                 #
+#########################################################################################
+"""
+
+try:
+    from accvlab_build_config import build_cmake_args
+except ModuleNotFoundError as exc:
+    if exc.name != "accvlab_build_config":
+        raise
+    raise RuntimeError(_ACCVLAB_BUILD_CONFIG_IMPORT_ERROR) from exc
+
+_cmake_args = build_cmake_args()
+
+
+setup(
+    name="accvlab.lane_helpers",
+    description="Lane helper utilities for ACCV-Lab.",
+    packages=find_namespace_packages(include=["accvlab.lane_helpers*"]),
+    include_package_data=True,
+    zip_safe=False,
+    cmake_source_dir="ext_impl",
+    cmake_install_dir="accvlab/lane_helpers",
+    cmake_args=_cmake_args,
+)
diff --git a/packages/lane_helpers/tests/polyline_test_utils.py b/packages/lane_helpers/tests/polyline_test_utils.py
new file mode 100644
index 0000000..8cad98f
--- /dev/null
+++ b/packages/lane_helpers/tests/polyline_test_utils.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+
+DEVICES = ["cpu", "cuda"]
+
+
+def sample_polyline_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor:
+    # For no distances, the result is an empty tensor.
+    if distances.shape[0] == 0:
+        sampled_points = points.new_empty((0, points.shape[1]))
+        return sampled_points
+    # For no points, the result is NaN for every requested point coordinate.
+    if points.shape[0] == 0:
+        sampled_points = points.new_full((distances.shape[0], points.shape[1]), torch.nan)
+        return sampled_points
+
+    segment_lengths = torch.linalg.vector_norm(points[1:] - points[:-1], dim=1)
+    accum = torch.cat([segment_lengths.new_zeros(1), torch.cumsum(segment_lengths, dim=0)])
+    total_length = accum[-1]
+
+    out = []
+    for distance in distances:
+        d = torch.clamp(distance, min=0.0, max=total_length)
+        lower_idx = int(torch.nonzero(accum <= d, as_tuple=False)[-1])
+        if lower_idx >= points.shape[0] - 1:
+            out.append(points[-1])
+            continue
+
+        upper_idx = lower_idx + 1
+        lower_dist = accum[lower_idx]
+        upper_dist = accum[upper_idx]
+        segment_dist = upper_dist - lower_dist
+        if segment_dist <= torch.finfo(points.dtype).eps:
+            out.append(points[lower_idx])
+            continue
+
+        weight_upper = (d - lower_dist) / segment_dist
+        weight_lower = (upper_dist - d) / segment_dist
+        out.append(points[lower_idx] * weight_lower + points[upper_idx] * weight_upper)
+
+    sampled_points = torch.stack(out)
+    return sampled_points
+
+
+def sample_batch_cpu(points: torch.Tensor, distances: torch.Tensor) -> torch.Tensor:
+    sampled_points = torch.stack(
+        [
+            sample_polyline_cpu(points_sample, distances_sample)
+            for points_sample, distances_sample in zip(points, distances)
+        ]
+    )
+    return sampled_points
+
+
+def polyline_lengths_cpu(points: torch.Tensor) -> torch.Tensor:
+    # For no points, the length is undefined.
+    if points.shape[1] == 0:
+        lengths = points.new_full((points.shape[0],), torch.nan)
+        return lengths
+    # For a single point, the length is 0.
+    if points.shape[1] == 1:
+        lengths = points.new_zeros((points.shape[0],))
+        return lengths
+
+    lengths = torch.linalg.vector_norm(points[:, 1:] - points[:, :-1], dim=2).sum(dim=1)
+    return lengths
+
+
+def polyline_lengths_var_size_cpu(points: torch.Tensor, sample_sizes: torch.Tensor) -> torch.Tensor:
+    lengths = []
+    for sample_idx in range(points.shape[0]):
+        num_points = int(sample_sizes[sample_idx].item())
+        lengths.append(polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0])
+    lengths = torch.stack(lengths)
+    return lengths
+
+
+def assert_ragged_matches_cpu(
+    result: RaggedBatch,
+    points: torch.Tensor,
+    distances: torch.Tensor,
+    points_sample_sizes: torch.Tensor,
+    distances_sample_sizes: torch.Tensor,
+    *,
+    atol: float = 1e-5,
+) -> None:
+    assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu())
+
+    for sample_idx in range(points.shape[0]):
+
+        num_points = int(points_sample_sizes[sample_idx].item())
+        num_distances = int(distances_sample_sizes[sample_idx].item())
+        expected = sample_polyline_cpu(
+            points[sample_idx, :num_points].cpu(),
+            distances[sample_idx, :num_distances].cpu(),
+        )
+
+        actual = result.tensor[sample_idx, :num_distances].cpu()
+
+        assert torch.allclose(actual, expected, atol=atol, rtol=0.0, equal_nan=True)
+
+
+def make_random_ragged_polyline_case(
+    *,
+    seed: int,
+    batch_size: int = 7,
+    max_num_points: int = 12,
+    max_num_distances: int = 17,
+    num_dims: int = 3,
+) -> tuple[RaggedBatch, RaggedBatch]:
+    generator = torch.Generator().manual_seed(seed)
+    points_sample_sizes = torch.randint(1, max_num_points + 1, (batch_size,), generator=generator)
+    distances_sample_sizes = torch.randint(0, max_num_distances + 1, (batch_size,), generator=generator)
+
+    max_points_in_batch = int(points_sample_sizes.max().item())
+    max_distances_in_batch = int(distances_sample_sizes.max().item())
+
+    points = torch.full((batch_size, max_points_in_batch, num_dims), 9999.0, dtype=torch.float32)
+    distances = torch.full((batch_size, max_distances_in_batch), -9999.0, dtype=torch.float32)
+
+    for sample_idx in range(batch_size):
+        num_points = int(points_sample_sizes[sample_idx].item())
+        num_distances = int(distances_sample_sizes[sample_idx].item())
+        points[sample_idx, :num_points] = torch.rand((num_points, num_dims), generator=generator)
+        total_length = polyline_lengths_cpu(points[sample_idx : sample_idx + 1, :num_points])[0]
+        distances[sample_idx, :num_distances] = (
+            torch.rand((num_distances,), generator=generator) * total_length
+        )
+
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+
+    return points_batch, distances_batch
+
+
+def make_padded_ragged_polyline_case(
+    device: str,
+) -> tuple[RaggedBatch, RaggedBatch]:
+    # Poitns data
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0], [0.0, 0.0]],
+            [[3.5, -1.25], [4.5, -1.25], [4.5, 0.75], [9999.0, 9999.0], [9999.0, 9999.0]],
+            [[-2.0, 3.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]],
+            [[10.0, 0.0], [12.0, 0.0], [9999.0, 9999.0], [9999.0, 9999.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.tensor([5, 3, 1, 2], device=device)
+    # Distances data
+    distances = torch.tensor(
+        [
+            [0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0],
+            [3.0, 2.0, 1.0, 0.0, -1.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+            [9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+            [-5.0, 1.0, 5.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0, 9999.0],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances_sample_sizes = torch.tensor([11, 5, 0, 3], device=device)
+
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+
+    return points_batch, distances_batch
+
+
+def distances_for_mode(
+    points: torch.Tensor, absolute_distances: torch.Tensor, *, relative: bool
+) -> torch.Tensor:
+
+    if not relative:
+        return absolute_distances
+
+    lengths = polyline_lengths_cpu(points.cpu()).to(
+        device=absolute_distances.device, dtype=absolute_distances.dtype
+    )
+
+    # For zero length, use 1.0 to avoid division by zero.
+    safe_lengths = torch.where(lengths > 0, lengths, torch.ones_like(lengths))
+    relative_distances = absolute_distances / safe_lengths[:, None]
+
+    return relative_distances
+
+
+def ragged_distances_for_mode(
+    points: RaggedBatch,
+    absolute_distances: RaggedBatch,
+    *,
+    relative: bool,
+) -> RaggedBatch:
+    if not relative:
+        return absolute_distances
+
+    relative_distances = absolute_distances.tensor.clone()
+    lengths = polyline_lengths_var_size_cpu(points.tensor.cpu(), points.sample_sizes.cpu()).to(
+        device=absolute_distances.tensor.device, dtype=absolute_distances.tensor.dtype
+    )
+    for sample_idx in range(points.tensor.shape[0]):
+        num_distances = int(absolute_distances.sample_sizes[sample_idx].item())
+        if num_distances == 0:
+            continue
+        length = lengths[sample_idx]
+        if length > 0:
+            relative_distances[sample_idx, :num_distances] /= length
+        else:
+            relative_distances[sample_idx, :num_distances] = 0.0
+    relative_distances_batch = absolute_distances.create_with_sample_sizes_like_self(relative_distances)
+    return relative_distances_batch
diff --git a/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py
new file mode 100644
index 0000000..ffa4946
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_fixed_interpolation.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import DEVICES, distances_for_mode, sample_batch_cpu
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_rectangle_polyline_interpolation(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 0.5, 1.0, 2.0, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0]],
+        device=device,
+    )
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_batched_polyline_interpolation(relative: bool, device: str):
+    base_points = torch.tensor(
+        [
+            [0.0, 0.0],
+            [1.0, 0.0],
+            [1.0, 2.0],
+            [0.0, 2.0],
+            [0.0, 0.0],
+        ],
+        dtype=torch.float32,
+    )
+    offsets = torch.tensor([[0.0, 0.0], [3.5, -1.25]], dtype=torch.float32)
+    points = (base_points.unsqueeze(0) + offsets.unsqueeze(1)).to(device)
+    distances = torch.tensor(
+        [
+            [0.0, 0.5, 1.0, 3.0, 6.0],
+            [6.0, 5.0, 3.0, 1.0, 0.0],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points.contiguous(), distances_input.contiguous(), relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    distances = torch.tensor(
+        [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    ).transpose(0, 1)
+    assert not points.is_contiguous()
+    assert not distances.is_contiguous()
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_out_of_range_distances_clamp_to_endpoints(relative: bool, device: str):
+    points = torch.tensor([[[0.0, 0.0], [1.0, 0.0], [1.0, 2.0]]], device=device, dtype=torch.float32)
+    distances = torch.tensor([[-4.0, -1.0, 0.0, 3.0, 4.0]], device=device, dtype=torch.float32)
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_single_point_polyline(relative: bool, device: str):
+    points = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32)
+    distances = torch.tensor([[-1.0, 0.0, 1.0]], device=device, dtype=torch.float32)
+
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    expected = sample_batch_cpu(points.cpu(), distances.cpu())
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_zero_point_polyline_returns_nan(relative: bool, device: str):
+    points = torch.empty((2, 0, 3), device=device, dtype=torch.float32)
+    distances = torch.tensor([[0.0, 1.0], [-1.0, 2.0]], device=device, dtype=torch.float32)
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert result.shape == (2, 2, 3)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_zero_point_polyline_with_zero_distances_returns_empty(relative: bool, device: str):
+    points = torch.empty((2, 0, 3), device=device, dtype=torch.float32)
+    distances = torch.empty((2, 0), device=device, dtype=torch.float32)
+    distances_input = distances_for_mode(points, distances, relative=relative)
+
+    result = polyline.interpolate(points, distances_input, relative=relative)
+
+    assert result.shape == (2, 0, 3)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_random_polyline_matches_cpu_reference(relative: bool, device: str):
+    num_iters = 100
+    generator = torch.Generator().manual_seed(0)
+    for _ in range(num_iters):
+        num_points = int(torch.randint(15, 61, (), generator=generator).item())
+        num_distances = int(torch.randint(15, 61, (), generator=generator).item())
+        points_cpu = torch.rand((3, num_points, 2), generator=generator, dtype=torch.float32)
+        distances_cpu = torch.rand((3, num_distances), generator=generator, dtype=torch.float32)
+
+        segment_lengths = torch.linalg.vector_norm(points_cpu[:, 1:] - points_cpu[:, :-1], dim=2)
+        total_lengths = torch.sum(segment_lengths, dim=1)
+        distances_cpu = distances_cpu * total_lengths[:, None]
+
+        distances_input_cpu = distances_for_mode(points_cpu, distances_cpu, relative=relative)
+
+        expected = sample_batch_cpu(points_cpu, distances_cpu)
+        result = polyline.interpolate(
+            points_cpu.to(device), distances_input_cpu.to(device), relative=relative
+        )
+
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_large_polyline_interpolation_external_distance_buffer(relative: bool):
+    # Create a large polyline to ensure that the external distance buffer is used.
+    num_points = 200_000
+    x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32)
+    points = torch.stack((x, torch.zeros_like(x)), dim=1).unsqueeze(0)
+    distances = torch.tensor([[0.0, 0.25, 0.5, 1.0, 2.0]], device="cuda", dtype=torch.float32)
+    expected = torch.tensor(
+        [[[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]]],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        result = polyline.interpolate(points, distances, relative=relative)
+    stream.synchronize()
+
+    assert torch.allclose(result, expected, atol=1e-4, rtol=0.0)
diff --git a/packages/lane_helpers/tests/test_polyline_lengths.py b/packages/lane_helpers/tests/test_polyline_lengths.py
new file mode 100644
index 0000000..4b31de2
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_lengths.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import (
+    DEVICES,
+    make_padded_ragged_polyline_case,
+    make_random_ragged_polyline_case,
+    polyline_lengths_cpu,
+    polyline_lengths_var_size_cpu,
+)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_rectangle_and_single_point(device: str):
+    rectangle = torch.tensor(
+        [
+            [
+                [0.0, 0.0],
+                [1.0, 0.0],
+                [1.0, 2.0],
+                [0.0, 2.0],
+                [0.0, 0.0],
+            ]
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    single_point = torch.tensor([[[1.0, 2.0]]], device=device, dtype=torch.float32)
+
+    assert torch.allclose(polyline.lengths(rectangle).cpu(), torch.tensor([6.0]), atol=1e-5, rtol=0.0)
+    assert torch.allclose(polyline.lengths(single_point).cpu(), torch.tensor([0.0]), atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_zero_point_batch_returns_nan(device: str):
+    points = torch.empty((3, 0, 2), device=device, dtype=torch.float32)
+
+    result = polyline.lengths(points)
+
+    assert result.shape == (3,)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_random_nd_matches_cpu_reference(device: str):
+    generator = torch.Generator().manual_seed(1)
+    num_iters = 100
+    for _ in range(num_iters):
+        points_cpu = torch.rand((5, 37, 4), generator=generator, dtype=torch.float32)
+
+        expected = polyline_lengths_cpu(points_cpu)
+        result = polyline.lengths(points_cpu.to(device))
+
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_accepts_non_contiguous_points(device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    assert not points.is_contiguous()
+
+    result = polyline.lengths(points)
+    expected = polyline_lengths_cpu(points.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_random_matches_cpu_reference(device: str):
+    num_iters = 100
+    for i in range(num_iters):
+        points_batch_cpu, _ = make_random_ragged_polyline_case(seed=i * 100)
+        points_batch = points_batch_cpu.to(device)
+
+        result = polyline.lengths_var_size_batch(points_batch)
+        expected = polyline_lengths_var_size_cpu(points_batch_cpu.tensor, points_batch_cpu.sample_sizes)
+
+        assert result.shape == (points_batch.tensor.shape[0],)
+        assert torch.allclose(result.cpu(), expected, atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_matches_cpu_reference_and_ignores_padding(device: str):
+    points_batch, _ = make_padded_ragged_polyline_case(device)
+
+    result = polyline.lengths_var_size_batch(points_batch)
+    expected = polyline_lengths_var_size_cpu(points_batch.tensor.cpu(), points_batch.sample_sizes.cpu())
+
+    assert result.shape == (4,)
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_zero_point_row_returns_nan(device: str):
+    points = torch.tensor(
+        [
+            [[9999.0, 9999.0], [9999.0, 9999.0]],
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32)
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+    expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0, equal_nan=True)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_all_zero_point_rows_return_nan(device: str):
+    points = torch.empty((3, 0, 2), device=device, dtype=torch.float32)
+    sample_sizes = torch.zeros(3, device=device, dtype=torch.int32)
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+
+    assert result.shape == (3,)
+    assert torch.isnan(result).all()
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_accepts_int32_sample_sizes_and_non_contiguous_points(device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32)
+    assert not points.is_contiguous()
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+    expected = polyline_lengths_var_size_cpu(points.cpu(), sample_sizes.cpu())
+
+    assert torch.allclose(result.cpu(), expected, atol=1e-5, rtol=0.0)
+
+
+def test_polyline_lengths_var_size_batch_handles_inactive_cuda_rows():
+    num_samples = 33
+    points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32)
+    points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32)
+    points[:, 0, 1] = 0.0
+    points[:, 1, 0] = points[:, 0, 0] + 1.0
+    points[:, 1, 1] = 0.0
+    sample_sizes = torch.full((num_samples,), 2, device="cuda")
+
+    result = polyline.lengths_var_size_batch(RaggedBatch(points, sample_sizes=sample_sizes))
+
+    assert torch.allclose(result.cpu(), torch.ones(num_samples), atol=1e-5, rtol=0.0)
diff --git a/packages/lane_helpers/tests/test_polyline_validation.py b/packages/lane_helpers/tests/test_polyline_validation.py
new file mode 100644
index 0000000..fe7db26
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_validation.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import DEVICES
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_rejects_invalid_ragged_layout(device: str):
+    points = torch.randn((2, 3, 4), device=device)
+    distances = torch.randn((2, 4), device=device)
+    points_batch = RaggedBatch(
+        points.transpose(1, 2).contiguous(),
+        sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32),
+        non_uniform_dim=2,
+    )
+    distances_batch = RaggedBatch(
+        distances, sample_sizes=torch.tensor([4, 2], device=device, dtype=torch.int32)
+    )
+
+    # Polyline points must use dimension 1 as the non-uniform point dimension.
+    with pytest.raises(AssertionError, match="points.non_uniform_dim"):
+        polyline.interpolate_var_size_batch(points_batch, distances_batch)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_validates_inputs(device: str):
+    points = torch.randn((1, 3, 2), device=device)
+    distances = torch.randn((1, 4), device=device)
+    points_batch = RaggedBatch(points, sample_sizes=torch.tensor([3], device=device))
+    distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([4], device=device))
+
+    # Points sample sizes must not be negative.
+    bad_points_sizes = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.interpolate_var_size_batch(bad_points_sizes, distances_batch)
+
+    # Distance sample sizes must not exceed the padded distance dimension.
+    bad_distances_sizes = RaggedBatch(distances, sample_sizes=torch.tensor([5], device=device))
+    with pytest.raises(RuntimeError, match="distances.sample_sizes"):
+        polyline.interpolate_var_size_batch(points_batch, bad_distances_sizes)
+
+    # Points and distances must have the same dtype.
+    distances_double = distances_batch.double()
+    with pytest.raises(RuntimeError, match="same dtype"):
+        polyline.interpolate_var_size_batch(points_batch, distances_double)
+
+    # Points and distances must have the same sample size dtype.
+    mismatched_sample_size_dtype = RaggedBatch(
+        distances,
+        sample_sizes=torch.tensor([4], device=device, dtype=torch.int32),
+    )
+    with pytest.raises(RuntimeError, match="same dtype"):
+        polyline.interpolate_var_size_batch(points_batch, mismatched_sample_size_dtype)
+
+
+def test_polyline_functions_reject_mixed_cpu_cuda_inputs():
+    points_cpu = torch.randn((1, 3, 2), device="cpu")
+    distances_cpu = torch.randn((1, 4), device="cpu")
+    points_cuda = points_cpu.cuda()
+    distances_cuda = distances_cpu.cuda()
+
+    # Fixed-size points and distances must live on the same device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate(points_cpu, distances_cuda)
+
+    # Ragged points and distances must live on the same device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate_var_size_batch(
+            RaggedBatch(points_cpu, sample_sizes=torch.tensor([3], device="cpu")),
+            RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")),
+        )
+
+    # Ragged sample sizes must live on the same device as their data tensor.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.interpolate_var_size_batch(
+            RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu")),
+            RaggedBatch(distances_cuda, sample_sizes=torch.tensor([4], device="cuda")),
+        )
+
+    # Lengths use only points, but points.sample_sizes must still match the points device.
+    with pytest.raises(RuntimeError, match="same device"):
+        polyline.lengths_var_size_batch(
+            RaggedBatch(points_cuda, sample_sizes=torch.tensor([3], device="cpu"))
+        )
+
+
+def test_cpu_polyline_functions_reject_low_precision_dtypes():
+    for dtype in (torch.float16, torch.bfloat16):
+        # CPU kernels intentionally support only float32 and float64.
+        points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], dtype=dtype)
+        distances = torch.tensor([[0.0, 1.0]], dtype=dtype)
+        points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2]))
+        distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2]))
+
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.interpolate(points, distances)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.lengths(points)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.interpolate_var_size_batch(points_batch, distances_batch)
+        with pytest.raises(RuntimeError, match="float32 or float64 on CPU"):
+            polyline.lengths_var_size_batch(points_batch)
+
+
+def test_cuda_polyline_functions_accept_low_precision_dtypes():
+    for dtype in (torch.float16, torch.bfloat16):
+        points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype)
+        distances = torch.tensor([[0.0, 1.0]], device="cuda", dtype=dtype)
+        points_batch = RaggedBatch(points, sample_sizes=torch.tensor([2], device="cuda"))
+        distances_batch = RaggedBatch(distances, sample_sizes=torch.tensor([2], device="cuda"))
+
+        expected_points = torch.tensor([[[0.0, 0.0], [1.0, 0.0]]], device="cuda", dtype=dtype)
+        expected_lengths = torch.tensor([1.0], device="cuda", dtype=dtype)
+
+        assert torch.equal(polyline.interpolate(points, distances), expected_points)
+        assert torch.equal(polyline.lengths(points), expected_lengths)
+        assert torch.equal(
+            polyline.interpolate_var_size_batch(points_batch, distances_batch).tensor, expected_points
+        )
+        assert torch.equal(polyline.lengths_var_size_batch(points_batch), expected_lengths)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_rejects_invalid_ragged_layout(device: str):
+    points = torch.randn((2, 3, 4), device=device)
+    points_batch = RaggedBatch(
+        points.transpose(1, 2).contiguous(),
+        sample_sizes=torch.tensor([3, 2], device=device, dtype=torch.int32),
+        non_uniform_dim=2,
+    )
+
+    # Polyline points must use dimension 1 as the non-uniform point dimension.
+    with pytest.raises(AssertionError, match="points.non_uniform_dim"):
+        polyline.lengths_var_size_batch(points_batch)
+
+
+@pytest.mark.parametrize("device", DEVICES)
+def test_polyline_lengths_var_size_batch_validates_sample_sizes(device: str):
+    points = torch.randn((1, 3, 2), device=device)
+
+    # Length sample sizes must not be negative.
+    bad_small = RaggedBatch(points, sample_sizes=torch.tensor([-1], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.lengths_var_size_batch(bad_small)
+
+    # Length sample sizes must not exceed the padded point dimension.
+    bad_large = RaggedBatch(points, sample_sizes=torch.tensor([4], device=device))
+    with pytest.raises(RuntimeError, match="points.sample_sizes"):
+        polyline.lengths_var_size_batch(bad_large)
diff --git a/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py
new file mode 100644
index 0000000..7234ef0
--- /dev/null
+++ b/packages/lane_helpers/tests/test_polyline_var_size_interpolation.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from accvlab.batching_helpers import RaggedBatch
+from accvlab.lane_helpers import polyline
+
+from polyline_test_utils import (
+    DEVICES,
+    assert_ragged_matches_cpu,
+    make_padded_ragged_polyline_case,
+    make_random_ragged_polyline_case,
+    ragged_distances_for_mode,
+)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_matches_cpu_reference(relative: bool, device: str):
+    points_batch, distances_batch = make_padded_ragged_polyline_case(device)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(points_batch, distances_input_batch, relative=relative)
+
+    assert isinstance(result, RaggedBatch)
+    assert result.tensor.shape == (4, 11, 2)
+    assert result.non_uniform_dim == 1
+    assert_ragged_matches_cpu(
+        result,
+        points_batch.tensor,
+        distances_batch.tensor,
+        points_batch.sample_sizes,
+        distances_batch.sample_sizes,
+    )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_random_matches_cpu_reference(relative: bool, device: str):
+    num_iters = 100
+    for i in range(num_iters):
+        points_batch_cpu, distances_batch_cpu = make_random_ragged_polyline_case(seed=i)
+        points_batch = points_batch_cpu.to(device)
+        distances_batch = distances_batch_cpu.to(device)
+        distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+        result = polyline.interpolate_var_size_batch(
+            points_batch,
+            distances_input_batch,
+            relative=relative,
+        )
+
+        assert isinstance(result, RaggedBatch)
+        assert result.tensor.shape == (
+            points_batch.tensor.shape[0],
+            distances_batch.tensor.shape[1],
+            points_batch.tensor.shape[2],
+        )
+        assert_ragged_matches_cpu(
+            result,
+            points_batch.tensor,
+            distances_batch.tensor,
+            points_batch.sample_sizes,
+            distances_batch.sample_sizes,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_matches_fixed_size_when_uniform(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0], [1.0, 2.0], [0.0, 2.0]],
+            [[2.0, 2.0], [3.0, 2.0], [3.0, 4.0], [2.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 0.5, 2.0, 4.0], [4.0, 2.0, 0.5, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    )
+    sample_sizes = torch.tensor([points.shape[1], points.shape[1]], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor(
+        [distances.shape[1], distances.shape[1]], device=device, dtype=torch.int32
+    )
+    points_batch = RaggedBatch(points, sample_sizes=sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+    expected = polyline.interpolate(
+        points.contiguous(), distances_input_batch.tensor.contiguous(), relative=relative
+    )
+
+    assert torch.allclose(result.tensor, expected, atol=1e-5, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_accepts_non_contiguous_inputs(relative: bool, device: str):
+    points_storage = torch.tensor(
+        [
+            [[0.0, 1.0, 1.0, 0.0], [0.0, 0.0, 2.0, 2.0]],
+            [[2.0, 3.0, 3.0, 2.0], [2.0, 2.0, 4.0, 4.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    points = points_storage.transpose(1, 2)
+    distances = torch.tensor(
+        [[0.0, 4.0], [0.5, 2.0], [2.0, 0.5], [4.0, 0.0]],
+        device=device,
+        dtype=torch.float32,
+    ).transpose(0, 1)
+    assert not points.is_contiguous()
+    assert not distances.is_contiguous()
+
+    points_sample_sizes = torch.tensor([4, 3], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([4, 2], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(
+        result,
+        points,
+        distances,
+        points_sample_sizes,
+        distances_sample_sizes,
+    )
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_zero_point_row_returns_nan(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[9999.0, 9999.0], [9999.0, 9999.0]],
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.tensor(
+        [[0.0, 1.0], [0.0, 0.5], [-1.0, 2.0]],
+        device=device,
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.tensor([0, 2, 1], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([2, 2, 2], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(
+        result,
+        points,
+        distances,
+        points_sample_sizes,
+        distances_sample_sizes,
+    )
+    assert torch.isnan(result.tensor[0, :2]).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_zero_max_distances_returns_empty(relative: bool, device: str):
+    points = torch.tensor(
+        [
+            [[0.0, 0.0], [1.0, 0.0]],
+            [[2.0, 3.0], [9999.0, 9999.0]],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    distances = torch.empty((2, 0), device=device, dtype=torch.float32)
+    points_sample_sizes = torch.tensor([2, 1], device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([0, 0], device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert isinstance(result, RaggedBatch)
+    assert result.tensor.shape == (2, 0, 2)
+    assert torch.equal(result.sample_sizes.cpu(), torch.zeros(2, dtype=torch.int32))
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+@pytest.mark.parametrize("device", DEVICES)
+def test_variable_size_polyline_interpolation_all_zero_point_rows_return_nan(relative: bool, device: str):
+    points = torch.empty((2, 0, 2), device=device, dtype=torch.float32)
+    distances = torch.tensor([[0.0, 1.0, 2.0], [-1.0, 0.5, 3.0]], device=device, dtype=torch.float32)
+    points_sample_sizes = torch.zeros(2, device=device, dtype=torch.int32)
+    distances_sample_sizes = torch.full((2,), 3, device=device, dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert result.tensor.shape == (2, 3, 2)
+    assert torch.isnan(result.tensor).all()
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_variable_size_large_polyline_interpolation_external_distance_buffer(relative: bool):
+    # Create a large polyline to ensure that the external distance buffer is used.
+    num_points = 200_000
+    x = torch.linspace(0.0, 1.0, num_points, device="cuda", dtype=torch.float32)
+    first_polyline = torch.stack((x, torch.zeros_like(x)), dim=1)
+    second_polyline = torch.stack((x, torch.ones_like(x)), dim=1)
+    points = torch.stack((first_polyline, second_polyline), dim=0)
+    distances = torch.tensor(
+        # Note that 9999.0 is a filler is not not part of the distances used for interpolation (due to `distances_sample_sizes`)
+        [[0.0, 0.25, 0.5, 1.0, 2.0], [1.0, 0.5, 0.0, -1.0, 9999.0]],
+        device="cuda",
+        dtype=torch.float32,
+    )
+    points_sample_sizes = torch.full((2,), num_points, device="cuda", dtype=torch.int32)
+    distances_sample_sizes = torch.tensor([5, 4], device="cuda", dtype=torch.int32)
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    expected = torch.tensor(
+        [
+            [[0.0, 0.0], [0.25, 0.0], [0.5, 0.0], [1.0, 0.0], [1.0, 0.0]],
+            # Note that 9999.0 is a filler and is not checked for equality in the test.
+            [[1.0, 1.0], [0.5, 1.0], [0.0, 1.0], [0.0, 1.0], [9999.0, 9999.0]],
+        ],
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        result = polyline.interpolate_var_size_batch(points_batch, distances_batch, relative=relative)
+    stream.synchronize()
+
+    assert torch.equal(result.sample_sizes.cpu(), distances_sample_sizes.cpu())
+    assert torch.allclose(result.tensor[0, :5], expected[0, :5], atol=1e-4, rtol=0.0)
+    assert torch.allclose(result.tensor[1, :4], expected[1, :4], atol=1e-4, rtol=0.0)
+
+
+@pytest.mark.parametrize("relative", [False, True], ids=["absolute", "relative"])
+def test_variable_size_polyline_interpolation_handles_inactive_cuda_rows(relative: bool):
+    num_samples = 33
+    points = torch.empty((num_samples, 2, 2), device="cuda", dtype=torch.float32)
+    points[:, 0, 0] = torch.arange(num_samples, device="cuda", dtype=torch.float32)
+    points[:, 0, 1] = 0.0
+    points[:, 1, 0] = points[:, 0, 0] + 1.0
+    points[:, 1, 1] = 0.0
+    distances = (
+        torch.tensor([[0.0, 0.25, 1.0]], device="cuda", dtype=torch.float32).expand(num_samples, -1).clone()
+    )
+    points_sample_sizes = torch.full((num_samples,), 2, device="cuda")
+    distances_sample_sizes = torch.full((num_samples,), 3, device="cuda")
+    points_batch = RaggedBatch(points, sample_sizes=points_sample_sizes)
+    distances_batch = RaggedBatch(distances, sample_sizes=distances_sample_sizes)
+    distances_input_batch = ragged_distances_for_mode(points_batch, distances_batch, relative=relative)
+
+    result = polyline.interpolate_var_size_batch(
+        points_batch,
+        distances_input_batch,
+        relative=relative,
+    )
+
+    assert_ragged_matches_cpu(result, points, distances, points_sample_sizes, distances_sample_sizes)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])