From 6d9542531ebbbcd7a32694429312603dd5269689 Mon Sep 17 00:00:00 2001 From: tordnat Date: Wed, 29 Apr 2026 18:16:08 +0000 Subject: [PATCH 1/8] [Caspar] Fix examples + pybinding PascalCase --- symforce/caspar/code_generation/library.py | 5 ++++- .../caspar/examples/kernel_example/gen_and_run.py | 13 +++++-------- .../examples/multiple_factors/gen_and_run.py | 15 ++++++++------- .../caspar/source/templates/pybinding.cc.jinja | 2 +- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/symforce/caspar/code_generation/library.py b/symforce/caspar/code_generation/library.py index 2af63380..698b3645 100644 --- a/symforce/caspar/code_generation/library.py +++ b/symforce/caspar/code_generation/library.py @@ -14,6 +14,7 @@ from symforce.caspar.memory.dtype import DType from symforce.ops import LieGroupOps as Ops from symforce.python_util import camelcase_to_snakecase +from symforce.python_util import snakecase_to_camelcase from ..code_generation.factor import Factor from ..code_generation.kernel import Kernel @@ -241,7 +242,9 @@ def generate_castype_mappings(self, out_dir: Path) -> None: write_if_different(definition, out_dir.joinpath("caspar_mappings_pybinding.h")) def generate_binding_file(self, out_dir: Path, solver: Solver | None) -> None: - binding = env.get_template("pybinding.cc.jinja").render(caslib=self, solver=solver) + binding = env.get_template("pybinding.cc.jinja").render( + caslib=self, solver=solver, snake_to_camel=snakecase_to_camelcase + ) write_if_different(binding, out_dir.joinpath("pybinding.cc")) def generate_buildfiles(self, out_dir: Path) -> None: diff --git a/symforce/caspar/examples/kernel_example/gen_and_run.py b/symforce/caspar/examples/kernel_example/gen_and_run.py index f53bfe80..6155005d 100644 --- a/symforce/caspar/examples/kernel_example/gen_and_run.py +++ b/symforce/caspar/examples/kernel_example/gen_and_run.py @@ -44,15 +44,12 @@ def example_kernel( caslib.generate(out_dir) caslib.compile(out_dir) - # Can also be imported using: lib = caslib.import_lib(out_dir) - from symforce.caspar.examples.kernel_example.generated import ( # type: ignore[import-not-found] - caspar_lib as lib, - ) + lib = caslib.import_lib(out_dir) N = 100 arg0_stacked = torch.rand(N, sf.V3.storage_dim(), device="cuda") arg0_caspar = torch.empty(mem.caspar_size(sf.V3.storage_dim()), N, device="cuda") - lib.Matrix31_stacked_to_caspar(arg0_stacked, arg0_caspar) + lib.matrix31_stacked_to_caspar(arg0_stacked, arg0_caspar) arg0_indices = torch.randint(0, N, (N,), device="cuda", dtype=torch.int32) arg0_indices_shared = torch.empty(N, 2, device="cuda", dtype=torch.int32) @@ -60,7 +57,7 @@ def example_kernel( arg1_stacked = torch.rand(1, 6, device="cuda") arg1_caspar = torch.empty(mem.caspar_size(6), 1, device="cuda") - lib.Matrix61_stacked_to_caspar(arg1_stacked, arg1_caspar) + lib.matrix61_stacked_to_caspar(arg1_stacked, arg1_caspar) BLOCK_SIZE = 1024 OUT0_IDX_MAX = 10 @@ -87,8 +84,8 @@ def example_kernel( out0_sharedsum = torch.zeros(OUT0_IDX_MAX, 2, device="cuda") out1_indexed = torch.empty(N, 1, device="cuda") - lib.Matrix21_caspar_to_stacked(out0_caspar, out0_sharedsum) - lib.Symbol_caspar_to_stacked(out1_caspar, out1_indexed) + lib.matrix21_caspar_to_stacked(out0_caspar, out0_sharedsum) + lib.symbol_caspar_to_stacked(out1_caspar, out1_indexed) # Check the results sincos = 2 * torch.stack([torch.sin(arg0_stacked[:, 0]), torch.cos(arg0_stacked[:, 0])], dim=1) diff --git a/symforce/caspar/examples/multiple_factors/gen_and_run.py b/symforce/caspar/examples/multiple_factors/gen_and_run.py index 019d5050..d708a2e5 100644 --- a/symforce/caspar/examples/multiple_factors/gen_and_run.py +++ b/symforce/caspar/examples/multiple_factors/gen_and_run.py @@ -123,6 +123,7 @@ def to_tensor(storage: sf.Storage) -> torch.Tensor: caslib.generate(out_dir) # Can be commented out after the first run to avoid regenerating (slow) caslib.compile(out_dir) # Can be commented out after the first run to avoid recompiling (slow) + # Can also be imported using: # lib = caslib.import_lib(out_dir) from generated import caspar_lib as lib # type: ignore[import-not-found, unused-ignore] @@ -181,15 +182,15 @@ def to_tensor(storage: sf.Storage) -> torch.Tensor: # Map the generated Caspar data to regular array of structs (AOS) format. pose_stacked = torch.empty(N_POSE, mem.stacked_size(Pose)) -lib.Pose_caspar_to_stacked(pose_caspar, pose_stacked) +lib.pose_caspar_to_stacked(pose_caspar, pose_stacked) landmarks_stacked = torch.empty(N_LANDMARK, mem.stacked_size(Landmark)) -lib.Landmark_caspar_to_stacked(landmarks_caspar, landmarks_stacked) +lib.landmark_caspar_to_stacked(landmarks_caspar, landmarks_stacked) odometry_stacked = torch.empty(N_POSE - 1, mem.stacked_size(OdometryMeasurement)) -lib.OdometryMeasurement_caspar_to_stacked(odometry_caspar, odometry_stacked) +lib.odometry_measurement_caspar_to_stacked(odometry_caspar, odometry_stacked) pos_meas_stacked = torch.empty(N_GNSS, mem.stacked_size(PositionMeasurement)) -lib.posMeasurement_caspar_to_stacked(pos_meas_caspar, pos_meas_stacked) +lib.position_measurement_caspar_to_stacked(pos_meas_caspar, pos_meas_stacked) landmark_meas_stacked = torch.empty(N_LANDMARK_ERROR, mem.stacked_size(LandmarkMeasurement)) -lib.LandmarkMeasurement_caspar_to_stacked(landmark_meas_caspar, landmark_meas_stacked) +lib.landmark_measurement_caspar_to_stacked(landmark_meas_caspar, landmark_meas_stacked) # Add some noise to the data. @@ -210,7 +211,7 @@ def to_tensor(storage: sf.Storage) -> torch.Tensor: params, Pose_num_max=N_POSE, Landmark_num_max=N_LANDMARK, - posSensorOffset_num_max=1, + PositionSensorOffset_num_max=1, LandmarkSensorOffset_num_max=1, pos_error_num_max=N_GNSS, landmark_error_num_max=N_LANDMARK_ERROR, @@ -218,7 +219,7 @@ def to_tensor(storage: sf.Storage) -> torch.Tensor: ) -solver.set_posSensorOffset_nodes_from_stacked_device(pos_sensor_offset) +solver.set_PositionSensorOffset_nodes_from_stacked_device(pos_sensor_offset) solver.set_LandmarkSensorOffset_nodes_from_stacked_device(landmark_sensor_offset) # To demonstrade how to update the solver dynamically we start by loading and optimizing only half the problem. diff --git a/symforce/caspar/source/templates/pybinding.cc.jinja b/symforce/caspar/source/templates/pybinding.cc.jinja index 91b18196..b7fed7e8 100644 --- a/symforce/caspar/source/templates/pybinding.cc.jinja +++ b/symforce/caspar/source/templates/pybinding.cc.jinja @@ -59,7 +59,7 @@ PYBIND11_MODULE({{caslib.name}}, module) { module.def("shared_indices", &caspar::shared_indices_pybinding); {% for kernel in caslib.kernels %} {% if kernel.expose_to_python %} - module.def("{{kernel.name}}", &{{snake_to_camel(kernel.name)}}_pybinding); + module.def("{{kernel.name}}", &{{snake_to_camel(kernel.name)}}Pybinding); {% endif %} {% endfor %} From efd9d5c403612a2719ffae6eeaf51b687c7b1ba6 Mon Sep 17 00:00:00 2001 From: tordnat Date: Wed, 29 Apr 2026 19:14:14 +0000 Subject: [PATCH 2/8] [Caspar] Fix generate_binding_file to use parts_to_pascal consistent with kernel generation --- symforce/caspar/code_generation/library.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/symforce/caspar/code_generation/library.py b/symforce/caspar/code_generation/library.py index 698b3645..99728ac9 100644 --- a/symforce/caspar/code_generation/library.py +++ b/symforce/caspar/code_generation/library.py @@ -14,7 +14,7 @@ from symforce.caspar.memory.dtype import DType from symforce.ops import LieGroupOps as Ops from symforce.python_util import camelcase_to_snakecase -from symforce.python_util import snakecase_to_camelcase +from symforce.python_util import parts_to_pascal from ..code_generation.factor import Factor from ..code_generation.kernel import Kernel @@ -243,7 +243,7 @@ def generate_castype_mappings(self, out_dir: Path) -> None: def generate_binding_file(self, out_dir: Path, solver: Solver | None) -> None: binding = env.get_template("pybinding.cc.jinja").render( - caslib=self, solver=solver, snake_to_camel=snakecase_to_camelcase + caslib=self, solver=solver, snake_to_camel=parts_to_pascal ) write_if_different(binding, out_dir.joinpath("pybinding.cc")) From de43ffe399ae2ab035e48c47fbd027f3672050dc Mon Sep 17 00:00:00 2001 From: tordnat Date: Sun, 3 May 2026 17:16:43 +0200 Subject: [PATCH 3/8] [Caspar] Add device index selection to solver --- .../caspar/source/templates/lib.pyi.jinja | 1 + .../caspar/source/templates/solver.cc.jinja | 39 +++++++++++++++++-- .../caspar/source/templates/solver.h.jinja | 4 +- .../source/templates/solver_pybinding.h.jinja | 6 ++- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/symforce/caspar/source/templates/lib.pyi.jinja b/symforce/caspar/source/templates/lib.pyi.jinja index 2d75b55a..be223997 100644 --- a/symforce/caspar/source/templates/lib.pyi.jinja +++ b/symforce/caspar/source/templates/lib.pyi.jinja @@ -68,6 +68,7 @@ class {{solver.struct_name}}: {% for thing in solver.size_contributors %} {{num_arg_key(thing)}}: int = 0, {% endfor %} + device_id: int = 0 ): ... def set_params(self, params: SolverParams) -> None: diff --git a/symforce/caspar/source/templates/solver.cc.jinja b/symforce/caspar/source/templates/solver.cc.jinja index c693464a..dd59f2e4 100644 --- a/symforce/caspar/source/templates/solver.cc.jinja +++ b/symforce/caspar/source/templates/solver.cc.jinja @@ -68,10 +68,12 @@ namespace caspar { {{ solver.struct_name }}::{{ solver.struct_name }}( const SolverParams ¶ms, {% for thing in solver.size_contributors %} - size_t {{num_arg_key(thing)}}{{ ", " if not loop.last else "" }} + size_t {{num_arg_key(thing)}}{{ ", " }} {% endfor %} + int device_id ) : params_(params), + device_id_(device_id), {% for thing in solver.size_contributors %} {{num_key(thing)}}({{num_arg_key(thing)}}), {{num_max_key(thing)}}({{num_arg_key(thing)}}){{ ", " if not loop.last else "" }} @@ -85,6 +87,22 @@ namespace caspar { throw std::runtime_error("params.diag_init must be positive"); } allocation_size_ = get_nbytes(); + + if (device_id_ != 0){ + if (device_id_ < 0){ + std::runtime_error("Invalid CUDA device id: %d", device_id_); + } + int deviceCount; + cudaGetDeviceCount(&deviceCount); + if(deviceCount < 2){ + std::runtime_error("CUDA detected %d devices, but %d was requested", deviceCount, device_id_); + } + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device_id); + printf("Non-default CUDA device %d with compute capability %d.%d. selected\n", + device, deviceProp.major, deviceProp.minor); + } + cudaSetDevice(device_id_); cudaMalloc(&origin_ptr_, allocation_size_); size_t offset = 0; @@ -97,6 +115,7 @@ namespace caspar { } {{ solver.struct_name }}::~{{ solver.struct_name }}(){ + cudaSetDevice(device_id_); cudaFree(origin_ptr_); } @@ -110,6 +129,7 @@ size_t {{ solver.struct_name }}::get_allocation_size(){ SolveResult {{ solver.struct_name }}::solve(bool print_progress, bool verbose_logging) { + cudaSetDevice(device_id_); SolveResult result; result.exit_reason = ExitReason::MAX_ITERATIONS; {{solver.linear_t}} score_best; @@ -634,6 +654,7 @@ void {{ solver.struct_name }}::finish_indices() { {% for nodetype in solver.node_types %} void {{ solver.struct_name }}::Set{{nodetype.__name__}}Num(const size_t num) { + cudaSetDevice(device_id_); if (num > {{num_max_key(nodetype)}}) { throw std::runtime_error(std::to_string(num) + " > {{num_max_key(nodetype)}}"); } @@ -642,6 +663,7 @@ void {{ solver.struct_name }}::Set{{nodetype.__name__}}Num(const size_t num) { void {{ solver.struct_name }}::Set{{nodetype.__name__}}NodesFromStackedHost( const {{solver.storage_t}}* const data, const size_t offset, const size_t num) { + cudaSetDevice(device_id_); if (offset + num > {{num_key(nodetype)}}){ throw std::runtime_error(std::to_string(offset + num) + " > {{num_key(nodetype)}}"); } @@ -654,6 +676,7 @@ void {{ solver.struct_name }}::Set{{nodetype.__name__}}NodesFromStackedHost( void {{ solver.struct_name }}::Set{{nodetype.__name__}}NodesFromStackedDevice( const {{solver.storage_t}}* const data, const size_t offset, const size_t num) { + cudaSetDevice(device_id_); if (offset + num > {{num_key(nodetype)}}){ throw std::runtime_error(std::to_string(offset + num) + " > {{num_key(nodetype)}}"); } @@ -663,6 +686,7 @@ void {{ solver.struct_name }}::Set{{nodetype.__name__}}NodesFromStackedDevice( void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedHost( {{solver.storage_t}}* const data, const size_t offset, const size_t num) { + cudaSetDevice(device_id_); if (offset + num > {{num_key(nodetype)}}){ throw std::runtime_error(std::to_string(offset + num) + " > {{num_key(nodetype)}}"); } @@ -675,6 +699,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedHost( void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( {{solver.storage_t}}* const data, const size_t offset, const size_t num) { + cudaSetDevice(device_id_); if (offset + num > {{num_key(nodetype)}}){ throw std::runtime_error(std::to_string(offset + num) + " > {{num_key(nodetype)}}"); } @@ -686,6 +711,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( {% for fac in solver.factors %} void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}Num(const size_t num) { + cudaSetDevice(device_id_); if (num > {{num_max_key(fac)}}){ throw std::runtime_error(std::to_string(num) + " > {{num_max_key(fac)}}"); } @@ -695,6 +721,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( {% if fac.isnodeshared[arg] %} void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromHost( const unsigned int* const indices, size_t num) { + cudaSetDevice(device_id_); if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) @@ -708,7 +735,8 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromDevice( const unsigned int* const indices, size_t num) { indices_valid_ = false; - + cudaSetDevice(device_id_); + if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) @@ -737,6 +765,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( const {{solver.storage_t}}* const data, size_t offset, size_t num {% endif %} ) { + cudaSetDevice(device_id_); {% if fac.isconstuniq[arg] %} const size_t offset = 0; const size_t num = 1; @@ -769,6 +798,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( {% elif fac.isconstindexed[arg] %} const {{solver.storage_t}}* const data, size_t offset, size_t num {% endif %} ) { + cudaSetDevice(device_id_); {% if fac.isconstuniq[arg] %} const size_t offset = 0; const size_t num = 1; @@ -791,6 +821,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( {% if fac.isconstshared[arg] %} void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromHost( const unsigned int* const indices, size_t num) { + cudaSetDevice(device_id_); if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) @@ -804,7 +835,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromDevice( const unsigned int* const indices, size_t num) { indices_valid_ = false; - + cudaSetDevice(device_id_); if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) @@ -824,6 +855,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromHost( const unsigned int* const indices, size_t num) { indices_valid_ = false; + cudaSetDevice(device_id_); if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) @@ -836,6 +868,7 @@ void {{solver.struct_name}}::Get{{nodetype.__name__}}NodesToStackedDevice( void {{ solver.struct_name }}::Set{{snake_to_camel(fac.name)}}{{snake_to_camel(arg)}}IndicesFromDevice( const unsigned int* const indices, size_t num) { indices_valid_ = false; + cudaSetDevice(device_id_); if (num != {{num_key(fac)}}){ throw std::runtime_error( std::to_string(num) diff --git a/symforce/caspar/source/templates/solver.h.jinja b/symforce/caspar/source/templates/solver.h.jinja index 7f3ed12d..c897ce4b 100644 --- a/symforce/caspar/source/templates/solver.h.jinja +++ b/symforce/caspar/source/templates/solver.h.jinja @@ -55,8 +55,9 @@ class {{ solver.struct_name }} { {{ solver.struct_name }}( const SolverParams ¶ms, {% for thing in solver.size_contributors %} - size_t {{num_arg_key(thing)}}{{ ", " if not loop.last else "" }} + size_t {{num_arg_key(thing)}}{{ ", " }} {% endfor %} + int device_id = 0 ); // This class is managing cuda memory and cannot be copied. @@ -210,6 +211,7 @@ class {{ solver.struct_name }} { private: SolverParams<{{solver.linear_t}}> params_; + int device_id_; uint8_t* origin_ptr_; size_t scratch_inout_size_; size_t allocation_size_; diff --git a/symforce/caspar/source/templates/solver_pybinding.h.jinja b/symforce/caspar/source/templates/solver_pybinding.h.jinja index b5e88035..f0409d80 100644 --- a/symforce/caspar/source/templates/solver_pybinding.h.jinja +++ b/symforce/caspar/source/templates/solver_pybinding.h.jinja @@ -41,12 +41,14 @@ inline void add_solver_pybinding(pybind11::module_ module) { .def(py::init, {% for thing in solver.size_contributors %} size_t{{ ", " if not loop.last else "" }} - {% endfor %}>(), + {% endfor %}, + int>(), py::arg("params"), py::kw_only(), {% for thing in solver.size_contributors %} - py::arg("{{num_arg_key(thing)}}") = 0{{ ", " if not loop.last else "" }} + py::arg("{{num_arg_key(thing)}}") = 0{{ ", " }} {% endfor %} + py::arg("device_id") = 0 ) .def("set_params", &{{solver.struct_name}}::set_params) From 6a23793ffbc95d56d09c904322207a252a61d5c9 Mon Sep 17 00:00:00 2001 From: tordnat Date: Sun, 3 May 2026 16:40:58 +0000 Subject: [PATCH 4/8] [Caspar] Pybind infer CUDA device from data --- symforce/caspar/source/runtime/pybind_array_tools.cc | 8 ++++++++ symforce/caspar/source/runtime/pybind_array_tools.h | 2 ++ .../source/templates/caspar_mappings_pybinding.h.jinja | 3 ++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/symforce/caspar/source/runtime/pybind_array_tools.cc b/symforce/caspar/source/runtime/pybind_array_tools.cc index ec032103..87c3204f 100644 --- a/symforce/caspar/source/runtime/pybind_array_tools.cc +++ b/symforce/caspar/source/runtime/pybind_array_tools.cc @@ -169,6 +169,14 @@ void AssertUint2Vec(const py::object& obj) { Assert2DNxk(obj, 2); } +int GetDeviceId(const py::object& obj) { + py::tuple data = GetInterface(obj)["data"].cast(); + void* ptr = reinterpret_cast(data[0].cast()); + cudaPointerAttributes attrs; + cudaPointerGetAttributes(&attrs, ptr); + return attrs.device; +} + float* AsFloatPtr(const py::object& obj) { AssertFloatVec(obj); py::tuple data = GetInterface(obj)["data"].cast(); diff --git a/symforce/caspar/source/runtime/pybind_array_tools.h b/symforce/caspar/source/runtime/pybind_array_tools.h index c6f77fc5..96445288 100644 --- a/symforce/caspar/source/runtime/pybind_array_tools.h +++ b/symforce/caspar/source/runtime/pybind_array_tools.h @@ -30,6 +30,8 @@ void AssertDeviceMemory(const py::object& obj); void AssertNumRowsEquals(const py::object& obj, size_t n); void AssertNumColsEquals(const py::object& obj, size_t n); +int GetDeviceId(const py::object& obj); + float* AsFloatPtr(const py::object& obj); double* AsDoublePtr(const py::object& obj); int* AsIntPtr(const py::object& obj); diff --git a/symforce/caspar/source/templates/caspar_mappings_pybinding.h.jinja b/symforce/caspar/source/templates/caspar_mappings_pybinding.h.jinja index 4c1e7b11..05f250c9 100644 --- a/symforce/caspar/source/templates/caspar_mappings_pybinding.h.jinja +++ b/symforce/caspar/source/templates/caspar_mappings_pybinding.h.jinja @@ -27,6 +27,7 @@ void add_casmappings_pybindings(pybind11::module_ module) { throw std::runtime_error( "The caspar data must have at least as many columns as stacked_data has rows."); } + cudaSetDevice(GetDeviceId(stacked_data)); {{nodetype.__name__}}StackedToCaspar( As{{caslib.storage_t.capitalize()}}Ptr(stacked_data), As{{caslib.storage_t.capitalize()}}Ptr(cas_data), cas_stride, 0, num_objects); }); @@ -45,7 +46,7 @@ void add_casmappings_pybindings(pybind11::module_ module) { throw std::runtime_error( "The caspar data must have at least as many columns as stacked_data has rows."); } - + cudaSetDevice(GetDeviceId(cas_data)); {{nodetype.__name__}}CasparToStacked( As{{caslib.storage_t.capitalize()}}Ptr(cas_data), As{{caslib.storage_t.capitalize()}}Ptr(stacked_data), cas_stride, 0, num_objects); }); From 4ddf01cff8148712bddb1660ee11d2a688ae3e33 Mon Sep 17 00:00:00 2001 From: tordnat Date: Sun, 3 May 2026 16:41:10 +0000 Subject: [PATCH 5/8] Small bugfixes --- .../caspar/source/templates/solver.cc.jinja | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/symforce/caspar/source/templates/solver.cc.jinja b/symforce/caspar/source/templates/solver.cc.jinja index dd59f2e4..4c596158 100644 --- a/symforce/caspar/source/templates/solver.cc.jinja +++ b/symforce/caspar/source/templates/solver.cc.jinja @@ -88,19 +88,21 @@ namespace caspar { } allocation_size_ = get_nbytes(); - if (device_id_ != 0){ - if (device_id_ < 0){ - std::runtime_error("Invalid CUDA device id: %d", device_id_); - } + if (device_id_ < 0) { + throw std::runtime_error("Invalid CUDA device id: " + std::to_string(device_id_)); + } + if (device_id_ != 0) { int deviceCount; cudaGetDeviceCount(&deviceCount); - if(deviceCount < 2){ - std::runtime_error("CUDA detected %d devices, but %d was requested", deviceCount, device_id_); + if (deviceCount <= device_id_) { + throw std::runtime_error("CUDA detected " + std::to_string(deviceCount) + + " devices, but device " + std::to_string(device_id_) + + " was requested (0-indexed)"); } cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, device_id); - printf("Non-default CUDA device %d with compute capability %d.%d. selected\n", - device, deviceProp.major, deviceProp.minor); + cudaGetDeviceProperties(&deviceProp, device_id_); + printf("Non-default CUDA device %d with compute capability %d.%d selected\n", + device_id_, deviceProp.major, deviceProp.minor); } cudaSetDevice(device_id_); cudaMalloc(&origin_ptr_, allocation_size_); From 9e91dfa0fb55fc4ab1ba9ba75a3ddf7b22b8584f Mon Sep 17 00:00:00 2001 From: tordnat Date: Mon, 4 May 2026 11:02:31 +0000 Subject: [PATCH 6/8] address comments --- symforce/caspar/source/templates/lib.pyi.jinja | 2 +- symforce/caspar/source/templates/solver.cc.jinja | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/symforce/caspar/source/templates/lib.pyi.jinja b/symforce/caspar/source/templates/lib.pyi.jinja index be223997..a208c4de 100644 --- a/symforce/caspar/source/templates/lib.pyi.jinja +++ b/symforce/caspar/source/templates/lib.pyi.jinja @@ -68,7 +68,7 @@ class {{solver.struct_name}}: {% for thing in solver.size_contributors %} {{num_arg_key(thing)}}: int = 0, {% endfor %} - device_id: int = 0 + device_id: int = 0, ): ... def set_params(self, params: SolverParams) -> None: diff --git a/symforce/caspar/source/templates/solver.cc.jinja b/symforce/caspar/source/templates/solver.cc.jinja index 4c596158..2a7e97c5 100644 --- a/symforce/caspar/source/templates/solver.cc.jinja +++ b/symforce/caspar/source/templates/solver.cc.jinja @@ -99,10 +99,6 @@ namespace caspar { " devices, but device " + std::to_string(device_id_) + " was requested (0-indexed)"); } - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, device_id_); - printf("Non-default CUDA device %d with compute capability %d.%d selected\n", - device_id_, deviceProp.major, deviceProp.minor); } cudaSetDevice(device_id_); cudaMalloc(&origin_ptr_, allocation_size_); From f1fd8f5b43fad2379e7e9782111aaa8736c714bc Mon Sep 17 00:00:00 2001 From: tordnat Date: Thu, 7 May 2026 12:37:34 +0200 Subject: [PATCH 7/8] More robust device deduction --- .../source/runtime/pybind_array_tools.cc | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/symforce/caspar/source/runtime/pybind_array_tools.cc b/symforce/caspar/source/runtime/pybind_array_tools.cc index 87c3204f..9e43f198 100644 --- a/symforce/caspar/source/runtime/pybind_array_tools.cc +++ b/symforce/caspar/source/runtime/pybind_array_tools.cc @@ -170,11 +170,20 @@ void AssertUint2Vec(const py::object& obj) { } int GetDeviceId(const py::object& obj) { - py::tuple data = GetInterface(obj)["data"].cast(); - void* ptr = reinterpret_cast(data[0].cast()); - cudaPointerAttributes attrs; - cudaPointerGetAttributes(&attrs, ptr); - return attrs.device; + try { + auto interface = obj.attr("__cuda_array_interface__").cast(); + auto data = interface["data"].cast(); + void* ptr = reinterpret_cast(data[0].cast()); + cudaPointerAttributes attrs; + cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); + if (err != cudaSuccess) { + cudaGetLastError(); + return -1; + } + return attrs.device; + } catch (...) { + return -1; // Fallback if interface or attributes aren't available + } } float* AsFloatPtr(const py::object& obj) { From 92fe0922d71b283d2ae562ba6c5f7bfffc9c4786 Mon Sep 17 00:00:00 2001 From: tordnat Date: Wed, 13 May 2026 16:09:03 +0200 Subject: [PATCH 8/8] Save initial score to initial_score --- symforce/caspar/source/templates/solver.cc.jinja | 1 + 1 file changed, 1 insertion(+) diff --git a/symforce/caspar/source/templates/solver.cc.jinja b/symforce/caspar/source/templates/solver.cc.jinja index 1c847653..4379d6e8 100644 --- a/symforce/caspar/source/templates/solver.cc.jinja +++ b/symforce/caspar/source/templates/solver.cc.jinja @@ -141,6 +141,7 @@ SolveResult {{ solver.struct_name }}::solve(bool print_progress, bool verbose_lo std::chrono::time_point t0 = std::chrono::steady_clock::now(); std::chrono::time_point t_prev = t0; score_best = DoResJacFirst(); + result.initial_score = score_best; if (print_progress) { printf(" score_init: % .6e\n", score_best); }