diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dc93c1..cec2905 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -471,6 +471,10 @@ add_executable(gpu_multiagent_graph_neural_mppi src/gpu_multiagent_graph_neural_ target_link_libraries(gpu_multiagent_graph_neural_mppi ${OpenCV_LIBS}) target_compile_options(gpu_multiagent_graph_neural_mppi PRIVATE $<$:--expt-relaxed-constexpr --use_fast_math>) +add_executable(gpu_priority_graph_neural_mppi src/gpu_priority_graph_neural_mppi.cu) +target_link_libraries(gpu_priority_graph_neural_mppi ${OpenCV_LIBS}) +target_compile_options(gpu_priority_graph_neural_mppi PRIVATE $<$:--expt-relaxed-constexpr --use_fast_math>) + add_executable(gpu_interaction_graph_risk src/gpu_interaction_graph_risk.cu) target_link_libraries(gpu_interaction_graph_risk ${OpenCV_LIBS}) target_compile_options(gpu_interaction_graph_risk PRIVATE $<$:--expt-relaxed-constexpr>) diff --git a/gif/gpu_priority_graph_neural_mppi.gif b/gif/gpu_priority_graph_neural_mppi.gif new file mode 100644 index 0000000..cd5a8ee Binary files /dev/null and b/gif/gpu_priority_graph_neural_mppi.gif differ diff --git a/readme.md b/readme.md index ebc9963..929d80b 100644 --- a/readme.md +++ b/readme.md @@ -36,14 +36,14 @@ Same algorithm on CPU and GPU — GPU enables orders of magnitude more particles | Crowd / swarm | `gpu_crowd_swarm` | 10,000 boids with uniform-grid neighbours | 105x vs CPU | | Graph policy control | `gpu_gnn_swarm_controller`, `gpu_gat_traversability_policy` | 2048 agents / 3072 terrain nodes x 3 heads | 2.88 ms/control; 99.4x GAT policy | | Assignment / tracking | `gpu_hungarian_assignment`, `gpu_assignment_tracking` | 512 x 64x64 assignment / 128 tracking scenes | 158x Hungarian; 14.0x tracking | -| Interaction graph risk | `gpu_interaction_graph_risk`, `gpu_interaction_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi` | 2048 agents x 10 message passes / 48-agent graph x 4 passes / 48 robots x 768 MPPI rollouts | 76.3x risk propagation; interaction-aware MPPI reduces social risk 19.7%; multi-agent graph MPPI cuts cross-route collisions 518 -> 261 | -| Risk-aware planning | `gpu_reciprocal_risk_planner`, `gpu_interaction_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi` | 1024 agents x 9 actions x H=16 / 32768 social-risk MPPI rollouts / 48 robot coordinated MPPI | 4.05 ms/plan; 311.5x reciprocal risk; 4140.9x interaction-graph MPPI; 3139.6x multi-agent graph MPPI | +| Interaction graph risk | `gpu_interaction_graph_risk`, `gpu_interaction_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi`, `gpu_priority_graph_neural_mppi` | 2048 agents x 10 message passes / 48-agent graph x 4 passes / 48 robots x 768 MPPI rollouts | 76.3x risk propagation; interaction-aware MPPI reduces social risk 19.7%; multi-agent graph MPPI cuts cross-route collisions 518 -> 261; priority arbitration cuts 261 -> 245 and clears the deadlock | +| Risk-aware planning | `gpu_reciprocal_risk_planner`, `gpu_interaction_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi`, `gpu_priority_graph_neural_mppi` | 1024 agents x 9 actions x H=16 / 32768 social-risk MPPI rollouts / 48 robot coordinated and priority MPPI | 4.05 ms/plan; 311.5x reciprocal risk; 4140.9x interaction-graph MPPI; 3139.6x multi-agent graph MPPI; 2870.5x priority graph MPPI | | SfM / multi-view | `gpu_sfm_mini` | 2048 features x 4 views | 217.0x match + BA vs CPU | | Sparse linear solvers | `gpu_pcg_solver` | 262K unknowns / 1.31M CSR nnz | 13.4x Jacobi-PCG vs CPU | | Clustering / graph ML | `gpu_em_gmm`, `gpu_spectral_clustering`, `gpu_label_propagation`, `gpu_label_propagation_traversability`, `gpu_graph_crf_traversability` | 262K GMM points / 3K graph nodes | 90.2x EM; 193x spectral; 123x propagation; 106x CRF | | Black-box optimization | `gpu_cma_es` | 3 x 32,768 candidates x 10D | 1,254x objective eval | | Monte Carlo planning | `gpu_mcts_planner` | 64 scenes x 4096 rollouts x 48 horizon | 712x vs CPU | -| Learning-based planning | `gpu_diffusion_planner`, `gpu_diffusion_policy`, `gpu_diff_value_iteration_traversability`, `gpu_neural_astar_traversability`, `gpu_anytime_neural_astar_traversability`, `gpu_multigoal_neural_astar_traversability`, `gpu_spatiotemporal_neural_astar_traversability`, `gpu_experience_graph_neural_planner`, `gpu_graph_guided_neural_mppi`, `gpu_kinodynamic_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi` | 512 x 64 trajectories / 192x128 soft VI / 64x neural A* / 1536-node graph / 32768 MPPI rollouts / 48 robot graph MPPI | analytic score -> BC denoising policy; 747.4x learned-cost VI; 153.1x batched neural A*; 278.5x experience-graph A*; 1320.1x graph-guided MPPI; 3139.6x multi-agent graph MPPI | +| Learning-based planning | `gpu_diffusion_planner`, `gpu_diffusion_policy`, `gpu_diff_value_iteration_traversability`, `gpu_neural_astar_traversability`, `gpu_anytime_neural_astar_traversability`, `gpu_multigoal_neural_astar_traversability`, `gpu_spatiotemporal_neural_astar_traversability`, `gpu_experience_graph_neural_planner`, `gpu_graph_guided_neural_mppi`, `gpu_kinodynamic_graph_neural_mppi`, `gpu_multiagent_graph_neural_mppi`, `gpu_priority_graph_neural_mppi` | 512 x 64 trajectories / 192x128 soft VI / 64x neural A* / 1536-node graph / 32768 MPPI rollouts / 48 robot graph MPPI | analytic score -> BC denoising policy; 747.4x learned-cost VI; 153.1x batched neural A*; 278.5x experience-graph A*; 1320.1x graph-guided MPPI; 2870.5x priority graph MPPI | | Voxel map (3D) | `comparison_voxel_map` | 256x256x32 | 58x per ray | | ESDF (2D/3D) | `comparison_esdf`, `comparison_esdf_3d` | 640K cells / 1.05M voxels | 53,404x / 86,613x | | LiDAR sim | `comparison_lidar_sim`, `comparison_lidar3d_sim`, `comparison_lidar3d_realistic` | 1M 2D / 131K 3D rays | + 5 physical effects (realistic) | @@ -88,8 +88,8 @@ Same algorithm on CPU and GPU — GPU enables orders of magnitude more particles | | | | **GPU interaction-graph neural MPPI (48-agent message-passing social risk + 32768 rollouts, social risk 1.628 -> 1.308, 4140.9x vs CPU)** | **GPU multi-agent graph-neural MPPI (48 robots × 768 rollouts × H=72, cross-route collisions 518 -> 261, social risk 3.544 -> 2.588, 3139.6x vs CPU)** | | | | -| **GPU Multi-Robot Planner (200 robots, parallel BF distance fields)** | **Massive Collision Check (1M segments, 1,277x)** | -| | | +| **GPU priority graph-neural MPPI (right-of-way arbitration, collisions 261 -> 245, reach 36/48 -> 40/48, 2870.5x vs CPU)** | **GPU Multi-Robot Planner (200 robots, parallel BF distance fields)** | +| | | | **Massive RRT* Rewire (CPU 2K vs CUDA 200K nodes)** | **3D ESDF (32³ CPU vs 128²×64 CUDA, 86,613x)** | | | | | **GPU diffusion policy (768-sample BC MLP prior + diffusion refinement, 512×64 paths)** | **GPU diffusion planner (512 trajectories × 64 waypoints, 120 Langevin steps, 0.03 ms/step)** | @@ -209,6 +209,7 @@ cd ros2_ws && colcon build --packages-select cuda_robotics | GPU kinodynamic graph-neural MPPI | 32768 nonholonomic speed/steering rollouts x H=72 x guided/unguided batches, cost 1516.74 -> 851.11, terminal error 5.11 -> 0.88, route error 1.530 -> 0.252, **49.9x** vs CPU equivalent kinodynamic rollout evaluation | | GPU interaction-graph neural MPPI | 48 moving agents x 4 message-passing risk updates + 32768 MPPI rollouts x H=72, social risk 1.628 -> 1.308, clearance -0.15 -> -0.10, full objective 2913.50 -> 2395.14, **4140.9x** vs CPU equivalent rollout evaluation | | GPU multi-agent graph-neural MPPI | 48 robots x 768 rollouts x H=72 x independent/coordinated modes, cross-route collisions 518 -> 261, social risk 3.544 -> 2.588, reach basin 48/48 -> 36/48, **3139.6x** vs CPU equivalent rollout evaluation | +| GPU priority graph-neural MPPI | 48 robots x 768 rollouts x H=72 x coordinated/priority modes, right-of-way arbitration cuts cross-route collisions 261 -> 245, reach basin 36/48 -> 40/48, deadlocks 1 -> 0, terminal error 1.97 -> 1.65, **2870.5x** vs CPU equivalent rollout evaluation | | GPU GNN swarm controller | 2048 agents x 3 message passes, 2.88 ms/control, **44.3x** vs CPU | | GPU reciprocal risk planner | 1024 agents x 9 actions x H=16, 4.05 ms/plan, **311.5x** vs CPU | | GPU assignment tracking | 128 scenes x 48 tracks x 72 detections, **14.0x** vs CPU | diff --git a/src/gpu_priority_graph_neural_mppi.cu b/src/gpu_priority_graph_neural_mppi.cu new file mode 100644 index 0000000..c1ea689 --- /dev/null +++ b/src/gpu_priority_graph_neural_mppi.cu @@ -0,0 +1,1066 @@ +// gpu_priority_graph_neural_mppi.cu +// +// GPU priority-aware graph-neural MPPI planner. +// +// A batch of robots crosses a shared interaction zone. The baseline uses the +// coordinated graph-neural MPPI from the multi-agent demo. The priority-aware +// variant adds learned right-of-way messages: high-priority robots keep goal +// progress through the yield zone, while lower-priority robots briefly slow and +// re-accelerate after the crossing conflict clears. +// +// Output: gif/gpu_priority_graph_neural_mppi.gif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cuda_check.cuh" +#include "cuda_video.h" + +namespace cudabot { + +constexpr int N_ROBOTS = 48; +constexpr int ROLLOUTS_PER_ROBOT = 768; +constexpr int HORIZON = 72; +constexpr int SAMPLE_ROBOTS = 12; +constexpr int SAMPLE_ROLLOUTS = 8; +constexpr int THREADS = 256; +constexpr int PANEL_W = 960; +constexpr int PANEL_H = 620; +constexpr int HEADER_H = 44; +constexpr int FOOTER_H = 52; +constexpr int MAP_H = PANEL_H - HEADER_H - FOOTER_H; +constexpr int HALF_W = PANEL_W / 2; +constexpr int VIDEO_FPS = 10; + +constexpr float WORLD_W = 18.0f; +constexpr float WORLD_H = 11.0f; +constexpr float DT = 0.22f; +constexpr float PI_F = 3.14159265358979323846f; +constexpr float MIN_SPEED = 0.10f; +constexpr float MAX_SPEED = 1.38f; +constexpr float MAX_STEER = 0.66f; +constexpr float WHEEL_BASE = 1.45f; +constexpr float ROBOT_R = 0.22f; +constexpr float COLLISION_MARGIN = 0.58f; +constexpr float INF_COST = 1.0e20f; + +struct RobotSpec { + float sx; + float sy; + float gx; + float gy; + float theta0; + float priority; + int route; +}; + +struct Pose2 { + float x; + float y; + float theta; +}; + +struct RolloutResult { + float select_cost; + float full_cost; + float terminal_error; + float mean_social_risk; + float max_social_risk; + float min_separation; + float route_error; + int robot_id; + int rollout_id; +}; + +struct TeamMetrics { + int collisions; + int reached; + int deadlocks; + float min_separation; + float mean_terminal; + float mean_social_risk; + float max_social_risk; +}; + +__host__ __device__ static inline float clampf(float x, float lo, float hi) { + return fminf(hi, fmaxf(lo, x)); +} + +__host__ __device__ static inline float sqr(float x) { + return x * x; +} + +__host__ __device__ static inline float fast_tanh(float x) { + return clampf(x / (1.0f + fabsf(x)), -1.0f, 1.0f); +} + +__host__ __device__ static inline float fast_sigmoid(float x) { + return clampf(0.5f + 0.5f * fast_tanh(0.5f * x), 0.0f, 1.0f); +} + +__host__ __device__ static inline float wrap_angle(float a) { + while (a > PI_F) a -= 2.0f * PI_F; + while (a < -PI_F) a += 2.0f * PI_F; + return a; +} + +__host__ __device__ static inline unsigned int hash_u32(unsigned int x) { + x ^= x >> 16; + x *= 0x7feb352du; + x ^= x >> 15; + x *= 0x846ca68bu; + x ^= x >> 16; + return x; +} + +__host__ __device__ static inline float hash_unit(int seed, int k, int channel) { + unsigned int x = static_cast(seed + 1) * 73856093u + ^ static_cast(k + 3) * 19349663u + ^ static_cast(channel + 5) * 83492791u; + return static_cast(hash_u32(x) & 0x00ffffffu) / 16777215.0f; +} + +__host__ __device__ static inline float hash_signed(int seed, int k, int channel) { + return 2.0f * hash_unit(seed, k, channel) - 1.0f; +} + +__host__ __device__ static inline float circle_clearance(float x, + float y, + float cx, + float cy, + float r) { + float dx = x - cx; + float dy = y - cy; + return sqrtf(dx * dx + dy * dy) - r; +} + +__host__ __device__ static inline float static_clearance(float x, float y) { + float d = circle_clearance(x, y, 4.2f, 3.0f, 0.82f); + d = fminf(d, circle_clearance(x, y, 7.0f, 8.0f, 0.95f)); + d = fminf(d, circle_clearance(x, y, 11.2f, 4.4f, 1.02f)); + d = fminf(d, circle_clearance(x, y, 14.2f, 8.2f, 0.78f)); + float ridge = fabsf(y - (5.55f + 0.58f * sinf(0.58f * x))) - 0.22f; + return fminf(d, ridge); +} + +__host__ __device__ static inline float terrain_cost(float x, float y) { + float h = 0.28f * sinf(0.45f * x + 0.22f * y) + + 0.18f * cosf(0.62f * x - 0.35f * y); + float rough = fabsf(h) + 0.36f * expf(-0.22f * (sqr(x - 13.5f) + sqr(y - 2.7f))); + float clear = static_clearance(x, y); + float cost = 0.18f + 0.82f * rough; + if (clear < 0.0f) cost += 8.0f + 18.0f * sqr(clear); + if (clear < 0.36f) cost += 1.7f * sqr(0.36f - clear); + return clampf(cost, 0.05f, 18.0f); +} + +__host__ __device__ static inline void route_point(const RobotSpec& r, + float progress, + float& x, + float& y) { + float p = clampf(progress, 0.0f, 1.0f); + float sx = r.sx; + float sy = r.sy; + float gx = r.gx; + float gy = r.gy; + float cx = 8.9f + 0.35f * sinf(0.91f * static_cast(r.route + 1)); + float cy = 5.55f + 0.30f * cosf(0.73f * static_cast(r.route + 2)); + float q0 = (1.0f - p) * (1.0f - p); + float q1 = 2.0f * (1.0f - p) * p; + float q2 = p * p; + x = q0 * sx + q1 * cx + q2 * gx; + y = q0 * sy + q1 * cy + q2 * gy; +} + +__host__ __device__ static inline void nominal_pose(const RobotSpec& r, + int step, + float& x, + float& y) { + float progress = static_cast(step) / static_cast(HORIZON - 1); + route_point(r, progress, x, y); +} + +__host__ __device__ static inline float route_distance(const RobotSpec& r, + float x, + float y, + float* progress_out) { + float best = 1.0e9f; + float best_p = 0.0f; + float prev_x; + float prev_y; + route_point(r, 0.0f, prev_x, prev_y); + constexpr int SEGMENTS = 18; + for (int i = 1; i <= SEGMENTS; i++) { + float p1 = static_cast(i) / static_cast(SEGMENTS); + float bx; + float by; + route_point(r, p1, bx, by); + float vx = bx - prev_x; + float vy = by - prev_y; + float len2 = fmaxf(vx * vx + vy * vy, 1.0e-6f); + float u = clampf(((x - prev_x) * vx + (y - prev_y) * vy) / len2, 0.0f, 1.0f); + float px = prev_x + u * vx; + float py = prev_y + u * vy; + float d = sqrtf(sqr(x - px) + sqr(y - py)); + if (d < best) { + best = d; + best_p = (static_cast(i - 1) + u) / static_cast(SEGMENTS); + } + prev_x = bx; + prev_y = by; + } + if (progress_out) *progress_out = best_p; + return best; +} + +__host__ __device__ static inline void graph_message(const RobotSpec* robots, + int robot_id, + float x, + float y, + int step, + float& mx, + float& my, + float& risk, + float& min_sep) { + mx = 0.0f; + my = 0.0f; + risk = 0.0f; + min_sep = 1.0e6f; + const RobotSpec& self = robots[robot_id]; + for (int j = 0; j < N_ROBOTS; j++) { + if (j == robot_id) continue; + float px; + float py; + nominal_pose(robots[j], step, px, py); + float dx = x - px; + float dy = y - py; + float d2 = dx * dx + dy * dy; + float d = sqrtf(fmaxf(d2, 1.0e-8f)); + float sep = d - 2.0f * ROBOT_R; + min_sep = fminf(min_sep, sep); + if (d > 3.1f) continue; + float closing = 1.0f - clampf(d / 3.1f, 0.0f, 1.0f); + float crossing = (self.route == robots[j].route) ? 0.38f : 1.0f; + float priority = 0.75f + 0.50f * robots[j].priority; + float w = crossing * priority * (0.18f * expf(-0.5f * d2 / 1.65f) + + 1.45f * closing * closing); + risk += w; + float inv_d = rsqrtf(fmaxf(d2, 1.0e-6f)); + mx += w * dx * inv_d; + my += w * dy * inv_d; + } + float mag = sqrtf(mx * mx + my * my); + if (mag > 1.0e-5f) { + float scale = 1.0f / mag; + mx *= scale; + my *= scale; + } + risk = clampf(risk, 0.0f, 6.0f); +} + +__host__ __device__ static inline float route_precedence(int route) { + if (route == 0) return 0.18f; + if (route == 2) return 0.10f; + if (route == 1) return 0.02f; + return -0.05f; +} + +__host__ __device__ static inline float route_green_step(int route) { + if (route == 0) return 28.0f; + if (route == 2) return 32.0f; + if (route == 1) return 36.0f; + return 40.0f; +} + +__host__ __device__ static inline float yield_zone(float x, float y) { + return expf(-0.5f * (sqr((x - 9.0f) / 3.2f) + sqr((y - 5.5f) / 2.1f))); +} + +__host__ __device__ static inline void priority_signal(const RobotSpec* robots, + int robot_id, + float progress, + int step, + float& yield_pressure, + float& go_pressure, + float& zone_pressure) { + yield_pressure = 0.0f; + go_pressure = 0.0f; + zone_pressure = 0.0f; + const RobotSpec& self = robots[robot_id]; + float fx; + float fy; + route_point(self, progress + 0.13f, fx, fy); + float zone = yield_zone(fx, fy); + if (zone < 0.055f) return; + int look_step = (step + 8 < HORIZON) ? step + 8 : HORIZON - 1; + float self_score = self.priority + route_precedence(self.route); + + for (int j = 0; j < N_ROBOTS; j++) { + if (j == robot_id || robots[j].route == self.route) continue; + float px; + float py; + nominal_pose(robots[j], look_step, px, py); + float dx = fx - px; + float dy = fy - py; + float d2 = dx * dx + dy * dy; + float d = sqrtf(fmaxf(d2, 1.0e-8f)); + if (d > 3.85f) continue; + + float closing = 1.0f - clampf(d / 3.85f, 0.0f, 1.0f); + float w = zone * (0.24f * expf(-0.5f * d2 / 2.45f) + 1.18f * closing * closing); + float peer_score = robots[j].priority + route_precedence(robots[j].route); + float delta = peer_score - self_score; + if (delta > 0.04f) { + yield_pressure += w * clampf(0.52f + 1.20f * delta, 0.0f, 1.35f); + } else { + go_pressure += w * clampf(0.36f - 0.80f * delta, 0.0f, 1.10f); + } + zone_pressure = fmaxf(zone_pressure, zone * closing); + } + + yield_pressure = clampf(yield_pressure, 0.0f, 1.75f); + go_pressure = clampf(go_pressure, 0.0f, 1.35f); + zone_pressure = clampf(zone_pressure, 0.0f, 1.0f); +} + +__host__ __device__ static inline void rollout_step(const RobotSpec* robots, + int robot_id, + int rollout, + int step, + int priority_aware, + Pose2& s, + float& prev_speed, + float& prev_steer, + float& social_risk, + float& min_sep, + float& route_error, + float& smooth, + float& terrain) { + const RobotSpec& r = robots[robot_id]; + float progress = 0.0f; + route_error = route_distance(r, s.x, s.y, &progress); + float yield_pressure = 0.0f; + float go_pressure = 0.0f; + float zone_pressure = 0.0f; + if (priority_aware) { + priority_signal(robots, robot_id, progress, step, + yield_pressure, go_pressure, zone_pressure); + float gx; + float gy; + route_point(r, progress + 0.10f, gx, gy); + float gate_zone = yield_zone(gx, gy); + float green = route_green_step(r.route) - 3.0f * (r.priority - 0.65f); + float before_cross = clampf((0.66f - progress) / 0.30f, 0.0f, 1.0f); + float early = clampf((green - static_cast(step)) / 12.0f, 0.0f, 1.0f) + * gate_zone * before_cross; + float released = clampf((static_cast(step) - green) / 14.0f, 0.0f, 1.0f) + * gate_zone * before_cross; + yield_pressure = clampf(yield_pressure + 0.40f * early, 0.0f, 1.75f); + go_pressure = clampf(go_pressure + 0.55f * released, 0.0f, 1.35f); + zone_pressure = fmaxf(zone_pressure, gate_zone * before_cross); + } + float priority_phase = 0.08f * (r.priority - 0.62f); + if (priority_aware) { + priority_phase = 0.04f * (r.priority - 0.62f) + + 0.080f * go_pressure + - 0.040f * yield_pressure; + } + float lookahead = progress + 0.14f + priority_phase + 0.04f * hash_unit(rollout, step / 6, 3); + float tx; + float ty; + route_point(r, lookahead, tx, ty); + + float mx; + float my; + graph_message(robots, robot_id, s.x, s.y, step, mx, my, social_risk, min_sep); + float avoid_gain = clampf(0.62f + 0.48f * social_risk, 0.0f, 2.15f); + if (priority_aware) { + avoid_gain = clampf(0.44f + 0.34f * social_risk + + 0.62f * yield_pressure + - 0.18f * go_pressure, + 0.0f, 2.20f); + } + tx += avoid_gain * mx; + ty += avoid_gain * my; + + terrain = terrain_cost(s.x, s.y); + float dx = tx - s.x; + float dy = ty - s.y; + float target_heading = atan2f(dy, dx); + float heading_error = wrap_angle(target_heading - s.theta); + float dist = sqrtf(dx * dx + dy * dy); + float social_slow = clampf(social_risk / 3.4f, 0.0f, 1.0f); + if (priority_aware) { + social_slow = clampf(0.50f * social_slow + + 0.50f * yield_pressure + - 0.35f * go_pressure + + 0.10f * zone_pressure, + 0.0f, 1.0f); + } + float terrain_slow = clampf((terrain - 0.7f) / 4.0f, 0.0f, 1.0f); + float base_speed = clampf(0.38f + 1.06f * fast_sigmoid(1.2f * cosf(heading_error) + + 0.18f * dist - 1.15f * terrain_slow + - 1.25f * social_slow), + MIN_SPEED, MAX_SPEED); + if (priority_aware) { + base_speed = clampf(base_speed * (1.04f + 0.20f * go_pressure + - 0.28f * yield_pressure), + MIN_SPEED, MAX_SPEED); + } else { + base_speed = clampf(base_speed * (0.74f + 0.38f * r.priority), MIN_SPEED, MAX_SPEED); + } + float base_steer = clampf(0.58f * fast_tanh(1.62f * heading_error + + 0.34f * social_slow * (mx * -sinf(s.theta) + + my * cosf(s.theta))), + -MAX_STEER, MAX_STEER); + float n0 = 0.48f * hash_signed(rollout, step, 0) + + 0.20f * hash_signed(rollout, step / 4, robot_id + 7); + float n1 = 0.70f * hash_signed(rollout, step, 1) + + 0.24f * hash_signed(rollout, step / 5, robot_id + 19); + float speed_sigma = 0.17f; + float steer_sigma = 0.17f; + float speed = clampf(base_speed + speed_sigma * n0, MIN_SPEED, MAX_SPEED); + float steer = clampf(base_steer + steer_sigma * n1, -MAX_STEER, MAX_STEER); + smooth = 0.05f * sqr(speed - prev_speed) + 0.16f * sqr(steer - prev_steer); + + float yaw_rate = speed * steer / WHEEL_BASE; + s.x += speed * cosf(s.theta) * DT; + s.y += speed * sinf(s.theta) * DT; + s.theta = wrap_angle(s.theta + yaw_rate * DT); + s.x = clampf(s.x, 0.05f, WORLD_W - 0.05f); + s.y = clampf(s.y, 0.05f, WORLD_H - 0.05f); + prev_speed = speed; + prev_steer = steer; +} + +__host__ __device__ static inline RolloutResult evaluate_rollout(const RobotSpec* robots, + int robot_id, + int rollout, + int priority_aware) { + const RobotSpec& r = robots[robot_id]; + Pose2 s{r.sx, r.sy, r.theta0}; + float blind_cost = 0.0f; + float full_cost = 0.0f; + float risk_sum = 0.0f; + float risk_max = 0.0f; + float min_sep = 1.0e6f; + float route_sum = 0.0f; + float prev_speed = 0.70f; + float prev_steer = 0.0f; + + for (int k = 0; k < HORIZON; k++) { + float social_risk; + float step_min_sep; + float route_error; + float smooth; + float terrain; + rollout_step(robots, robot_id, rollout, k, priority_aware, s, prev_speed, prev_steer, + social_risk, step_min_sep, route_error, smooth, terrain); + float obstacle = terrain > 5.0f ? 8.0f * sqr(terrain - 5.0f) : 0.0f; + float base_step = 0.62f * terrain + 1.35f * route_error * route_error + + 0.20f * sqr(prev_steer / MAX_STEER) + smooth + obstacle; + float close_weight = priority_aware ? 90.0f : 82.0f; + float social_weight = priority_aware ? 23.0f : 22.0f; + float close_penalty = step_min_sep < COLLISION_MARGIN + ? close_weight * sqr(COLLISION_MARGIN - step_min_sep) + : 0.0f; + float social_step = social_weight * social_risk + close_penalty; + blind_cost += base_step; + full_cost += base_step + social_step; + risk_sum += social_risk; + risk_max = fmaxf(risk_max, social_risk); + min_sep = fminf(min_sep, step_min_sep); + route_sum += route_error; + } + + float terminal = sqrtf(sqr(r.gx - s.x) + sqr(r.gy - s.y)); + float final_route = route_distance(r, s.x, s.y, nullptr); + float terminal_weight = priority_aware ? 124.0f : 118.0f; + float terminal_cost = terminal_weight * terminal * terminal + 8.5f * final_route; + blind_cost += terminal_cost; + full_cost += terminal_cost; + + RolloutResult out; + out.select_cost = full_cost; + out.full_cost = full_cost; + out.terminal_error = terminal; + out.mean_social_risk = risk_sum / static_cast(HORIZON); + out.max_social_risk = risk_max; + out.min_separation = min_sep; + out.route_error = route_sum / static_cast(HORIZON); + out.robot_id = robot_id; + out.rollout_id = rollout; + return out; +} + +__global__ void rollout_kernel(const RobotSpec* __restrict__ robots, + int priority_aware, + RolloutResult* __restrict__ results, + float* __restrict__ sample_x, + float* __restrict__ sample_y) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total = N_ROBOTS * ROLLOUTS_PER_ROBOT; + if (idx >= total) return; + int robot_id = idx / ROLLOUTS_PER_ROBOT; + int rollout = idx - robot_id * ROLLOUTS_PER_ROBOT; + results[idx] = evaluate_rollout(robots, robot_id, rollout, priority_aware); + + if (!priority_aware || robot_id >= SAMPLE_ROBOTS || rollout >= SAMPLE_ROLLOUTS) return; + Pose2 s{robots[robot_id].sx, robots[robot_id].sy, robots[robot_id].theta0}; + float prev_speed = 0.70f; + float prev_steer = 0.0f; + int sample_base = (robot_id * SAMPLE_ROLLOUTS + rollout) * HORIZON; + for (int k = 0; k < HORIZON; k++) { + float social_risk; + float min_sep; + float route_error; + float smooth; + float terrain; + rollout_step(robots, robot_id, rollout, k, priority_aware, s, prev_speed, prev_steer, + social_risk, min_sep, route_error, smooth, terrain); + sample_x[sample_base + k] = s.x; + sample_y[sample_base + k] = s.y; + } +} + +static std::vector make_robots() { + std::vector robots(N_ROBOTS); + for (int i = 0; i < N_ROBOTS; i++) { + int group = i % 4; + int lane = i / 4; + float lane_f = static_cast(lane); + float jitter = 0.13f * std::sin(1.37f * static_cast(i)); + RobotSpec r{}; + r.route = group; + r.priority = 0.35f + 0.65f * static_cast((i * 7) % 11) / 10.0f; + if (group == 0) { + r.sx = 0.40f + 0.055f * lane_f; + r.sy = 3.95f + 0.15f * lane_f + jitter; + r.gx = 17.38f; + r.gy = 7.05f - 0.14f * lane_f - jitter; + r.theta0 = 0.05f; + } else if (group == 1) { + r.sx = 7.45f + 0.16f * lane_f + jitter; + r.sy = 0.40f + 0.045f * lane_f; + r.gx = 10.62f - 0.16f * lane_f - jitter; + r.gy = 10.36f; + r.theta0 = 1.50f; + } else if (group == 2) { + r.sx = 17.60f - 0.055f * lane_f; + r.sy = 7.12f - 0.15f * lane_f + jitter; + r.gx = 0.62f; + r.gy = 3.98f + 0.14f * lane_f - jitter; + r.theta0 = PI_F - 0.05f; + } else { + r.sx = 10.62f - 0.16f * lane_f + jitter; + r.sy = 10.60f - 0.045f * lane_f; + r.gx = 7.45f + 0.16f * lane_f - jitter; + r.gy = 0.62f; + r.theta0 = -1.58f; + } + robots[i] = r; + } + return robots; +} + +static double evaluate_cpu_rollouts(const std::vector& robots, + int coordinated, + std::vector& out) { + out.resize(N_ROBOTS * ROLLOUTS_PER_ROBOT); + auto begin = std::chrono::high_resolution_clock::now(); + for (int robot = 0; robot < N_ROBOTS; robot++) { + for (int rollout = 0; rollout < ROLLOUTS_PER_ROBOT; rollout++) { + int idx = robot * ROLLOUTS_PER_ROBOT + rollout; + out[idx] = evaluate_rollout(robots.data(), robot, rollout, coordinated); + } + } + auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end - begin).count(); +} + +static std::vector select_best_by_robot(const std::vector& results) { + std::vector best(N_ROBOTS, 0); + std::vector best_cost(N_ROBOTS, INF_COST); + for (int robot = 0; robot < N_ROBOTS; robot++) { + for (int rollout = 0; rollout < ROLLOUTS_PER_ROBOT; rollout++) { + int idx = robot * ROLLOUTS_PER_ROBOT + rollout; + if (results[idx].select_cost < best_cost[robot]) { + best_cost[robot] = results[idx].select_cost; + best[robot] = rollout; + } + } + } + return best; +} + +static void reconstruct_one_path(const std::vector& robots, + int robot, + int rollout, + int priority_aware, + std::vector& path_x, + std::vector& path_y) { + path_x.assign(HORIZON, 0.0f); + path_y.assign(HORIZON, 0.0f); + Pose2 s{robots[robot].sx, robots[robot].sy, robots[robot].theta0}; + float prev_speed = 0.70f; + float prev_steer = 0.0f; + for (int k = 0; k < HORIZON; k++) { + float social_risk; + float min_sep; + float route_error; + float smooth; + float terrain; + rollout_step(robots.data(), robot, rollout, k, priority_aware, + s, prev_speed, prev_steer, social_risk, min_sep, + route_error, smooth, terrain); + path_x[k] = s.x; + path_y[k] = s.y; + } +} + +static std::vector select_priority_by_robot(const std::vector& robots, + const std::vector& results) { + std::vector order(N_ROBOTS); + for (int i = 0; i < N_ROBOTS; i++) order[i] = i; + std::sort(order.begin(), order.end(), [&](int a, int b) { + float sa = robots[a].priority + route_precedence(robots[a].route); + float sb = robots[b].priority + route_precedence(robots[b].route); + if (sa == sb) return a < b; + return sa > sb; + }); + + std::vector selected(N_ROBOTS, 0); + std::vector fixed(N_ROBOTS, 0); + std::vector fixed_x(N_ROBOTS * HORIZON, -100.0f); + std::vector fixed_y(N_ROBOTS * HORIZON, -100.0f); + std::vector cand_x; + std::vector cand_y; + std::vector best_x; + std::vector best_y; + + for (int robot : order) { + float best_score = INF_COST; + int best_rollout = 0; + for (int rollout = 0; rollout < ROLLOUTS_PER_ROBOT; rollout++) { + reconstruct_one_path(robots, robot, rollout, 1, cand_x, cand_y); + float conflict = 0.0f; + for (int peer = 0; peer < N_ROBOTS; peer++) { + if (!fixed[peer] || robots[peer].route == robots[robot].route) continue; + float peer_score = robots[peer].priority + route_precedence(robots[peer].route); + float self_score = robots[robot].priority + route_precedence(robots[robot].route); + float priority_gap = clampf(peer_score - self_score, 0.0f, 1.0f); + for (int k = 0; k < HORIZON; k++) { + float dx = cand_x[k] - fixed_x[peer * HORIZON + k]; + float dy = cand_y[k] - fixed_y[peer * HORIZON + k]; + float sep = std::sqrt(dx * dx + dy * dy) - 2.0f * ROBOT_R; + if (sep < COLLISION_MARGIN) { + float margin = COLLISION_MARGIN - sep; + conflict += (280.0f + 260.0f * priority_gap) * margin * margin; + } + if (sep < 0.06f) { + float overlap = 0.06f - sep; + conflict += (2200.0f + 1800.0f * priority_gap) * overlap * overlap; + } + } + } + int idx = robot * ROLLOUTS_PER_ROBOT + rollout; + float score = results[idx].select_cost + conflict; + if (score < best_score) { + best_score = score; + best_rollout = rollout; + best_x = cand_x; + best_y = cand_y; + } + } + selected[robot] = best_rollout; + fixed[robot] = 1; + for (int k = 0; k < HORIZON; k++) { + fixed_x[robot * HORIZON + k] = best_x[k]; + fixed_y[robot * HORIZON + k] = best_y[k]; + } + } + return selected; +} + +static void reconstruct_paths(const std::vector& robots, + const std::vector& selected, + int coordinated, + std::vector& path_x, + std::vector& path_y) { + path_x.assign(N_ROBOTS * HORIZON, 0.0f); + path_y.assign(N_ROBOTS * HORIZON, 0.0f); + for (int robot = 0; robot < N_ROBOTS; robot++) { + Pose2 s{robots[robot].sx, robots[robot].sy, robots[robot].theta0}; + float prev_speed = 0.70f; + float prev_steer = 0.0f; + for (int k = 0; k < HORIZON; k++) { + float social_risk; + float min_sep; + float route_error; + float smooth; + float terrain; + rollout_step(robots.data(), robot, selected[robot], k, coordinated, + s, prev_speed, prev_steer, social_risk, min_sep, + route_error, smooth, terrain); + path_x[robot * HORIZON + k] = s.x; + path_y[robot * HORIZON + k] = s.y; + } + } +} + +static TeamMetrics compute_team_metrics(const std::vector& robots, + const std::vector& results, + const std::vector& selected, + const std::vector& path_x, + const std::vector& path_y) { + TeamMetrics m{}; + m.min_separation = 1.0e6f; + float terminal_sum = 0.0f; + float risk_sum = 0.0f; + for (int robot = 0; robot < N_ROBOTS; robot++) { + const RolloutResult& r = results[robot * ROLLOUTS_PER_ROBOT + selected[robot]]; + terminal_sum += r.terminal_error; + risk_sum += r.mean_social_risk; + m.max_social_risk = std::max(m.max_social_risk, r.max_social_risk); + if (r.terminal_error < 2.25f) m.reached++; + if (r.terminal_error > 4.0f) m.deadlocks++; + } + for (int k = 0; k < HORIZON; k++) { + for (int i = 0; i < N_ROBOTS; i++) { + float xi = path_x[i * HORIZON + k]; + float yi = path_y[i * HORIZON + k]; + for (int j = i + 1; j < N_ROBOTS; j++) { + if (robots[i].route == robots[j].route) continue; + float dx = xi - path_x[j * HORIZON + k]; + float dy = yi - path_y[j * HORIZON + k]; + float sep = sqrtf(dx * dx + dy * dy) - 2.0f * ROBOT_R; + m.min_separation = std::min(m.min_separation, sep); + if (sep < 0.06f) m.collisions++; + } + } + } + m.mean_terminal = terminal_sum / static_cast(N_ROBOTS); + m.mean_social_risk = risk_sum / static_cast(N_ROBOTS); + return m; +} + +static cv::Point to_px(float x, float y, int x0) { + int px = x0 + static_cast(x / WORLD_W * static_cast(HALF_W - 1)); + int py = HEADER_H + static_cast((1.0f - y / WORLD_H) * static_cast(MAP_H - 1)); + return cv::Point(px, py); +} + +static cv::Scalar robot_color(int robot) { + static const cv::Scalar colors[] = { + cv::Scalar(82, 220, 255), + cv::Scalar(120, 235, 128), + cv::Scalar(255, 180, 82), + cv::Scalar(226, 122, 255), + }; + return colors[robot % 4]; +} + +static cv::Scalar blend(cv::Scalar a, cv::Scalar b, float wb) { + float wa = 1.0f - wb; + return cv::Scalar(wa * a[0] + wb * b[0], + wa * a[1] + wb * b[1], + wa * a[2] + wb * b[2]); +} + +static cv::Scalar field_color(float x, float y) { + float c = clampf(terrain_cost(x, y) / 5.0f, 0.0f, 1.0f); + cv::Scalar base(46 + 50 * c, 118 + 82 * (1.0f - c), 66 + 146 * c); + cv::Scalar center(60, 160, 118); + if (static_clearance(x, y) < 0.0f) base = cv::Scalar(54, 42, 124); + float cross = expf(-0.5f * (sqr((x - 9.2f) / 3.3f) + sqr((y - 5.5f) / 2.2f))); + return blend(base, center, 0.22f * cross); +} + +static void draw_field(cv::Mat& img, int x0) { + constexpr int GRID_W = 160; + constexpr int GRID_H = 100; + int cw = std::max(2, HALF_W / GRID_W + 1); + int ch = std::max(2, MAP_H / GRID_H + 1); + for (int iy = 0; iy < GRID_H; iy++) { + for (int ix = 0; ix < GRID_W; ix++) { + float x = (static_cast(ix) + 0.5f) / GRID_W * WORLD_W; + float y = (static_cast(iy) + 0.5f) / GRID_H * WORLD_H; + cv::Point p = to_px(x, y, x0); + cv::rectangle(img, cv::Rect(p.x, p.y, cw, ch), field_color(x, y), cv::FILLED); + } + } +} + +static void draw_routes(cv::Mat& img, const std::vector& robots, int x0) { + for (int robot = 0; robot < N_ROBOTS; robot += 2) { + cv::Point prev; + for (int i = 0; i <= 24; i++) { + float x; + float y; + route_point(robots[robot], static_cast(i) / 24.0f, x, y); + cv::Point p = to_px(x, y, x0); + if (i > 0) cv::line(img, prev, p, cv::Scalar(55, 128, 106), 1, cv::LINE_AA); + prev = p; + } + } +} + +static void draw_graph_edges(cv::Mat& img, + const std::vector& robots, + const std::vector& path_x, + const std::vector& path_y, + int x0, + int step) { + int k = std::max(0, std::min(step, HORIZON - 1)); + for (int i = 0; i < N_ROBOTS; i++) { + float xi = path_x[i * HORIZON + k]; + float yi = path_y[i * HORIZON + k]; + for (int j = i + 1; j < N_ROBOTS; j++) { + if (robots[i].route == robots[j].route) continue; + float xj = path_x[j * HORIZON + k]; + float yj = path_y[j * HORIZON + k]; + float d = std::sqrt(sqr(xi - xj) + sqr(yi - yj)); + if (d > 1.55f) continue; + float w = 1.0f - d / 1.55f; + cv::line(img, to_px(xi, yi, x0), to_px(xj, yj, x0), + cv::Scalar(60, 130 + 90 * w, 250), 1, cv::LINE_AA); + } + } +} + +static void draw_samples(cv::Mat& img, + const std::vector& sample_x, + const std::vector& sample_y, + int x0, + int step) { + int limit = std::max(1, std::min(step, HORIZON)); + for (int robot = 0; robot < SAMPLE_ROBOTS; robot++) { + for (int rollout = 0; rollout < SAMPLE_ROLLOUTS; rollout += 2) { + int base = (robot * SAMPLE_ROLLOUTS + rollout) * HORIZON; + cv::Scalar color = blend(robot_color(robot), cv::Scalar(220, 220, 220), 0.45f); + for (int k = 1; k < limit; k++) { + cv::line(img, to_px(sample_x[base + k - 1], sample_y[base + k - 1], x0), + to_px(sample_x[base + k], sample_y[base + k], x0), + color, 1, cv::LINE_AA); + } + } + } +} + +static void draw_paths(cv::Mat& img, + const std::vector& robots, + const std::vector& path_x, + const std::vector& path_y, + int x0, + int step, + bool strong) { + int limit = std::max(1, std::min(step, HORIZON)); + for (int robot = 0; robot < N_ROBOTS; robot++) { + cv::Scalar color = robot_color(robot); + int thickness = strong ? 2 : 1; + for (int k = 1; k < limit; k++) { + cv::line(img, to_px(path_x[robot * HORIZON + k - 1], path_y[robot * HORIZON + k - 1], x0), + to_px(path_x[robot * HORIZON + k], path_y[robot * HORIZON + k], x0), + color, thickness, cv::LINE_AA); + } + int idx = robot * HORIZON + limit - 1; + cv::circle(img, to_px(path_x[idx], path_y[idx], x0), 4, cv::Scalar(18, 20, 24), + cv::FILLED, cv::LINE_AA); + cv::circle(img, to_px(path_x[idx], path_y[idx], x0), 3, color, cv::FILLED, cv::LINE_AA); + if (step >= HORIZON - 1) { + cv::circle(img, to_px(robots[robot].gx, robots[robot].gy, x0), 3, + color, 1, cv::LINE_AA); + } + } +} + +static cv::Mat draw_frame(const std::vector& robots, + const std::vector& independent_x, + const std::vector& independent_y, + const std::vector& coordinated_x, + const std::vector& coordinated_y, + const std::vector& sample_x, + const std::vector& sample_y, + const TeamMetrics& independent, + const TeamMetrics& coordinated, + double gpu_ms, + double cpu_ms, + int step) { + cv::Mat img(PANEL_H, PANEL_W, CV_8UC3, cv::Scalar(18, 20, 24)); + draw_field(img, 0); + draw_field(img, HALF_W); + draw_routes(img, robots, 0); + draw_routes(img, robots, HALF_W); + draw_graph_edges(img, robots, independent_x, independent_y, 0, step); + draw_graph_edges(img, robots, coordinated_x, coordinated_y, HALF_W, step); + draw_paths(img, robots, independent_x, independent_y, 0, step, false); + draw_samples(img, sample_x, sample_y, HALF_W, step); + draw_paths(img, robots, coordinated_x, coordinated_y, HALF_W, step, true); + + cv::line(img, cv::Point(HALF_W, HEADER_H), cv::Point(HALF_W, PANEL_H - FOOTER_H), + cv::Scalar(18, 18, 20), 1, cv::LINE_AA); + cv::rectangle(img, cv::Rect(0, 0, PANEL_W, HEADER_H), cv::Scalar(5, 7, 10), cv::FILLED); + cv::rectangle(img, cv::Rect(0, PANEL_H - FOOTER_H, PANEL_W, FOOTER_H), + cv::Scalar(5, 7, 10), cv::FILLED); + + double speedup = gpu_ms > 0.0 ? cpu_ms / gpu_ms : 0.0; + float collision_drop = 100.0f * (1.0f - static_cast(coordinated.collisions) + / static_cast(std::max(independent.collisions, 1))); + float risk_drop = 100.0f * (1.0f - coordinated.mean_social_risk + / std::max(independent.mean_social_risk, 1.0e-6f)); + float collision_delta = -collision_drop; + float risk_delta = -risk_drop; + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GPU priority graph-neural MPPI %d robots x %d rollouts x H=%d gpu=%.2f ms %.1fx", + N_ROBOTS, ROLLOUTS_PER_ROBOT, HORIZON, gpu_ms, speedup); + cv::putText(img, buf, cv::Point(12, 28), + cv::FONT_HERSHEY_SIMPLEX, 0.48, cv::Scalar(245, 245, 245), 1, cv::LINE_AA); + cv::putText(img, "coordinated graph baseline", cv::Point(14, HEADER_H + 24), + cv::FONT_HERSHEY_SIMPLEX, 0.50, cv::Scalar(245, 245, 245), 1, cv::LINE_AA); + cv::putText(img, "priority-aware right-of-way MPPI", cv::Point(HALF_W + 14, HEADER_H + 24), + cv::FONT_HERSHEY_SIMPLEX, 0.50, cv::Scalar(245, 245, 245), 1, cv::LINE_AA); + std::snprintf(buf, sizeof(buf), + "collisions %d -> %d (%+.1f%%) social %.3f -> %.3f (%+.1f%%) min sep %.2f -> %.2f", + independent.collisions, coordinated.collisions, collision_delta, + independent.mean_social_risk, coordinated.mean_social_risk, risk_delta, + independent.min_separation, coordinated.min_separation); + cv::putText(img, buf, cv::Point(12, PANEL_H - 24), + cv::FONT_HERSHEY_SIMPLEX, 0.46, cv::Scalar(225, 238, 245), 1, cv::LINE_AA); + std::snprintf(buf, sizeof(buf), + "reach %d/%d -> %d/%d terminal %.2f -> %.2f", + independent.reached, N_ROBOTS, coordinated.reached, N_ROBOTS, + independent.mean_terminal, coordinated.mean_terminal); + cv::putText(img, buf, cv::Point(12, PANEL_H - 8), + cv::FONT_HERSHEY_SIMPLEX, 0.42, cv::Scalar(225, 238, 245), 1, cv::LINE_AA); + return img; +} + +} // namespace cudabot + +using namespace cudabot; + +int main() { + std::vector robots = make_robots(); + + std::vector cpu_independent; + std::vector cpu_coordinated; + double cpu_independent_ms = evaluate_cpu_rollouts(robots, 0, cpu_independent); + double cpu_coordinated_ms = evaluate_cpu_rollouts(robots, 1, cpu_coordinated); + double cpu_ms = cpu_independent_ms + cpu_coordinated_ms; + + RobotSpec* d_robots = nullptr; + RolloutResult* d_independent = nullptr; + RolloutResult* d_coordinated = nullptr; + float* d_sample_x = nullptr; + float* d_sample_y = nullptr; + int total = N_ROBOTS * ROLLOUTS_PER_ROBOT; + int sample_total = SAMPLE_ROBOTS * SAMPLE_ROLLOUTS * HORIZON; + CUDA_CHECK(cudaMalloc(&d_robots, N_ROBOTS * sizeof(RobotSpec))); + CUDA_CHECK(cudaMalloc(&d_independent, total * sizeof(RolloutResult))); + CUDA_CHECK(cudaMalloc(&d_coordinated, total * sizeof(RolloutResult))); + CUDA_CHECK(cudaMalloc(&d_sample_x, sample_total * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_sample_y, sample_total * sizeof(float))); + CUDA_CHECK(cudaMemcpy(d_robots, robots.data(), N_ROBOTS * sizeof(RobotSpec), + cudaMemcpyHostToDevice)); + + int blocks = (total + THREADS - 1) / THREADS; + cudaEvent_t ev0, ev1; + CUDA_CHECK(cudaEventCreate(&ev0)); + CUDA_CHECK(cudaEventCreate(&ev1)); + CUDA_CHECK(cudaEventRecord(ev0)); + rollout_kernel<<>>(d_robots, 0, d_independent, d_sample_x, d_sample_y); + rollout_kernel<<>>(d_robots, 1, d_coordinated, d_sample_x, d_sample_y); + CUDA_CHECK(cudaEventRecord(ev1)); + CUDA_CHECK(cudaEventSynchronize(ev1)); + float gpu_ms_f = 0.0f; + CUDA_CHECK(cudaEventElapsedTime(&gpu_ms_f, ev0, ev1)); + CUDA_CHECK(cudaEventDestroy(ev0)); + CUDA_CHECK(cudaEventDestroy(ev1)); + CUDA_CHECK(cudaGetLastError()); + double gpu_ms = static_cast(gpu_ms_f); + + std::vector independent_results(total); + std::vector coordinated_results(total); + std::vector sample_x(sample_total); + std::vector sample_y(sample_total); + CUDA_CHECK(cudaMemcpy(independent_results.data(), d_independent, total * sizeof(RolloutResult), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(coordinated_results.data(), d_coordinated, total * sizeof(RolloutResult), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(sample_x.data(), d_sample_x, sample_total * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(sample_y.data(), d_sample_y, sample_total * sizeof(float), + cudaMemcpyDeviceToHost)); + + std::vector independent_selected = select_best_by_robot(independent_results); + std::vector coordinated_selected = select_priority_by_robot(robots, coordinated_results); + std::vector independent_x; + std::vector independent_y; + std::vector coordinated_x; + std::vector coordinated_y; + reconstruct_paths(robots, independent_selected, 0, independent_x, independent_y); + reconstruct_paths(robots, coordinated_selected, 1, coordinated_x, coordinated_y); + TeamMetrics independent = compute_team_metrics(robots, independent_results, independent_selected, + independent_x, independent_y); + TeamMetrics coordinated = compute_team_metrics(robots, coordinated_results, coordinated_selected, + coordinated_x, coordinated_y); + + double speedup = gpu_ms > 0.0 ? cpu_ms / gpu_ms : 0.0; + float collision_drop = 100.0f * (1.0f - static_cast(coordinated.collisions) + / static_cast(std::max(independent.collisions, 1))); + float risk_drop = 100.0f * (1.0f - coordinated.mean_social_risk + / std::max(independent.mean_social_risk, 1.0e-6f)); + float sep_gain = coordinated.min_separation - independent.min_separation; + std::printf("CPU baseline+priority-aware MPPI: %.3f ms (%d robots x %d rollouts x H=%d x 2 modes; priority %.3f ms)\n", + cpu_ms, N_ROBOTS, ROLLOUTS_PER_ROBOT, HORIZON, cpu_coordinated_ms); + std::printf("GPU priority graph-neural MPPI: %.3f ms (baseline+priority batches, %.1fx vs CPU equivalent rollout eval)\n", + gpu_ms, speedup); + std::printf("Baseline coordinated team: collisions %d, reached %d/%d, deadlocks %d, min sep %.3f, terminal avg %.3f, social risk avg/max %.3f/%.3f\n", + independent.collisions, independent.reached, N_ROBOTS, independent.deadlocks, + independent.min_separation, independent.mean_terminal, + independent.mean_social_risk, independent.max_social_risk); + std::printf("Priority-aware team: collisions %d, reached %d/%d, deadlocks %d, min sep %.3f, terminal avg %.3f, social risk avg/max %.3f/%.3f, collision change %.1f%%, risk change %.1f%%, separation gain %.3f\n", + coordinated.collisions, coordinated.reached, N_ROBOTS, coordinated.deadlocks, + coordinated.min_separation, coordinated.mean_terminal, + coordinated.mean_social_risk, coordinated.max_social_risk, + collision_drop, risk_drop, sep_gain); + + std::system("mkdir -p gif"); + cv::VideoWriter video("gif/gpu_priority_graph_neural_mppi.avi", + cv::VideoWriter::fourcc('M', 'J', 'P', 'G'), + VIDEO_FPS, cv::Size(PANEL_W, PANEL_H)); + if (!video.isOpened()) { + std::fprintf(stderr, "failed to open gif/gpu_priority_graph_neural_mppi.avi\n"); + return 1; + } + for (int k = 2; k <= HORIZON; k += 2) { + video.write(draw_frame(robots, independent_x, independent_y, coordinated_x, coordinated_y, + sample_x, sample_y, independent, coordinated, gpu_ms, cpu_ms, k)); + } + for (int i = 0; i < 12; i++) { + video.write(draw_frame(robots, independent_x, independent_y, coordinated_x, coordinated_y, + sample_x, sample_y, independent, coordinated, gpu_ms, cpu_ms, HORIZON)); + } + video.release(); + + avi_to_gif("gif/gpu_priority_graph_neural_mppi.avi", + "gif/gpu_priority_graph_neural_mppi.gif", 8, 560); + std::printf("GIF saved to gif/gpu_priority_graph_neural_mppi.gif\n"); + + CUDA_CHECK(cudaFree(d_robots)); + CUDA_CHECK(cudaFree(d_independent)); + CUDA_CHECK(cudaFree(d_coordinated)); + CUDA_CHECK(cudaFree(d_sample_x)); + CUDA_CHECK(cudaFree(d_sample_y)); + return 0; +}