SciML · utkarsh530 · Jun 7, 2026 · Apr 2, 2026
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,17 @@
 name = "ParallelParticleSwarms"
 uuid = "ab63da0c-63b4-40fa-a3b7-d2cba5be6419"
-version = "1.0.0"
 authors = ["Utkarsh <rajpututkarsh530@gmail.com> and contributors"]
+version = "1.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LineSearch = "87fe0de2-c867-4266-b59a-2f0a94fc965b"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NonlinearSolveBase = "be0214bd-f91f-a760-ac4e-3421ce2b2da0"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
@@ -26,14 +29,16 @@ Enzyme = "0.13"
 ForwardDiff = "0.10, 1.3"
 JET = "0.9, 0.10, 0.11"
 KernelAbstractions = "0.9"
+LineSearch = "0.1"
+NonlinearSolveBase = "2.24"
 Optimization = "4.1, 5.2"
 PrecompileTools = "1"
 QuasiMonteCarlo = "0.3"
 Reexport = "1.2"
 SafeTestsets = "0.1"
 SciMLBase = "2.79"
 Setfield = "1.1"
-SimpleNonlinearSolve = "2.2"
+SimpleNonlinearSolve = "2.12"
 StaticArrays = "1.9"
 julia = "1.10"
 

diff --git a/src/ParallelParticleSwarms.jl b/src/ParallelParticleSwarms.jl
@@ -1,10 +1,11 @@
 module ParallelParticleSwarms
 
 using SciMLBase, StaticArrays, Setfield, KernelAbstractions
-using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff
+using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff, LineSearch
+using NonlinearSolveBase: ImmutableNonlinearProblem
 import Adapt
 import Adapt: adapt
-import Enzyme: autodiff_deferred, Active, Reverse, Const
+import Enzyme: autodiff, autodiff_deferred, Active, Reverse, Const, Duplicated, make_zero!
 import KernelAbstractions: @atomic, @atomicreplace, @atomicswap
 using QuasiMonteCarlo
 import DiffEqGPU: GPUTsit5, make_prob_compatible, vectorized_solve, vectorized_asolve

diff --git a/src/bfgs.jl b/src/bfgs.jl
@@ -5,20 +5,21 @@ function SciMLBase.__solve(
         abstol = nothing,
         reltol = nothing,
         maxiters = 1000,
+        linesearch = StrongWolfeLineSearch(),
         kwargs...
     )
-    f = Base.Fix2(prob.f.f, prob.p)
-    function _g(θ, _p = nothing)
-        return ForwardDiff.gradient(f, θ)
-    end
+    u0 = as_svector(prob.u0)
+    ∇f = as_svector_grad(instantiate_gradient(prob.f.f, prob.f.adtype))
     t0 = time()
-    nlprob = NonlinearProblem{false}(_g, prob.u0)
+    nlprob = ImmutableNonlinearProblem{false}(∇f, u0, prob.p)
     nlsol = solve(
         nlprob,
-        SimpleLimitedMemoryBroyden(; threshold = opt.threshold, linesearch = Val(true));
+        SimpleLimitedMemoryBroyden(; threshold = opt.threshold, linesearch);
         maxiters,
         abstol,
-        reltol
+        reltol,
+        grad_f = ∇f,
+        kwargs...
     )
     θ = nlsol.u
     t1 = time()
@@ -31,26 +32,30 @@ function SciMLBase.__solve(
     )
 end
 
+# `BFGS` here solves ∇f = 0 via `SimpleBroyden` (secant/quasi-Newton on the gradient).
 function SciMLBase.__solve(
         prob::SciMLBase.OptimizationProblem,
         opt::BFGS,
         args...;
         abstol = nothing,
         reltol = nothing,
         maxiters = 1000,
+        linesearch = StrongWolfeLineSearch(),
         kwargs...
     )
-    f = Base.Fix2(prob.f.f, prob.p)
-    ∇f = instantiate_gradient(f, prob.f.adtype)
+    u0 = as_svector(prob.u0)
+    ∇f = as_svector_grad(instantiate_gradient(prob.f.f, prob.f.adtype))
 
     t0 = time()
-    nlprob = NonlinearProblem{false}(∇f, prob.u0)
+    nlprob = ImmutableNonlinearProblem{false}(∇f, u0, prob.p)
     nlsol = solve(
         nlprob,
-        SimpleBroyden(; linesearch = Val(true));
+        SimpleBroyden(; linesearch);
         maxiters,
         abstol,
-        reltol
+        reltol,
+        grad_f = ∇f,
+        kwargs...
     )
     θ = nlsol.u
     t1 = time()

diff --git a/src/hybrid.jl b/src/hybrid.jl
@@ -1,69 +1,116 @@
-@kernel function simplebfgs_run!(nlprob, x0s, result, opt, maxiters, abstol, reltol)
+using KernelAbstractions
+using SciMLBase
+using Optimization
+using LineSearch
+using SimpleNonlinearSolve
+using NonlinearSolveBase: ImmutableNonlinearProblem
+import SciMLBase: NonlinearFunction
+import NonlinearSolveBase.Utils as NLBUtils
+
+@inline (f::NonlinearFunction{false, G})(u, p) where {G} = f.f(u, p)
+
+@inline NLBUtils.evaluate_f(prob::ImmutableNonlinearProblem, u) =
+    prob.f.f(u, prob.p)
+
+@inline NLBUtils.evaluate_f!!(prob::ImmutableNonlinearProblem, fu, u) =
+    prob.f.f(u, prob.p)
+
+@inline _unwrap_scalar(x::Real) = x
+@inline _unwrap_scalar(x) = x[]
+
+function _hybrid_bounds(prob, ::Val{d}, ::Type{T}) where {d, T}
+    lb = prob.lb === nothing ? nothing : SVector{d, T}(prob.lb)
+    ub = prob.ub === nothing ? nothing : SVector{d, T}(prob.ub)
+    return lb, ub
+end
+
+struct BoundedGrad{G, LB, UB}
+    raw::G
+    lb::LB
+    ub::UB
+end
+
+@inline (bg::BoundedGrad{G, Nothing, Nothing})(θ, p) where {G} = as_svector(bg.raw(θ, p))
+
+@inline function (bg::BoundedGrad)(θ, p)
+    T = eltype(θ)
+    w = bg.ub .- bg.lb
+    in_box = all(isfinite, θ) &&
+        all(θ .>= bg.lb .- T(2) .* w) &&
+        all(θ .<= bg.ub .+ T(2) .* w)
+    g = in_box ? bg.raw(θ, p) : map(_ -> T(1.0e15), θ)
+    return as_svector(g)
+end
+
+@inline function _nlalg(local_opt::LBFGS, linesearch)
+    return SimpleLimitedMemoryBroyden(; threshold = local_opt.threshold, linesearch)
+end
+@inline _nlalg(::BFGS, linesearch) = SimpleBroyden(; linesearch)
+
+@kernel function simplebfgs_run!(
+        grad_f, f_raw, p, x0s, result, result_fx, nlalg, maxiters, abstol, reltol
+    )
     i = @index(Global, Linear)
-    nlcache = remake(nlprob; u0 = x0s[i])
-    sol = solve(nlcache, opt; maxiters, abstol, reltol)
-    @inbounds result[i] = sol.u
+    @inbounds x0 = as_svector(x0s[i])
+    nlprob = ImmutableNonlinearProblem{false}(NonlinearFunction{false}(grad_f), x0, p)
+    sol = SciMLBase.solve(nlprob, nlalg; maxiters, abstol, reltol, grad_f = grad_f)
+    u = as_svector(sol.u)
+    T = eltype(u)
+    v = f_raw(u, p)
+    @inbounds result[i] = u
+    @inbounds result_fx[i] = (isnan(v) | !isfinite(v)) ? T(Inf) : convert(T, v)
 end
 
 function SciMLBase.solve!(
         cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
         abstol = nothing,
         reltol = nothing,
-        maxiters = 100, local_maxiters = 10, kwargs...
-    ) where {
-        Backend, LocalOpt <: Union{LBFGS, BFGS},
-    }
-    pso_cache = cache.pso_cache
-
-    sol_pso = solve!(pso_cache)
-    x0s = sol_pso.original
+        maxiters = 100,
+        local_maxiters = 50,
+        linesearch = StrongWolfeLineSearch(),
+        kwargs...
+    ) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
 
-    backend = opt.backend
+    sol_pso = SciMLBase.solve!(cache.pso_cache; maxiters)
+    best_u = sol_pso.u
+    best_obj = _unwrap_scalar(sol_pso.objective)
 
-    prob = remake(cache.prob, lb = nothing, ub = nothing)
+    prob = cache.prob
+    f_raw = prob.f.f
+    p = prob.p
+    T = eltype(prob.u0)
+    d = length(prob.u0)
+    lb, ub = _hybrid_bounds(prob, Val(d), T)
 
-    result = cache.start_points
-    copyto!(result, x0s)
+    grad_f = as_svector_grad(BoundedGrad(ForwardDiffGradient(f_raw), lb, ub))
 
-    ∇f = instantiate_gradient(prob.f.f, prob.f.adtype)
+    nlalg = _nlalg(opt.local_opt, linesearch)
 
-    kernel = simplebfgs_run!(backend)
-    nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p)
-
-    nlalg = opt.local_opt isa LBFGS ?
-        SimpleLimitedMemoryBroyden(;
-            threshold = opt.local_opt.threshold,
-            linesearch = Val(true)
-        ) : SimpleBroyden(; linesearch = Val(true))
+    x0s = sol_pso.original
+    n = length(x0s)
+    result = similar(x0s)
+    result_fx = KernelAbstractions.allocate(opt.backend, T, n)
 
     t0 = time()
-    kernel(
-        nlprob,
-        x0s,
-        result,
-        nlalg,
-        local_maxiters,
-        abstol,
-        reltol;
-        ndrange = length(x0s)
+    simplebfgs_run!(opt.backend)(
+        grad_f, f_raw, p,
+        x0s, result, result_fx,
+        nlalg, local_maxiters, abstol, reltol;
+        ndrange = n,
     )
+    KernelAbstractions.synchronize(opt.backend)
 
-    sol_bfgs = (x -> prob.f(x, prob.p)).(result)
-    sol_bfgs = (x -> isnan(x) ? convert(eltype(prob.u0), Inf) : x).(sol_bfgs)
-
-    minobj, ind = findmin(sol_bfgs)
-    sol_u,
-        sol_obj = minobj > sol_pso.objective ? (sol_pso.u, sol_pso.objective) :
-        (view(result, ind), minobj)
-    t1 = time()
-
-    # @show sol_pso.stats.time
-
-    solve_time = (t1 - t0) + sol_pso.stats.time
+    fx_host = Array(result_fx)
+    minobj, ind = findmin(fx_host)
+    if minobj < best_obj
+        best_obj = minobj
+        best_u = Array(result)[ind]
+    end
 
+    solve_time = (time() - t0) + sol_pso.stats.time
     return SciMLBase.build_solution(
         SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
-        sol_u, sol_obj,
-        stats = Optimization.OptimizationStats(; time = solve_time)
+        best_u, best_obj;
+        stats = Optimization.OptimizationStats(; time = solve_time),
     )
 end
diff --git a/src/utils.jl b/src/utils.jl
@@ -376,12 +376,44 @@ Based on the paper: Particle swarm optimization method for constrained optimizat
     return penalty
 end
 
-#TODO: Possible migration to DifferentiationInterface.jl,
-# however I cannot compile GPU-compatible gradients with Enzyme as Mar 2025
-@inline function instantiate_gradient(f, adtype::AutoForwardDiff)
-    return (θ, p) -> ForwardDiff.gradient(x -> f(x, p), θ)
+@inline function _enzyme_scalar_f(f, θ, p)
+    val = f(θ, p)
+    return val isa Number ? val : first(val)
 end
 
-@inline function instantiate_gradient(f, adtype::AutoEnzyme)
-    return (θ, p) -> autodiff_deferred(Reverse, Const(x -> f(x, p)), Active, Active(θ))[1][1]
+struct ForwardDiffGradient{F}
+    f::F
 end
+@inline function (g::ForwardDiffGradient)(θ, p)
+    return ForwardDiff.gradient(x -> g.f(x, p), θ)
+end
+
+struct EnzymeGradient{F}
+    f::F
+end
+@inline function (g::EnzymeGradient)(θ, p)
+    θd = θ isa SVector ? MVector(θ) : θ
+    res = similar(θd)
+    make_zero!(res)
+    autodiff(
+        Reverse,
+        Const(_enzyme_scalar_f),
+        Active,
+        Const(g.f),
+        Duplicated(θd, res),
+        Const(p),
+    )
+    return θ isa SVector ? SVector(res) : res
+end
+
+@inline instantiate_gradient(f, ::AutoForwardDiff) = ForwardDiffGradient(f)
+@inline instantiate_gradient(f, ::AutoEnzyme) = EnzymeGradient(f)
+
+@inline as_svector(x::SVector) = x
+@inline as_svector(x) = SVector(x)
+
+struct SVectorGradient{G}
+    g::G
+end
+@inline (sg::SVectorGradient)(θ, p) = as_svector(sg.g(θ, p))
+@inline as_svector_grad(g) = SVectorGradient(g)