diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index b7eb25c1a..14c8dcd4d 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -19,6 +19,7 @@ jobs:
       matrix:
         group:
           - Core
+          - OptimizationBase
           - OptimizationBBO
           - OptimizationCMAEvolutionStrategy
           - OptimizationEvolutionary
@@ -66,7 +67,7 @@ jobs:
           GROUP: ${{ matrix.group }}
       - uses: julia-actions/julia-processcoverage@v1
         with:
-          directories: src,lib/OptimizationBBO/src,lib/OptimizationCMAEvolutionStrategy/src,lib/OptimizationEvolutionary/src,lib/OptimizationGCMAES/src,lib/OptimizationManopt/src,lib/OptimizationMOI/src,lib/OptimizationMetaheuristics/src,lib/OptimizationMultistartOptimization/src,lib/OptimizationNLopt/src,lib/OptimizationNOMAD/src,lib/OptimizationOptimJL/src,lib/OptimizationOptimisers/src,lib/OptimizationPolyalgorithms/src,lib/OptimizationQuadDIRECT/src,lib/OptimizationSpeedMapping/src
+          directories: src,lib/OptimizationBase/src,lib/OptimizationBBO/src,lib/OptimizationCMAEvolutionStrategy/src,lib/OptimizationEvolutionary/src,lib/OptimizationGCMAES/src,lib/OptimizationManopt/src,lib/OptimizationMOI/src,lib/OptimizationMetaheuristics/src,lib/OptimizationMultistartOptimization/src,lib/OptimizationNLopt/src,lib/OptimizationNOMAD/src,lib/OptimizationOptimJL/src,lib/OptimizationOptimisers/src,lib/OptimizationPolyalgorithms/src,lib/OptimizationQuadDIRECT/src,lib/OptimizationSpeedMapping/src
       - uses: codecov/codecov-action@v5
         with:
           file: lcov.info
diff --git a/Project.toml b/Project.toml
index 67f39e158..c77373cfe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,17 +6,23 @@ version = "4.4.0"
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 ConsoleProgressMonitor = "88cd18e8-d9cc-4ea6-8889-5259c0d15c8b"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 LBFGSB = "5be7bae1-8223-5378-bac3-9e7378a2f6e6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 OptimizationBase = "bca83a33-5cc9-4baa-983d-23429ab6bcbb"
+PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
+SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
 TerminalLoggers = "5d786b92-1e48-4d6f-9151-6b4477ca9bed"
 
 [compat]
@@ -28,8 +34,10 @@ Boltz = "1"
 ComponentArrays = ">= 0.13.9"
 ConsoleProgressMonitor = "0.1.1"
 DiffEqFlux = "2, 3, 4"
+DifferentiationInterface = "0.7"
 DocStringExtensions = "0.9"
 Enzyme = "0.13"
+FastClosures = "0.3"
 FiniteDiff = "2"
 Flux = "0.13, 0.14, 0.15, 0.16"
 ForwardDiff = "0.10, 1"
@@ -43,27 +51,29 @@ Lux = "1.12.4"
 MLUtils = "0.4"
 ModelingToolkit = "9"
 Optim = ">= 1.4.1"
-OptimizationBase = "2"
+Optimisers = ">= 0.2.5"
 OptimizationMOI = "0.5"
 OptimizationOptimJL = "0.4"
 OptimizationOptimisers = "0.3"
 OrdinaryDiffEqTsit5 = "1"
+PDMats = "0.11"
 Pkg = "1"
 Printf = "1.10"
 ProgressLogging = "0.1"
-Random = "1.10" 
+Random = "1.10"
 Reexport = "1.2"
 ReverseDiff = "1"
 SafeTestsets = "0.1"
 SciMLBase = "2.39.0"
 SciMLSensitivity = "7"
 SparseArrays = "1.10"
+SparseConnectivityTracer = "0.6, 1"
 SparseDiffTools = "2"
+SparseMatrixColorings = "0.4"
 Symbolics = "6"
 TerminalLoggers = "0.1"
 Test = "1.10"
 Tracker = "0.2"
-Optimisers = ">= 0.2.5"
 Zygote = "0.6, 0.7"
 julia = "1.10"
 
@@ -101,7 +111,4 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "BenchmarkTools", "Boltz", "ComponentArrays", "DiffEqFlux", "Enzyme", "FiniteDiff", "Flux", "ForwardDiff",
-    "Ipopt", "IterTools", "Lux", "MLUtils", "ModelingToolkit", "Optim", "OptimizationMOI", "OptimizationOptimJL", "OptimizationOptimisers", 
-    "OrdinaryDiffEqTsit5", "Pkg", "Random", "ReverseDiff", "SafeTestsets", "SciMLSensitivity", "SparseArrays", "SparseDiffTools",
-    "Symbolics",  "Test", "Tracker", "Zygote"]
+test = ["Aqua", "BenchmarkTools", "Boltz", "ComponentArrays", "DiffEqFlux", "Enzyme", "FiniteDiff", "Flux", "ForwardDiff", "Ipopt", "IterTools", "Lux", "MLUtils", "ModelingToolkit", "Optim", "OptimizationMOI", "OptimizationOptimJL", "OptimizationOptimisers", "OrdinaryDiffEqTsit5", "Pkg", "Random", "ReverseDiff", "SafeTestsets", "SciMLSensitivity", "SparseArrays", "SparseDiffTools", "Symbolics", "Test", "Tracker", "Zygote"]
diff --git a/lib/OptimizationBase/LICENSE b/lib/OptimizationBase/LICENSE
new file mode 100644
index 000000000..5056c1c66
--- /dev/null
+++ b/lib/OptimizationBase/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Vaibhav Dixit <vaibhavyashdixit@gmail.com> and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/lib/OptimizationBase/Project.toml b/lib/OptimizationBase/Project.toml
new file mode 100644
index 000000000..5c81f3e6f
--- /dev/null
+++ b/lib/OptimizationBase/Project.toml
@@ -0,0 +1,69 @@
+name = "OptimizationBase"
+uuid = "bca83a33-5cc9-4baa-983d-23429ab6bcbb"
+authors = ["Vaibhav Dixit <vaibhavyashdixit@gmail.com> and contributors"]
+version = "2.10.0"
+
+[deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
+DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
+SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
+
+[weakdeps]
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+SymbolicAnalysis = "4297ee4d-0239-47d8-ba5d-195ecdf594fe"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[extensions]
+OptimizationEnzymeExt = "Enzyme"
+OptimizationFiniteDiffExt = "FiniteDiff"
+OptimizationForwardDiffExt = "ForwardDiff"
+OptimizationMLDataDevicesExt = "MLDataDevices"
+OptimizationMLUtilsExt = "MLUtils"
+OptimizationMTKExt = "ModelingToolkit"
+OptimizationReverseDiffExt = "ReverseDiff"
+OptimizationSymbolicAnalysisExt = "SymbolicAnalysis"
+OptimizationZygoteExt = "Zygote"
+
+[compat]
+ADTypes = "1.9"
+ArrayInterface = "7.6"
+DifferentiationInterface = "0.7"
+DocStringExtensions = "0.9"
+Enzyme = "0.13.2"
+FastClosures = "0.3"
+FiniteDiff = "2.12"
+ForwardDiff = "0.10.26, 1"
+LinearAlgebra = "1.9, 1.10"
+MLDataDevices = "1"
+MLUtils = "0.4"
+ModelingToolkit = "9, 10"
+PDMats = "0.11"
+Reexport = "1.2"
+ReverseDiff = "1.14"
+SciMLBase = "2"
+SparseConnectivityTracer = "0.6, 1"
+SparseMatrixColorings = "0.4"
+SymbolicAnalysis = "0.3"
+Zygote = "0.6.67, 0.7"
+julia = "1.10"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/lib/OptimizationBase/ext/OptimizationEnzymeExt.jl b/lib/OptimizationBase/ext/OptimizationEnzymeExt.jl
new file mode 100644
index 000000000..d53b4495f
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationEnzymeExt.jl
@@ -0,0 +1,731 @@
+module OptimizationEnzymeExt
+
+import OptimizationBase, OptimizationBase.ArrayInterface
+import OptimizationBase.SciMLBase: OptimizationFunction
+import OptimizationBase.SciMLBase
+import OptimizationBase.LinearAlgebra: I, dot
+import OptimizationBase.ADTypes: AutoEnzyme
+using Enzyme
+using Core: Vararg
+
+@inline function firstapply(f::F, θ, p) where {F}
+    res = f(θ, p)
+    if isa(res, AbstractFloat)
+        res
+    else
+        first(res)
+    end
+end
+
+function inner_grad(mode::Mode, θ, bθ, f, p) where {Mode}
+    Enzyme.autodiff(mode,
+        Const(firstapply),
+        Active,
+        Const(f),
+        Enzyme.Duplicated(θ, bθ),
+        Const(p)
+    )
+    return nothing
+end
+
+function hv_f2_alloc(mode::Mode, x, f, p) where {Mode}
+    dx = Enzyme.make_zero(x)
+    Enzyme.autodiff(mode,
+        Const(firstapply),
+        Active,
+        Const(f),
+        Enzyme.Duplicated(x, dx),
+        Const(p)
+    )
+    return dx
+end
+
+function inner_cons(x, fcons::Function, p::Union{SciMLBase.NullParameters, Nothing},
+        num_cons::Int, i::Int)
+    res = zeros(eltype(x), num_cons)
+    fcons(res, x, p)
+    return res[i]
+end
+
+function cons_f2(mode, x, dx, fcons, p, num_cons, i)
+    Enzyme.autodiff_deferred(
+        mode, Const(inner_cons), Active, Enzyme.Duplicated(x, dx),
+        Const(fcons), Const(p), Const(num_cons), Const(i))
+    return nothing
+end
+
+function inner_cons_oop(
+        x::Vector{T}, fcons::Function, p::Union{SciMLBase.NullParameters, Nothing},
+        i::Int) where {T}
+    return fcons(x, p)[i]
+end
+
+function cons_f2_oop(mode, x, dx, fcons, p, i)
+    Enzyme.autodiff_deferred(
+        mode, Const(inner_cons_oop), Active, Enzyme.Duplicated(x, dx),
+        Const(fcons), Const(p), Const(i))
+    return nothing
+end
+
+function lagrangian(x, _f::Function, cons::Function, p, λ, σ = one(eltype(x)))::Float64
+    res = zeros(eltype(x), length(λ))
+    cons(res, x, p)
+    return σ * _f(x, p) + dot(λ, res)
+end
+
+function lag_grad(mode, x, dx, lagrangian::Function, _f::Function, cons::Function, p, σ, λ)
+    Enzyme.autodiff_deferred(
+        mode, Const(lagrangian), Active, Enzyme.Duplicated(x, dx),
+        Const(_f), Const(cons), Const(p), Const(λ), Const(σ))
+    return nothing
+end
+
+function set_runtime_activity2(
+        a::Mode1, ::Enzyme.Mode{ABI, Err, RTA}) where {Mode1, ABI, Err, RTA}
+    Enzyme.set_runtime_activity(a, RTA)
+end
+function_annotation(::Nothing) = Nothing
+function_annotation(::AutoEnzyme{<:Any, A}) where A = A
+function OptimizationBase.instantiate_function(f::OptimizationFunction{true}, x,
+        adtype::AutoEnzyme, p, num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    rmode = if adtype.mode isa Nothing
+        Enzyme.Reverse
+    else
+        set_runtime_activity2(Enzyme.Reverse, adtype.mode)
+    end
+
+    fmode = if adtype.mode isa Nothing
+        Enzyme.Forward
+    else
+        set_runtime_activity2(Enzyme.Forward, adtype.mode)
+    end
+
+    func_annot = function_annotation(adtype)
+
+    if g == true && f.grad === nothing
+        function grad(res, θ, p = p)
+            Enzyme.make_zero!(res)
+            Enzyme.autodiff(rmode,
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res),
+                Const(p)
+            )
+        end
+    elseif g == true
+        grad = (G, θ, p = p) -> f.grad(G, θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        function fg!(res, θ, p = p)
+            Enzyme.make_zero!(res)
+            y = Enzyme.autodiff(WithPrimal(rmode),
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res),
+                Const(p)
+            )[2]
+            return y
+        end
+    elseif fg == true
+        fg! = (res, θ, p = p) -> f.fg(res, θ, p)
+    else
+        fg! = nothing
+    end
+
+    if h == true && f.hess === nothing
+        vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        bθ = zeros(eltype(x), length(x))
+
+        if f.hess_prototype === nothing
+            vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            #useless right now, looks like there is no way to tell Enzyme the sparsity pattern?
+            vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
+
+        function hess(res, θ, p = p)
+            Enzyme.make_zero!(bθ)
+            Enzyme.make_zero!.(vdbθ)
+
+            Enzyme.autodiff(fmode,
+                inner_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, vdθ),
+                Enzyme.BatchDuplicatedNoNeed(bθ, vdbθ),
+                Const(f.f),
+                Const(p)
+            )
+
+            for i in eachindex(θ)
+                res[i, :] .= vdbθ[i]
+            end
+        end
+    elseif h == true
+        hess = (H, θ, p = p) -> f.hess(H, θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(G, H, θ, p = p)
+            vdθ = Tuple((Array(r) for r in eachrow(I(length(θ)) * one(eltype(θ)))))
+            vdbθ = Tuple(zeros(eltype(θ), length(θ)) for i in eachindex(θ))
+
+            Enzyme.autodiff(fmode,
+                inner_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, vdθ),
+                Enzyme.BatchDuplicatedNoNeed(G, vdbθ),
+                Const(f.f),
+                Const(p)
+            )
+
+            for i in eachindex(θ)
+                H[i, :] .= vdbθ[i]
+            end
+        end
+    elseif fgh == true
+        fgh! = (G, H, θ, p = p) -> f.fgh(G, H, θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        function hv!(H, θ, v, p = p)
+            H .= Enzyme.autodiff(
+                fmode, hv_f2_alloc, Const(rmode), Duplicated(θ, v),
+                Const(f.f), Const(p)
+            )[1]
+        end
+    elseif hv == true
+        hv! = (H, θ, v, p = p) -> f.hv(H, θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = (res, θ) -> f.cons(res, θ, p)
+    end
+
+    if cons !== nothing && cons_j == true && f.cons_j === nothing
+        # if num_cons > length(x)
+        seeds = Enzyme.onehot(x)
+        Jaccache = Tuple(zeros(eltype(x), num_cons) for i in 1:length(x))
+        basefunc = f.cons
+        if func_annot <: Enzyme.Const
+            basefunc = Enzyme.Const(basefunc)
+        elseif func_annot <: Enzyme.Duplicated || func_annot <: Enzyme.BatchDuplicated
+            basefunc = Enzyme.BatchDuplicated(basefunc, Tuple(make_zero(basefunc) for i in 1:length(x)))
+        elseif func_annot <: Enzyme.DuplicatedNoNeed || func_annot <: Enzyme.BatchDuplicatedNoNeed
+            basefunc = Enzyme.BatchDuplicatedNoNeed(basefunc, Tuple(make_zero(basefunc) for i in 1:length(x)))
+        end
+        # else
+        #     seeds = Enzyme.onehot(zeros(eltype(x), num_cons))
+        #     Jaccache = Tuple(zero(x) for i in 1:num_cons)
+        # end
+
+        y = zeros(eltype(x), num_cons)
+
+        function cons_j!(J, θ)
+            for jc in Jaccache
+                Enzyme.make_zero!(jc)
+            end
+            Enzyme.make_zero!(y)
+            if func_annot <: Enzyme.Duplicated || func_annot <: Enzyme.BatchDuplicated || func_annot <: Enzyme.DuplicatedNoNeed || func_annot <: Enzyme.BatchDuplicatedNoNeed                
+                for bf in basefunc.dval
+                    Enzyme.make_zero!(bf)
+                end
+            end
+            Enzyme.autodiff(fmode, basefunc , BatchDuplicated(y, Jaccache),
+                BatchDuplicated(θ, seeds), Const(p))
+            for i in eachindex(θ)
+                if J isa Vector
+                    J[i] = Jaccache[i][1]
+                else
+                    copyto!(@view(J[:, i]), Jaccache[i])
+                end
+            end
+            # else
+            #     Enzyme.autodiff(Enzyme.Reverse, f.cons, BatchDuplicated(y, seeds),
+            #         BatchDuplicated(θ, Jaccache), Const(p))
+            #     for i in 1:num_cons
+            #         if J isa Vector
+            #             J .= Jaccache[1]
+            #         else
+            #             J[i, :] = Jaccache[i]
+            #         end
+            #     end
+            # end
+        end
+    elseif cons_j == true && cons !== nothing
+        cons_j! = (J, θ) -> f.cons_j(J, θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if cons !== nothing && cons_vjp == true && f.cons_vjp === nothing
+        cons_res = zeros(eltype(x), num_cons)
+        function cons_vjp!(res, θ, v)
+            Enzyme.make_zero!(res)
+            Enzyme.make_zero!(cons_res)
+
+            Enzyme.autodiff(rmode,
+                f.cons,
+                Const,
+                Duplicated(cons_res, v),
+                Duplicated(θ, res),
+                Const(p)
+            )
+        end
+    elseif cons_vjp == true && cons !== nothing
+        cons_vjp! = (Jv, θ, σ) -> f.cons_vjp(Jv, θ, σ, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if cons !== nothing && cons_jvp == true && f.cons_jvp === nothing
+        cons_res = zeros(eltype(x), num_cons)
+
+        function cons_jvp!(res, θ, v)
+            Enzyme.make_zero!(res)
+            Enzyme.make_zero!(cons_res)
+
+            Enzyme.autodiff(fmode,
+                f.cons,
+                Duplicated(cons_res, res),
+                Duplicated(θ, v),
+                Const(p)
+            )
+        end
+    elseif cons_jvp == true && cons !== nothing
+        cons_jvp! = (Jv, θ, v) -> f.cons_jvp(Jv, θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    if cons !== nothing && cons_h == true && f.cons_h === nothing
+        cons_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        cons_bθ = zeros(eltype(x), length(x))
+        cons_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function cons_h!(res, θ)
+            for i in 1:num_cons
+                Enzyme.make_zero!(cons_bθ)
+                Enzyme.make_zero!.(cons_vdbθ)
+                Enzyme.autodiff(fmode,
+                    cons_f2,
+                    Const(rmode),
+                    Enzyme.BatchDuplicated(θ, cons_vdθ),
+                    Enzyme.BatchDuplicated(cons_bθ, cons_vdbθ),
+                    Const(f.cons),
+                    Const(p),
+                    Const(num_cons),
+                    Const(i))
+
+                for j in eachindex(θ)
+                    res[i][j, :] .= cons_vdbθ[j]
+                end
+            end
+        end
+    elseif cons !== nothing && cons_h == true
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    if lag_h == true && f.lag_h === nothing && cons !== nothing
+        lag_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        lag_bθ = zeros(eltype(x), length(x))
+
+        if f.hess_prototype === nothing
+            lag_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            #useless right now, looks like there is no way to tell Enzyme the sparsity pattern?
+            lag_vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
+
+        function lag_h!(h, θ, σ, μ, p = p)
+            Enzyme.make_zero!(lag_bθ)
+            Enzyme.make_zero!.(lag_vdbθ)
+
+            Enzyme.autodiff(fmode,
+                lag_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, lag_vdθ),
+                Enzyme.BatchDuplicatedNoNeed(lag_bθ, lag_vdbθ),
+                Const(lagrangian),
+                Const(f.f),
+                Const(f.cons),
+                Const(p),
+                Const(σ),
+                Const(μ)
+            )
+            k = 0
+
+            for i in eachindex(θ)
+                vec_lagv = lag_vdbθ[i]
+                h[(k + 1):(k + i)] .= @view(vec_lagv[1:i])
+                k += i
+            end
+        end
+
+        function lag_h!(H::AbstractMatrix, θ, σ, μ, p = p)
+            Enzyme.make_zero!(H)
+            Enzyme.make_zero!(lag_bθ)
+            Enzyme.make_zero!.(lag_vdbθ)
+
+            Enzyme.autodiff(fmode,
+                lag_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, lag_vdθ),
+                Enzyme.BatchDuplicatedNoNeed(lag_bθ, lag_vdbθ),
+                Const(lagrangian),
+                Const(f.f),
+                Const(f.cons),
+                Const(p),
+                Const(σ),
+                Const(μ)
+            )
+
+            for i in eachindex(θ)
+                H[i, :] .= lag_vdbθ[i]
+            end
+        end
+    elseif lag_h == true && cons !== nothing
+        lag_h! = (θ, σ, μ, p = p) -> f.lag_h(θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype;
+        grad = grad, fg = fg!, fgh = fgh!,
+        hess = hess, hv = hv!,
+        cons = cons, cons_j = cons_j!,
+        cons_jvp = cons_jvp!, cons_vjp = cons_vjp!,
+        cons_h = cons_h!,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        lag_h = lag_h!,
+        lag_hess_prototype = f.lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function OptimizationBase.instantiate_function(f::OptimizationFunction{true},
+        cache::OptimizationBase.ReInitCache,
+        adtype::AutoEnzyme,
+        num_cons = 0; kwargs...)
+    p = cache.p
+    x = cache.u0
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
+
+function OptimizationBase.instantiate_function(f::OptimizationFunction{false}, x,
+        adtype::AutoEnzyme, p, num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    rmode = if adtype.mode isa Nothing
+        Enzyme.Reverse
+    else
+        set_runtime_activity2(Enzyme.Reverse, adtype.mode)
+    end
+
+    fmode = if adtype.mode isa Nothing
+        Enzyme.Forward
+    else
+        set_runtime_activity2(Enzyme.Forward, adtype.mode)
+    end
+
+    if g == true && f.grad === nothing
+        res = zeros(eltype(x), size(x))
+        function grad(θ, p = p)
+            Enzyme.make_zero!(res)
+            Enzyme.autodiff(rmode,
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res),
+                Const(p)
+            )
+            return res
+        end
+    elseif fg == true
+        grad = (θ, p = p) -> f.grad(θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        res_fg = zeros(eltype(x), size(x))
+        function fg!(θ, p = p)
+            Enzyme.make_zero!(res_fg)
+            y = Enzyme.autodiff(WithPrimal(rmode),
+                Const(firstapply),
+                Active,
+                Const(f.f),
+                Enzyme.Duplicated(θ, res_fg),
+                Const(p)
+            )[2]
+            return y, res
+        end
+    elseif fg == true
+        fg! = (θ, p = p) -> f.fg(θ, p)
+    else
+        fg! = nothing
+    end
+
+    if h == true && f.hess === nothing
+        vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        bθ = zeros(eltype(x), length(x))
+        vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function hess(θ, p = p)
+            Enzyme.make_zero!(bθ)
+            Enzyme.make_zero!.(vdbθ)
+
+            Enzyme.autodiff(fmode,
+                inner_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, vdθ),
+                Enzyme.BatchDuplicated(bθ, vdbθ),
+                Const(f.f),
+                Const(p)
+            )
+
+            return reduce(
+                vcat, [reshape(vdbθ[i], (1, length(vdbθ[i]))) for i in eachindex(θ)])
+        end
+    elseif h == true
+        hess = (θ, p = p) -> f.hess(θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        vdθ_fgh = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        vdbθ_fgh = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        G_fgh = zeros(eltype(x), length(x))
+        H_fgh = zeros(eltype(x), length(x), length(x))
+
+        function fgh!(θ, p = p)
+            Enzyme.make_zero!(G_fgh)
+            Enzyme.make_zero!(H_fgh)
+            Enzyme.make_zero!.(vdbθ_fgh)
+
+            Enzyme.autodiff(fmode,
+                inner_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, vdθ_fgh),
+                Enzyme.BatchDuplicatedNoNeed(G_fgh, vdbθ_fgh),
+                Const(f.f),
+                Const(p)
+            )
+
+            for i in eachindex(θ)
+                H_fgh[i, :] .= vdbθ_fgh[i]
+            end
+            return G_fgh, H_fgh
+        end
+    elseif fgh == true
+        fgh! = (θ, p = p) -> f.fgh(θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        function hv!(θ, v, p = p)
+            return Enzyme.autodiff(
+                fmode, hv_f2_alloc, DuplicatedNoNeed, Const(rmode), Duplicated(θ, v),
+                Const(_f), Const(f.f), Const(p)
+            )[1]
+        end
+    elseif hv == true
+        hv! = (θ, v, p = p) -> f.hv(θ, v, p)
+    else
+        hv! = f.hv
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        function cons(θ)
+            return f.cons(θ, p)
+        end
+    end
+
+    if cons_j == true && cons !== nothing && f.cons_j === nothing
+        seeds = Enzyme.onehot(x)
+        Jaccache = Tuple(zeros(eltype(x), num_cons) for i in 1:length(x))
+
+        function cons_j!(θ)
+            for i in eachindex(Jaccache)
+                Enzyme.make_zero!(Jaccache[i])
+            end
+            Jaccache, y = Enzyme.autodiff(WithPrimal(fmode), f.cons, Duplicated,
+                BatchDuplicated(θ, seeds), Const(p))
+            if size(y, 1) == 1
+                return reduce(vcat, Jaccache)
+            else
+                return reduce(hcat, Jaccache)
+            end
+        end
+    elseif cons_j == true && cons !== nothing
+        cons_j! = (θ) -> f.cons_j(θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if cons_vjp == true && cons !== nothing && f.cons_vjp == true
+        res_vjp = zeros(eltype(x), size(x))
+        cons_vjp_res = zeros(eltype(x), num_cons)
+
+        function cons_vjp!(θ, v)
+            Enzyme.make_zero!(res_vjp)
+            Enzyme.make_zero!(cons_vjp_res)
+
+            Enzyme.autodiff(WithPrimal(rmode),
+                f.cons,
+                Const,
+                Duplicated(cons_vjp_res, v),
+                Duplicated(θ, res_vjp),
+                Const(p)
+            )
+            return res_vjp
+        end
+    elseif cons_vjp == true && cons !== nothing
+        cons_vjp! = (θ, v) -> f.cons_vjp(θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if cons_jvp == true && cons !== nothing && f.cons_jvp == true
+        res_jvp = zeros(eltype(x), size(x))
+        cons_jvp_res = zeros(eltype(x), num_cons)
+
+        function cons_jvp!(θ, v)
+            Enzyme.make_zero!(res_jvp)
+            Enzyme.make_zero!(cons_jvp_res)
+
+            Enzyme.autodiff(fmode,
+                f.cons,
+                Duplicated(cons_jvp_res, res_jvp),
+                Duplicated(θ, v),
+                Const(p)
+            )
+            return res_jvp
+        end
+    elseif cons_jvp == true && cons !== nothing
+        cons_jvp! = (θ, v) -> f.cons_jvp(θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    if cons_h == true && cons !== nothing && f.cons_h === nothing
+        cons_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        cons_bθ = zeros(eltype(x), length(x))
+        cons_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+
+        function cons_h!(θ)
+            return map(1:num_cons) do i
+                Enzyme.make_zero!(cons_bθ)
+                Enzyme.make_zero!.(cons_vdbθ)
+                Enzyme.autodiff(fmode,
+                    cons_f2_oop,
+                    Const(rmode),
+                    Enzyme.BatchDuplicated(θ, cons_vdθ),
+                    Enzyme.BatchDuplicated(cons_bθ, cons_vdbθ),
+                    Const(f.cons),
+                    Const(p),
+                    Const(i))
+
+                return reduce(hcat, cons_vdbθ)
+            end
+        end
+    elseif cons_h == true && cons !== nothing
+        cons_h! = (θ) -> f.cons_h(θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    if lag_h == true && f.lag_h === nothing && cons !== nothing
+        lag_vdθ = Tuple((Array(r) for r in eachrow(I(length(x)) * one(eltype(x)))))
+        lag_bθ = zeros(eltype(x), length(x))
+        if f.hess_prototype === nothing
+            lag_vdbθ = Tuple(zeros(eltype(x), length(x)) for i in eachindex(x))
+        else
+            lag_vdbθ = Tuple((copy(r) for r in eachrow(f.hess_prototype)))
+        end
+
+        function lag_h!(θ, σ, μ, p = p)
+            Enzyme.make_zero!(lag_bθ)
+            Enzyme.make_zero!.(lag_vdbθ)
+
+            Enzyme.autodiff(fmode,
+                lag_grad,
+                Const(rmode),
+                Enzyme.BatchDuplicated(θ, lag_vdθ),
+                Enzyme.BatchDuplicatedNoNeed(lag_bθ, lag_vdbθ),
+                Const(lagrangian),
+                Const(f.f),
+                Const(f.cons),
+                Const(p),
+                Const(σ),
+                Const(μ)
+            )
+
+            k = 0
+
+            for i in eachindex(θ)
+                vec_lagv = lag_vdbθ[i]
+                res[(k + 1):(k + i), :] .= @view(vec_lagv[1:i])
+                k += i
+            end
+            return res
+        end
+    elseif lag_h == true && cons !== nothing
+        lag_h! = (θ, σ, μ, p = p) -> f.lag_h(θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+
+    return OptimizationFunction{false}(f.f, adtype; grad = grad,
+        fg = fg!, fgh = fgh!,
+        hess = hess, hv = hv!,
+        cons = cons, cons_j = cons_j!,
+        cons_jvp = cons_jvp!, cons_vjp = cons_vjp!,
+        cons_h = cons_h!,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        lag_h = lag_h!,
+        lag_hess_prototype = f.lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function OptimizationBase.instantiate_function(f::OptimizationFunction{false},
+        cache::OptimizationBase.ReInitCache,
+        adtype::AutoEnzyme,
+        num_cons = 0; kwargs...)
+    p = cache.p
+    x = cache.u0
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationFiniteDiffExt.jl b/lib/OptimizationBase/ext/OptimizationFiniteDiffExt.jl
new file mode 100644
index 000000000..ed95f2a93
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationFiniteDiffExt.jl
@@ -0,0 +1,5 @@
+module OptimizationFiniteDiffExt
+
+using DifferentiationInterface, FiniteDiff
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationForwardDiffExt.jl b/lib/OptimizationBase/ext/OptimizationForwardDiffExt.jl
new file mode 100644
index 000000000..0ff3e5ffb
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationForwardDiffExt.jl
@@ -0,0 +1,5 @@
+module OptimizationForwardDiffExt
+
+using DifferentiationInterface, ForwardDiff
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationMLDataDevicesExt.jl b/lib/OptimizationBase/ext/OptimizationMLDataDevicesExt.jl
new file mode 100644
index 000000000..ae8d5106a
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationMLDataDevicesExt.jl
@@ -0,0 +1,8 @@
+module OptimizationMLDataDevicesExt
+
+using MLDataDevices
+using OptimizationBase
+
+OptimizationBase.isa_dataiterator(::DeviceIterator) = true
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationMLUtilsExt.jl b/lib/OptimizationBase/ext/OptimizationMLUtilsExt.jl
new file mode 100644
index 000000000..517883129
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationMLUtilsExt.jl
@@ -0,0 +1,8 @@
+module OptimizationMLUtilsExt
+
+using MLUtils
+using OptimizationBase
+
+OptimizationBase.isa_dataiterator(::MLUtils.DataLoader) = true
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationMTKExt.jl b/lib/OptimizationBase/ext/OptimizationMTKExt.jl
new file mode 100644
index 000000000..a15ba3171
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationMTKExt.jl
@@ -0,0 +1,210 @@
+module OptimizationMTKExt
+
+import OptimizationBase, OptimizationBase.ArrayInterface
+import OptimizationBase.SciMLBase
+import OptimizationBase.SciMLBase: OptimizationFunction
+import OptimizationBase.ADTypes: AutoModelingToolkit, AutoSymbolics, AutoSparse
+using ModelingToolkit
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::AutoSparse{<:AutoSymbolics}, p,
+        num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    p = isnothing(p) ? SciMLBase.NullParameters() : p
+
+    sys = complete(ModelingToolkit.modelingtoolkitize(OptimizationProblem(f, x, p;
+        lcons = fill(0.0,
+            num_cons),
+        ucons = fill(0.0,
+            num_cons))))
+    #sys = ModelingToolkit.structural_simplify(sys)
+    # don't need to pass `x` or `p` since they're defaults now
+    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+        sparse = true, cons_j = cons_j, cons_h = cons_h,
+        cons_sparse = true).f
+
+    grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+
+    hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+
+    hv = function (H, θ, v, args...)
+        res = (eltype(θ)).(f.hess_prototype)
+        hess(res, θ, args...)
+        H .= res * v
+    end
+
+    if !isnothing(f.cons)
+        cons = (res, θ) -> f.cons(res, θ, p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons = nothing
+        cons_j = nothing
+        cons_h = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        expr = OptimizationBase.symbolify(f.expr),
+        cons_expr = OptimizationBase.symbolify.(f.cons_expr),
+        sys = sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::AutoSparse{<:AutoSymbolics}, num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    p = isnothing(cache.p) ? SciMLBase.NullParameters() : cache.p
+
+    sys = complete(ModelingToolkit.modelingtoolkitize(OptimizationProblem(f, cache.u0,
+        cache.p;
+        lcons = fill(0.0,
+            num_cons),
+        ucons = fill(0.0,
+            num_cons))))
+    #sys = ModelingToolkit.structural_simplify(sys)
+    # don't need to pass `x` or `p` since they're defaults now
+    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+        sparse = true, cons_j = cons_j, cons_h = cons_h,
+        cons_sparse = true).f
+
+    grad = (G, θ, args...) -> f.grad(G, θ, cache.p, args...)
+
+    hess = (H, θ, args...) -> f.hess(H, θ, cache.p, args...)
+
+    hv = function (H, θ, v, args...)
+        res = (eltype(θ)).(f.hess_prototype)
+        hess(res, θ, args...)
+        H .= res * v
+    end
+
+    if !isnothing(f.cons)
+        cons = (res, θ) -> f.cons(res, θ, cache.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, cache.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, cache.p)
+    else
+        cons = nothing
+        cons_j = nothing
+        cons_h = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        expr = OptimizationBase.symbolify(f.expr),
+        cons_expr = OptimizationBase.symbolify.(f.cons_expr),
+        sys = sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::AutoSymbolics, p,
+        num_cons = 0; g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    p = isnothing(p) ? SciMLBase.NullParameters() : p
+
+    sys = complete(ModelingToolkit.modelingtoolkitize(OptimizationProblem(f, x, p;
+        lcons = fill(0.0,
+            num_cons),
+        ucons = fill(0.0,
+            num_cons))))
+    #sys = ModelingToolkit.structural_simplify(sys)
+    # don't need to pass `x` or `p` since they're defaults now
+    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+        sparse = false, cons_j = cons_j, cons_h = cons_h,
+        cons_sparse = false).f
+
+    grad = (G, θ, args...) -> f.grad(G, θ, p, args...)
+
+    hess = (H, θ, args...) -> f.hess(H, θ, p, args...)
+
+    hv = function (H, θ, v, args...)
+        res = ArrayInterface.zeromatrix(θ)
+        hess(res, θ, args...)
+        H .= res * v
+    end
+
+    if !isnothing(f.cons)
+        cons = (res, θ) -> f.cons(res, θ, p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons = nothing
+        cons_j = nothing
+        cons_h = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        expr = OptimizationBase.symbolify(f.expr),
+        cons_expr = OptimizationBase.symbolify.(f.cons_expr),
+        sys = sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::AutoSymbolics, num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    p = isnothing(cache.p) ? SciMLBase.NullParameters() : cache.p
+
+    sys = complete(ModelingToolkit.modelingtoolkitize(OptimizationProblem(f, cache.u0,
+        cache.p;
+        lcons = fill(0.0,
+            num_cons),
+        ucons = fill(0.0,
+            num_cons))))
+    #sys = ModelingToolkit.structural_simplify(sys)
+    # don't need to pass `x` or `p` since they're defaults now
+    f = OptimizationProblem(sys, nothing; grad = g, hess = h,
+        sparse = false, cons_j = cons_j, cons_h = cons_h,
+        cons_sparse = false).f
+
+    grad = (G, θ, args...) -> f.grad(G, θ, cache.p, args...)
+
+    hess = (H, θ, args...) -> f.hess(H, θ, cache.p, args...)
+
+    hv = function (H, θ, v, args...)
+        res = ArrayInterface.zeromatrix(θ)
+        hess(res, θ, args...)
+        H .= res * v
+    end
+
+    if !isnothing(f.cons)
+        cons = (res, θ) -> f.cons(res, θ, cache.p)
+        cons_j = (J, θ) -> f.cons_j(J, θ, cache.p)
+        cons_h = (res, θ) -> f.cons_h(res, θ, cache.p)
+    else
+        cons = nothing
+        cons_j = nothing
+        cons_h = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype; grad = grad, hess = hess, hv = hv,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
+        hess_prototype = f.hess_prototype,
+        cons_jac_prototype = f.cons_jac_prototype,
+        cons_hess_prototype = f.cons_hess_prototype,
+        expr = OptimizationBase.symbolify(f.expr),
+        cons_expr = OptimizationBase.symbolify.(f.cons_expr),
+        sys = sys,
+        observed = f.observed)
+end
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationReverseDiffExt.jl b/lib/OptimizationBase/ext/OptimizationReverseDiffExt.jl
new file mode 100644
index 000000000..11e57cf3b
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationReverseDiffExt.jl
@@ -0,0 +1,5 @@
+module OptimizationReverseDiffExt
+
+using DifferentiationInterface, ReverseDiff
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationSymbolicAnalysisExt.jl b/lib/OptimizationBase/ext/OptimizationSymbolicAnalysisExt.jl
new file mode 100644
index 000000000..ecf690915
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationSymbolicAnalysisExt.jl
@@ -0,0 +1,118 @@
+module OptimizationSymbolicAnalysisExt
+
+using OptimizationBase, SciMLBase, SymbolicAnalysis, SymbolicAnalysis.Symbolics,
+      OptimizationBase.ArrayInterface
+using SymbolicAnalysis: AnalysisResult
+import SymbolicAnalysis.Symbolics: variable, Equation, Inequality, unwrap, @variables
+
+function OptimizationBase.symify_cache(
+        f::OptimizationFunction{iip, AD, F, G, FG, H, FGH, HV, C, CJ, CJV, CVJ, CH, HP,
+            CJP, CHP, O, EX, CEX, SYS, LH, LHP, HCV, CJCV, CHCV, LHCV},
+        prob, num_cons,
+        manifold) where {
+        iip, AD, F, G, FG, H, FGH, HV, C, CJ, CJV, CVJ, CH, HP, CJP, CHP, O,
+        EX <: Nothing, CEX <: Nothing, SYS, LH, LHP, HCV, CJCV, CHCV, LHCV}
+    obj_expr = f.expr
+    cons_expr = f.cons_expr === nothing ? nothing : getfield.(f.cons_expr, Ref(:lhs))
+
+    if obj_expr === nothing || cons_expr === nothing
+        try
+            vars = if prob.u0 isa Matrix
+                @variables X[1:size(prob.u0, 1), 1:size(prob.u0, 2)]
+            else
+                ArrayInterface.restructure(
+                    prob.u0, [variable(:x, i) for i in eachindex(prob.u0)])
+            end
+            params = if prob.p isa SciMLBase.NullParameters
+                []
+            elseif prob.p isa MTK.MTKParameters
+                [variable(:α, i) for i in eachindex(vcat(p...))]
+            else
+                ArrayInterface.restructure(p, [variable(:α, i) for i in eachindex(p)])
+            end
+
+            if prob.u0 isa Matrix
+                vars = vars[1]
+            end
+
+            if obj_expr === nothing
+                obj_expr = f.f(vars, params)
+            end
+
+            if cons_expr === nothing && SciMLBase.isinplace(prob) && !isnothing(prob.f.cons)
+                lhs = Array{Symbolics.Num}(undef, num_cons)
+                f.cons(lhs, vars)
+                cons = Union{Equation, Inequality}[]
+
+                if !isnothing(prob.lcons)
+                    for i in 1:num_cons
+                        if !isinf(prob.lcons[i])
+                            if prob.lcons[i] != prob.ucons[i]
+                                push!(cons, prob.lcons[i] ≲ lhs[i])
+                            else
+                                push!(cons, lhs[i] ~ prob.ucons[i])
+                            end
+                        end
+                    end
+                end
+
+                if !isnothing(prob.ucons)
+                    for i in 1:num_cons
+                        if !isinf(prob.ucons[i]) && prob.lcons[i] != prob.ucons[i]
+                            push!(cons, lhs[i] ≲ prob.ucons[i])
+                        end
+                    end
+                end
+                if (isnothing(prob.lcons) || all(isinf, prob.lcons)) &&
+                   (isnothing(prob.ucons) || all(isinf, prob.ucons))
+                    throw(ArgumentError("Constraints passed have no proper bounds defined.
+                    Ensure you pass equal bounds (the scalar that the constraint should evaluate to) for equality constraints
+                    or pass the lower and upper bounds for inequality constraints."))
+                end
+                cons_expr = lhs
+            elseif cons_expr === nothing && !isnothing(prob.f.cons)
+                cons_expr = f.cons(vars, params)
+            end
+        catch err
+            throw(ArgumentError("Automatic symbolic expression generation with failed with error: $err.
+            Try by setting `structural_analysis = false` instead if the solver doesn't require symbolic expressions."))
+        end
+    end
+
+    if obj_expr !== nothing
+        obj_expr = obj_expr |> Symbolics.unwrap
+        if manifold === nothing
+            obj_res = analyze(obj_expr)
+        else
+            obj_res = analyze(obj_expr, manifold)
+        end
+        @info "Objective Euclidean curvature: $(obj_res.curvature)"
+        if obj_res.gcurvature !== nothing
+            @info "Objective Geodesic curvature: $(obj_res.gcurvature)"
+        end
+    else
+        obj_res = nothing
+    end
+
+    if cons_expr !== nothing
+        cons_expr = cons_expr .|> Symbolics.unwrap
+        if manifold === nothing
+            cons_res = analyze.(cons_expr)
+        else
+            cons_res = analyze.(cons_expr, Ref(manifold))
+        end
+        for i in 1:num_cons
+            @info "Constraints Euclidean curvature: $(cons_res[i].curvature)"
+
+            if cons_res[i].gcurvature !== nothing
+                @info "Constraints Geodesic curvature: $(cons_res[i].gcurvature)"
+            end
+        end
+    else
+        cons_res = nothing
+    end
+
+    return obj_res, cons_res
+end
+
+end
diff --git a/lib/OptimizationBase/ext/OptimizationZygoteExt.jl b/lib/OptimizationBase/ext/OptimizationZygoteExt.jl
new file mode 100644
index 000000000..7574af77d
--- /dev/null
+++ b/lib/OptimizationBase/ext/OptimizationZygoteExt.jl
@@ -0,0 +1,563 @@
+module OptimizationZygoteExt
+
+using OptimizationBase, SparseArrays
+using OptimizationBase.FastClosures
+import OptimizationBase.ArrayInterface
+import OptimizationBase.SciMLBase: OptimizationFunction
+import OptimizationBase.LinearAlgebra: I, dot
+import DifferentiationInterface
+import DifferentiationInterface: prepare_gradient, prepare_hessian, prepare_hvp,
+                                 prepare_pullback, prepare_pushforward, pullback!,
+                                 pushforward!,
+                                 pullback, pushforward,
+                                 prepare_jacobian, value_and_gradient!, value_and_gradient,
+                                 value_derivative_and_second_derivative!,
+                                 value_derivative_and_second_derivative,
+                                 gradient!, hessian!, hvp!, jacobian!, gradient, hessian,
+                                 hvp, jacobian, Constant
+using ADTypes, SciMLBase
+import Zygote, Zygote.ForwardDiff
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x,
+        adtype::Union{ADTypes.AutoZygote,
+            DifferentiationInterface.SecondOrder{
+                <:ADTypes.AbstractADType, <:ADTypes.AutoZygote}},
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = OptimizationBase.generate_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        prep_grad = prepare_gradient(f.f, adtype, x, Constant(p), strict=Val(false))
+        function grad(res, θ)
+            gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function grad(res, θ, p)
+                gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (G, θ, p = p) -> f.grad(G, θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            prep_grad = prepare_gradient(f.f, adtype, x, Constant(p), strict=Val(false))
+        end
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fg!(res, θ, p)
+                (y, _) = value_and_gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fg == true
+        fg! = (G, θ, p = p) -> f.fg(G, θ, p)
+    else
+        fg! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if h == true && f.hess === nothing
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p), strict=Val(false))
+        function hess(res, θ)
+            hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(res, θ, p)
+                hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (H, θ, p = p) -> f.hess(H, θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(
+                f.f, G, H, prep_hess, soadtype, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(G, H, θ, p)
+                (y, _, _) = value_derivative_and_second_derivative!(
+                    f.f, G, H, prep_hess, soadtype, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fgh == true
+        fgh! = (G, H, θ, p = p) -> f.fgh(G, H, θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(f.f, soadtype, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(H, θ, v)
+            hvp!(f.f, (H,), prep_hvp, soadtype, θ, (v,), Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(H, θ, v, p)
+                hvp!(f.f, (H,), prep_hvp, soadtype, θ, (v,), Constant(p))
+            end
+        end
+    elseif hv == true
+        hv! = (H, θ, v, p = p) -> f.hv(H, θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = (res, θ) -> f.cons(res, θ, p)
+
+        function cons_oop(x)
+            _res = Zygote.Buffer(x, num_cons)
+            f.cons(_res, x, p)
+            return copy(_res)
+        end
+
+        function cons_oop(x, i)
+            _res = Zygote.Buffer(x, num_cons)
+            f.cons(_res, x, p)
+            return _res[i]
+        end
+
+        function lagrangian(θ, σ, λ, p)
+            return σ * f.f(θ, p) + dot(λ, cons_oop(θ))
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(cons_oop, adtype, x, strict=Val(false))
+        function cons_j!(J, θ)
+            jacobian!(cons_oop, J, prep_jac, adtype, θ)
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+        end
+    elseif cons !== nothing && cons_j == true
+        cons_j! = (J, θ) -> f.cons_j(J, θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && cons !== nothing
+        prep_pullback = prepare_pullback(cons_oop, adtype, x, (ones(eltype(x), num_cons),), strict=Val(false))
+        function cons_vjp!(J, θ, v)
+            pullback!(cons_oop, (J,), prep_pullback, adtype, θ, (v,))
+        end
+    elseif cons_vjp == true
+        cons_vjp! = (J, θ, v) -> f.cons_vjp(J, θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if cons !== nothing && f.cons_jvp === nothing && cons_jvp == true
+        prep_pushforward = prepare_pushforward(
+            cons_oop, adtype, x, (ones(eltype(x), length(x)),), strict=Val(false))
+        function cons_jvp!(J, θ, v)
+            pushforward!(cons_oop, (J,), prep_pushforward, adtype, θ, (v,))
+        end
+    elseif cons_jvp == true
+        cons_jvp! = (J, θ, v) -> f.cons_jvp(J, θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if cons !== nothing && cons_h == true && f.cons_h === nothing
+        prep_cons_hess = [prepare_hessian(cons_oop, soadtype, x, Constant(i), strict=Val(false))
+                          for i in 1:num_cons]
+
+        function cons_h!(H, θ)
+            for i in 1:num_cons
+                hessian!(cons_oop, H[i], prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+        end
+    elseif cons !== nothing && cons_h == true
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+
+    if f.lag_h === nothing && cons !== nothing && lag_h == true
+        lag_extras = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p), strict=Val(false))
+        lag_hess_prototype = zeros(Bool, num_cons, length(x))
+
+        function lag_h!(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h!(H, θ)
+                H *= λ
+            else
+                hessian!(lagrangian, H, lag_extras, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+            end
+        end
+
+        function lag_h!(h::AbstractVector, θ, σ, λ)
+            H = hessian(
+                lagrangian, lag_extras, soadtype, θ, Constant(σ), Constant(λ), Constant(p))
+            k = 0
+            for i in 1:length(θ)
+                for j in 1:i
+                    k += 1
+                    h[k] = H[i, j]
+                end
+            end
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(H::AbstractMatrix, θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    cons_h(H, θ)
+                    H *= λ
+                else
+                    hessian!(lagrangian, H, lag_extras, soadtype, θ,
+                        Constant(σ), Constant(λ), Constant(p))
+                end
+            end
+
+            function lag_h!(h::AbstractVector, θ, σ, λ, p)
+                H = hessian(lagrangian, lag_extras, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+                k = 0
+                for i in 1:length(θ)
+                    for j in 1:i
+                        k += 1
+                        h[k] = H[i, j]
+                    end
+                end
+            end
+        end
+    elseif cons !== nothing && lag_h == true
+        lag_h! = (res, θ, σ, μ, p = p) -> f.lag_h(res, θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoZygote, num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return OptimizationBase.instantiate_function(
+        f, x, adtype, p, num_cons; kwargs...)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x,
+        adtype::ADTypes.AutoSparse{<:Union{ADTypes.AutoZygote,
+            DifferentiationInterface.SecondOrder{
+                <:ADTypes.AbstractADType, <:ADTypes.AutoZygote}}},
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = OptimizationBase.generate_sparse_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        extras_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p), strict=Val(false))
+        function grad(res, θ)
+            gradient!(f.f, res, extras_grad, adtype.dense_ad, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function grad(res, θ, p)
+                gradient!(f.f, res, extras_grad, adtype.dense_ad, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (G, θ, p = p) -> f.grad(G, θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            extras_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p), strict=Val(false))
+        end
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(
+                f.f, res, extras_grad, adtype.dense_ad, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fg!(res, θ, p)
+                (y, _) = value_and_gradient!(
+                    f.f, res, extras_grad, adtype.dense_ad, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fg == true
+        fg! = (G, θ, p = p) -> f.fg(G, θ, p)
+    else
+        fg! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if h == true && f.hess === nothing
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p), strict=Val(false))
+        function hess(res, θ)
+            hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+        end
+        hess_sparsity = prep_hess.coloring_result.A
+        hess_colors = prep_hess.coloring_result.color
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(res, θ, p)
+                hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (H, θ, p = p) -> f.hess(H, θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(
+                f.f, G, H, θ, prep_hess, soadtype, Constant(p))
+            return y
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(G, H, θ, p)
+                (y, _, _) = value_derivative_and_second_derivative!(
+                    f.f, G, H, θ, prep_hess, soadtype, Constant(p))
+                return y
+            end
+        end
+    elseif fgh == true
+        fgh! = (G, H, θ, p = p) -> f.fgh(G, H, θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(
+            f.f, soadtype.dense_ad, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(H, θ, v)
+            hvp!(f.f, (H,), prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(H, θ, v, p)
+                hvp!(f.f, (H,), prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p))
+            end
+        end
+    elseif hv == true
+        hv! = (H, θ, v, p = p) -> f.hv(H, θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = (res, θ) -> f.cons(res, θ, p)
+
+        function cons_oop(x)
+            _res = Zygote.Buffer(x, num_cons)
+            f.cons(_res, x, p)
+            return copy(_res)
+        end
+
+        function cons_oop(x, i)
+            _res = Zygote.Buffer(x, num_cons)
+            f.cons(_res, x, p)
+            return _res[i]
+        end
+
+        function lagrangian(θ, σ, λ, p)
+            return σ * f.f(θ, p) + dot(λ, cons_oop(θ))
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(cons_oop, adtype, x)
+        function cons_j!(J, θ)
+            jacobian!(cons_oop, J, prep_jac, adtype, θ)
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+        end
+        cons_jac_prototype = prep_jac.coloring_result.A
+        cons_jac_colorvec = prep_jac.coloring_result.color
+    elseif cons !== nothing && cons_j == true
+        cons_j! = (J, θ) -> f.cons_j(J, θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && cons !== nothing
+        extras_pullback = prepare_pullback(
+            cons_oop, adtype.dense_ad, x, (ones(eltype(x), num_cons),))
+        function cons_vjp!(J, θ, v)
+            pullback!(
+                cons_oop, (J,), extras_pullback, adtype.dense_ad, θ, (v,))
+        end
+    elseif cons_vjp == true
+        cons_vjp! = (J, θ, v) -> f.cons_vjp(J, θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true && cons !== nothing
+        extras_pushforward = prepare_pushforward(
+            cons_oop, adtype.dense_ad, x, (ones(eltype(x), length(x)),))
+        function cons_jvp!(J, θ, v)
+            pushforward!(
+                cons_oop, (J,), extras_pushforward, adtype.dense_ad, θ, (v,))
+        end
+    elseif cons_jvp == true
+        cons_jvp! = (J, θ, v) -> f.cons_jvp(J, θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if cons !== nothing && f.cons_h === nothing && cons_h == true
+        prep_cons_hess = [prepare_hessian(cons_oop, soadtype, x, Constant(i), strict=Val(false))
+                          for i in 1:num_cons]
+        colores = getfield.(prep_cons_hess, :coloring_result)
+        conshess_sparsity = getfield.(colores, :A)
+        conshess_colors = getfield.(colores, :color)
+        function cons_h!(H, θ)
+            for i in 1:num_cons
+                hessian!(cons_oop, H[i], prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+        end
+    elseif cons_h == true
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+    lag_hess_colors = f.lag_hess_colorvec
+    if cons !== nothing && f.lag_h === nothing && lag_h == true
+        lag_extras = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p), strict=Val(false))
+        lag_hess_prototype = lag_extras.coloring_result.A
+        lag_hess_colors = lag_extras.coloring_result.color
+
+        function lag_h!(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h!(H, θ)
+                H *= λ
+            else
+                hessian!(lagrangian, H, lag_extras, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+            end
+        end
+
+        function lag_h!(h, θ, σ, λ)
+            H = hessian(
+                lagrangian, lag_extras, soadtype, θ, Constant(σ), Constant(λ), Constant(p))
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
+            end
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(H::AbstractMatrix, θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    cons_h!(H, θ)
+                    H *= λ
+                else
+                    hessian!(lagrangian, H, lag_extras, soadtype, θ,
+                        Constant(σ), Constant(λ), Constant(p))
+                end
+            end
+
+            function lag_h!(h::AbstractVector, θ, σ, λ, p)
+                H = hessian(lagrangian, lag_extras, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+                k = 0
+                for i in 1:length(θ)
+                    for j in 1:i
+                        k += 1
+                        h[k] = H[i, j]
+                    end
+                end
+            end
+        end
+    elseif cons !== nothing && cons_h == true
+        lag_h! = (res, θ, σ, μ, p = p) -> f.lag_h(res, θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+    return OptimizationFunction{true}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        lag_hess_colorvec = lag_hess_colors,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoSparse{<:AutoZygote}, num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return OptimizationBase.instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
+
+end
diff --git a/lib/OptimizationBase/src/OptimizationBase.jl b/lib/OptimizationBase/src/OptimizationBase.jl
new file mode 100644
index 000000000..c849a5dde
--- /dev/null
+++ b/lib/OptimizationBase/src/OptimizationBase.jl
@@ -0,0 +1,33 @@
+module OptimizationBase
+
+using DocStringExtensions
+using Reexport
+@reexport using SciMLBase, ADTypes
+
+using ArrayInterface, Base.Iterators, SparseArrays, LinearAlgebra
+import SciMLBase: OptimizationProblem,
+                  OptimizationFunction, ObjSense,
+                  MaxSense, MinSense, OptimizationStats
+export ObjSense, MaxSense, MinSense
+
+using FastClosures
+
+struct NullCallback end
+(x::NullCallback)(args...) = false
+const DEFAULT_CALLBACK = NullCallback()
+
+struct NullData end
+const DEFAULT_DATA = Iterators.cycle((NullData(),))
+Base.iterate(::NullData, i = 1) = nothing
+Base.length(::NullData) = 0
+
+include("adtypes.jl")
+include("symify.jl")
+include("cache.jl")
+include("OptimizationDIExt.jl")
+include("OptimizationDISparseExt.jl")
+include("function.jl")
+
+export solve, OptimizationCache, DEFAULT_CALLBACK, DEFAULT_DATA
+
+end
diff --git a/lib/OptimizationBase/src/OptimizationDIExt.jl b/lib/OptimizationBase/src/OptimizationDIExt.jl
new file mode 100644
index 000000000..f32e6cf33
--- /dev/null
+++ b/lib/OptimizationBase/src/OptimizationDIExt.jl
@@ -0,0 +1,502 @@
+import ArrayInterface
+import SciMLBase: OptimizationFunction
+import LinearAlgebra: I
+import DifferentiationInterface
+import DifferentiationInterface: prepare_gradient, prepare_hessian, prepare_hvp,
+                                 prepare_pullback, prepare_pushforward, pullback!,
+                                 pushforward!,
+                                 pullback, pushforward,
+                                 prepare_jacobian, value_and_gradient!, value_and_gradient,
+                                 value_derivative_and_second_derivative!,
+                                 value_derivative_and_second_derivative,
+                                 gradient!, hessian!, hvp!, jacobian!, gradient, hessian,
+                                 hvp, jacobian, Constant
+using ADTypes, SciMLBase
+
+function instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::ADTypes.AbstractADType,
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = generate_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        prep_grad = prepare_gradient(f.f, adtype, x, Constant(p))
+        function grad(res, θ)
+            gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function grad(res, θ, p)
+                gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (G, θ, p = p) -> f.grad(G, θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            prep_grad = prepare_gradient(f.f, adtype, x, Constant(p))
+        end
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fg!(res, θ, p)
+                (y, _) = value_and_gradient!(f.f, res, prep_grad, adtype, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fg == true
+        fg! = (G, θ, p = p) -> f.fg(G, θ, p)
+    else
+        fg! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if h == true && f.hess === nothing
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p))
+        function hess(res, θ)
+            hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(res, θ, p)
+                hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (H, θ, p = p) -> f.hess(H, θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(
+                f.f, G, H, prep_hess, soadtype, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(G, H, θ, p)
+                (y, _, _) = value_derivative_and_second_derivative!(
+                    f.f, G, H, prep_hess, soadtype, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fgh == true
+        fgh! = (G, H, θ, p = p) -> f.fgh(G, H, θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(f.f, soadtype, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(H, θ, v)
+            only(hvp!(f.f, (H,), prep_hvp, soadtype, θ, (v,), Constant(p)))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(H, θ, v, p)
+                only(hvp!(f.f, (H,), soadtype, θ, (v,), Constant(p)))
+            end
+        end
+    elseif hv == true
+        hv! = (H, θ, v, p = p) -> f.hv(H, θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = (res, x) -> f.cons(res, x, p)
+        function cons_oop(x)
+            _res = zeros(eltype(x), num_cons)
+            f.cons(_res, x, p)
+            return _res
+        end
+
+        function cons_oop(x, i)
+            _res = zeros(eltype(x), num_cons)
+            f.cons(_res, x, p)
+            return _res[i]
+        end
+
+        function lagrangian(θ, σ, λ, p)
+            return σ * f.f(θ, p) + dot(λ, cons_oop(θ))
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if f.cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(cons_oop, adtype, x)
+        function cons_j!(J, θ)
+            jacobian!(cons_oop, J, prep_jac, adtype, θ)
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+        end
+    elseif cons_j == true && f.cons !== nothing
+        cons_j! = (J, θ) -> f.cons_j(J, θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && f.cons !== nothing
+        prep_pullback = prepare_pullback(cons_oop, adtype, x, (ones(eltype(x), num_cons),))
+        function cons_vjp!(J, θ, v)
+            only(pullback!(cons_oop, (J,), prep_pullback, adtype, θ, (v,)))
+        end
+    elseif cons_vjp == true && f.cons !== nothing
+        cons_vjp! = (J, θ, v) -> f.cons_vjp(J, θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true && f.cons !== nothing
+        prep_pushforward = prepare_pushforward(
+            cons_oop, adtype, x, (ones(eltype(x), length(x)),))
+        function cons_jvp!(J, θ, v)
+            only(pushforward!(cons_oop, (J,), prep_pushforward, adtype, θ, (v,)))
+        end
+    elseif cons_jvp == true && f.cons !== nothing
+        cons_jvp! = (J, θ, v) -> f.cons_jvp(J, θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if f.cons !== nothing && f.cons_h === nothing && cons_h == true
+        prep_cons_hess = [prepare_hessian(cons_oop, soadtype, x, Constant(i))
+                          for i in 1:num_cons]
+
+        function cons_h!(H, θ)
+            for i in 1:num_cons
+                hessian!(cons_oop, H[i], prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+        end
+    elseif cons_h == true && f.cons !== nothing
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+
+    if f.cons !== nothing && lag_h == true && f.lag_h === nothing
+        lag_prep = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p))
+        lag_hess_prototype = zeros(Bool, num_cons, length(x))
+
+        function lag_h!(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h!(H, θ)
+                H *= λ
+            else
+                hessian!(lagrangian, H, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+            end
+        end
+
+        function lag_h!(h::AbstractVector, θ, σ, λ)
+            H = hessian(
+                lagrangian, lag_prep, soadtype, θ, Constant(σ), Constant(λ), Constant(p))
+            k = 0
+            for i in 1:length(θ)
+                for j in 1:i
+                    k += 1
+                    h[k] = H[i, j]
+                end
+            end
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(H::AbstractMatrix, θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    cons_h!(H, θ)
+                    H *= λ
+                else
+                    hessian!(lagrangian, H, lag_prep, soadtype, θ,
+                        Constant(σ), Constant(λ), Constant(p))
+                end
+            end
+
+            function lag_h!(h::AbstractVector, θ, σ, λ, p)
+                H = hessian(lagrangian, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+                k = 0
+                for i in 1:length(θ)
+                    for j in 1:i
+                        k += 1
+                        h[k] = H[i, j]
+                    end
+                end
+            end
+        end
+    elseif lag_h == true && f.cons !== nothing
+        lag_h! = (res, θ, σ, μ, p = p) -> f.lag_h(res, θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+
+    return OptimizationFunction{true}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AbstractADType, num_cons = 0;
+        kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, x, adtype::ADTypes.AbstractADType,
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = generate_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        prep_grad = prepare_gradient(f.f, adtype, x, Constant(p))
+        function grad(θ)
+            gradient(f.f, prep_grad, adtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function grad(θ, p)
+                gradient(f.f, prep_grad, adtype, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (θ, p = p) -> f.grad(θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            prep_grad = prepare_gradient(f.f, adtype, x, Constant(p))
+        end
+        function fg!(θ)
+            (y, res) = value_and_gradient(f.f, prep_grad, adtype, θ, Constant(p))
+            return y, res
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fg!(θ, p)
+                (y, res) = value_and_gradient(f.f, prep_grad, adtype, θ, Constant(p))
+                return y, res
+            end
+        end
+    elseif fg == true
+        fg! = (θ, p = p) -> f.fg(θ, p)
+    else
+        fg! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if h == true && f.hess === nothing
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p))
+        function hess(θ)
+            hessian(f.f, prep_hess, soadtype, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(θ, p)
+                hessian(f.f, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (θ, p = p) -> f.hess(θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(θ)
+            (y, G, H) = value_derivative_and_second_derivative(
+                f.f, prep_hess, adtype, θ, Constant(p))
+            return y, G, H
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(θ, p)
+                (y, G, H) = value_derivative_and_second_derivative(
+                    f.f, prep_hess, adtype, θ, Constant(p))
+                return y, G, H
+            end
+        end
+    elseif fgh == true
+        fgh! = (θ, p = p) -> f.fgh(θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(f.f, soadtype, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(θ, v)
+            only(hvp(f.f, prep_hvp, soadtype, θ, (v,), Constant(p)))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(θ, v, p)
+                only(hvp(f.f, prep_hvp, soadtype, θ, (v,), Constant(p)))
+            end
+        end
+    elseif hv == true
+        hv! = (θ, v, p = p) -> f.hv(θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = Base.Fix2(f.cons, p)
+
+        function lagrangian(θ, σ, λ, p)
+            return σ * f.f(θ, p) + dot(λ, f.cons(θ, p))
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if f.cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(f.cons, adtype, x, Constant(p))
+        function cons_j!(θ)
+            J = jacobian(f.cons, prep_jac, adtype, θ, Constant(p))
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+            return J
+        end
+    elseif cons_j == true && f.cons !== nothing
+        cons_j! = (θ) -> f.cons_j(θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && f.cons !== nothing
+        prep_pullback = prepare_pullback(
+            f.cons, adtype, x, (ones(eltype(x), num_cons),), Constant(p))
+        function cons_vjp!(θ, v)
+            return only(pullback(f.cons, prep_pullback, adtype, θ, (v,), Constant(p)))
+        end
+    elseif cons_vjp == true && f.cons !== nothing
+        cons_vjp! = (θ, v) -> f.cons_vjp(θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true && f.cons !== nothing
+        prep_pushforward = prepare_pushforward(
+            f.cons, adtype, x, (ones(eltype(x), length(x)),), Constant(p))
+        function cons_jvp!(θ, v)
+            return only(pushforward(f.cons, prep_pushforward, adtype, θ, (v,), Constant(p)))
+        end
+    elseif cons_jvp == true && f.cons !== nothing
+        cons_jvp! = (θ, v) -> f.cons_jvp(θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if f.cons !== nothing && cons_h == true && f.cons_h === nothing
+        function cons_i(x, i)
+            return f.cons(x, p)[i]
+        end
+        prep_cons_hess = [prepare_hessian(cons_i, soadtype, x, Constant(i))
+                          for i in 1:num_cons]
+
+        function cons_h!(θ)
+            H = map(1:num_cons) do i
+                hessian(cons_i, prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+            return H
+        end
+    elseif cons_h == true && f.cons !== nothing
+        cons_h! = (θ) -> f.cons_h(θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+
+    if f.cons !== nothing && lag_h == true && f.lag_h === nothing
+        lag_prep = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p))
+        lag_hess_prototype = zeros(Bool, num_cons, length(x))
+
+        function lag_h!(θ, σ, λ)
+            if σ == zero(eltype(θ))
+                return λ .* cons_h(θ)
+            else
+                return hessian(lagrangian, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+            end
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    return λ .* cons_h(θ)
+                else
+                    return hessian(lagrangian, lag_prep, soadtype, θ,
+                        Constant(σ), Constant(λ), Constant(p))
+                end
+            end
+        end
+    elseif lag_h == true && f.cons !== nothing
+        lag_h! = (θ, σ, λ, p = p) -> f.lag_h(θ, σ, λ, p)
+    else
+        lag_h! = nothing
+    end
+
+    return OptimizationFunction{false}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AbstractADType, num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
diff --git a/lib/OptimizationBase/src/OptimizationDISparseExt.jl b/lib/OptimizationBase/src/OptimizationDISparseExt.jl
new file mode 100644
index 000000000..dd3219fe5
--- /dev/null
+++ b/lib/OptimizationBase/src/OptimizationDISparseExt.jl
@@ -0,0 +1,532 @@
+import ArrayInterface
+import SciMLBase: OptimizationFunction
+import LinearAlgebra: I
+import DifferentiationInterface
+import DifferentiationInterface: prepare_gradient, prepare_hessian, prepare_hvp,
+                                 prepare_jacobian, value_and_gradient!,
+                                 value_derivative_and_second_derivative!,
+                                 value_and_gradient, value_derivative_and_second_derivative,
+                                 gradient!, hessian!, hvp!, jacobian!, gradient, hessian,
+                                 hvp, jacobian
+using ADTypes
+using SparseConnectivityTracer, SparseMatrixColorings
+
+function instantiate_function(
+        f::OptimizationFunction{true}, x, adtype::ADTypes.AutoSparse{<:AbstractADType},
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = generate_sparse_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        prep_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p))
+        function grad(res, θ)
+            gradient!(f.f, res, prep_grad, adtype.dense_ad, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters()
+            function grad(res, θ, p)
+                gradient!(f.f, res, prep_grad, adtype.dense_ad, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (G, θ, p = p) -> f.grad(G, θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            prep_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p))
+        end
+        function fg!(res, θ)
+            (y, _) = value_and_gradient!(
+                f.f, res, prep_grad, adtype.dense_ad, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters()
+            function fg!(res, θ, p)
+                (y, _) = value_and_gradient!(
+                    f.f, res, prep_grad, adtype.dense_ad, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fg == true
+        fg! = (G, θ, p = p) -> f.fg(G, θ, p)
+    else
+        fg! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if f.hess === nothing && h == true
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p))
+        function hess(res, θ)
+            hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+        end
+        hess_sparsity = prep_hess.coloring_result.A
+        hess_colors = prep_hess.coloring_result.color
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(res, θ, p)
+                hessian!(f.f, res, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (H, θ, p = p) -> f.hess(H, θ, p)
+    else
+        hess = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(G, H, θ)
+            (y, _, _) = value_derivative_and_second_derivative!(
+                f.f, G, H, prep_hess, soadtype.dense_ad, θ, Constant(p))
+            return y
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(G, H, θ, p)
+                (y, _, _) = value_derivative_and_second_derivative!(
+                    f.f, G, H, prep_hess, soadtype.dense_ad, θ, Constant(p))
+                return y
+            end
+        end
+    elseif fgh == true
+        fgh! = (G, H, θ, p = p) -> f.fgh(G, H, θ, p)
+    else
+        fgh! = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(
+            f.f, soadtype.dense_ad, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(H, θ, v)
+            only(hvp!(f.f, (H,), prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p)))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(H, θ, v, p)
+                only(hvp!(f.f, (H,), prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p)))
+            end
+        end
+    elseif hv == true
+        hv! = (H, θ, v, p = p) -> f.hv(H, θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = (res, θ) -> f.cons(res, θ, p)
+
+        function cons_oop(x)
+            _res = zeros(eltype(x), num_cons)
+            f.cons(_res, x, p)
+            return _res
+        end
+
+        function cons_oop(x, i)
+            _res = zeros(eltype(x), num_cons)
+            f.cons(_res, x, p)
+            return _res[i]
+        end
+
+        function lagrangian(θ, σ, λ, p)
+            if eltype(θ) <: SparseConnectivityTracer.AbstractTracer || !iszero(θ)
+                return σ * f.f(θ, p) + dot(λ, cons_oop(θ))
+            else
+                return dot(λ, cons_oop(θ))
+            end
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if f.cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(cons_oop, adtype, x)
+        function cons_j!(J, θ)
+            jacobian!(cons_oop, J, prep_jac, adtype, θ)
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+        end
+        cons_jac_prototype = prep_jac.coloring_result.A
+        cons_jac_colorvec = prep_jac.coloring_result.color
+    elseif cons_j === true && f.cons !== nothing
+        cons_j! = (J, θ) -> f.cons_j(J, θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && f.cons !== nothing
+        prep_pullback = prepare_pullback(
+            cons_oop, adtype.dense_ad, x, (ones(eltype(x), num_cons),))
+        function cons_vjp!(J, θ, v)
+            only(pullback!(cons_oop, (J,), prep_pullback, adtype.dense_ad, θ, (v,)))
+        end
+    elseif cons_vjp === true && f.cons !== nothing
+        cons_vjp! = (J, θ, v) -> f.cons_vjp(J, θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true && f.cons !== nothing
+        prep_pushforward = prepare_pushforward(
+            cons_oop, adtype.dense_ad, x, (ones(eltype(x), length(x)),))
+        function cons_jvp!(J, θ, v)
+            only(pushforward!(cons_oop, (J,), prep_pushforward, adtype.dense_ad, θ, (v,)))
+        end
+    elseif cons_jvp === true && f.cons !== nothing
+        cons_jvp! = (J, θ, v) -> f.cons_jvp(J, θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if f.cons !== nothing && f.cons_h === nothing && cons_h == true
+        prep_cons_hess = [prepare_hessian(cons_oop, soadtype, x, Constant(i))
+                          for i in 1:num_cons]
+        colores = getfield.(prep_cons_hess, :coloring_result)
+        conshess_sparsity = getfield.(colores, :A)
+        conshess_colors = getfield.(colores, :color)
+        function cons_h!(H, θ)
+            for i in 1:num_cons
+                hessian!(cons_oop, H[i], prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+        end
+    elseif cons_h == true && f.cons !== nothing
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+    lag_hess_colors = f.lag_hess_colorvec
+    if f.cons !== nothing && lag_h == true && f.lag_h === nothing
+        lag_prep = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p))
+        lag_hess_prototype = lag_prep.coloring_result.A
+        lag_hess_colors = lag_prep.coloring_result.color
+
+        function lag_h!(H::AbstractMatrix, θ, σ, λ)
+            if σ == zero(eltype(θ))
+                cons_h!(H, θ)
+                H *= λ
+            else
+                hessian!(lagrangian, H, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+            end
+        end
+
+        function lag_h!(h, θ, σ, λ)
+            H = hessian(
+                lagrangian, lag_prep, soadtype, θ, Constant(σ), Constant(λ), Constant(p))
+            k = 0
+            rows, cols, _ = findnz(H)
+            for (i, j) in zip(rows, cols)
+                if i <= j
+                    k += 1
+                    h[k] = H[i, j]
+                end
+            end
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(H::AbstractMatrix, θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    cons_h(H, θ)
+                    H *= λ
+                else
+                    hessian!(lagrangian, H, lag_prep, soadtype, θ,
+                        Constant(σ), Constant(λ), Constant(p))
+                end
+            end
+
+            function lag_h!(h, θ, σ, λ, p)
+                H = hessian(lagrangian, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+                k = 0
+                rows, cols, _ = findnz(H)
+                for (i, j) in zip(rows, cols)
+                    if i <= j
+                        k += 1
+                        h[k] = H[i, j]
+                    end
+                end
+            end
+        end
+    elseif lag_h == true
+        lag_h! = (H, θ, σ, λ, p = p) -> f.lag_h(H, θ, σ, λ, p)
+    else
+        lag_h! = nothing
+    end
+    return OptimizationFunction{true}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        lag_hess_colorvec = lag_hess_colors,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{true}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoSparse{<:AbstractADType}, num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, x, adtype::ADTypes.AutoSparse{<:AbstractADType},
+        p = SciMLBase.NullParameters(), num_cons = 0;
+        g = false, h = false, hv = false, fg = false, fgh = false,
+        cons_j = false, cons_vjp = false, cons_jvp = false, cons_h = false,
+        lag_h = false)
+    adtype, soadtype = generate_sparse_adtype(adtype)
+
+    if g == true && f.grad === nothing
+        prep_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p))
+        function grad(θ)
+            gradient(f.f, prep_grad, adtype.dense_ad, θ, Constant(p))
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function grad(θ, p)
+                gradient(f.f, prep_grad, adtype.dense_ad, θ, Constant(p))
+            end
+        end
+    elseif g == true
+        grad = (θ, p = p) -> f.grad(θ, p)
+    else
+        grad = nothing
+    end
+
+    if fg == true && f.fg === nothing
+        if g == false
+            prep_grad = prepare_gradient(f.f, adtype.dense_ad, x, Constant(p))
+        end
+        function fg!(θ)
+            (y, G) = value_and_gradient(f.f, prep_grad, adtype.dense_ad, θ, Constant(p))
+            return y, G
+        end
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fg!(θ, p)
+                (y, G) = value_and_gradient(f.f, prep_grad, adtype.dense_ad, θ, Constant(p))
+                return y, G
+            end
+        end
+    elseif fg == true
+        fg! = (θ, p = p) -> f.fg(θ, p)
+    else
+        fg! = nothing
+    end
+
+    if fgh == true && f.fgh === nothing
+        function fgh!(θ)
+            (y, G, H) = value_derivative_and_second_derivative(
+                f.f, prep_hess, soadtype, θ, Constant(p))
+            return y, G, H
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function fgh!(θ, p)
+                (y, G, H) = value_derivative_and_second_derivative(
+                    f.f, prep_hess, soadtype, θ, Constant(p))
+                return y, G, H
+            end
+        end
+    elseif fgh == true
+        fgh! = (θ, p = p) -> f.fgh(θ, p)
+    else
+        fgh! = nothing
+    end
+
+    hess_sparsity = f.hess_prototype
+    hess_colors = f.hess_colorvec
+    if h == true && f.hess === nothing
+        prep_hess = prepare_hessian(f.f, soadtype, x, Constant(p))
+        function hess(θ)
+            hessian(f.f, prep_hess, soadtype, θ, Constant(p))
+        end
+        hess_sparsity = prep_hess.coloring_result.A
+        hess_colors = prep_hess.coloring_result.color
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hess(θ, p)
+                hessian(f.f, prep_hess, soadtype, θ, Constant(p))
+            end
+        end
+    elseif h == true
+        hess = (θ, p = p) -> f.hess(θ, p)
+    else
+        hess = nothing
+    end
+
+    if hv == true && f.hv === nothing
+        prep_hvp = prepare_hvp(
+            f.f, soadtype.dense_ad, x, (zeros(eltype(x), size(x)),), Constant(p))
+        function hv!(θ, v)
+            only(hvp(f.f, prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p)))
+        end
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function hv!(θ, v, p)
+                only(hvp(f.f, prep_hvp, soadtype.dense_ad, θ, (v,), Constant(p)))
+            end
+        end
+    elseif hv == true
+        hv! = (θ, v, p = p) -> f.hv(θ, v, p)
+    else
+        hv! = nothing
+    end
+
+    if f.cons === nothing
+        cons = nothing
+    else
+        cons = Base.Fix2(f.cons, p)
+
+        function lagrangian(θ, σ, λ, p)
+            return σ * f.f(θ, p) + dot(λ, f.cons(θ, p))
+        end
+    end
+
+    cons_jac_prototype = f.cons_jac_prototype
+    cons_jac_colorvec = f.cons_jac_colorvec
+    if f.cons !== nothing && cons_j == true && f.cons_j === nothing
+        prep_jac = prepare_jacobian(f.cons, adtype, x, Constant(p))
+        function cons_j!(θ)
+            J = jacobian(f.cons, prep_jac, adtype, θ, Constant(p))
+            if size(J, 1) == 1
+                J = vec(J)
+            end
+            return J
+        end
+        cons_jac_prototype = prep_jac.coloring_result.A
+        cons_jac_colorvec = prep_jac.coloring_result.color
+    elseif cons_j === true && f.cons !== nothing
+        cons_j! = (θ) -> f.cons_j(θ, p)
+    else
+        cons_j! = nothing
+    end
+
+    if f.cons_vjp === nothing && cons_vjp == true && f.cons !== nothing
+        prep_pullback = prepare_pullback(
+            f.cons, adtype.dense_ad, x, (ones(eltype(x), num_cons),), Constant(p))
+        function cons_vjp!(θ, v)
+            only(pullback(f.cons, prep_pullback, adtype.dense_ad, θ, (v,), Constant(p)))
+        end
+    elseif cons_vjp === true && f.cons !== nothing
+        cons_vjp! = (θ, v) -> f.cons_vjp(θ, v, p)
+    else
+        cons_vjp! = nothing
+    end
+
+    if f.cons_jvp === nothing && cons_jvp == true && f.cons !== nothing
+        prep_pushforward = prepare_pushforward(
+            f.cons, adtype.dense_ad, x, (ones(eltype(x), length(x)),), Constant(p))
+        function cons_jvp!(θ, v)
+            only(pushforward(
+                f.cons, prep_pushforward, adtype.dense_ad, θ, (v,), Constant(p)))
+        end
+    elseif cons_jvp === true && f.cons !== nothing
+        cons_jvp! = (θ, v) -> f.cons_jvp(θ, v, p)
+    else
+        cons_jvp! = nothing
+    end
+
+    conshess_sparsity = f.cons_hess_prototype
+    conshess_colors = f.cons_hess_colorvec
+    if f.cons !== nothing && cons_h == true && f.cons_h === nothing
+        function cons_i(x, i)
+            f.cons(x, p)[i]
+        end
+        prep_cons_hess = [prepare_hessian(cons_i, soadtype, x, Constant(i))
+                          for i in 1:num_cons]
+
+        function cons_h!(θ)
+            H = map(1:num_cons) do i
+                hessian(cons_i, prep_cons_hess[i], soadtype, θ, Constant(i))
+            end
+            return H
+        end
+        colores = getfield.(prep_cons_hess, :coloring_result)
+        conshess_sparsity = getfield.(colores, :A)
+        conshess_colors = getfield.(colores, :color)
+    elseif cons_h == true && f.cons !== nothing
+        cons_h! = (res, θ) -> f.cons_h(res, θ, p)
+    else
+        cons_h! = nothing
+    end
+
+    lag_hess_prototype = f.lag_hess_prototype
+    lag_hess_colors = f.lag_hess_colorvec
+    if f.cons !== nothing && lag_h == true && f.lag_h === nothing
+        lag_prep = prepare_hessian(
+            lagrangian, soadtype, x, Constant(one(eltype(x))),
+            Constant(ones(eltype(x), num_cons)), Constant(p))
+        function lag_h!(θ, σ, λ)
+            if σ == zero(eltype(θ))
+                return λ .* cons_h!(θ)
+            else
+                hess = hessian(lagrangian, lag_prep, soadtype, θ,
+                    Constant(σ), Constant(λ), Constant(p))
+                return hess
+            end
+        end
+        lag_hess_prototype = lag_prep.coloring_result.A
+        lag_hess_colors = lag_prep.coloring_result.color
+
+        if p !== SciMLBase.NullParameters() && p !== nothing
+            function lag_h!(θ, σ, λ, p)
+                if σ == zero(eltype(θ))
+                    return λ .* cons_h!(θ)
+                else
+                    hess = hessian(
+                        lagrangian, lag_prep, θ, Constant(σ), Constant(λ), Constant(p))
+                    return hess
+                end
+            end
+        end
+    elseif lag_h == true && f.cons !== nothing
+        lag_h! = (θ, σ, μ, p = p) -> f.lag_h(θ, σ, μ, p)
+    else
+        lag_h! = nothing
+    end
+    return OptimizationFunction{false}(f.f, adtype;
+        grad = grad, fg = fg!, hess = hess, hv = hv!, fgh = fgh!,
+        cons = cons, cons_j = cons_j!, cons_h = cons_h!,
+        cons_vjp = cons_vjp!, cons_jvp = cons_jvp!,
+        hess_prototype = hess_sparsity,
+        hess_colorvec = hess_colors,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_jac_colorvec = cons_jac_colorvec,
+        cons_hess_prototype = conshess_sparsity,
+        cons_hess_colorvec = conshess_colors,
+        lag_h = lag_h!,
+        lag_hess_prototype = lag_hess_prototype,
+        lag_hess_colorvec = lag_hess_colors,
+        sys = f.sys,
+        expr = f.expr,
+        cons_expr = f.cons_expr)
+end
+
+function instantiate_function(
+        f::OptimizationFunction{false}, cache::OptimizationBase.ReInitCache,
+        adtype::ADTypes.AutoSparse{<:AbstractADType}, num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, adtype, p, num_cons; kwargs...)
+end
diff --git a/lib/OptimizationBase/src/adtypes.jl b/lib/OptimizationBase/src/adtypes.jl
new file mode 100644
index 000000000..071528dc4
--- /dev/null
+++ b/lib/OptimizationBase/src/adtypes.jl
@@ -0,0 +1,288 @@
+"""
+    AutoEnzyme <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+```julia
+OptimizationFunction(f, AutoEnzyme(); kwargs...)
+```
+This uses the [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) package. Enzyme performs automatic differentiation on the LLVM IR code generated from julia.
+It is highly-efficient and its ability perform AD on optimized code allows Enzyme to meet or exceed the performance of state-of-the-art AD tools.
+  - Compatible with GPUs
+  - Compatible with Hessian-based optimization
+  - Compatible with Hv-based optimization
+  - Compatible with constraints
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the Hessian
+is not defined via Enzyme.
+"""
+AutoEnzyme
+
+"""
+    AutoFiniteDiff{T1,T2,T3} <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoFiniteDiff(); kwargs...)
+```
+
+This uses [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl).
+While not necessarily the most efficient, this is the only
+choice that doesn't require the `f` function to be automatically
+differentiable, which means it applies to any choice. However, because
+it's using finite differencing, one needs to be careful as this procedure
+introduces numerical error into the derivative estimates.
+
+  - Compatible with GPUs
+  - Compatible with Hessian-based optimization
+  - Compatible with Hv-based optimization
+  - Compatible with constraint functions
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not defined via FiniteDiff.
+
+## Constructor
+
+```julia
+AutoFiniteDiff(; fdtype = Val(:forward)fdjtype = fdtype, fdhtype = Val(:hcentral))
+```
+
+  - `fdtype`: the method used for defining the gradient
+  - `fdjtype`: the method used for defining the Jacobian of constraints.
+  - `fdhtype`: the method used for defining the Hessian
+
+For more information on the derivative type specifiers, see the
+[FiniteDiff.jl documentation](https://github.com/JuliaDiff/FiniteDiff.jl).
+"""
+AutoFiniteDiff
+
+"""
+    AutoForwardDiff{chunksize} <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoForwardDiff(); kwargs...)
+```
+
+This uses the [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl)
+package. It is the fastest choice for small systems, especially with
+heavy scalar interactions. It is easy to use and compatible with most
+Julia functions which have loose type restrictions. However,
+because it's forward-mode, it scales poorly in comparison to other AD
+choices. Hessian construction is suboptimal as it uses the forward-over-forward
+approach.
+
+  - Compatible with GPUs
+  - Compatible with Hessian-based optimization
+  - Compatible with Hv-based optimization
+  - Compatible with constraints
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not defined via ForwardDiff.
+"""
+AutoForwardDiff
+
+"""
+    AutoModelingToolkit <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoModelingToolkit(); kwargs...)
+```
+
+This uses the [ModelingToolkit.jl](https://github.com/SciML/ModelingToolkit.jl)
+package's `modelingtookitize` functionality to generate the derivatives and other fields of an `OptimizationFunction`.
+This backend creates the symbolic expressions for the objective and its derivatives as well as
+the constraints and their derivatives. Through `structural_simplify`, it enforces simplifications
+that can reduce the number of operations needed to compute the derivatives of the constraints. This automatically
+generates the expression graphs that some solver interfaces through OptimizationMOI like
+[AmplNLWriter.jl](https://github.com/jump-dev/AmplNLWriter.jl) require.
+
+  - Compatible with GPUs
+  - Compatible with Hessian-based optimization
+  - Compatible with Hv-based optimization
+  - Compatible with constraints
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not generated via ModelingToolkit.
+
+## Constructor
+
+```julia
+AutoModelingToolkit(false, false)
+```
+
+  - `obj_sparse`: to indicate whether the objective hessian is sparse.
+  - `cons_sparse`: to indicate whether the constraints' jacobian and hessian are sparse.
+
+"""
+AutoModelingToolkit
+
+"""
+    AutoReverseDiff <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoReverseDiff(); kwargs...)
+```
+
+This uses the [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl)
+package. `AutoReverseDiff` has a default argument, `compile`, which
+denotes whether the reverse pass should be compiled. **`compile` should only
+be set to `true` if `f` contains no branches (if statements, while loops)
+otherwise it can produce incorrect derivatives!**
+
+`AutoReverseDiff` is generally applicable to many pure Julia codes,
+and with `compile=true` it is one of the fastest options on code with
+heavy scalar interactions. Hessian calculations are fast by mixing
+ForwardDiff with ReverseDiff for forward-over-reverse. However, its
+performance can falter when `compile=false`.
+
+  - Not compatible with GPUs
+  - Compatible with Hessian-based optimization by mixing with ForwardDiff
+  - Compatible with Hv-based optimization by mixing with ForwardDiff
+  - Not compatible with constraint functions
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not defined via ReverseDiff.
+
+## Constructor
+
+```julia
+AutoReverseDiff(; compile = false)
+```
+
+#### Note: currently, compilation is not defined/used!
+"""
+AutoReverseDiff
+
+"""
+    AutoTracker <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoTracker(); kwargs...)
+```
+
+This uses the [Tracker.jl](https://github.com/FluxML/Tracker.jl) package.
+Generally slower than ReverseDiff, it is generally applicable to many
+pure Julia codes.
+
+  - Compatible with GPUs
+  - Not compatible with Hessian-based optimization
+  - Not compatible with Hv-based optimization
+  - Not compatible with constraint functions
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not defined via Tracker.
+"""
+AutoTracker
+
+"""
+    AutoZygote <: AbstractADType
+
+An AbstractADType choice for use in OptimizationFunction for automatically
+generating the unspecified derivative functions. Usage:
+
+```julia
+OptimizationFunction(f, AutoZygote(); kwargs...)
+```
+
+This uses the [Zygote.jl](https://github.com/FluxML/Zygote.jl) package.
+This is the staple reverse-mode AD that handles a large portion of
+Julia with good efficiency. Hessian construction is fast via
+forward-over-reverse mixing ForwardDiff.jl with Zygote.jl
+
+  - Compatible with GPUs
+  - Compatible with Hessian-based optimization via ForwardDiff
+  - Compatible with Hv-based optimization via ForwardDiff
+  - Not compatible with constraint functions
+
+Note that only the unspecified derivative functions are defined. For example,
+if a `hess` function is supplied to the `OptimizationFunction`, then the
+Hessian is not defined via Zygote.
+"""
+AutoZygote
+
+function generate_adtype(adtype)
+    if !(adtype isa SciMLBase.NoAD || adtype isa DifferentiationInterface.SecondOrder ||
+         adtype isa AutoZygote)
+        soadtype = DifferentiationInterface.SecondOrder(adtype, adtype)
+    elseif adtype isa AutoZygote
+        soadtype = DifferentiationInterface.SecondOrder(AutoForwardDiff(), adtype)
+    elseif adtype isa DifferentiationInterface.SecondOrder
+        soadtype = adtype
+        adtype = adtype.inner
+    elseif adtype isa SciMLBase.NoAD
+        soadtype = adtype
+        adtype = adtype
+    end
+    return adtype, soadtype
+end
+
+function spadtype_to_spsoadtype(adtype)
+    if !(adtype.dense_ad isa SciMLBase.NoAD ||
+         adtype.dense_ad isa DifferentiationInterface.SecondOrder ||
+         adtype.dense_ad isa AutoZygote)
+        soadtype = AutoSparse(
+            DifferentiationInterface.SecondOrder(adtype.dense_ad, adtype.dense_ad),
+            sparsity_detector = adtype.sparsity_detector,
+            coloring_algorithm = adtype.coloring_algorithm)
+    elseif adtype.dense_ad isa AutoZygote
+        soadtype = AutoSparse(
+            DifferentiationInterface.SecondOrder(AutoForwardDiff(), adtype.dense_ad),
+            sparsity_detector = adtype.sparsity_detector,
+            coloring_algorithm = adtype.coloring_algorithm)
+    else
+        soadtype = adtype
+    end
+    return soadtype
+end
+
+function filled_spad(adtype)
+    if adtype.sparsity_detector isa ADTypes.NoSparsityDetector &&
+       adtype.coloring_algorithm isa ADTypes.NoColoringAlgorithm
+        adtype = AutoSparse(adtype.dense_ad; sparsity_detector = TracerSparsityDetector(),
+            coloring_algorithm = GreedyColoringAlgorithm())
+    elseif adtype.sparsity_detector isa ADTypes.NoSparsityDetector &&
+           !(adtype.coloring_algorithm isa ADTypes.NoColoringAlgorithm)
+        adtype = AutoSparse(adtype.dense_ad; sparsity_detector = TracerSparsityDetector(),
+            coloring_algorithm = adtype.coloring_algorithm)
+    elseif !(adtype.sparsity_detector isa ADTypes.NoSparsityDetector) &&
+           adtype.coloring_algorithm isa ADTypes.NoColoringAlgorithm
+        adtype = AutoSparse(adtype.dense_ad; sparsity_detector = adtype.sparsity_detector,
+            coloring_algorithm = GreedyColoringAlgorithm())
+    end
+end
+
+function generate_sparse_adtype(adtype)
+    if !(adtype.dense_ad isa DifferentiationInterface.SecondOrder)
+        adtype = filled_spad(adtype)
+        soadtype = spadtype_to_spsoadtype(adtype)
+    else
+        soadtype = adtype
+        adtype = AutoSparse(
+            adtype.dense_ad.inner,
+            sparsity_detector = soadtype.sparsity_detector,
+            coloring_algorithm = soadtype.coloring_algorithm)
+        adtype = filled_spad(adtype)
+        soadtype = filled_spad(soadtype)
+    end
+
+    return adtype, soadtype
+end
diff --git a/lib/OptimizationBase/src/augmented_lagrangian.jl b/lib/OptimizationBase/src/augmented_lagrangian.jl
new file mode 100644
index 000000000..879090089
--- /dev/null
+++ b/lib/OptimizationBase/src/augmented_lagrangian.jl
@@ -0,0 +1,13 @@
+function generate_auglag(θ)
+    x = cache.f(θ, cache.p)
+    cons_tmp .= zero(eltype(θ))
+    cache.f.cons(cons_tmp, θ)
+    cons_tmp[eq_inds] .= cons_tmp[eq_inds] - cache.lcons[eq_inds]
+    cons_tmp[ineq_inds] .= cons_tmp[ineq_inds] .- cache.ucons[ineq_inds]
+    opt_state = Optimization.OptimizationState(u = θ, objective = x[1])
+    if cache.callback(opt_state, x...)
+        error("Optimization halted by callback.")
+    end
+    return x[1] + sum(@. λ * cons_tmp[eq_inds] + ρ / 2 * (cons_tmp[eq_inds] .^ 2)) +
+           1 / (2 * ρ) * sum((max.(Ref(0.0), μ .+ (ρ .* cons_tmp[ineq_inds]))) .^ 2)
+end
diff --git a/lib/OptimizationBase/src/cache.jl b/lib/OptimizationBase/src/cache.jl
new file mode 100644
index 000000000..7d529324b
--- /dev/null
+++ b/lib/OptimizationBase/src/cache.jl
@@ -0,0 +1,98 @@
+isa_dataiterator(data) = false
+
+struct AnalysisResults{O, C}
+    objective::O
+    constraints::C
+end
+
+struct OptimizationCache{F, RC, LB, UB, LC, UC, S, O, P, C, M} <:
+       SciMLBase.AbstractOptimizationCache
+    f::F
+    reinit_cache::RC
+    lb::LB
+    ub::UB
+    lcons::LC
+    ucons::UC
+    sense::S
+    opt::O
+    progress::P
+    callback::C
+    manifold::M
+    analysis_results::AnalysisResults
+    solver_args::NamedTuple
+end
+
+function OptimizationCache(prob::SciMLBase.OptimizationProblem, opt;
+        callback = DEFAULT_CALLBACK,
+        maxiters::Union{Number, Nothing} = nothing,
+        maxtime::Union{Number, Nothing} = nothing,
+        abstol::Union{Number, Nothing} = nothing,
+        reltol::Union{Number, Nothing} = nothing,
+        progress = false,
+        structural_analysis = false,
+        manifold = nothing,
+        kwargs...)
+    if isa_dataiterator(prob.p)
+        reinit_cache = OptimizationBase.ReInitCache(prob.u0, iterate(prob.p)[1])
+        reinit_cache_passedon = OptimizationBase.ReInitCache(prob.u0, prob.p)
+    else
+        reinit_cache = OptimizationBase.ReInitCache(prob.u0, prob.p)
+        reinit_cache_passedon = reinit_cache
+    end
+
+    num_cons = prob.ucons === nothing ? 0 : length(prob.ucons)
+
+    if !(prob.f.adtype isa DifferentiationInterface.SecondOrder ||
+         prob.f.adtype isa AutoZygote) &&
+       (SciMLBase.requireshessian(opt) || SciMLBase.requiresconshess(opt) ||
+        SciMLBase.requireslagh(opt))
+        @warn "The selected optimization algorithm requires second order derivatives, but `SecondOrder` ADtype was not provided. 
+        So a `SecondOrder` with $(prob.f.adtype) for both inner and outer will be created, this can be suboptimal and not work in some cases so 
+        an explicit `SecondOrder` ADtype is recommended."
+    elseif prob.f.adtype isa AutoZygote &&
+           (SciMLBase.requiresconshess(opt) || SciMLBase.requireslagh(opt) ||
+            SciMLBase.requireshessian(opt))
+        @warn "The selected optimization algorithm requires second order derivatives, but `AutoZygote` ADtype was provided. 
+        So a `SecondOrder` with `AutoZygote` for inner and `AutoForwardDiff` for outer will be created, for choosing another pair
+        an explicit `SecondOrder` ADtype is recommended."
+    end
+
+    f = OptimizationBase.instantiate_function(
+        prob.f, reinit_cache, prob.f.adtype, num_cons;
+        g = SciMLBase.requiresgradient(opt), h = SciMLBase.requireshessian(opt),
+        hv = SciMLBase.requireshessian(opt), fg = SciMLBase.allowsfg(opt),
+        fgh = SciMLBase.allowsfgh(opt), cons_j = SciMLBase.requiresconsjac(opt), cons_h = SciMLBase.requiresconshess(opt),
+        cons_vjp = SciMLBase.allowsconsjvp(opt), cons_jvp = SciMLBase.allowsconsjvp(opt), lag_h = SciMLBase.requireslagh(opt), sense = prob.sense)
+
+    if structural_analysis
+        obj_res, cons_res = symify_cache(f, prob, num_cons, manifold)
+    else
+        obj_res = nothing
+        cons_res = nothing
+    end
+
+    return OptimizationCache(f, reinit_cache_passedon, prob.lb, prob.ub, prob.lcons,
+        prob.ucons, prob.sense,
+        opt, progress, callback, manifold, AnalysisResults(obj_res, cons_res),
+        merge((; maxiters, maxtime, abstol, reltol),
+            NamedTuple(kwargs)))
+end
+
+function SciMLBase.__init(prob::SciMLBase.OptimizationProblem, opt;
+        callback = DEFAULT_CALLBACK,
+        maxiters::Union{Number, Nothing} = nothing,
+        maxtime::Union{Number, Nothing} = nothing,
+        abstol::Union{Number, Nothing} = nothing,
+        reltol::Union{Number, Nothing} = nothing,
+        progress = false,
+        kwargs...)
+    return OptimizationCache(prob, opt; maxiters, maxtime, abstol, callback,
+        reltol, progress,
+        kwargs...)
+end
+
+# Wrapper for fields that may change in `reinit!(cache)` of a cache.
+mutable struct ReInitCache{uType, P}
+    u0::uType
+    p::P
+end
diff --git a/lib/OptimizationBase/src/function.jl b/lib/OptimizationBase/src/function.jl
new file mode 100644
index 000000000..822a3587e
--- /dev/null
+++ b/lib/OptimizationBase/src/function.jl
@@ -0,0 +1,242 @@
+
+function symbolify(e::Expr)
+    if !(e.args[1] isa Symbol)
+        e.args[1] = Symbol(e.args[1])
+    end
+    symbolify.(e.args)
+    return e
+end
+
+function symbolify(e)
+    return e
+end
+
+function rep_pars_vals!(e::Expr, p)
+    rep_pars_vals!.(e.args, Ref(p))
+    replace!(e.args, p...)
+end
+
+function rep_pars_vals!(e, p) end
+
+"""
+    instantiate_function(f, x, ::AbstractADType, p, num_cons = 0)::OptimizationFunction
+
+This function is used internally by Optimization.jl to construct
+the necessary extra functions (gradients, Hessians, etc.) before
+optimization. Each of the ADType dispatches use the supplied automatic
+differentiation type in order to specify how the construction process
+occurs.
+
+If no ADType is given, then the default `NoAD` dispatch simply
+defines closures on any supplied gradient function to enclose the
+parameters to match the interfaces for the specific optimization
+libraries (i.e. (G,x)->f.grad(G,x,p)). If a function is not given
+and the `NoAD` dispatch is used, or if the AD dispatch is currently
+not capable of defining said derivative, then the constructed
+`OptimizationFunction` will simply use `nothing` to specify and undefined
+function.
+
+The return of `instantiate_function` is an `OptimizationFunction` which
+is then used in the optimization process. If an optimizer requires a
+function that is not defined, an error is thrown.
+
+For more information on the use of automatic differentiation, see the
+documentation of the `AbstractADType` types.
+"""
+function OptimizationBase.instantiate_function(
+        f::MultiObjectiveOptimizationFunction, x, ::SciMLBase.NoAD,
+        p, num_cons = 0; kwargs...)
+    jac = f.jac === nothing ? nothing : (J, x, args...) -> f.jac(J, x, p, args...)
+    hess = f.hess === nothing ? nothing :
+           [(H, x, args...) -> h(H, x, p, args...) for h in f.hess]
+    hv = f.hv === nothing ? nothing : (H, x, v, args...) -> f.hv(H, x, v, p, args...)
+    cons = f.cons === nothing ? nothing : (res, x) -> f.cons(res, x, p)
+    cons_j = f.cons_j === nothing ? nothing : (res, x) -> f.cons_j(res, x, p)
+    cons_jvp = f.cons_jvp === nothing ? nothing : (res, x) -> f.cons_jvp(res, x, p)
+    cons_vjp = f.cons_vjp === nothing ? nothing : (res, x) -> f.cons_vjp(res, x, p)
+    cons_h = f.cons_h === nothing ? nothing : (res, x) -> f.cons_h(res, x, p)
+    hess_prototype = f.hess_prototype === nothing ? nothing :
+                     convert.(eltype(x), f.hess_prototype)
+    cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
+                         convert.(eltype(x), f.cons_jac_prototype)
+    cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
+                          [convert.(eltype(x), f.cons_hess_prototype[i])
+                           for i in 1:num_cons]
+    expr = symbolify(f.expr)
+    cons_expr = symbolify.(f.cons_expr)
+
+    return MultiObjectiveOptimizationFunction{true}(
+        f.f, SciMLBase.NoAD(); jac = jac, hess = hess,
+        hv = hv,
+        cons = cons, cons_j = cons_j, cons_jvp = cons_jvp, cons_vjp = cons_vjp, cons_h = cons_h,
+        hess_prototype = hess_prototype,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_hess_prototype = cons_hess_prototype,
+        expr = expr, cons_expr = cons_expr,
+        sys = f.sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::MultiObjectiveOptimizationFunction, cache::ReInitCache, ::SciMLBase.NoAD,
+        num_cons = 0; kwargs...)
+    jac = f.jac === nothing ? nothing : (J, x, args...) -> f.jac(J, x, cache.p, args...)
+    hess = f.hess === nothing ? nothing :
+           [(H, x, args...) -> h(H, x, cache.p, args...) for h in f.hess]
+    hv = f.hv === nothing ? nothing : (H, x, v, args...) -> f.hv(H, x, v, cache.p, args...)
+    cons = f.cons === nothing ? nothing : (res, x) -> f.cons(res, x, cache.p)
+    cons_j = f.cons_j === nothing ? nothing : (res, x) -> f.cons_j(res, x, cache.p)
+    cons_jvp = f.cons_jvp === nothing ? nothing : (res, x) -> f.cons_jvp(res, x, cache.p)
+    cons_vjp = f.cons_vjp === nothing ? nothing : (res, x) -> f.cons_vjp(res, x, cache.p)
+    cons_h = f.cons_h === nothing ? nothing : (res, x) -> f.cons_h(res, x, cache.p)
+    hess_prototype = f.hess_prototype === nothing ? nothing :
+                     convert.(eltype(cache.u0), f.hess_prototype)
+    cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
+                         convert.(eltype(cache.u0), f.cons_jac_prototype)
+    cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
+                          [convert.(eltype(cache.u0), f.cons_hess_prototype[i])
+                           for i in 1:num_cons]
+    expr = symbolify(f.expr)
+    cons_expr = symbolify.(f.cons_expr)
+
+    return MultiObjectiveOptimizationFunction{true}(
+        f.f, SciMLBase.NoAD(); jac = jac, hess = hess,
+        hv = hv,
+        cons = cons, cons_j = cons_j, cons_jvp = cons_jvp, cons_vjp = cons_vjp, cons_h = cons_h,
+        hess_prototype = hess_prototype,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_hess_prototype = cons_hess_prototype,
+        expr = expr, cons_expr = cons_expr,
+        sys = f.sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, x, ::SciMLBase.NoAD,
+        p, num_cons = 0; sense, kwargs...)
+    if f.grad === nothing
+        grad = nothing
+    else
+        function grad(G, x)
+            return f.grad(G, x, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function grad(G, x, p)
+                return f.grad(G, x, p)
+            end
+        end
+    end
+    if f.fg === nothing
+        fg = nothing
+    else
+        function fg(G, x)
+            return f.fg(G, x, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function fg(G, x, p)
+                return f.fg(G, x, p)
+            end
+        end
+    end
+    if f.hess === nothing
+        hess = nothing
+    else
+        function hess(H, x)
+            return f.hess(H, x, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function hess(H, x, p)
+                return f.hess(H, x, p)
+            end
+        end
+    end
+
+    if f.fgh === nothing
+        fgh = nothing
+    else
+        function fgh(G, H, x)
+            return f.fgh(G, H, x, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function fgh(G, H, x, p)
+                return f.fgh(G, H, x, p)
+            end
+        end
+    end
+
+    if f.hv === nothing
+        hv = nothing
+    else
+        function hv(H, x, v)
+            return f.hv(H, x, v, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function hv(H, x, v, p)
+                return f.hv(H, x, v, p)
+            end
+        end
+    end
+
+    cons = f.cons === nothing ? nothing : (res, x) -> f.cons(res, x, p)
+    cons_j = f.cons_j === nothing ? nothing : (res, x) -> f.cons_j(res, x, p)
+    cons_vjp = f.cons_vjp === nothing ? nothing : (res, x) -> f.cons_vjp(res, x, p)
+    cons_jvp = f.cons_jvp === nothing ? nothing : (res, x) -> f.cons_jvp(res, x, p)
+    cons_h = f.cons_h === nothing ? nothing : (res, x) -> f.cons_h(res, x, p)
+
+    if f.lag_h === nothing
+        lag_h = nothing
+    else
+        function lag_h(res, x)
+            return f.lag_h(res, x, p)
+        end
+        if p != SciMLBase.NullParameters()
+            function lag_h(res, x, p)
+                return f.lag_h(res, x, p)
+            end
+        end
+    end
+    hess_prototype = f.hess_prototype === nothing ? nothing :
+                     convert.(eltype(x), f.hess_prototype)
+    cons_jac_prototype = f.cons_jac_prototype === nothing ? nothing :
+                         convert.(eltype(x), f.cons_jac_prototype)
+    cons_hess_prototype = f.cons_hess_prototype === nothing ? nothing :
+                          [convert.(eltype(x), f.cons_hess_prototype[i])
+                           for i in 1:num_cons]
+    expr = symbolify(f.expr)
+    cons_expr = symbolify.(f.cons_expr)
+
+    obj_f = (x,p) -> sense == MaxSense ? -1.0* f.f(x,p) : f.f(x,p)
+    return OptimizationFunction{true}(obj_f, SciMLBase.NoAD();
+        grad = grad, fg = fg, hess = hess, fgh = fgh, hv = hv,
+        cons = cons, cons_j = cons_j, cons_h = cons_h,
+        cons_vjp = cons_vjp, cons_jvp = cons_jvp,
+        lag_h = lag_h,
+        hess_prototype = hess_prototype,
+        cons_jac_prototype = cons_jac_prototype,
+        cons_hess_prototype = cons_hess_prototype,
+        expr = expr, cons_expr = cons_expr,
+        sys = f.sys,
+        observed = f.observed)
+end
+
+function OptimizationBase.instantiate_function(
+        f::OptimizationFunction{true}, cache::ReInitCache, ::SciMLBase.NoAD,
+        num_cons = 0; kwargs...)
+    x = cache.u0
+    p = cache.p
+
+    return instantiate_function(f, x, SciMLBase.NoAD(), p, num_cons; kwargs...)
+end
+
+function instantiate_function(f::OptimizationFunction, x, adtype::ADTypes.AbstractADType,
+        p, num_cons = 0; kwargs...)
+    adtypestr = string(adtype)
+    _strtind = findfirst('.', adtypestr)
+    strtind = isnothing(_strtind) ? 5 : _strtind + 5
+    open_nrmlbrkt_ind = findfirst('(', adtypestr)
+    open_squigllybrkt_ind = findfirst('{', adtypestr)
+    open_brkt_ind = isnothing(open_squigllybrkt_ind) ? open_nrmlbrkt_ind :
+                    min(open_nrmlbrkt_ind, open_squigllybrkt_ind)
+    adpkg = adtypestr[strtind:(open_brkt_ind - 1)]
+    throw(ArgumentError("The passed automatic differentiation backend choice is not available. Please load the corresponding AD package $adpkg."))
+end
diff --git a/lib/OptimizationBase/src/symify.jl b/lib/OptimizationBase/src/symify.jl
new file mode 100644
index 000000000..43f353165
--- /dev/null
+++ b/lib/OptimizationBase/src/symify.jl
@@ -0,0 +1,3 @@
+function symify_cache(f::OptimizationFunction, prob, num_cons, manifold)
+    throw("Structural analysis requires SymbolicAnalysis.jl to be loaded, either add `using SymbolicAnalysis` to your script or set `structural_analysis = false`.")
+end
diff --git a/lib/OptimizationBase/test/Project.toml b/lib/OptimizationBase/test/Project.toml
new file mode 100644
index 000000000..25549349b
--- /dev/null
+++ b/lib/OptimizationBase/test/Project.toml
@@ -0,0 +1,45 @@
+[deps]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Manifolds = "1cead3c2-87b3-11e9-0ccd-23c62b72b94e"
+ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
+OptimizationManopt = "e57b7fff-7ee7-4550-b4f0-90e9476e9fb6"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
+SymbolicAnalysis = "4297ee4d-0239-47d8-ba5d-195ecdf594fe"
+Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+Aqua = "0.8"
+ComponentArrays = ">= 0.13.9"
+Enzyme = "0.13"
+IterTools = ">= 1.3.0"
+Lux = "1.12"
+Manifolds = "0.9"
+Optim = ">= 1.4.1"
+Optimisers = ">= 0.2.5"
+Optimization = "4"
+OptimizationManopt = "0.0.4"
+SparseConnectivityTracer = "0.6"
+SymbolicAnalysis = "0.3.0"
+SafeTestsets = ">= 0.0.1"
diff --git a/lib/OptimizationBase/test/adtests.jl b/lib/OptimizationBase/test/adtests.jl
new file mode 100644
index 000000000..6fe4eea05
--- /dev/null
+++ b/lib/OptimizationBase/test/adtests.jl
@@ -0,0 +1,1189 @@
+using OptimizationBase, Test, DifferentiationInterface, SparseArrays, Symbolics
+using ForwardDiff, Zygote, ReverseDiff, FiniteDiff, Tracker
+using ModelingToolkit, Enzyme, Random
+
+x0 = zeros(2)
+rosenbrock(x, p = nothing) = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2
+l1 = rosenbrock(x0)
+
+function g!(G, x)
+    G[1] = -2.0 * (1.0 - x[1]) - 400.0 * (x[2] - x[1]^2) * x[1]
+    G[2] = 200.0 * (x[2] - x[1]^2)
+end
+
+function h!(H, x)
+    H[1, 1] = 2.0 - 400.0 * x[2] + 1200.0 * x[1]^2
+    H[1, 2] = -400.0 * x[1]
+    H[2, 1] = -400.0 * x[1]
+    H[2, 2] = 200.0
+end
+
+G1 = Array{Float64}(undef, 2)
+G2 = Array{Float64}(undef, 2)
+H1 = Array{Float64}(undef, 2, 2)
+H2 = Array{Float64}(undef, 2, 2)
+
+g!(G1, x0)
+h!(H1, x0)
+
+cons = (res, x, p) -> (res[1] = x[1]^2 + x[2]^2; return nothing)
+optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoModelingToolkit(), cons = cons)
+optprob = OptimizationBase.instantiate_function(optf, x0,
+    OptimizationBase.AutoModelingToolkit(),
+    nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+optprob.grad(G2, x0)
+@test G1 == G2
+optprob.hess(H2, x0)
+@test H1 == H2
+res = Array{Float64}(undef, 1)
+optprob.cons(res, x0)
+@test res == [0.0]
+J = Array{Float64}(undef, 2)
+optprob.cons_j(J, [5.0, 3.0])
+@test J == [10.0, 6.0]
+H3 = [Array{Float64}(undef, 2, 2)]
+optprob.cons_h(H3, x0)
+@test H3 == [[2.0 0.0; 0.0 2.0]]
+
+function con2_c(res, x, p)
+    res[1] = x[1]^2 + x[2]^2
+    res[2] = x[2] * sin(x[1]) - x[1]
+    return nothing
+end
+optf = OptimizationFunction(rosenbrock,
+    OptimizationBase.AutoModelingToolkit(),
+    cons = con2_c)
+optprob = OptimizationBase.instantiate_function(optf, x0,
+    OptimizationBase.AutoModelingToolkit(),
+    nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+optprob.grad(G2, x0)
+@test G1 == G2
+optprob.hess(H2, x0)
+@test H1 == H2
+res = Array{Float64}(undef, 2)
+optprob.cons(res, x0)
+@test res == [0.0, 0.0]
+J = Array{Float64}(undef, 2, 2)
+optprob.cons_j(J, [5.0, 3.0])
+@test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+optprob.cons_h(H3, x0)
+@test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+@testset "one constraint tests" begin
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoEnzyme(),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res == [0.0]
+    J = Array{Float64}(undef, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J == [10.0, 6.0]
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ == [10.0, 6.0]
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == [8.0]
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoForwardDiff(), cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoForwardDiff(),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res == [0.0]
+    J = Array{Float64}(undef, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J == [10.0, 6.0]
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ == [10.0, 6.0]
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == [8.0]
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoReverseDiff(), cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoReverseDiff(),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res == [0.0]
+    J = Array{Float64}(undef, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J == [10.0, 6.0]
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ == [10.0, 6.0]
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == [8.0]
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, OptimizationBase.AutoReverseDiff(true), cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoReverseDiff(true),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res == [0.0]
+    J = Array{Float64}(undef, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J == [10.0, 6.0]
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ == [10.0, 6.0]
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == [8.0]
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, AutoZygote(), cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, AutoZygote(),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res == [0.0]
+    J = Array{Float64}(undef, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J == [10.0, 6.0]
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ == [10.0, 6.0]
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == [8.0]
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(rosenbrock,
+        DifferentiationInterface.SecondOrder(
+            ADTypes.AutoFiniteDiff(), ADTypes.AutoReverseDiff()),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0,
+        DifferentiationInterface.SecondOrder(
+            ADTypes.AutoFiniteDiff(), ADTypes.AutoReverseDiff()),
+        nothing, 1, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1≈G2 rtol=1e-5
+    optprob.hess(H2, x0)
+    @test H1≈H2 rtol=1e-5
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv≈[2.0, 200.0] rtol=1e-5
+    res = Array{Float64}(undef, 1)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0]
+    J = Array{Float64}(undef, 1, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test J≈[10.0 6.0] rtol=1e-5
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0])
+    @test vJ≈[10.0, 6.0] rtol=1e-5
+    Jv = Array{Float64}(undef, 1)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv≈[8.0] rtol=1e-5
+    H3 = [Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3≈[[2.0 0.0; 0.0 2.0]] rtol=1e-5
+    Random.seed!(123)
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(1)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H2 + μ[1] * H3[1] rtol=1e-6
+end
+
+@testset "two constraints tests" begin
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+    optf = OptimizationFunction(rosenbrock, OptimizationBase.AutoEnzyme(), cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoEnzyme(),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res == [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ == sum(J, dims = 1)[:]
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv ≈ 0.5 * sum(J, dims = 2)[:]
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, OptimizationBase.AutoReverseDiff(), cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res == [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ == sum(J, dims = 1)[:]
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == 0.5 * sum(J, dims = 2)[:]
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, OptimizationBase.AutoReverseDiff(true), cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(true),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res == [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ == sum(J, dims = 1)[:]
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == 0.5 * sum(J, dims = 2)[:]
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, OptimizationBase.AutoForwardDiff(), cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(compile = true),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res == [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ == sum(J, dims = 1)[:]
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == 0.5 * sum(J, dims = 2)[:]
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, AutoZygote(), cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, AutoZygote(),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1 == G2
+    optprob.hess(H2, x0)
+    @test H1 == H2
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv == [2.0, 200.0]
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res == [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ == sum(J, dims = 1)[:]
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv == 0.5 * sum(J, dims = 2)[:]
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3 == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+
+    G2 = Array{Float64}(undef, 2)
+    H2 = Array{Float64}(undef, 2, 2)
+
+    optf = OptimizationFunction(
+        rosenbrock, DifferentiationInterface.SecondOrder(
+            ADTypes.AutoFiniteDiff(), ADTypes.AutoReverseDiff()),
+        cons = con2_c)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0,
+        DifferentiationInterface.SecondOrder(
+            ADTypes.AutoFiniteDiff(), ADTypes.AutoReverseDiff()),
+        nothing, 2, g = true, h = true, hv = true,
+        cons_j = true, cons_h = true, cons_vjp = true,
+        cons_jvp = true, lag_h = true)
+    optprob.grad(G2, x0)
+    @test G1≈G2 rtol=1e-5
+    optprob.hess(H2, x0)
+    @test H1≈H2 rtol=1e-5
+    Hv = Array{Float64}(undef, 2)
+    optprob.hv(Hv, x0, [1.0, 1.0])
+    @test Hv≈[2.0, 200.0] rtol=1e-5
+    res = Array{Float64}(undef, 2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0, 0.0]
+    J = Array{Float64}(undef, 2, 2)
+    optprob.cons_j(J, [5.0, 3.0])
+    @test all(isapprox(J, [10.0 6.0; -0.149013 -0.958924]; rtol = 1e-3))
+    vJ = Array{Float64}(undef, 2)
+    optprob.cons_vjp(vJ, [5.0, 3.0], [1.0, 1.0])
+    @test vJ≈sum(J, dims = 1)[:] rtol=1e-5
+    Jv = Array{Float64}(undef, 2)
+    optprob.cons_jvp(Jv, [5.0, 3.0], [0.5, 0.5])
+    @test Jv≈0.5 * sum(J, dims = 2)[:] rtol=1e-5
+    H3 = [Array{Float64}(undef, 2, 2), Array{Float64}(undef, 2, 2)]
+    optprob.cons_h(H3, x0)
+    @test H3≈[[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]] rtol=1e-5
+    H4 = Array{Float64}(undef, 2, 2)
+    μ = randn(2)
+    σ = rand()
+    optprob.lag_h(H4, x0, σ, μ)
+    @test H4≈σ * H1 + sum(μ .* H3) rtol=1e-6
+end
+
+@testset "Sparse Tests" begin
+    # Define a sparse objective function
+    function sparse_objective(x, p)
+        return x[1]^2 + 100 * (x[3] - x[2]^2)^2
+    end
+
+    # Define sparse constraints
+    function sparse_constraints(res, x, p)
+        res[1] = x[1] + x[2] + (x[2] * x[3])^2 - 1
+        res[2] = x[1]^2 + x[3]^2 - 1
+    end
+
+    # Initial point
+    x0 = [0.5, 0.5, 0.5]
+
+    # Create OptimizationFunction
+    optf = OptimizationFunction(sparse_objective, OptimizationBase.AutoSparseForwardDiff(),
+        cons = sparse_constraints)
+
+    # Instantiate the optimization problem
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true, lag_h = true)
+    # Test gradient
+    G = zeros(3)
+    optprob.grad(G, x0)
+    @test G ≈ [1.0, -50.0, 50.0]
+
+    # Test Hessian
+    H_expected = sparse(
+        [1, 2, 2, 3, 3], [1, 2, 3, 2, 3], [2.0, 100.0, -200.0, -200.0, 200.0], 3, 3)
+    H = similar(optprob.hess_prototype, Float64)
+    optprob.hess(H, x0)
+    @test H ≈ H_expected
+    @test nnz(H) == 5  # Check sparsity
+
+    # Test constraints
+    res = zeros(2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0625, -0.5]
+
+    # Test constraint Jacobian
+    J_expected = sparse([1, 1, 1, 2, 2], [1, 2, 3, 1, 3], [1.0, 1.25, 0.25, 1.0, 1.0], 2, 3)
+    J = similar(optprob.cons_jac_prototype, Float64)
+    optprob.cons_j(J, x0)
+    @test J ≈ J_expected
+    @test nnz(J) == 5  # Check sparsity
+
+    # Test constraint Hessians
+    H_cons_expected = [sparse([2, 2, 3, 3], [2, 3, 2, 3], [0.5, 1.0, 1.0, 0.5], 3, 3),
+        sparse([1, 3], [1, 3], [2.0, 2.0], 3, 3)]
+    H_cons = [similar(h, Float64) for h in optprob.cons_hess_prototype]
+    optprob.cons_h(H_cons, x0)
+    @test all(H_cons .≈ H_cons_expected)
+    @test all(nnz.(H_cons) .== [4, 2])  # Check sparsity
+
+    lag_H_expected = sparse(
+        [1, 2, 3, 2, 3], [1, 2, 2, 3, 3], [6.0, 100.5, -199.0, -199.0, 204.5], 3, 3)
+    σ = 1.0
+    λ = [1.0, 2.0]
+    lag_H = similar(optprob.lag_hess_prototype, Float64)
+    optprob.lag_h(lag_H, x0, σ, λ)
+    @test lag_H ≈ lag_H_expected
+    @test nnz(lag_H) == 5
+
+    optf = OptimizationFunction(sparse_objective, OptimizationBase.AutoSparseReverseDiff(),
+        cons = sparse_constraints)
+
+    # Instantiate the optimization problem
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true, lag_h = true)
+    # Test gradient
+    G = zeros(3)
+    optprob.grad(G, x0)
+    @test G ≈ [1.0, -50.0, 50.0]
+
+    # Test Hessian
+    H_expected = sparse(
+        [1, 2, 2, 3, 3], [1, 2, 3, 2, 3], [2.0, 100.0, -200.0, -200.0, 200.0], 3, 3)
+    H = similar(optprob.hess_prototype, Float64)
+    optprob.hess(H, x0)
+    @test H ≈ H_expected
+    @test nnz(H) == 5  # Check sparsity
+
+    # Test constraints
+    res = zeros(2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0625, -0.5]
+
+    # Test constraint Jacobian
+    J_expected = sparse([1, 1, 1, 2, 2], [1, 2, 3, 1, 3], [1.0, 1.25, 0.25, 1.0, 1.0], 2, 3)
+    J = similar(optprob.cons_jac_prototype, Float64)
+    optprob.cons_j(J, x0)
+    @test J ≈ J_expected
+    @test nnz(J) == 5  # Check sparsity
+
+    # Test constraint Hessians
+    H_cons_expected = [sparse([2, 2, 3, 3], [2, 3, 2, 3], [0.5, 1.0, 1.0, 0.5], 3, 3),
+        sparse([1, 3], [1, 3], [2.0, 2.0], 3, 3)]
+    H_cons = [similar(h, Float64) for h in optprob.cons_hess_prototype]
+    optprob.cons_h(H_cons, x0)
+    @test all(H_cons .≈ H_cons_expected)
+    @test all(nnz.(H_cons) .== [4, 2])  # Check sparsity
+
+    lag_H_expected = sparse(
+        [1, 2, 3, 2, 3], [1, 2, 2, 3, 3], [6.0, 100.5, -199.0, -199.0, 204.5], 3, 3)
+    σ = 1.0
+    λ = [1.0, 2.0]
+    lag_H = similar(optprob.lag_hess_prototype, Float64)
+    optprob.lag_h(lag_H, x0, σ, λ)
+    @test lag_H ≈ lag_H_expected
+    @test nnz(lag_H) == 5
+
+    optf = OptimizationFunction(
+        sparse_objective, OptimizationBase.AutoSparseReverseDiff(true),
+        cons = sparse_constraints)
+
+    # Instantiate the optimization problem
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true, lag_h = true)
+    # Test gradient
+    G = zeros(3)
+    optprob.grad(G, x0)
+    @test G ≈ [1.0, -50.0, 50.0]
+
+    # Test Hessian
+    H_expected = sparse(
+        [1, 2, 2, 3, 3], [1, 2, 3, 2, 3], [2.0, 100.0, -200.0, -200.0, 200.0], 3, 3)
+    H = similar(optprob.hess_prototype, Float64)
+    optprob.hess(H, x0)
+    @test H ≈ H_expected
+    @test nnz(H) == 5  # Check sparsity
+
+    # Test constraints
+    res = zeros(2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0625, -0.5]
+
+    # Test constraint Jacobian
+    J_expected = sparse([1, 1, 1, 2, 2], [1, 2, 3, 1, 3], [1.0, 1.25, 0.25, 1.0, 1.0], 2, 3)
+    J = similar(optprob.cons_jac_prototype, Float64)
+    optprob.cons_j(J, x0)
+    @test J ≈ J_expected
+    @test nnz(J) == 5  # Check sparsity
+
+    # Test constraint Hessians
+    H_cons_expected = [sparse([2, 2, 3, 3], [2, 3, 2, 3], [0.5, 1.0, 1.0, 0.5], 3, 3),
+        sparse([1, 3], [1, 3], [2.0, 2.0], 3, 3)]
+    H_cons = [similar(h, Float64) for h in optprob.cons_hess_prototype]
+    optprob.cons_h(H_cons, x0)
+    @test all(H_cons .≈ H_cons_expected)
+    @test all(nnz.(H_cons) .== [4, 2])  # Check sparsity
+
+    lag_H_expected = sparse(
+        [1, 2, 3, 2, 3], [1, 2, 2, 3, 3], [6.0, 100.5, -199.0, -199.0, 204.5], 3, 3)
+    σ = 1.0
+    λ = [1.0, 2.0]
+    lag_H = similar(optprob.lag_hess_prototype, Float64)
+    optprob.lag_h(lag_H, x0, σ, λ)
+    @test lag_H ≈ lag_H_expected
+    @test nnz(lag_H) == 5
+
+    optf = OptimizationFunction(sparse_objective, OptimizationBase.AutoSparseFiniteDiff(),
+        cons = sparse_constraints)
+
+    # Instantiate the optimization problem
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true, lag_h = true)
+    # Test gradient
+    G = zeros(3)
+    optprob.grad(G, x0)
+    @test G ≈ [1.0, -50.0, 50.0]
+
+    # Test Hessian
+    H_expected = sparse(
+        [1, 2, 2, 3, 3], [1, 2, 3, 2, 3], [2.0, 100.0, -200.0, -200.0, 200.0], 3, 3)
+    H = similar(optprob.hess_prototype, Float64)
+    optprob.hess(H, x0)
+    @test H ≈ H_expected
+    @test nnz(H) == 5  # Check sparsity
+
+    # Test constraints
+    res = zeros(2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0625, -0.5]
+
+    # Test constraint Jacobian
+    J_expected = sparse([1, 1, 1, 2, 2], [1, 2, 3, 1, 3], [1.0, 1.25, 0.25, 1.0, 1.0], 2, 3)
+    J = similar(optprob.cons_jac_prototype, Float64)
+    optprob.cons_j(J, x0)
+    @test J ≈ J_expected
+    @test nnz(J) == 5  # Check sparsity
+
+    # Test constraint Hessians
+    H_cons_expected = [sparse([2, 2, 3, 3], [2, 3, 2, 3], [0.5, 1.0, 1.0, 0.5], 3, 3),
+        sparse([1, 3], [1, 3], [2.0, 2.0], 3, 3)]
+    H_cons = [similar(h, Float64) for h in optprob.cons_hess_prototype]
+    optprob.cons_h(H_cons, x0)
+    @test all(H_cons .≈ H_cons_expected)
+    @test all(nnz.(H_cons) .== [4, 2])  # Check sparsity
+
+    lag_H_expected = sparse(
+        [1, 2, 3, 2, 3], [1, 2, 2, 3, 3], [6.0, 100.5, -199.0, -199.0, 204.5], 3, 3)
+    σ = 1.0
+    λ = [1.0, 2.0]
+    lag_H = similar(optprob.lag_hess_prototype, Float64)
+    optprob.lag_h(lag_H, x0, σ, λ)
+    @test lag_H ≈ lag_H_expected
+    @test nnz(lag_H) == 5
+
+    optf = OptimizationFunction(sparse_objective,
+        AutoSparse(DifferentiationInterface.SecondOrder(
+            ADTypes.AutoForwardDiff(), ADTypes.AutoZygote())),
+        cons = sparse_constraints)
+
+    # Instantiate the optimization problem
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        AutoSparse(DifferentiationInterface.SecondOrder(
+            ADTypes.AutoForwardDiff(), ADTypes.AutoZygote())),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true, lag_h = true)
+    # Test gradient
+    G = zeros(3)
+    optprob.grad(G, x0)
+    @test G ≈ [1.0, -50.0, 50.0]
+
+    # Test Hessian
+    H_expected = sparse(
+        [1, 2, 2, 3, 3], [1, 2, 3, 2, 3], [2.0, 100.0, -200.0, -200.0, 200.0], 3, 3)
+    H = similar(optprob.hess_prototype, Float64)
+    optprob.hess(H, x0)
+    @test H ≈ H_expected
+    @test nnz(H) == 5  # Check sparsity
+
+    # Test constraints
+    res = zeros(2)
+    optprob.cons(res, x0)
+    @test res ≈ [0.0625, -0.5]
+
+    # Test constraint Jacobian
+    J_expected = sparse([1, 1, 1, 2, 2], [1, 2, 3, 1, 3], [1.0, 1.25, 0.25, 1.0, 1.0], 2, 3)
+    J = similar(optprob.cons_jac_prototype, Float64)
+    optprob.cons_j(J, x0)
+    @test J ≈ J_expected
+    @test nnz(J) == 5  # Check sparsity
+
+    # Test constraint Hessians
+    H_cons_expected = [sparse([2, 2, 3, 3], [2, 3, 2, 3], [0.5, 1.0, 1.0, 0.5], 3, 3),
+        sparse([1, 3], [1, 3], [2.0, 2.0], 3, 3)]
+    H_cons = [similar(h, Float64) for h in optprob.cons_hess_prototype]
+    optprob.cons_h(H_cons, x0)
+    @test all(H_cons .≈ H_cons_expected)
+    @test all(nnz.(H_cons) .== [4, 2])  # Check sparsity
+
+    lag_H_expected = sparse(
+        [1, 2, 3, 2, 3], [1, 2, 2, 3, 3], [6.0, 100.5, -199.0, -199.0, 204.5], 3, 3)
+    σ = 1.0
+    λ = [1.0, 2.0]
+    lag_H = similar(optprob.lag_hess_prototype, Float64)
+    optprob.lag_h(lag_H, x0, σ, λ)
+    @test lag_H ≈ lag_H_expected
+    @test nnz(lag_H) == 5
+end
+
+@testset "OOP" begin
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoEnzyme(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoEnzyme(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoEnzyme(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, OptimizationBase.AutoEnzyme(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoFiniteDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoFiniteDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0)≈G1 rtol=1e-6
+    @test optprob.hess(x0)≈H1 rtol=1e-6
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0])≈[10.0, 6.0] rtol=1e-6
+
+    @test optprob.cons_h(x0) ≈ [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoFiniteDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoFiniteDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0)≈G1 rtol=1e-6
+    @test optprob.hess(x0)≈H1 rtol=1e-6
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test optprob.cons_h(x0) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoForwardDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoForwardDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoForwardDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoReverseDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoReverseDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoReverseDiff(true),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(true),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoReverseDiff(true),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoReverseDiff(true),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseForwardDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseForwardDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test Array(optprob.cons_j([5.0, 3.0]))≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test Array.(optprob.cons_h(x0)) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseFiniteDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseFiniteDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0)≈G1 rtol=1e-4
+    @test Array(optprob.hess(x0)) ≈ H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) ≈ [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseFiniteDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseForwardDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test Array(optprob.cons_j([5.0, 3.0]))≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test Array.(optprob.cons_h(x0)) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseReverseDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseReverseDiff(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseReverseDiff(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseReverseDiff(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test Array(optprob.cons_j([5.0, 3.0]))≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test Array.(optprob.cons_h(x0)) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseReverseDiff(true),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseReverseDiff(true),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        OptimizationBase.AutoSparseReverseDiff(true),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(optf, x0,
+        OptimizationBase.AutoSparseReverseDiff(true),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test Array(optprob.cons_j([5.0, 3.0]))≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test Array.(optprob.cons_h(x0)) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2]
+    optf = OptimizationFunction{false}(rosenbrock,
+        AutoZygote(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, AutoZygote(),
+        nothing, 1, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test optprob.hess(x0) == H1
+    @test optprob.cons(x0) == [0.0]
+
+    @test optprob.cons_j([5.0, 3.0]) == [10.0, 6.0]
+
+    @test optprob.cons_h(x0) == [[2.0 0.0; 0.0 2.0]]
+
+    cons = (x, p) -> [x[1]^2 + x[2]^2, x[2] * sin(x[1]) - x[1]]
+    optf = OptimizationFunction{false}(rosenbrock,
+        AutoZygote(),
+        cons = cons)
+    optprob = OptimizationBase.instantiate_function(
+        optf, x0, AutoZygote(),
+        nothing, 2, g = true, h = true, cons_j = true, cons_h = true)
+
+    @test optprob.grad(x0) == G1
+    @test Array(optprob.hess(x0)) ≈ H1
+    @test optprob.cons(x0) == [0.0, 0.0]
+    @test optprob.cons_j([5.0, 3.0])≈[10.0 6.0; -0.149013 -0.958924] rtol=1e-6
+    @test Array.(optprob.cons_h(x0)) ≈ [[2.0 0.0; 0.0 2.0], [-0.0 1.0; 1.0 0.0]]
+end
+
+using MLUtils
+
+@testset "Stochastic gradient" begin
+    x0 = rand(10000)
+    y0 = sin.(x0)
+    data = MLUtils.DataLoader((x0, y0), batchsize = 100)
+
+    function loss(coeffs, data)
+        ypred = [evalpoly(data[1][i], coeffs) for i in eachindex(data[1])]
+        return sum(abs2, ypred .- data[2])
+    end
+
+    optf = OptimizationFunction(loss, AutoForwardDiff())
+    optf = OptimizationBase.instantiate_function(
+        optf, rand(3), AutoForwardDiff(), iterate(data)[1], g = true, fg = true)
+    G0 = zeros(3)
+    optf.grad(G0, ones(3), (x0, y0))
+    stochgrads = []
+    i = 0
+    for (x, y) in data
+        G = zeros(3)
+        optf.grad(G, ones(3), (x, y))
+        push!(stochgrads, copy(G))
+        G1 = zeros(3)
+        optf.fg(G1, ones(3), (x, y))
+        @test G≈G1 rtol=1e-6
+    end
+    @test G0≈sum(stochgrads) rtol=1e-1
+
+    optf = OptimizationFunction(loss, AutoReverseDiff())
+    optf = OptimizationBase.instantiate_function(
+        optf, rand(3), AutoReverseDiff(), iterate(data)[1], g = true, fg = true)
+    G0 = zeros(3)
+    optf.grad(G0, ones(3), (x0, y0))
+    stochgrads = []
+    for (x, y) in data
+        G = zeros(3)
+        optf.grad(G, ones(3), (x, y))
+        push!(stochgrads, copy(G))
+        G1 = zeros(3)
+        optf.fg(G1, ones(3), (x, y))
+        @test G≈G1 rtol=1e-6
+    end
+    @test G0≈sum(stochgrads) rtol=1e-1
+
+    optf = OptimizationFunction(loss, AutoZygote())
+    optf = OptimizationBase.instantiate_function(
+        optf, rand(3), AutoZygote(), iterate(data)[1], g = true, fg = true)
+    G0 = zeros(3)
+    optf.grad(G0, ones(3), (x0, y0))
+    stochgrads = []
+    for (x, y) in data
+        G = zeros(3)
+        optf.grad(G, ones(3), (x, y))
+        push!(stochgrads, copy(G))
+        G1 = zeros(3)
+        optf.fg(G1, ones(3), (x, y))
+        @test G≈G1 rtol=1e-6
+    end
+    @test G0≈sum(stochgrads) rtol=1e-1
+
+    optf = OptimizationFunction(loss, AutoEnzyme())
+    optf = OptimizationBase.instantiate_function(
+        optf, rand(3), AutoEnzyme(mode = set_runtime_activity(Reverse)),
+        iterate(data)[1], g = true, fg = true)
+    G0 = zeros(3)
+    optf.grad(G0, ones(3), (x0, y0))
+    stochgrads = []
+    for (x, y) in data
+        G = zeros(3)
+        optf.grad(G, ones(3), (x, y))
+        push!(stochgrads, copy(G))
+        G1 = zeros(3)
+        optf.fg(G1, ones(3), (x, y))
+        @test G≈G1 rtol=1e-6
+    end
+    @test G0≈sum(stochgrads) rtol=1e-1
+end
diff --git a/lib/OptimizationBase/test/cvxtest.jl b/lib/OptimizationBase/test/cvxtest.jl
new file mode 100644
index 000000000..cef7859bb
--- /dev/null
+++ b/lib/OptimizationBase/test/cvxtest.jl
@@ -0,0 +1,52 @@
+using Optimization, OptimizationBase, ForwardDiff, SymbolicAnalysis, LinearAlgebra,
+      Manifolds, OptimizationManopt
+
+function f(x, p = nothing)
+    return exp(x[1]) + x[1]^2
+end
+
+optf = OptimizationFunction(f, Optimization.AutoForwardDiff())
+prob = OptimizationProblem(optf, [0.4], structural_analysis = true)
+
+@time sol = solve(prob, Optimization.LBFGS(), maxiters = 1000)
+@test sol.cache.analysis_results.objective.curvature == SymbolicAnalysis.Convex
+@test sol.cache.analysis_results.constraints === nothing
+
+x0 = zeros(2)
+rosenbrock(x, p = nothing) = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2
+l1 = rosenbrock(x0)
+
+optf = OptimizationFunction(rosenbrock, AutoEnzyme())
+prob = OptimizationProblem(optf, x0, structural_analysis = true)
+@time res = solve(prob, Optimization.LBFGS(), maxiters = 100)
+@test res.cache.analysis_results.objective.curvature == SymbolicAnalysis.UnknownCurvature
+
+function con2_c(res, x, p)
+    res .= [x[1]^2 + x[2]^2, (x[2] * sin(x[1]) + x[1]) - 5]
+end
+
+optf = OptimizationFunction(rosenbrock, AutoZygote(), cons = con2_c)
+prob = OptimizationProblem(optf, x0, lcons = [1.0, -Inf], ucons = [1.0, 0.0],
+    lb = [-1.0, -1.0], ub = [1.0, 1.0], structural_analysis = true)
+@time res = solve(prob, Optimization.LBFGS(), maxiters = 100)
+@test res.cache.analysis_results.objective.curvature == SymbolicAnalysis.UnknownCurvature
+@test res.cache.analysis_results.constraints[1].curvature == SymbolicAnalysis.Convex
+@test res.cache.analysis_results.constraints[2].curvature ==
+      SymbolicAnalysis.UnknownCurvature
+
+m = 100
+σ = 0.005
+q = Matrix{Float64}(LinearAlgebra.I(5)) .+ 2.0
+
+M = SymmetricPositiveDefinite(5)
+data2 = [exp(M, q, σ * rand(M; vector_at = q)) for i in 1:m];
+
+f(x, p = nothing) = sum(SymbolicAnalysis.distance(M, data2[i], x)^2 for i in 1:5)
+optf = OptimizationFunction(f, Optimization.AutoForwardDiff())
+prob = OptimizationProblem(optf, data2[1]; manifold = M, structural_analysis = true)
+
+opt = OptimizationManopt.GradientDescentOptimizer()
+@time sol = solve(prob, opt, maxiters = 100)
+@test sol.minimum < 1e-1
+@test sol.cache.analysis_results.objective.curvature == SymbolicAnalysis.UnknownCurvature
+@test sol.cache.analysis_results.objective.gcurvature == SymbolicAnalysis.GConvex
diff --git a/lib/OptimizationBase/test/matrixvalued.jl b/lib/OptimizationBase/test/matrixvalued.jl
new file mode 100644
index 000000000..7a60823ac
--- /dev/null
+++ b/lib/OptimizationBase/test/matrixvalued.jl
@@ -0,0 +1,81 @@
+using OptimizationBase, LinearAlgebra, ForwardDiff, Zygote, FiniteDiff,
+      DifferentiationInterface, SparseConnectivityTracer
+using Test, ReverseDiff
+
+@testset "Matrix Valued" begin
+    for adtype in [AutoForwardDiff(), SecondOrder(AutoForwardDiff(), AutoZygote()),
+        SecondOrder(AutoForwardDiff(), AutoFiniteDiff()),
+        AutoSparse(AutoForwardDiff(), sparsity_detector = TracerLocalSparsityDetector()),
+        AutoSparse(SecondOrder(AutoForwardDiff(), AutoZygote()),
+            sparsity_detector = TracerLocalSparsityDetector()),
+        AutoSparse(SecondOrder(AutoForwardDiff(), AutoFiniteDiff()),
+            sparsity_detector = TracerLocalSparsityDetector())]
+        # 1. Matrix Factorization
+        @show adtype
+        function matrix_factorization_objective(X, A)
+            U, V = @view(X[1:size(A, 1), 1:Int(size(A, 2) / 2)]),
+            @view(X[1:size(A, 1), (Int(size(A, 2) / 2) + 1):size(A, 2)])
+            return norm(A - U * V')
+        end
+        function non_negative_constraint(X, A)
+            U, V = X
+            return [all(U .>= 0) && all(V .>= 0)]
+        end
+        A_mf = rand(4, 4)  # Original matrix
+        U_mf = rand(4, 2)  # Factor matrix U
+        V_mf = rand(4, 2)  # Factor matrix V
+
+        optf = OptimizationFunction{false}(
+            matrix_factorization_objective, adtype, cons = non_negative_constraint)
+        optf = OptimizationBase.instantiate_function(
+            optf, hcat(U_mf, V_mf), adtype, A_mf, g = true, h = true,
+            cons_j = true, cons_h = true)
+        optf.grad(hcat(U_mf, V_mf))
+        optf.hess(hcat(U_mf, V_mf))
+        if !(adtype isa ADTypes.AutoSparse)
+            optf.cons_j(hcat(U_mf, V_mf))
+            optf.cons_h(hcat(U_mf, V_mf))
+        end
+
+        # 2. Principal Component Analysis (PCA)
+        function pca_objective(X, A)
+            return -tr(X' * A * X)  # Minimize the negative of the trace for maximization
+        end
+        function orthogonality_constraint(X, A)
+            return [norm(X' * X - I) < 1e-6]
+        end
+        A_pca = rand(4, 4)  # Covariance matrix (can be symmetric positive definite)
+        X_pca = rand(4, 2)  # Matrix to hold principal components
+
+        optf = OptimizationFunction{false}(
+            pca_objective, adtype, cons = orthogonality_constraint)
+        optf = OptimizationBase.instantiate_function(
+            optf, X_pca, adtype, A_pca, g = true, h = true,
+            cons_j = true, cons_h = true)
+        optf.grad(X_pca)
+        optf.hess(X_pca)
+        if !(adtype isa ADTypes.AutoSparse)
+            optf.cons_j(X_pca)
+            optf.cons_h(X_pca)
+        end
+
+        # 3. Matrix Completion
+        function matrix_completion_objective(X, P)
+            A, Omega = P
+            return norm(Omega .* (A - X))
+        end
+        # r = 2  # Rank of the matrix to be completed
+        # function rank_constraint(X, P)
+        #     return [rank(X) <= r]
+        # end
+        A_mc = rand(4, 4)  # Original matrix with missing entries
+        Omega_mc = rand(4, 4) .> 0.5  # Mask for observed entries (boolean matrix)
+        X_mc = rand(4, 4)  # Matrix to be completed
+        optf = OptimizationFunction{false}(
+            matrix_completion_objective, adtype)
+        optf = OptimizationBase.instantiate_function(
+            optf, X_mc, adtype, (A_mc, Omega_mc), g = true, h = true)
+        optf.grad(X_mc)
+        optf.hess(X_mc)
+    end
+end
diff --git a/lib/OptimizationBase/test/runtests.jl b/lib/OptimizationBase/test/runtests.jl
new file mode 100644
index 000000000..fad4b35ef
--- /dev/null
+++ b/lib/OptimizationBase/test/runtests.jl
@@ -0,0 +1,8 @@
+using OptimizationBase
+using Test
+
+@testset "OptimizationBase.jl" begin
+    include("adtests.jl")
+    include("cvxtest.jl")
+    include("matrixvalued.jl")
+end
diff --git a/src/Optimization.jl b/src/Optimization.jl
index 8d0257dd1..5e716c8cf 100644
--- a/src/Optimization.jl
+++ b/src/Optimization.jl
@@ -5,7 +5,12 @@ module Optimization
 
 using DocStringExtensions
 using Reexport
-@reexport using SciMLBase, ADTypes, OptimizationBase
+@reexport using SciMLBase, ADTypes
+
+# Include OptimizationBase as a submodule
+include("../lib/OptimizationBase/src/OptimizationBase.jl")
+using .OptimizationBase
+export OptimizationBase
 
 if !isdefined(Base, :get_extension)
     using Requires
@@ -14,7 +19,7 @@ end
 using Logging, ProgressLogging, ConsoleProgressMonitor, TerminalLoggers, LoggingExtras
 using ArrayInterface, Base.Iterators, SparseArrays, LinearAlgebra
 
-import OptimizationBase: instantiate_function, OptimizationCache, ReInitCache
+import .OptimizationBase: instantiate_function, OptimizationCache, ReInitCache
 import SciMLBase: OptimizationProblem,
                   OptimizationFunction, ObjSense,
                   MaxSense, MinSense, OptimizationStats
diff --git a/test/runtests.jl b/test/runtests.jl
index 2e1b7f39f..cc49b3f5b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,6 +15,7 @@ function activate_subpkg_env(subpkg)
 end
 
 if GROUP == "All" || GROUP == "Core"
+    dev_subpkg("OptimizationBase")
     dev_subpkg("OptimizationOptimJL")
     dev_subpkg("OptimizationOptimisers")
     dev_subpkg("OptimizationMOI")