diff --git a/.claude/claude.md b/.claude/claude.md
index 0d49c0c64..2ba1fab29 100644
--- a/.claude/claude.md
+++ b/.claude/claude.md
@@ -75,7 +75,6 @@ Efficient computation of power network matrices for large-scale systems. Support
 ### `ext/`
 
   - **MKLPardisoExt.jl**: MKL-Pardiso sparse solver (Windows/Linux)
-  - **AppleAccelerateExt.jl**: native macOS BLAS acceleration
 
 ### `test/`
 
@@ -112,7 +111,7 @@ Documentation source
 ### Optional Extensions
 
   - **MKL + Pardiso**: high-performance sparse factorization
-  - **AppleAccelerate**: native macOS dense linear algebra
+  - **AppleAccelerate**: native macOS sparse direct solver (libSparse `SparseFactorizationLU`, built-in via `src/AccelerateWrapper/`, macOS 15.5+)
 
 ## Core Abstractions
 
@@ -160,7 +159,7 @@ Documentation source
   - **Indexing**: `matrix[bus_num, branch_tuple]` auto-maps to internal indices
   - **Subnetworks**: `subnetwork_axes` Dict maps reference buses to island components
   - **Caching**: VirtualPTDF/LODF use LRU cache (default 100 MiB) for row storage
-  - **Solvers**: KLU (default), Dense, MKLPardiso, AppleAccelerate via extensions
+  - **Solvers**: KLU, Dense, MKLPardiso (ext), and built-in AppleAccelerateLU (libSparse `SparseFactorizationLU` + Inf-norm equilibration scaling, macOS 15.5+) via `src/AccelerateWrapper/`; KLU is the default on non-Apple / macOS < 15.5; AppleAccelerateLU is the default on macOS ≥ 15.5.
 
 ## Test Patterns
 
diff --git a/Project.toml b/Project.toml
index 725cae6d0..a2f0ea685 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,15 +15,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SuiteSparse_jll = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
 
 [weakdeps]
-AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
 
 [extensions]
-AppleAccelerateExt = "AppleAccelerate"
 MKLPardisoExt = "Pardiso"
 
 [compat]
-AppleAccelerate = "^0.4"
 DataStructures = "^0.19"
 DocStringExtensions = "~0.8, ~0.9"
 HDF5 = "0.17"
diff --git a/docs/src/reference/internals.md b/docs/src/reference/internals.md
index cf5b63078..e3cc12b85 100644
--- a/docs/src/reference/internals.md
+++ b/docs/src/reference/internals.md
@@ -14,3 +14,14 @@ symbols are exported from `PowerNetworkMatrices`.
 ```@autodocs
 Modules = [PowerNetworkMatrices.KLUWrapper]
 ```
+
+## `AccelerateWrapper`
+
+`PowerNetworkMatrices.AccelerateWrapper` is a thin, allocation-aware wrapper over Apple's
+`libSparse.dylib` (provided by the system Accelerate framework) used internally for sparse
+linear solves on macOS. Non-Apple builds load stub fallbacks that throw on use. None of
+these symbols are exported from `PowerNetworkMatrices`.
+
+```@autodocs
+Modules = [PowerNetworkMatrices.AccelerateWrapper]
+```
diff --git a/ext/AppleAccelerateExt.jl b/ext/AppleAccelerateExt.jl
deleted file mode 100644
index 8b2e48345..000000000
--- a/ext/AppleAccelerateExt.jl
+++ /dev/null
@@ -1,112 +0,0 @@
-module AppleAccelerateExt
-
-import PowerNetworkMatrices as PNM
-using AppleAccelerate
-import SparseArrays
-import LinearAlgebra
-
-# Extend the factorization creation function
-function PNM._create_apple_accelerate_factorization(ABA)
-    K = AppleAccelerate.AAFactorization(ABA)
-    AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT)
-    return K
-end
-
-# Extend the solve function for AppleAccelerate factorizations
-function PNM._solve_factorization(
-    K::AppleAccelerate.AAFactorization{Float64},
-    b::Vector{Float64},
-)
-    return AppleAccelerate.solve(K, b)
-end
-
-"""
-Function for internal use only.
-
-Computes the PTDF matrix by means of AppleAccelerate for sparse matrices.
-
-# Arguments
-- `A::SparseArrays.SparseMatrixCSC{Int8, Int}`:
-        Incidence Matrix
-- `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
-        BA matrix
-- `ref_bus_positions::Set{Int}`:
-        vector containing the indexes of the reference slack buses.
-- `dist_slack::Vector{Float64}`:
-        vector containing the weights for the distributed slacks.
-"""
-function PNM._calculate_PTDF_matrix_AppleAccelerate(
-    A::SparseArrays.SparseMatrixCSC{Int8, Int},
-    BA::SparseArrays.SparseMatrixCSC{Float64, Int},
-    ref_bus_positions::Set{Int},
-    dist_slack::Vector{Float64})
-    @warn "AppleAccelerate solver is experimental and may produce unexpected results. If you need high confidence use KLU"
-    linecount = size(BA, 2)
-    buscount = size(BA, 1)
-
-    ABA = PNM.calculate_ABA_matrix(A, BA, ref_bus_positions)
-    K = AppleAccelerate.AAFactorization(ABA)
-    AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT)
-
-    # initialize matrices for evaluation
-    valid_ix = setdiff(1:buscount, ref_bus_positions)
-    PTDFm_t = zeros(buscount, linecount)
-    copyto!(PTDFm_t, BA)
-    if !isempty(dist_slack) && length(ref_bus_positions) != 1
-        error(
-            "Distributed slack is not supported for systems with multiple reference buses.",
-        )
-    elseif isempty(dist_slack) && length(ref_bus_positions) < buscount
-        PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        return PTDFm_t
-    elseif length(dist_slack) == buscount
-        @info "Distributed bus"
-        PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        slack_array = dist_slack / sum(dist_slack)
-        slack_array = reshape(slack_array, 1, buscount)
-        return PTDFm_t .- (slack_array * PTDFm_t)
-    else
-        error("Distributed bus specification doesn't match the number of buses.")
-    end
-
-    return
-end
-
-"""
-Function for internal use only.
-
-Computes the LODF matrix by means of AppleAccelerate for sparse matrices.
-
-# Arguments
-- `a::SparseArrays.SparseMatrixCSC{Int8, Int}`:
-        Incidence Matrix
-- `ptdf::Matrix{Float64}`:
-        PTDF matrix
-"""
-function PNM._calculate_LODF_matrix_AppleAccelerate(
-    a::SparseArrays.SparseMatrixCSC{Int8, Int},
-    ptdf::Matrix{Float64},
-)
-    linecount = size(ptdf, 2)
-    ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    Dem_LU = AppleAccelerate.AAFactorization(SparseArrays.sparse(m_I, m_I, m_V))
-    lodf_t = AppleAccelerate.solve(Dem_LU, ptdf_denominator_t)
-    lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0
-
-    return lodf_t
-end
-
-end # module
diff --git a/ext/MKLPardisoExt.jl b/ext/MKLPardisoExt.jl
index ab9d02e04..1f124a47d 100644
--- a/ext/MKLPardisoExt.jl
+++ b/ext/MKLPardisoExt.jl
@@ -171,28 +171,15 @@ function PNM._calculate_LODF_matrix_MKLPardiso(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
     ptdf::Matrix{Float64},
 )
+    # The demand matrix `diag(1 - PTDF·A[i,i])` is diagonal, so the
+    # "solve" is a row-wise element-wise division. The Pardiso path that
+    # used to factor it and back-solve via `_pardiso_sequential_LODF!` /
+    # `_pardiso_single_LODF!` is no longer needed for this stage.
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    lodf_t = zeros(linecount, linecount)
-    A = SparseArrays.sparse(m_I, m_I, m_V)
-    if linecount > PNM.DEFAULT_LODF_CHUNK_SIZE
-        PNM._pardiso_sequential_LODF!(lodf_t, A, ptdf_denominator_t)
-    else
-        PNM._pardiso_single_LODF!(lodf_t, A, ptdf_denominator_t)
-    end
-    lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0
-    return lodf_t
+    m_V = PNM._build_lodf_demand(ptdf_denominator_t, linecount)
+    PNM._apply_lodf_demand!(ptdf_denominator_t, m_V)
+    return ptdf_denominator_t
 end
 
 end # module
diff --git a/scripts/benchmarks/benchmark_solvers.jl b/scripts/benchmarks/benchmark_solvers.jl
new file mode 100644
index 000000000..6324106e6
--- /dev/null
+++ b/scripts/benchmarks/benchmark_solvers.jl
@@ -0,0 +1,376 @@
+# benchmark_solvers.jl
+#
+# Purpose:
+#   Benchmark VirtualPTDF and VirtualMODF construction and per-row query
+#   performance for KLU and AppleAccelerateLU solvers on the 10k-bus
+#   matpower_ACTIVSg10k_sys case.
+#
+#   Line outages are injected as PSY.FixedForcedOutage supplemental attributes
+#   on ACTransmission branches above a configurable kV threshold, making
+#   VirtualMODF contingency-row performance measurable.
+#
+#   Every measured quantity is sampled over PASSES (default 10) passes and
+#   reported as `median [min–max]` so the run-to-run range is visible.
+#
+# Run command (from repo root):
+#   julia --project=test scripts/benchmarks/benchmark_solvers.jl
+#
+# Runtime note:
+#   Each row-batch pass builds a *fresh* (cold-cache) Virtual* object so the
+#   row-solve cost is measured without cache hits. With PASSES=10 that means
+#   ~11 cold builds per solver per path; on the 10k case this is minutes, not
+#   seconds. Lower PASSES / N_ROWS to trade range fidelity for speed.
+#
+# Note on Project.toml:
+#   This script activates the repo `test` project rather than providing a
+#   standalone scripts/benchmarks/Project.toml because PowerSystemCaseBuilder
+#   is a test-only dependency. Resolving PSB from the registry in a standalone
+#   project is unreliable without a dedicated manifest; the test environment
+#   already has PSB deved alongside PNM and is guaranteed to be consistent.
+
+import Pkg
+Pkg.activate(joinpath(@__DIR__, "..", "..", "test"))
+
+using PowerNetworkMatrices
+using PowerSystemCaseBuilder
+import PowerSystems as PSY
+import InfrastructureSystems as IS
+import SparseArrays, LinearAlgebra, Random
+using Printf
+
+const PNM = PowerNetworkMatrices
+const PSB = PowerSystemCaseBuilder
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Top-level configuration constants
+# ─────────────────────────────────────────────────────────────────────────────
+
+const SYSTEM_NAME = "matpower_ACTIVSg10k_sys"
+const KV_THRESHOLD = 230.0          # inject outages on arcs above this kV
+const N_ROWS = 200            # arc-rows to time per batch
+const SOLVERS = ["KLU", "AppleAccelerateLU"]
+const PASSES = 10             # timed passes per measurement (range)
+const WARMUP = 1              # untimed warmup passes (excluded)
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Timing helpers
+# ─────────────────────────────────────────────────────────────────────────────
+
+Base.@noinline _no_inline(x) = x
+
+"Min/median/max of a sample vector of nanosecond timings."
+struct TimeStats
+    tmin::Float64
+    tmed::Float64
+    tmax::Float64
+end
+
+const NA_STATS = TimeStats(NaN, NaN, NaN)
+
+isna(s::TimeStats) = isnan(s.tmed)
+
+"""
+    collect_stats(sample; passes=PASSES, warmup=WARMUP) -> TimeStats
+
+Call `sample()` `warmup + passes` times. `sample()` must return its own
+elapsed time in nanoseconds (Float64). The first `warmup` results are
+discarded; the remaining `passes` are reduced to (min, median, max).
+"""
+function collect_stats(sample::F; passes::Int = PASSES, warmup::Int = WARMUP) where {F}
+    for _ in 1:warmup
+        _no_inline(sample())
+    end
+    times = Vector{Float64}(undef, passes)
+    for i in 1:passes
+        times[i] = Float64(_no_inline(sample()))
+    end
+    sort!(times)
+    tmin = times[1]
+    tmax = times[end]
+    tmed = times[(passes + 1) ÷ 2]
+    return TimeStats(tmin, tmed, tmax)
+end
+
+"Time a single call to `f()` (whole call, GC'd first); returns elapsed ns."
+function time_call(f::F) where {F}
+    GC.gc()
+    t = time_ns()
+    _no_inline(f())
+    return Float64(time_ns() - t)
+end
+
+function fmt_time(t_ns::Float64)
+    if t_ns < 1e3
+        return @sprintf("%.1f ns", t_ns)
+    elseif t_ns < 1e6
+        return @sprintf("%.1f μs", t_ns / 1e3)
+    elseif t_ns < 1e9
+        return @sprintf("%.1f ms", t_ns / 1e6)
+    else
+        return @sprintf("%.2f s", t_ns / 1e9)
+    end
+end
+
+"Markdown cell: `median [min–max]`, or `n/a`."
+function cell_str(s::TimeStats)
+    if isna(s)
+        return "n/a"
+    end
+    return "$(fmt_time(s.tmed)) [$(fmt_time(s.tmin))–$(fmt_time(s.tmax))]"
+end
+
+"Ratio of medians, e.g. KLU/AA."
+function ratio_str(a::TimeStats, b::TimeStats)
+    if isna(a) || isna(b)
+        return "n/a"
+    end
+    return @sprintf("%.2fx", a.tmed / b.tmed)
+end
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Spread-out index selection
+# ─────────────────────────────────────────────────────────────────────────────
+
+"""
+    spread_keys(axis, n) -> Vector
+
+Return `n` evenly-spread elements from `axis` (or all elements if fewer than `n`).
+"""
+function spread_keys(axis::Vector, n::Int)
+    total = length(axis)
+    n = min(n, total)
+    indices = unique(round.(Int, range(1, total; length = n)))
+    return [axis[i] for i in indices]
+end
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Outage injection
+# ─────────────────────────────────────────────────────────────────────────────
+
+"""
+    inject_line_outages!(sys; kv_threshold=KV_THRESHOLD) -> Int
+
+Attach a `PSY.FixedForcedOutage(outage_status=1.0)` supplemental attribute to
+every `PSY.Line` whose highest terminal voltage exceeds `kv_threshold` kV.
+
+Only `PSY.Line` components are considered — transformers and
+phase-shifting transformers are intentionally excluded (contingencies are
+registered for lines only).
+
+Returns the number of outages injected. Prints a summary with line count,
+injected count, and three example (name, kV) pairs.
+"""
+function inject_line_outages!(sys::PSY.System; kv_threshold::Float64 = KV_THRESHOLD)::Int
+    all_branches = collect(PSY.get_components(PSY.Line, sys))
+    n_total = length(all_branches)
+    injected = 0
+    examples = Tuple{String, Float64}[]
+
+    for branch in all_branches
+        arc = PSY.get_arc(branch)
+        from_kv = PSY.get_base_voltage(PSY.get_from(arc))
+        to_kv = PSY.get_base_voltage(PSY.get_to(arc))
+        v = max(from_kv, to_kv)
+        if v > kv_threshold
+            outage = PSY.FixedForcedOutage(; outage_status = 1.0)
+            PSY.add_supplemental_attribute!(sys, branch, outage)
+            injected += 1
+            if length(examples) < 3
+                push!(examples, (PSY.get_name(branch), v))
+            end
+        end
+    end
+
+    println(
+        "inject_line_outages!: $n_total total PSY.Line components; " *
+        "injected $injected outages (kv_threshold = $kv_threshold kV)",
+    )
+    for (name, kv) in examples
+        println("  example: \"$name\" at $(kv) kV")
+    end
+    return injected
+end
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Load system and inject outages
+# ─────────────────────────────────────────────────────────────────────────────
+
+println("Loading $SYSTEM_NAME …")
+sys = PSB.build_system(PSB.MatpowerTestSystems, SYSTEM_NAME)
+n_buses = length(collect(PSY.get_components(PSY.ACBus, sys)))
+println("System loaded: $n_buses buses")
+
+println("\nInjecting line outages (kv_threshold = $KV_THRESHOLD kV) …")
+n_outages = inject_line_outages!(sys; kv_threshold = KV_THRESHOLD)
+println("Total outages injected: $n_outages\n")
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Result tables
+# ─────────────────────────────────────────────────────────────────────────────
+
+const PATHS = [
+    "VirtualPTDF build",
+    "VirtualPTDF $(N_ROWS)-row batch",
+    "VirtualMODF build",
+    "VirtualMODF $(N_ROWS)-row batch",
+]
+
+results = Dict{String, Dict{String, TimeStats}}()
+for p in PATHS
+    results[p] = Dict{String, TimeStats}(s => NA_STATS for s in SOLVERS)
+end
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Per-solver benchmark loop
+#
+# Build paths : time the whole constructor call, PASSES times.
+# Row batches : each pass builds a FRESH (cold-cache) object UNTIMED, then
+#               times only the row loop — isolating row-solve cost with no
+#               build-time subtraction and no cache hits across passes.
+# ─────────────────────────────────────────────────────────────────────────────
+
+for solver in SOLVERS
+    println("\n" * "="^60)
+    println("Solver: $solver  ($PASSES passes, $WARMUP warmup)")
+    println("="^60)
+
+    # ------------------------------------------------------------------
+    # VirtualPTDF: build time
+    # ------------------------------------------------------------------
+    vptdf_ref = nothing
+    try
+        println("  [VirtualPTDF] timed build …")
+        vptdf_ref = PNM.VirtualPTDF(sys; linear_solver = solver)
+        s = collect_stats(
+            () -> time_call(
+                () -> PNM.VirtualPTDF(sys; linear_solver = solver),
+            ),
+        )
+        results["VirtualPTDF build"][solver] = s
+        println("  VirtualPTDF build: $(cell_str(s))")
+    catch e
+        println("  VirtualPTDF build FAILED: $e")
+    end
+
+    # ------------------------------------------------------------------
+    # VirtualPTDF: N_ROWS-row batch (fresh cold-cache object per pass)
+    # ------------------------------------------------------------------
+    if vptdf_ref !== nothing
+        try
+            println("  [VirtualPTDF] $N_ROWS-row batch …")
+            arc_keys = spread_keys(PNM.get_arc_axis(vptdf_ref), N_ROWS)
+            sample = function ()
+                v = PNM.VirtualPTDF(sys; linear_solver = solver)
+                GC.gc()
+                t = time_ns()
+                for arc in arc_keys
+                    _ = v[arc, :]
+                end
+                return Float64(time_ns() - t)
+            end
+            s = collect_stats(sample)
+            results["VirtualPTDF $(N_ROWS)-row batch"][solver] = s
+            println("  VirtualPTDF $(N_ROWS)-row batch (rows only): $(cell_str(s))")
+        catch e
+            println("  VirtualPTDF row batch FAILED: $e")
+        end
+    end
+
+    # ------------------------------------------------------------------
+    # VirtualMODF: build time
+    # ------------------------------------------------------------------
+    vmodf_ref = nothing
+    try
+        println("  [VirtualMODF] timed build …")
+        vmodf_ref = PNM.VirtualMODF(sys; linear_solver = solver)
+        n_ctg = length(PNM.get_registered_contingencies(vmodf_ref))
+        println("  VirtualMODF: $n_ctg contingencies registered")
+        s = collect_stats(
+            () -> time_call(
+                () -> PNM.VirtualMODF(sys; linear_solver = solver),
+            ),
+        )
+        results["VirtualMODF build"][solver] = s
+        println("  VirtualMODF build: $(cell_str(s))")
+    catch e
+        println("  VirtualMODF build FAILED: $e")
+    end
+
+    # ------------------------------------------------------------------
+    # VirtualMODF: N_ROWS-row batch for one contingency
+    #
+    # Row access: vmodf[arc_idx::Int, contingency::ContingencySpec]
+    # where arc_idx = PNM.get_arc_lookup(vmodf)[arc_tuple]
+    # and contingency is taken from PNM.get_registered_contingencies(vmodf).
+    # ------------------------------------------------------------------
+    if vmodf_ref !== nothing
+        try
+            ctg_dict = PNM.get_registered_contingencies(vmodf_ref)
+            if isempty(ctg_dict)
+                println("  VirtualMODF: no contingencies registered — skipping row batch.")
+                println("  (Re-run inject_line_outages! with a lower kv_threshold.)")
+            else
+                println("  [VirtualMODF] $N_ROWS-row batch …")
+                arc_keys = spread_keys(PNM.get_arc_axis(vmodf_ref), N_ROWS)
+                arc_lookup_ref = PNM.get_arc_lookup(vmodf_ref)
+                arc_indices = [arc_lookup_ref[arc] for arc in arc_keys]
+                sample = function ()
+                    v = PNM.VirtualMODF(sys; linear_solver = solver)
+                    ctg = first(values(PNM.get_registered_contingencies(v)))
+                    GC.gc()
+                    t = time_ns()
+                    for arc_idx in arc_indices
+                        _ = v[arc_idx, ctg]
+                    end
+                    return Float64(time_ns() - t)
+                end
+                s = collect_stats(sample)
+                results["VirtualMODF $(N_ROWS)-row batch"][solver] = s
+                println("  VirtualMODF $(N_ROWS)-row batch (rows only): $(cell_str(s))")
+            end
+        catch e
+            println("  VirtualMODF row batch FAILED: $e")
+        end
+    end
+end
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Results table
+# ─────────────────────────────────────────────────────────────────────────────
+
+println("\n\n" * "="^60)
+println("RESULTS")
+println("="^60)
+
+println()
+println("System:    $SYSTEM_NAME")
+println("Buses:     $n_buses")
+println("Outages:   $n_outages (kv_threshold = $KV_THRESHOLD kV)")
+println("Hardware:  $(Sys.MACHINE)")
+println("Julia:     $(VERSION)")
+println("Passes:    $PASSES timed ($WARMUP warmup discarded); cells are median [min–max]")
+println(
+    "Row batch: $N_ROWS spread-out arcs; fresh cold-cache object per pass, row loop only",
+)
+println()
+
+header = "| Path | KLU | AA_LU | KLU/AA_LU |"
+sep = "|---|---|---|---|"
+println(header)
+println(sep)
+
+for p in PATHS
+    s_klu = results[p]["KLU"]
+    s_lu = results[p]["AppleAccelerateLU"]
+    row = @sprintf(
+        "| %s | %s | %s | %s |",
+        p,
+        cell_str(s_klu),
+        cell_str(s_lu),
+        ratio_str(s_klu, s_lu),
+    )
+    println(row)
+end
+
+println()
+println("Done.")
diff --git a/src/AccelerateWrapper/AccelerateWrapper.jl b/src/AccelerateWrapper/AccelerateWrapper.jl
new file mode 100644
index 000000000..e84f5a1c1
--- /dev/null
+++ b/src/AccelerateWrapper/AccelerateWrapper.jl
@@ -0,0 +1,73 @@
+"""
+    AccelerateWrapper
+
+A small, allocation-aware wrapper over Apple's `libSparse.dylib`
+(`/System/Library/Frameworks/Accelerate.framework/.../libSparse.dylib`)
+designed for the access patterns of `PowerNetworkMatrices`:
+
+- Cache the symbolic and numeric factorizations of a general (unsymmetric)
+  sparse matrix (LU with threshold partial pivoting + Inf-norm equilibration
+  scaling) and reuse them across many solves.
+- Refresh the numeric factor (`numeric_refactor!`) while keeping the symbolic
+  analysis, without re-allocating the structural arrays.
+- Solve dense and **sparse** right-hand sides in place, with the sparse path
+  packing only non-empty RHS columns into a bounded scratch block.
+- Compute `A·X` and `A·x` directly via libSparse's `SparseMultiply`.
+
+This module is intentionally lighter than the upstream `AppleAccelerate.jl`
+package: it owns no high-level Julia wrappers over libSparse, exposes the
+symbolic/numeric split directly, binds only the entry points used by PNM,
+and is compile-gated to macOS so non-Apple builds never codegen the
+`@ccall` sites.
+"""
+module AccelerateWrapper
+
+import SparseArrays
+import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros, nzrange
+import LinearAlgebra
+
+export AAFactorCache,
+    aa_factorize,
+    symbolic_factor!,
+    numeric_refactor!,
+    full_factor!,
+    full_refactor!,
+    solve!,
+    solve_sparse!,
+    solve_sparse,
+    is_factored,
+    aa_spmm!,
+    aa_spmv!
+
+@static if Sys.isapple()
+    include("libsparse_bindings.jl")
+    include("aa_cache.jl")
+    include("solve_dense.jl")
+    include("solve_sparse_rhs.jl")
+    include("spmm.jl")
+else
+    # Stub layer. Non-Apple builds never bind libSparse symbols, never codegen
+    # the `@ccall` sites, and never instantiate `SparseOpaqueFactorization`.
+    # The whole submodule reduces to these short bodies on Linux/Windows.
+    struct AAFactorCache end
+
+    _unavailable() = error(
+        "AccelerateWrapper is macOS-only (Sys.isapple() returned false). " *
+        "Use the KLU backend on non-Apple platforms.",
+    )
+
+    AAFactorCache(args...; kwargs...) = _unavailable()
+    aa_factorize(args...; kwargs...) = _unavailable()
+    symbolic_factor!(args...) = _unavailable()
+    numeric_refactor!(args...) = _unavailable()
+    full_factor!(args...) = _unavailable()
+    full_refactor!(args...) = _unavailable()
+    solve!(args...) = _unavailable()
+    solve_sparse!(args...; kwargs...) = _unavailable()
+    solve_sparse(args...; kwargs...) = _unavailable()
+    is_factored(::AAFactorCache) = false
+    aa_spmm!(args...) = _unavailable()
+    aa_spmv!(args...) = _unavailable()
+end
+
+end # module
diff --git a/src/AccelerateWrapper/aa_cache.jl b/src/AccelerateWrapper/aa_cache.jl
new file mode 100644
index 000000000..033894633
--- /dev/null
+++ b/src/AccelerateWrapper/aa_cache.jl
@@ -0,0 +1,373 @@
+"""
+A cached libSparse linear solver for repeated solves against the same sparse
+matrix structure. `numeric_refactor!` and `solve!` allocate nothing once the
+cache is built.
+
+Cache the symbolic and numeric factorizations of a general (unsymmetric) sparse
+matrix (LU with threshold partial pivoting + Inf-norm equilibration scaling) and
+reuse them across many solves. The full CSC pattern of `A` is stored and the
+libSparse structure view is marked `ATT_ORDINARY`. No symmetry requirement
+(matches KLU's pivoting model).
+
+Float64 only. Requires macOS 15.5+ (enforced by the backend selection in
+`linalg_settings.jl`).
+
+`reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis;
+`check_pattern` adds a structural-equality check on refactor calls and is
+only consulted when reusing.
+"""
+mutable struct AAFactorCache
+    # Apple-side 0-based, narrower-integer copies of the full input CSC pattern.
+    # Reused as-is across `numeric_refactor!` calls.
+    columnStarts::Vector{Clong}
+    rowIndices::Vector{Cint}
+    nzval::Vector{Cdouble}
+    n::Int
+    # Count of stored entries: full `nnz(A)`.
+    nnz::Int
+    symbolic::SparseOpaqueSymbolicFactorization
+    numeric::SparseOpaqueFactorization_t
+    reuse_symbolic::Bool
+    check_pattern::Bool
+    # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call.
+    scratch::Matrix{Cdouble}
+    col_map::Vector{Int}
+    # Reusable libSparse solve workspace. Sized to
+    # `static + nrhs * per_rhs` bytes; supplied to the workspace-aware
+    # `SparseSolve` overloads so libSparse does not malloc/free per call.
+    # Float64 storage chosen for 16-byte alignment; length is in Float64s.
+    solve_workspace::Vector{Float64}
+    # Scaling method passed to libSparse at numeric-factor time.
+    # `SparseScalingEquilibriationInf` reduces fill on symmetric SPD-ish inputs
+    # like ABA (~4× faster multi-RHS solve, residual still O(1e-13)). Set via
+    # the `scaling` kwarg of `AAFactorCache` / `aa_factorize`.
+    scaling::SparseScaling_t
+end
+
+@inline _dim(cache::AAFactorCache) = cache.n
+
+Base.size(cache::AAFactorCache) = (cache.n, cache.n)
+function Base.size(cache::AAFactorCache, d::Integer)
+    if d <= 2
+        return cache.n
+    else
+        return 1
+    end
+end
+Base.eltype(::Type{AAFactorCache}) = Cdouble
+
+"""
+    is_factored(cache::AAFactorCache) -> Bool
+
+`true` when `cache` holds a valid numeric factorization ready for `solve!` /
+`solve_sparse!`. `false` after construction (before `full_factor!`) or after
+the libSparse handles have been finalized.
+"""
+function is_factored(cache::AAFactorCache)
+    return cache.numeric.status == SparseStatusOk && cache.symbolic.status == SparseStatusOk
+end
+
+"""
+    AAFactorCache(A; reuse_symbolic=true, check_pattern=true,
+                  scaling=SparseScalingEquilibriationInf)
+
+Build a cache for the square sparse matrix `A`. Allocates the Apple-side
+structural arrays (`columnStarts`, `rowIndices`, `nzval`) but does **not**
+factorize. Call `full_factor!` (or `symbolic_factor!` followed by
+`numeric_refactor!`) before `solve!`.
+
+The arrays are sized to the full pattern of `A` (general/LU mode). No symmetry
+requirement — matches KLU's pivoting model.
+
+A finalizer frees libSparse handles on GC; call `Base.finalize(cache)` to
+release them eagerly.
+"""
+function AAFactorCache(
+    A::SparseMatrixCSC{Float64, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+    scaling::SparseScaling_t = SparseScalingEquilibriationInf,
+)
+    n = size(A, 1)
+    n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))"))
+    stored_nnz = SparseArrays.nnz(A)
+    cache = AAFactorCache(
+        Vector{Clong}(undef, n + 1),
+        Vector{Cint}(undef, stored_nnz),
+        Vector{Cdouble}(undef, 0),
+        n,
+        stored_nnz,
+        _null_symbolic(),
+        _null_factorization(),
+        reuse_symbolic,
+        check_pattern,
+        Matrix{Cdouble}(undef, 0, 0),
+        Int[],
+        Float64[],
+        scaling,
+    )
+    _populate_pattern!(cache, A)
+    finalizer(_free_handles!, cache)
+    return cache
+end
+
+# --- general (LU) mode helpers ----------------------------------------------
+
+# Copy A's full CSC pattern into `cache.columnStarts` / `cache.rowIndices`
+# as 0-based narrowed indices. Caller sized these to (n+1) and nnz.
+function _populate_pattern!(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+)
+    cp = getcolptr(A)
+    rv = rowvals(A)
+    @inbounds for k in eachindex(cp)
+        cache.columnStarts[k] = Clong(cp[k] - 1)
+    end
+    @inbounds for k in eachindex(rv)
+        cache.rowIndices[k] = Cint(rv[k] - 1)
+    end
+    return cache
+end
+
+# Snapshot the full nonzeros into `cache.nzval`, growing if needed.
+function _populate_values!(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+)
+    if length(cache.nzval) != cache.nnz
+        resize!(cache.nzval, cache.nnz)
+    end
+    copyto!(cache.nzval, nonzeros(A))
+    return cache
+end
+
+# --- pattern guard ----------------------------------------------------------
+
+# Pattern-match guard for `numeric_refactor!`: assert the incoming CSC's
+# stored pattern is identical to what was analyzed (full pattern, LU mode).
+# Apple's arrays are 0-based but we always store them 0-based — no flipping.
+function _check_pattern_match(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+    op::AbstractString,
+)
+    n = cache.n
+    if size(A, 1) != n || size(A, 2) != n
+        throw(DimensionMismatch("Cannot $op: cache is $(n)×$(n) but A is $(size(A))."))
+    end
+    cp = getcolptr(A)
+    rv = rowvals(A)
+    length(rv) == cache.nnz || return _pattern_mismatch(op)
+    @inbounds for k in eachindex(cp)
+        cache.columnStarts[k] == Clong(cp[k] - 1) || return _pattern_mismatch(op)
+    end
+    @inbounds for k in eachindex(rv)
+        cache.rowIndices[k] == Cint(rv[k] - 1) || return _pattern_mismatch(op)
+    end
+    return nothing
+end
+
+_pattern_mismatch(op::AbstractString) =
+    throw(ArgumentError("Cannot $op: matrix has different sparsity structure."))
+
+"""
+Release the libSparse numeric and symbolic handles held by `cache`, leaving
+Julia-side state intact. Idempotent.
+"""
+function _free_handles!(cache::AAFactorCache)
+    if cache.numeric.status == SparseStatusOk
+        _sparse_cleanup_factor!(cache.numeric)
+        cache.numeric = _null_factorization()
+    end
+    if cache.symbolic.status == SparseStatusOk
+        _sparse_cleanup_symbolic!(cache.symbolic)
+        cache.symbolic = _null_symbolic()
+    end
+    return nothing
+end
+
+Base.finalize(cache::AAFactorCache) = _free_handles!(cache)
+
+# Build the libSparse `SparseMatrixStructure` view that points into the
+# cache's owned arrays. Always marked `ATT_ORDINARY` — the LU path treats
+# `cache.nzval` as the full matrix with no symmetry assumed.
+function _structure_view(cache::AAFactorCache)
+    return SparseMatrixStructure(
+        Cint(cache.n),
+        Cint(cache.n),
+        pointer(cache.columnStarts),
+        pointer(cache.rowIndices),
+        ATT_ORDINARY,
+        UInt8(1),
+    )
+end
+
+function _matrix_view(cache::AAFactorCache)
+    return SparseMatrix_t(_structure_view(cache), pointer(cache.nzval))
+end
+
+"""
+    symbolic_factor!(cache, A)
+
+Free any cached symbolic/numeric factor, replace the structural arrays with
+`A`'s full pattern, and analyze. Subsequent `numeric_refactor!` calls reuse
+the analysis.
+"""
+function symbolic_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    n = cache.n
+    if size(A, 1) != n || size(A, 2) != n
+        throw(DimensionMismatch("Cannot factor: cache is $(n)×$(n) but A is $(size(A))."))
+    end
+    _free_handles!(cache)
+    new_nnz = SparseArrays.nnz(A)
+    if new_nnz != cache.nnz
+        resize!(cache.rowIndices, new_nnz)
+        cache.nnz = new_nnz
+    end
+    if length(cache.columnStarts) != n + 1
+        resize!(cache.columnStarts, n + 1)
+    end
+    _populate_pattern!(cache, A)
+    sym = _sparse_symbolic_factor(
+        SparseFactorizationLU,
+        _structure_view(cache),
+        SparseSymbolicFactorOptions(),
+    )
+    if sym.status != SparseStatusOk
+        # libSparse may have allocated C-side state before deciding to fail —
+        # release it before throwing so we don't leak per failed factor.
+        _sparse_cleanup_symbolic!(sym)
+        _libsparse_throw(sym.status, "symbolic factor")
+    end
+    cache.symbolic = sym
+    return cache
+end
+
+"""
+    numeric_refactor!(cache, A)
+
+Refresh the numeric factor on top of the existing symbolic analysis. Errors
+if `symbolic_factor!` has not been called yet.
+"""
+function numeric_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    cache.symbolic.status == SparseStatusOk ||
+        error("AAFactorCache: call symbolic_factor! before numeric_refactor!.")
+    cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor")
+    _populate_values!(cache, A)
+    if cache.numeric.status == SparseStatusOk
+        _sparse_cleanup_factor!(cache.numeric)
+        cache.numeric = _null_factorization()
+    end
+    num = _sparse_numeric_factor(
+        cache.symbolic,
+        _matrix_view(cache),
+        SparseNumericFactorOptions(cache.scaling),
+    )
+    if num.status != SparseStatusOk
+        # Same rationale as in symbolic_factor!: release before throwing.
+        _sparse_cleanup_factor!(num)
+        _libsparse_throw(num.status, "numeric factor")
+    end
+    cache.numeric = num
+    return cache
+end
+
+"""
+    symbolic_refactor!(cache, A)
+
+If `cache.reuse_symbolic`, optionally verify the structure matches and reuse
+the existing analysis. Otherwise, rerun `symbolic_factor!`.
+"""
+function symbolic_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    if !cache.reuse_symbolic
+        return symbolic_factor!(cache, A)
+    end
+    cache.check_pattern && _check_pattern_match(cache, A, "symbolic_refactor")
+    return cache
+end
+
+"""
+    full_factor!(cache, A) -> cache
+
+Run a fresh symbolic analysis followed by a numeric factorization on `A`.
+"""
+function full_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    symbolic_factor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    full_refactor!(cache, A) -> cache
+
+Refresh both factorizations on `A`. Defers to `symbolic_refactor!` (which
+reuses the existing analysis when `cache.reuse_symbolic` is set) followed by
+`numeric_refactor!`.
+"""
+function full_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    symbolic_refactor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    aa_factorize(A; reuse_symbolic=true, check_pattern=true,
+                  scaling=SparseScalingEquilibriationInf) -> AAFactorCache
+
+Build a cache for `A` and immediately compute the full LU factorization. See
+`AAFactorCache` for the kwarg semantics.
+"""
+function aa_factorize(
+    A::SparseMatrixCSC{Float64, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+    scaling::SparseScaling_t = SparseScalingEquilibriationInf,
+)
+    cache = AAFactorCache(
+        A;
+        reuse_symbolic = reuse_symbolic,
+        check_pattern = check_pattern,
+        scaling = scaling,
+    )
+    return full_factor!(cache, A)
+end
+
+"""
+    _ensure_scratch!(cache, block) -> Nothing
+
+Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length
+`block`. Used by `solve_sparse!`.
+"""
+@inline function _ensure_scratch!(cache::AAFactorCache, block::Int)
+    n = cache.n
+    s = cache.scratch
+    if size(s, 1) != n || size(s, 2) < block
+        cache.scratch = Matrix{Cdouble}(undef, n, block)
+    end
+    if length(cache.col_map) < block
+        resize!(cache.col_map, block)
+    end
+    return nothing
+end
+
+"""
+    _ensure_solve_workspace!(cache, nrhs) -> Ptr{Cvoid}
+
+Ensure `cache.solve_workspace` is large enough to back a libSparse
+`SparseSolve` call with `nrhs` right-hand sides (`static + nrhs * per_rhs`
+bytes per the factor's documented requirements). Grows only when too small
+— steady state is no-op. Returns a `Ptr{Cvoid}` to pass to the
+workspace-aware ccall (16-byte aligned by virtue of `Vector{Float64}`'s
+allocator).
+"""
+@inline function _ensure_solve_workspace!(cache::AAFactorCache, nrhs::Integer)
+    nbytes = _solve_workspace_bytes(cache.numeric, nrhs)
+    # Round up to whole Float64s; min 1 element so `pointer` is well-defined
+    # when libSparse asks for zero bytes (it may still dereference).
+    need = max(cld(nbytes, 8), 1)
+    if length(cache.solve_workspace) < need
+        resize!(cache.solve_workspace, need)
+    end
+    return convert(Ptr{Cvoid}, pointer(cache.solve_workspace))
+end
diff --git a/src/AccelerateWrapper/libsparse_bindings.jl b/src/AccelerateWrapper/libsparse_bindings.jl
new file mode 100644
index 000000000..49f5391e5
--- /dev/null
+++ b/src/AccelerateWrapper/libsparse_bindings.jl
@@ -0,0 +1,373 @@
+# Direct bindings into Apple's libSparse.dylib (part of the Accelerate
+# framework). Only the entry points actually consumed by PowerNetworkMatrices
+# are wrapped — Float64-only, no Float32 mangled aliases, no QR / Cholesky-AtA
+# variants. Mangled names match what AppleAccelerate.jl uses; see
+# `/Library/Developer/CommandLineTools/SDKs/MacOSX*.sdk/System/Library/
+# Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/
+# Versions/A/Headers/Sparse/Solve.h` for the C declarations.
+
+const LIBSPARSE =
+    "/System/Library/Frameworks/Accelerate.framework/Versions/A/" *
+    "Frameworks/vecLib.framework/libSparse.dylib"
+
+# Only `SparseFactorizationLU` is used by PNM (the documented default,
+# "currently LU with TPP" — threshold partial pivoting, provably stable).
+# The other LU codes are listed for documentation. All LU codes require
+# macOS 15.5+. QR = 40 and CholeskyAtA = 41 are intentionally not wrapped
+# (see file header). The LDLT/Cholesky family (codes 0–4) is not wrapped
+# because all PNM workloads use the general LU path.
+@enum SparseFactorization_t::UInt8 begin
+    # QR = 40, CholeskyAtA = 41 — intentionally not wrapped (see file header).
+    SparseFactorizationLU = 80
+    SparseFactorizationLUUnpivoted = 81
+    SparseFactorizationLUSPP = 82
+    SparseFactorizationLUTPP = 83
+end
+
+@enum SparseOrder_t::UInt8 begin
+    SparseOrderDefault = 0
+    SparseOrderUser = 1
+    SparseOrderAMD = 2
+    SparseOrderMetis = 3
+    SparseOrderCOLAMD = 4
+end
+
+@enum SparseScaling_t::UInt8 begin
+    SparseScalingDefault = 0
+    SparseScalingUser = 1
+    SparseScalingEquilibriationInf = 2
+end
+
+@enum SparseStatus_t::Int32 begin
+    SparseStatusOk = 0
+    SparseStatusFailed = -1
+    SparseMatrixIsSingular = -2
+    SparseInternalError = -3
+    SparseParameterError = -4
+    SparseStatusReleased = -2147483647
+end
+
+@enum SparseControl_t::UInt32 begin
+    SparseDefaultControl = 0
+end
+
+# `SparseAttributes_t` is a packed bitfield in C; Julia can't express that
+# cleanly, so we model it as `Cuint` and assemble the bits ourselves.
+const att_type = Cuint
+const ATT_TRANSPOSE = att_type(1)
+const ATT_UPPER_TRIANGLE = att_type(0)
+const ATT_LOWER_TRIANGLE = att_type(2)
+const ATT_ORDINARY = att_type(0)
+const ATT_TRIANGULAR = att_type(4)
+const ATT_UNIT_TRIANGULAR = att_type(8)
+const ATT_SYMMETRIC = att_type(12)
+
+struct SparseMatrixStructure
+    rowCount::Cint
+    columnCount::Cint
+    columnStarts::Ptr{Clong}
+    rowIndices::Ptr{Cint}
+    attributes::att_type
+    blockSize::UInt8
+end
+
+struct SparseNumericFactorOptions
+    control::SparseControl_t
+    scalingMethod::SparseScaling_t
+    scaling::Ptr{Cvoid}
+    pivotTolerance::Float64
+    zeroTolerance::Float64
+end
+
+# Defaults match Apple's SolveImplementation.h. The scaling method is the
+# meaningful knob — `SparseScalingEquilibriationInf` on the LU path is a ~4×
+# speedup with no correctness impact for well-conditioned ABA-like inputs.
+function SparseNumericFactorOptions(scaling::SparseScaling_t)
+    return SparseNumericFactorOptions(
+        SparseDefaultControl,
+        scaling,
+        C_NULL,
+        0.01,
+        eps(Cdouble) * 1e-4,
+    )
+end
+
+struct SparseSymbolicFactorOptions
+    control::SparseControl_t
+    orderMethod::SparseOrder_t
+    order::Ptr{Cvoid}
+    ignoreRowsAndColumns::Ptr{Cvoid}
+    malloc::Ptr{Cvoid}
+    free::Ptr{Cvoid}
+    reportError::Ptr{Cvoid}
+end
+
+# `reportError` is fired by libSparse before it returns a failure status. The
+# inline `error(unsafe_string(text))` matches AppleAccelerate.jl's pattern: it
+# propagates the libSparse message as a Julia exception that unwinds back
+# through the originating ccall. Avoids `@error`'s allocator/logger overhead,
+# which is fragile when libSparse invokes the callback from its own threads.
+# Passing libc malloc/free explicitly (the C_NULL "use Apple defaults" path is
+# documented but unreliable from a non-Obj-C caller).
+function SparseSymbolicFactorOptions()
+    return SparseSymbolicFactorOptions(
+        SparseDefaultControl,
+        SparseOrderDefault,
+        C_NULL,
+        C_NULL,
+        @cfunction(Libc.malloc, Ptr{Cvoid}, (Csize_t,)),
+        @cfunction(Libc.free, Cvoid, (Ptr{Cvoid},)),
+        @cfunction(text -> error(unsafe_string(text)), Cvoid, (Cstring,)),
+    )
+end
+
+struct DenseVector_t
+    count::Cint
+    data::Ptr{Cdouble}
+end
+
+struct DenseMatrix_t
+    rowCount::Cint
+    columnCount::Cint
+    columnStride::Cint
+    attributes::att_type
+    data::Ptr{Cdouble}
+end
+
+struct SparseMatrix_t
+    structure::SparseMatrixStructure
+    data::Ptr{Cdouble}
+end
+
+struct SparseOpaqueSymbolicFactorization
+    status::SparseStatus_t
+    rowCount::Cint
+    columnCount::Cint
+    attributes::att_type
+    blockSize::UInt8
+    type::SparseFactorization_t
+    factorization::Ptr{Cvoid}
+    workspaceSize_Float::Csize_t
+    workspaceSize_Double::Csize_t
+    factorSize_Float::Csize_t
+    factorSize_Double::Csize_t
+end
+
+# Placeholder used to initialize `AAFactorCache.symbolic` before the first
+# factor. Marked released so any accidental cleanup is a no-op.
+function _null_symbolic()
+    return SparseOpaqueSymbolicFactorization(
+        SparseStatusReleased,
+        0,
+        0,
+        ATT_ORDINARY,
+        0,
+        SparseFactorizationLU,
+        C_NULL,
+        0,
+        0,
+        0,
+        0,
+    )
+end
+
+struct SparseOpaqueFactorization_t
+    status::SparseStatus_t
+    attributes::att_type
+    symbolicFactorization::SparseOpaqueSymbolicFactorization
+    userFactorStorage::Bool
+    numericFactorization::Ptr{Cvoid}
+    solveWorkspaceRequiredStatic::Csize_t
+    solveWorkspaceRequiredPerRHS::Csize_t
+end
+
+function _null_factorization()
+    return SparseOpaqueFactorization_t(
+        SparseStatusReleased,
+        ATT_ORDINARY,
+        _null_symbolic(),
+        false,
+        C_NULL,
+        0,
+        0,
+    )
+end
+
+# Build the Apple-side dense views at the ccall boundary. `StridedMatrix`'s
+# first-dimension stride must be 1 (the libSparse contract); we assert at the
+# call site, not here.
+function _dense_matrix(B::StridedMatrix{Cdouble})
+    return DenseMatrix_t(
+        Cint(size(B, 1)),
+        Cint(size(B, 2)),
+        Cint(stride(B, 2)),
+        ATT_ORDINARY,
+        pointer(B),
+    )
+end
+
+function _dense_vector(b::StridedVector{Cdouble})
+    return DenseVector_t(Cint(length(b)), pointer(b))
+end
+
+# --- ccalls -----------------------------------------------------------------
+#
+# Mangled symbol names come from the C++ ABI of libSparse. They are stable on
+# the system framework and match what AppleAccelerate.jl binds. If Apple
+# breaks them in a future macOS release, the failure will be loud (dlopen of
+# a missing symbol at first call), which is the behavior we want.
+
+# Symbolic-only factor: analyzes the pattern, returns an opaque symbolic
+# factor. Can back many numeric factors on the same pattern.
+function _sparse_symbolic_factor(
+    ftype::SparseFactorization_t,
+    structure::SparseMatrixStructure,
+    sym_opts::SparseSymbolicFactorOptions,
+)::SparseOpaqueSymbolicFactorization
+    return @ccall LIBSPARSE._Z12SparseFactorh21SparseMatrixStructure27SparseSymbolicFactorOptions(
+        ftype::Cuint,
+        structure::SparseMatrixStructure,
+        sym_opts::SparseSymbolicFactorOptions,
+    )::SparseOpaqueSymbolicFactorization
+end
+
+# Numeric factor on top of an existing symbolic factor. Reusable: the
+# symbolic handle is not consumed.
+function _sparse_numeric_factor(
+    symbolic::SparseOpaqueSymbolicFactorization,
+    matrix::SparseMatrix_t,
+    num_opts::SparseNumericFactorOptions,
+)::SparseOpaqueFactorization_t
+    return @ccall LIBSPARSE._Z12SparseFactor33SparseOpaqueSymbolicFactorization19SparseMatrix_Double26SparseNumericFactorOptions(
+        symbolic::SparseOpaqueSymbolicFactorization,
+        matrix::SparseMatrix_t,
+        num_opts::SparseNumericFactorOptions,
+    )::SparseOpaqueFactorization_t
+end
+
+# In-place solve `A · X = B`, where B is a column-major dense matrix and X
+# overwrites B's storage.
+function _sparse_solve_matrix!(
+    factor::SparseOpaqueFactorization_t,
+    B::DenseMatrix_t,
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_Double(
+        factor::SparseOpaqueFactorization_t,
+        B::DenseMatrix_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_solve_vector!(
+    factor::SparseOpaqueFactorization_t,
+    b::DenseVector_t,
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_Double(
+        factor::SparseOpaqueFactorization_t,
+        b::DenseVector_t,
+    )::Cvoid
+    return nothing
+end
+
+# Workspace-aware solve overloads (libSparse, macOS 10.13+). The factor
+# exposes `solveWorkspaceRequiredStatic + nrhs * solveWorkspaceRequiredPerRHS`
+# bytes of scratch it needs per call; supplying a reusable buffer eliminates
+# the implicit malloc/free that the no-workspace variants perform internally.
+# On AA_LU at 10k nodes that buffer is ~234 KiB / RHS — substantial per-call
+# churn when the no-workspace path is used in a tight row loop. The workspace
+# pointer must be 16-byte aligned; Julia's `Vector{Float64}` data satisfies
+# this for any non-trivial size.
+function _sparse_solve_matrix_ws!(
+    factor::SparseOpaqueFactorization_t,
+    B::DenseMatrix_t,
+    workspace::Ptr{Cvoid},
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_DoublePv(
+        factor::SparseOpaqueFactorization_t,
+        B::DenseMatrix_t,
+        workspace::Ptr{Cvoid},
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_solve_vector_ws!(
+    factor::SparseOpaqueFactorization_t,
+    b::DenseVector_t,
+    workspace::Ptr{Cvoid},
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_DoublePv(
+        factor::SparseOpaqueFactorization_t,
+        b::DenseVector_t,
+        workspace::Ptr{Cvoid},
+    )::Cvoid
+    return nothing
+end
+
+# Bytes of workspace required to solve with `nrhs` right-hand sides.
+@inline function _solve_workspace_bytes(
+    factor::SparseOpaqueFactorization_t,
+    nrhs::Integer,
+)
+    return Int(factor.solveWorkspaceRequiredStatic) +
+           Int(nrhs) * Int(factor.solveWorkspaceRequiredPerRHS)
+end
+
+# `Y = A · X`, dense multi-column. `Y` must be allocated to (rowCount, ncols)
+# by the caller. libSparse overwrites — does not accumulate.
+function _sparse_multiply_matrix!(
+    A::SparseMatrix_t,
+    X::DenseMatrix_t,
+    Y::DenseMatrix_t,
+)
+    @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseMatrix_DoubleS0_(
+        A::SparseMatrix_t,
+        X::DenseMatrix_t,
+        Y::DenseMatrix_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_multiply_vector!(
+    A::SparseMatrix_t,
+    x::DenseVector_t,
+    y::DenseVector_t,
+)
+    @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseVector_DoubleS0_(
+        A::SparseMatrix_t,
+        x::DenseVector_t,
+        y::DenseVector_t,
+    )::Cvoid
+    return nothing
+end
+
+# Frees the libSparse-side numeric / symbolic storage attached to an opaque
+# factor. Idempotent: a second call with a `SparseStatusReleased` handle is
+# a no-op on libSparse's side.
+function _sparse_cleanup_factor!(factor::SparseOpaqueFactorization_t)
+    @ccall LIBSPARSE._Z13SparseCleanup32SparseOpaqueFactorization_Double(
+        factor::SparseOpaqueFactorization_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_cleanup_symbolic!(symbolic::SparseOpaqueSymbolicFactorization)
+    @ccall LIBSPARSE._Z13SparseCleanup33SparseOpaqueSymbolicFactorization(
+        symbolic::SparseOpaqueSymbolicFactorization,
+    )::Cvoid
+    return nothing
+end
+
+# Translate libSparse status codes into Julia exceptions. Singular and
+# parameter-error are the most common; the rest fall through to a generic
+# `error`.
+function _libsparse_throw(status::SparseStatus_t, op::AbstractString)
+    status == SparseMatrixIsSingular &&
+        throw(LinearAlgebra.SingularException(0))
+    status == SparseParameterError &&
+        throw(ArgumentError("libSparse $(op) failed: parameter error"))
+    status == SparseInternalError &&
+        error("libSparse $(op) failed: internal error")
+    status == SparseStatusFailed &&
+        error("libSparse $(op) failed")
+    return error("libSparse $(op) failed: status=$(Int(status))")
+end
diff --git a/src/AccelerateWrapper/solve_dense.jl b/src/AccelerateWrapper/solve_dense.jl
new file mode 100644
index 000000000..62c978743
--- /dev/null
+++ b/src/AccelerateWrapper/solve_dense.jl
@@ -0,0 +1,44 @@
+"""
+    solve!(cache, B) -> B
+
+Solve `A · X = B` in place, dispatching on the shape of `B`:
+
+  - `B::StridedMatrix{Cdouble}`: multiple right-hand sides handled in a single
+    libSparse call; `size(B, 1)` must equal `cache.n`.
+  - `B::StridedVector{Cdouble}`: single right-hand side; `length(B)` must
+    equal `cache.n`.
+
+Both overloads require `B` to have unit stride in the first dimension and
+the cache to be factored (`is_factored(cache) == true`).
+"""
+function solve!(cache::AAFactorCache, B::StridedMatrix{Cdouble})
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    n = cache.n
+    size(B, 1) == n ||
+        throw(DimensionMismatch("size(B, 1) = $(size(B, 1)), cache n = $(n)"))
+    stride(B, 1) == 1 ||
+        throw(ArgumentError("B must have unit stride in the first dimension."))
+    size(B, 2) == 0 && return B
+    # Workspace-aware overload — caller-supplied scratch avoids a per-call
+    # malloc/free inside libSparse.
+    ws = _ensure_solve_workspace!(cache, size(B, 2))
+    GC.@preserve cache _sparse_solve_matrix_ws!(cache.numeric, _dense_matrix(B), ws)
+    return B
+end
+
+function solve!(cache::AAFactorCache, b::StridedVector{Cdouble})
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    n = cache.n
+    length(b) == n || throw(DimensionMismatch("length(b) = $(length(b)), cache n = $(n)"))
+    stride(b, 1) == 1 || throw(ArgumentError("b must have unit stride."))
+    ws = _ensure_solve_workspace!(cache, 1)
+    GC.@preserve cache _sparse_solve_vector_ws!(cache.numeric, _dense_vector(b), ws)
+    return b
+end
+
+"""
+    \\(cache::AAFactorCache, B) -> X
+
+Allocating solve, mirroring `LinearAlgebra.Factorization`'s API.
+"""
+Base.:\(cache::AAFactorCache, B::StridedVecOrMat{Cdouble}) = solve!(cache, copy(B))
diff --git a/src/AccelerateWrapper/solve_sparse_rhs.jl b/src/AccelerateWrapper/solve_sparse_rhs.jl
new file mode 100644
index 000000000..e2ba840f9
--- /dev/null
+++ b/src/AccelerateWrapper/solve_sparse_rhs.jl
@@ -0,0 +1,102 @@
+const SPARSE_RHS_DEFAULT_BLOCK = 64
+
+"""
+    solve_sparse!(cache, B, out; block=$(SPARSE_RHS_DEFAULT_BLOCK)) -> out
+
+Solve `A · X = B` for a `SparseMatrixCSC` right-hand side, writing the
+result into `out`. Empty columns of `B` are not solved — `out`'s
+corresponding columns are zeroed. Non-empty columns within each chunk of
+`block` consecutive RHS columns are packed into a dense scratch and solved
+in a single libSparse call.
+
+The `block` chunk size bounds the working set so that processing an
+`n × nrhs` sparse RHS requires only `O(n · block)` extra memory regardless
+of `nrhs`. The cache reuses its packing buffer across calls; warm calls
+allocate nothing in the solver.
+
+Not thread-safe (mutates per-cache scratch).
+"""
+function solve_sparse!(
+    cache::AAFactorCache,
+    B::SparseMatrixCSC{<:Number, Int},
+    out::AbstractMatrix{Cdouble};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+)
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)"))
+    n = cache.n
+    size(B, 1) == n || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    ))
+    size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch(
+        "out has size $(size(out)); expected $((n, size(B, 2))).",
+    ))
+
+    nb = size(B, 2)
+    nb == 0 && return out
+    fill!(out, zero(Cdouble))
+
+    Browval = rowvals(B)
+    Bnzval = nonzeros(B)
+
+    _ensure_scratch!(cache, block)
+    # Size the libSparse solve workspace to the *maximum* chunk size once.
+    # Sizing it per chunk would `resize!` on every short tail block, churning
+    # allocations in the hot PTDF loop. At 10k nodes a 64-RHS block needs
+    # ~15 MiB of scratch; without this, libSparse mallocs+frees it per call.
+    ws = _ensure_solve_workspace!(cache, block)
+    scratch = cache.scratch
+    col_map = cache.col_map
+
+    j_start = 1
+    @inbounds while j_start <= nb
+        j_end = min(j_start + block - 1, nb)
+
+        npack = 0
+        for j in j_start:j_end
+            rng = nzrange(B, j)
+            isempty(rng) && continue
+            npack += 1
+            col_map[npack] = j
+            fill!(view(scratch, :, npack), zero(Cdouble))
+            for p in rng
+                scratch[Browval[p], npack] = Bnzval[p]
+            end
+        end
+
+        if npack > 0
+            # Build a DenseMatrix view over scratch[:, 1:npack]. Column stride
+            # is `size(scratch, 1) = n` because scratch is column-major and
+            # full-height.
+            dm = DenseMatrix_t(
+                Cint(n),
+                Cint(npack),
+                Cint(size(scratch, 1)),
+                ATT_ORDINARY,
+                pointer(scratch),
+            )
+            GC.@preserve cache _sparse_solve_matrix_ws!(cache.numeric, dm, ws)
+
+            for k in 1:npack
+                copyto!(view(out, :, col_map[k]), view(scratch, :, k))
+            end
+        end
+
+        j_start = j_end + 1
+    end
+    return out
+end
+
+"""Allocating wrapper around `solve_sparse!`."""
+function solve_sparse(
+    cache::AAFactorCache,
+    B::SparseMatrixCSC{<:Number, Int};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+)
+    return solve_sparse!(
+        cache,
+        B,
+        Matrix{Cdouble}(undef, cache.n, size(B, 2));
+        block = block,
+    )
+end
diff --git a/src/AccelerateWrapper/spmm.jl b/src/AccelerateWrapper/spmm.jl
new file mode 100644
index 000000000..e1c5375ca
--- /dev/null
+++ b/src/AccelerateWrapper/spmm.jl
@@ -0,0 +1,105 @@
+# SparseMultiply bindings. These do not require a factorization cache; they
+# take a `SparseMatrixCSC{Float64, Int}` directly and build the Apple-side
+# `SparseMatrix_t` view at the ccall boundary. Per call this allocates two
+# transient arrays (0-based `Clong[]` colptr, narrowed `Cint[]` rowval); if
+# a future hot loop needs allocation-free SpMM, lift these into a dedicated
+# `AASparseView` cache type.
+
+"""
+    aa_spmm!(Y, A, X) -> Y
+
+Compute `Y ← A · X` in place, where `A` is a `SparseMatrixCSC{Float64, Int}`,
+`X` and `Y` are `StridedMatrix{Float64}` of compatible shape. libSparse
+overwrites `Y` (does not accumulate).
+"""
+function aa_spmm!(
+    Y::StridedMatrix{Cdouble},
+    A::SparseMatrixCSC{Float64, Int},
+    X::StridedMatrix{Cdouble},
+)
+    size(A, 2) == size(X, 1) || throw(
+        DimensionMismatch(
+            "A is $(size(A)), X is $(size(X)); inner dimensions must match.",
+        ),
+    )
+    size(Y, 1) == size(A, 1) && size(Y, 2) == size(X, 2) || throw(
+        DimensionMismatch(
+            "Y has size $(size(Y)); expected $((size(A, 1), size(X, 2))).",
+        ),
+    )
+    stride(X, 1) == 1 || throw(ArgumentError("X must have unit stride in dim 1."))
+    stride(Y, 1) == 1 || throw(ArgumentError("Y must have unit stride in dim 1."))
+    size(X, 2) == 0 && return Y
+    cs, ri = _csc_to_apple(A)
+    sp = SparseMatrix_t(
+        SparseMatrixStructure(
+            Cint(size(A, 1)),
+            Cint(size(A, 2)),
+            pointer(cs),
+            pointer(ri),
+            ATT_ORDINARY,
+            UInt8(1),
+        ),
+        pointer(nonzeros(A)),
+    )
+    GC.@preserve cs ri A X Y _sparse_multiply_matrix!(
+        sp,
+        _dense_matrix(X),
+        _dense_matrix(Y),
+    )
+    return Y
+end
+
+"""
+    aa_spmv!(y, A, x) -> y
+
+Compute `y ← A · x` in place.
+"""
+function aa_spmv!(
+    y::StridedVector{Cdouble},
+    A::SparseMatrixCSC{Float64, Int},
+    x::StridedVector{Cdouble},
+)
+    size(A, 2) == length(x) || throw(DimensionMismatch(
+        "A is $(size(A)), length(x) = $(length(x)).",
+    ))
+    length(y) == size(A, 1) || throw(DimensionMismatch(
+        "length(y) = $(length(y)); expected $(size(A, 1)).",
+    ))
+    stride(x, 1) == 1 || throw(ArgumentError("x must have unit stride."))
+    stride(y, 1) == 1 || throw(ArgumentError("y must have unit stride."))
+    cs, ri = _csc_to_apple(A)
+    sp = SparseMatrix_t(
+        SparseMatrixStructure(
+            Cint(size(A, 1)),
+            Cint(size(A, 2)),
+            pointer(cs),
+            pointer(ri),
+            ATT_ORDINARY,
+            UInt8(1),
+        ),
+        pointer(nonzeros(A)),
+    )
+    GC.@preserve cs ri A x y _sparse_multiply_vector!(
+        sp,
+        _dense_vector(x),
+        _dense_vector(y),
+    )
+    return y
+end
+
+# Convert CSC's 1-based `colptr::Vector{Int}` and `rowval::Vector{Int}` into
+# Apple's 0-based `Clong[]` colstarts and narrowed `Cint[]` rowindices.
+function _csc_to_apple(A::SparseMatrixCSC{Float64, Int})
+    cp = getcolptr(A)
+    rv = rowvals(A)
+    cs = Vector{Clong}(undef, length(cp))
+    ri = Vector{Cint}(undef, length(rv))
+    @inbounds for k in eachindex(cp)
+        cs[k] = Clong(cp[k] - 1)
+    end
+    @inbounds for k in eachindex(rv)
+        ri[k] = Cint(rv[k] - 1)
+    end
+    return cs, ri
+end
diff --git a/src/BA_ABA_matrices.jl b/src/BA_ABA_matrices.jl
index 0cec17048..7902317a3 100644
--- a/src/BA_ABA_matrices.jl
+++ b/src/BA_ABA_matrices.jl
@@ -179,7 +179,7 @@ power flow analysis, sensitivity calculations, and linear power system studies.
 struct ABA_Matrix{
     Ax <: NTuple{2, Vector},
     L <: NTuple{2, Dict},
-    F <: Union{Nothing, KLULinSolveCache{Float64}},
+    F <: Union{Nothing, KLULinSolveCache{Float64, Int64}},
 } <: PowerNetworkMatrix{Float64}
     data::SparseArrays.SparseMatrixCSC{Float64, Int}
     axes::Ax
@@ -358,7 +358,7 @@ Check if an ABA_Matrix has been factorized (i.e., contains LU factorization matr
 """
 is_factorized(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Dict}} = false
 is_factorized(
-    ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64}},
+    ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64, Int64}},
 ) where {Ax, L <: NTuple{2, Dict}} = true
 
 # get_index functions: BA_Matrix stores the transposed matrix, thus get index
diff --git a/src/KLUWrapper/KLUWrapper.jl b/src/KLUWrapper/KLUWrapper.jl
index 5e996dfdf..7d50b5914 100644
--- a/src/KLUWrapper/KLUWrapper.jl
+++ b/src/KLUWrapper/KLUWrapper.jl
@@ -74,8 +74,12 @@ export KLULinSolveCache,
     tsolve!,
     solve_sparse!,
     solve_sparse,
+    sort_factors!,
+    condest!,
+    rcond!,
     n_valid,
-    is_factored
+    is_factored,
+    get_reuse_symbolic
 
 include("klu_jll_bindings.jl")
 include("klu_cache.jl")
diff --git a/src/KLUWrapper/klu_cache.jl b/src/KLUWrapper/klu_cache.jl
index 89d519980..6318e3050 100644
--- a/src/KLUWrapper/klu_cache.jl
+++ b/src/KLUWrapper/klu_cache.jl
@@ -5,37 +5,55 @@ A cached KLU linear solver designed for repeated solves against the same
 sparse matrix structure. `numeric_refactor!` and `solve!` allocate nothing
 once the cache is built.
 
-The type parameter `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU
-path. Indices are always `Int64` (SuiteSparse_long).
+Type parameters:
+- `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU path
+  (`klu_*_factor`/`klu_z*_factor`).
+- `Ti ∈ {Int32, Int64}` selects the index-type entry-point family
+  (`klu_*` for `int`/Int32, `klu_l_*` for `SuiteSparse_long`/Int64).
+  The cache's `colptr`/`rowval`/`col_map` are stored in this type.
 
 `reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis;
 `check_pattern` adds a structural-equality check on refactor calls and is
 only consulted when reusing.
 """
-mutable struct KLULinSolveCache{Tv <: Union{Float64, ComplexF64}}
-    colptr::Vector{Int64}
-    rowval::Vector{Int64}
+mutable struct KLULinSolveCache{
+    Tv <: Union{Float64, ComplexF64},
+    Ti <: Union{Int32, Int64},
+}
+    colptr::Vector{Ti}
+    rowval::Vector{Ti}
     # Copy of the matrix values used in the most recent numeric factorization.
     # Lets `_recover_factorization!` rebuild a corrupted numeric handle without
     # the caller having to re-supply `A`. Empty before the first factor call.
     nzval::Vector{Tv}
-    common::Base.RefValue{KluLCommon}
-    symbolic::SymbolicPtr
-    numeric::NumericPtr
+    # `Base.RefValue{KluLCommon}` for Int64 or `Base.RefValue{KluCommon}` for
+    # Int32 — we keep the field untyped here because Julia's untyped
+    # `Base.RefValue` lookup is fast enough (the dispatch helpers recover the
+    # concrete type at each callsite via `_common_type(Ti)`) and avoiding a
+    # third type parameter keeps the surface tidy.
+    common::Base.RefValue
+    # Opaque Symbolic/Numeric pointers. `Ptr{Cvoid}` is safe: each callsite
+    # threads through the typed `SymbolicPtr` / `SymbolicPtr32` (resp.
+    # Numeric) at the ccall boundary; on the Julia side the values are
+    # treated as black boxes by every consumer.
+    symbolic::Ptr{Cvoid}
+    numeric::Ptr{Cvoid}
     reuse_symbolic::Bool
     check_pattern::Bool
     # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call so
     # the wrapper's working set stays O(n*block) instead of O(n*nrhs); see
     # `solve_sparse_rhs.jl`.
     scratch::Matrix{Tv}
-    col_map::Vector{Int64}
+    col_map::Vector{Ti}
 end
 
-@inline _dim(cache::KLULinSolveCache) = Int64(length(cache.colptr) - 1)
+@inline _dim(cache::KLULinSolveCache{Tv, Ti}) where {Tv, Ti} =
+    Ti(length(cache.colptr) - 1)
 
-Base.size(cache::KLULinSolveCache) = (n = _dim(cache); (n, n))
-Base.size(cache::KLULinSolveCache, d::Integer) = d <= 2 ? _dim(cache) : 1
-Base.eltype(::Type{KLULinSolveCache{Tv}}) where {Tv} = Tv
+Base.size(cache::KLULinSolveCache) = (n = Int(_dim(cache)); (n, n))
+Base.size(cache::KLULinSolveCache, d::Integer) =
+    d <= 2 ? Int(_dim(cache)) : 1
+Base.eltype(::Type{KLULinSolveCache{Tv, Ti}}) where {Tv, Ti} = Tv
 get_reuse_symbolic(cache::KLULinSolveCache) = cache.reuse_symbolic
 
 """
@@ -53,37 +71,363 @@ is_factored(cache::KLULinSolveCache) =
 # truthy form. Kept around for tests that want a uniform numeric reading.
 n_valid(cache::KLULinSolveCache) = is_factored(cache) ? 1 : 0
 
-@inline _factor_call(::Type{Float64}, ap, ai, ax, sym, common) =
-    klu_l_factor(ap, ai, ax, sym, common)
-@inline _factor_call(::Type{ComplexF64}, ap, ai, ax, sym, common) =
-    klu_zl_factor(ap, ai, ax, sym, common)
+# ---------------------------------------------------------------------------
+# Type-paired dispatch helpers — (Tv, Ti) → libklu entry point
+# ---------------------------------------------------------------------------
+
+# Map Ti to its concrete `klu_common` struct type.
+@inline _common_type(::Type{Int32}) = KluCommon
+@inline _common_type(::Type{Int64}) = KluLCommon
+
+# `klu_defaults` initializer.
+@inline _defaults!(::Type{Int32}, common::Ref) = klu_defaults!(common)
+@inline _defaults!(::Type{Int64}, common::Ref) = klu_l_defaults!(common)
+
+# `klu_analyze` returns the symbolic handle. n is widened to `Ti` so the
+# C argument width matches.
+@inline _analyze_call(::Type{Int32}, n, ap, ai, common) =
+    klu_analyze(Cint(n), ap, ai, common)
+@inline _analyze_call(::Type{Int64}, n, ap, ai, common) =
+    klu_l_analyze(Int64(n), ap, ai, common)
+
+# `klu_free_symbolic` — takes the opaque pointer ref and the common ref.
+# `sym_ref` is a `Ref{Ptr{Cvoid}}` on the Julia side; we reinterpret it
+# to the typed `SymbolicPtr` / `SymbolicPtr32` at the ccall site so libklu
+# sees the right pointer width.
+@inline function _free_symbolic!(::Type{Int32}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref)
+    typed = Ref(reinterpret(SymbolicPtr32, sym_ref[]))
+    klu_free_symbolic!(typed, common)
+    sym_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_symbolic!(::Type{Int64}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref)
+    typed = Ref(reinterpret(SymbolicPtr, sym_ref[]))
+    klu_l_free_symbolic!(typed, common)
+    sym_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
 
-@inline _refactor_call(::Type{Float64}, ap, ai, ax, sym, num, common) =
-    klu_l_refactor(ap, ai, ax, sym, num, common)
-@inline _refactor_call(::Type{ComplexF64}, ap, ai, ax, sym, num, common) =
-    klu_zl_refactor(ap, ai, ax, sym, num, common)
+# `klu_factor` — returns the numeric handle as a typed pointer; we
+# reinterpret to `Ptr{Cvoid}` for storage. Dispatch on both Tv and Ti.
+@inline function _factor_call(::Type{Float64}, ::Type{Int32}, ap, ai, ax, sym, common)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common),
+    )
+end
+@inline function _factor_call(::Type{Float64}, ::Type{Int64}, ap, ai, ax, sym, common)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_l_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common),
+    )
+end
+@inline function _factor_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    common,
+)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_z_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common),
+    )
+end
+@inline function _factor_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    common,
+)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_zl_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common),
+    )
+end
 
-@inline _solve_call(::Type{Float64}, sym, num, n, nrhs, b, common) =
-    klu_l_solve(sym, num, n, nrhs, b, common)
-@inline _solve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common) =
-    klu_zl_solve(sym, num, n, nrhs, b, common)
+@inline function _refactor_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_l_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_z_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_zl_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        common,
+    )
+end
 
-@inline _tsolve_call(::Type{Float64}, sym, num, n, nrhs, b, common; conjugate = false) =
-    klu_l_tsolve(sym, num, n, nrhs, b, common)
-@inline _tsolve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common; conjugate = false) =
-    klu_zl_tsolve(sym, num, n, nrhs, b, Cint(conjugate), common)
+@inline function _solve_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_solve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_l_solve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_z_solve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_zl_solve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
 
-@inline _free_numeric!(::Type{Float64}, num_ref, common) =
-    klu_l_free_numeric!(num_ref, common)
-@inline _free_numeric!(::Type{ComplexF64}, num_ref, common) =
-    klu_zl_free_numeric!(num_ref, common)
+@inline function _tsolve_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_tsolve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_l_tsolve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_z_tsolve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, Cint(conjugate), common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_zl_tsolve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, Cint(conjugate), common,
+    )
+end
+
+@inline function _free_numeric!(
+    ::Type{Float64},
+    ::Type{Int32},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr32, num_ref[]))
+    klu_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{Float64},
+    ::Type{Int64},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr, num_ref[]))
+    klu_l_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr32, num_ref[]))
+    klu_z_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr, num_ref[]))
+    klu_zl_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+
+# Performance-knob dispatchers — Float64 only. libklu exposes these in the
+# ComplexF64 build too; we'll add bindings if a consumer asks.
+@inline _sort_call(::Type{Int32}, sym, num, common) =
+    klu_sort(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common)
+@inline _sort_call(::Type{Int64}, sym, num, common) =
+    klu_l_sort(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common)
+
+@inline _condest_call(::Type{Int32}, ap, ax, sym, num, common) =
+    klu_condest(
+        ap,
+        ax,
+        reinterpret(SymbolicPtr32, sym),
+        reinterpret(NumericPtr32, num),
+        common,
+    )
+@inline _condest_call(::Type{Int64}, ap, ax, sym, num, common) =
+    klu_l_condest(
+        ap,
+        ax,
+        reinterpret(SymbolicPtr, sym),
+        reinterpret(NumericPtr, num),
+        common,
+    )
+
+@inline _rcond_call(::Type{Int32}, sym, num, common) =
+    klu_rcond(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common)
+@inline _rcond_call(::Type{Int64}, sym, num, common) =
+    klu_l_rcond(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common)
+
+# ---------------------------------------------------------------------------
+# Constructor
+# ---------------------------------------------------------------------------
 
 """
     KLULinSolveCache(A; reuse_symbolic=true, check_pattern=true)
 
-Build a cache for the sparse matrix `A`. Allocates structural arrays and
-runs `klu_l_defaults`, but does **not** factorize. Call `full_factor!`
-(or `symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
+Build a cache for the sparse matrix `A`. The cache's index type is taken
+from `A`: `SparseMatrixCSC{Tv, Int32}` ⇒ `KLULinSolveCache{Tv, Int32}`,
+`SparseMatrixCSC{Tv, Int64}` ⇒ `KLULinSolveCache{Tv, Int64}`. Allocates
+structural arrays and runs the corresponding `klu_defaults`/`klu_l_defaults`
+initializer, but does **not** factorize. Call `full_factor!` (or
+`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
 
 A finalizer frees libklu handles on GC; call `Base.finalize(cache)` to
 release them eagerly. Releasing the handles leaves Julia-side state intact,
@@ -91,34 +435,30 @@ so the cache can be re-factorized via `symbolic_factor!`/`numeric_refactor!`
 or `full_factor!`.
 """
 function KLULinSolveCache(
-    A::SparseMatrixCSC{Tv, Int};
+    A::SparseMatrixCSC{Tv, Ti};
     reuse_symbolic::Bool = true,
     check_pattern::Bool = true,
-) where {Tv <: Union{Float64, ComplexF64}}
-    Int === Int64 || error(
-        "KLULinSolveCache requires a 64-bit Julia build (SuiteSparse_long " *
-        "bindings need Int64 indices). Got Int = $(Int).",
-    )
+) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}}
     n = size(A, 1)
-    n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))"))
+    n == size(A, 2) ||
+        throw(DimensionMismatch("matrix must be square; got $(size(A))"))
 
-    common = Ref(KluLCommon())
-    klu_l_defaults!(common)
+    common = Ref(_common_type(Ti)())
+    _defaults!(Ti, common)
 
-    colptr = Vector{Int64}(undef, length(getcolptr(A)))
+    colptr = Vector{Ti}(undef, length(getcolptr(A)))
     copyto!(colptr, getcolptr(A))
-    colptr .-= 1
-    rowval = Vector{Int64}(undef, length(rowvals(A)))
+    colptr .-= one(Ti)
+    rowval = Vector{Ti}(undef, length(rowvals(A)))
     copyto!(rowval, rowvals(A))
-    rowval .-= 1
+    rowval .-= one(Ti)
 
-    cache = KLULinSolveCache{Tv}(
+    cache = KLULinSolveCache{Tv, Ti}(
         colptr, rowval, Tv[], common,
-        convert(SymbolicPtr, C_NULL),
-        convert(NumericPtr, C_NULL),
+        Ptr{Cvoid}(C_NULL), Ptr{Cvoid}(C_NULL),
         reuse_symbolic, check_pattern,
         Matrix{Tv}(undef, 0, 0),
-        Int64[],
+        Ti[],
     )
     finalizer(_free_klu_handles!, cache)
     return cache
@@ -131,10 +471,10 @@ Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length
 `block`. Grows in place; reuses across `solve_sparse!` calls.
 """
 @inline function _ensure_scratch!(
-    cache::KLULinSolveCache{Tv},
+    cache::KLULinSolveCache{Tv, Ti},
     block::Int,
-) where {Tv <: Union{Float64, ComplexF64}}
-    n = _dim(cache)
+) where {Tv, Ti}
+    n = Int(_dim(cache))
     s = cache.scratch
     if size(s, 1) != n || size(s, 2) < block
         cache.scratch = Matrix{Tv}(undef, n, block)
@@ -153,16 +493,16 @@ second call hits the `C_NULL` guards. Used both by `symbolic_factor!`
 mid-life (drop old handles before re-analyzing) and by the GC finalizer.
 """
 function _free_klu_handles!(
-    cache::KLULinSolveCache{Tv},
-) where {Tv <: Union{Float64, ComplexF64}}
+    cache::KLULinSolveCache{Tv, Ti},
+) where {Tv, Ti}
     if cache.numeric != C_NULL
         num_ref = Ref(cache.numeric)
-        _free_numeric!(Tv, num_ref, cache.common)
+        _free_numeric!(Tv, Ti, num_ref, cache.common)
         cache.numeric = num_ref[]
     end
     if cache.symbolic != C_NULL
         sym_ref = Ref(cache.symbolic)
-        klu_l_free_symbolic!(sym_ref, cache.common)
+        _free_symbolic!(Ti, sym_ref, cache.common)
         cache.symbolic = sym_ref[]
     end
     return nothing
@@ -180,8 +520,8 @@ scope. Requires that a numeric factor has been built before (so `cache.nzval`
 is populated) and the symbolic factor is still valid.
 """
 function _recover_factorization!(
-    cache::KLULinSolveCache{Tv},
-) where {Tv <: Union{Float64, ComplexF64}}
+    cache::KLULinSolveCache{Tv, Ti},
+) where {Tv, Ti}
     cache.symbolic == C_NULL && error(
         "KLULinSolveCache: cannot recover without a symbolic factor.",
     )
@@ -190,11 +530,12 @@ function _recover_factorization!(
     )
     if cache.numeric != C_NULL
         num_ref = Ref(cache.numeric)
-        _free_numeric!(Tv, num_ref, cache.common)
+        _free_numeric!(Tv, Ti, num_ref, cache.common)
         cache.numeric = num_ref[]
     end
     num = _factor_call(
-        Tv, pointer(cache.colptr), pointer(cache.rowval),
+        Tv, Ti,
+        pointer(cache.colptr), pointer(cache.rowval),
         pointer(cache.nzval), cache.symbolic, cache.common,
     )
     num == C_NULL && klu_throw(cache.common[], "klu_factor (recovery)")
@@ -202,8 +543,11 @@ function _recover_factorization!(
     return cache
 end
 
-@inline function _check_pattern_match(cache::KLULinSolveCache,
-    A::SparseMatrixCSC, op::AbstractString)
+@inline function _check_pattern_match(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC,
+    op::AbstractString,
+) where {Tv, Ti}
     Acolptr = getcolptr(A)
     Arowval = rowvals(A)
     if length(Acolptr) != length(cache.colptr) ||
@@ -217,13 +561,13 @@ end
     # Increment-compare-decrement: avoids allocating a 1-indexed copy. The
     # `try/finally` restores `colptr`/`rowval` even on InterruptException so
     # the cache is never left with off-by-one structural arrays.
-    cache.colptr .+= 1
-    cache.rowval .+= 1
+    cache.colptr .+= one(Ti)
+    cache.rowval .+= one(Ti)
     bad = try
         (cache.colptr != Acolptr) || (cache.rowval != Arowval)
     finally
-        cache.colptr .-= 1
-        cache.rowval .-= 1
+        cache.colptr .-= one(Ti)
+        cache.rowval .-= one(Ti)
     end
     if bad
         throw(ArgumentError(
@@ -237,15 +581,19 @@ end
     symbolic_factor!(cache, A)
 
 Free any cached symbolic/numeric factor, replace the structural arrays with
-`A`'s pattern, and run `klu_l_analyze`.
+`A`'s pattern, and run `klu_analyze` / `klu_l_analyze`.
 """
-function symbolic_factor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function symbolic_factor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     n = _dim(cache)
-    if size(A, 1) != n || size(A, 2) != n
-        throw(DimensionMismatch(
-            "Cannot factor: cache is $(n)×$(n) but A is $(size(A)).",
-        ))
+    if size(A, 1) != Int(n) || size(A, 2) != Int(n)
+        throw(
+            DimensionMismatch(
+                "Cannot factor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).",
+            ),
+        )
     end
     _free_klu_handles!(cache)
 
@@ -253,16 +601,14 @@ function symbolic_factor!(cache::KLULinSolveCache{Tv},
     Arowval = rowvals(A)
     resize!(cache.colptr, length(Acolptr))
     copyto!(cache.colptr, Acolptr)
-    cache.colptr .-= 1
+    cache.colptr .-= one(Ti)
     resize!(cache.rowval, length(Arowval))
     copyto!(cache.rowval, Arowval)
-    cache.rowval .-= 1
+    cache.rowval .-= one(Ti)
 
-    sym = klu_l_analyze(
-        Int64(n), pointer(cache.colptr), pointer(cache.rowval), cache.common,
-    )
-    sym == C_NULL && klu_throw(cache.common[], "klu_l_analyze")
-    cache.symbolic = sym
+    sym = _analyze_call(Ti, n, pointer(cache.colptr), pointer(cache.rowval), cache.common)
+    sym == C_NULL && klu_throw(cache.common[], "klu_analyze")
+    cache.symbolic = reinterpret(Ptr{Cvoid}, sym)
     return cache
 end
 
@@ -272,17 +618,19 @@ end
 If `cache.reuse_symbolic`, optionally verify the structure matches and reuse
 the existing analysis. Otherwise, rerun `symbolic_factor!`.
 """
-function symbolic_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function symbolic_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     if !cache.reuse_symbolic
         return symbolic_factor!(cache, A)
     end
     if cache.check_pattern
         n = _dim(cache)
-        if size(A, 1) != n || size(A, 2) != n
+        if size(A, 1) != Int(n) || size(A, 2) != Int(n)
             throw(
                 DimensionMismatch(
-                    "Cannot refactor: cache is $(n)×$(n) but A is $(size(A)).",
+                    "Cannot refactor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).",
                 ),
             )
         end
@@ -298,15 +646,18 @@ Compute (or refresh) the numeric factorization. The first call after
 `symbolic_factor!` invokes `klu_*_factor`; subsequent calls invoke
 `klu_*_refactor` and reuse the existing numeric struct.
 """
-function numeric_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function numeric_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     cache.symbolic == C_NULL && error(
         "KLULinSolveCache: call symbolic_factor! before numeric_refactor!.",
     )
     Anz = nonzeros(A)
     if cache.numeric == C_NULL
         num = _factor_call(
-            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            Tv, Ti,
+            pointer(cache.colptr), pointer(cache.rowval),
             pointer(Anz), cache.symbolic, cache.common,
         )
         num == C_NULL && klu_throw(cache.common[], "klu_factor")
@@ -314,7 +665,8 @@ function numeric_refactor!(cache::KLULinSolveCache{Tv},
     else
         cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor")
         ok = _refactor_call(
-            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            Tv, Ti,
+            pointer(cache.colptr), pointer(cache.rowval),
             pointer(Anz), cache.symbolic, cache.numeric, cache.common,
         )
         ok != 1 && klu_throw(cache.common[], "klu_refactor")
@@ -334,8 +686,10 @@ Equivalent to `symbolic_factor!(cache, A); numeric_refactor!(cache, A)`. Use
 this on a freshly constructed cache, or after `_free_klu_handles!` has cleared
 the handles, to bring the cache to a factored state.
 """
-function full_factor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function full_factor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     symbolic_factor!(cache, A)
     numeric_refactor!(cache, A)
     return cache
@@ -351,8 +705,10 @@ the matrix values have changed; if the structure has also changed and the
 cache was built with `reuse_symbolic = false`, the symbolic analysis is rerun
 as well.
 """
-function full_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function full_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     symbolic_refactor!(cache, A)
     numeric_refactor!(cache, A)
     return cache
@@ -363,11 +719,94 @@ end
 
 Build a cache for `A` and immediately compute the full factorization.
 """
-function klu_factorize(A::SparseMatrixCSC{Tv, Int};
+function klu_factorize(
+    A::SparseMatrixCSC{Tv, Ti};
     reuse_symbolic::Bool = true,
     check_pattern::Bool = true,
-) where {Tv <: Union{Float64, ComplexF64}}
-    cache = KLULinSolveCache(A;
-        reuse_symbolic = reuse_symbolic, check_pattern = check_pattern)
+) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}}
+    cache = KLULinSolveCache(
+        A;
+        reuse_symbolic = reuse_symbolic,
+        check_pattern = check_pattern,
+    )
     return full_factor!(cache, A)
 end
+
+# ---------------------------------------------------------------------------
+# Performance / diagnostic surface
+# ---------------------------------------------------------------------------
+
+"""
+    sort_factors!(cache) -> cache
+
+Sort the columns of the cached L and U factors in place via libklu's
+`klu_sort` / `klu_l_sort`. KLU's numeric phase stores factor columns in
+arbitrary order; sorting once after the first factor improves cache locality
+on every subsequent `solve!` / `tsolve!`. The cost is `O(nnz_factor)` and
+is amortized over many repeated solves — a win whenever the cache is used
+for ≥ a few solves on the same factorization.
+
+Idempotent and `refactor`-stable: sorting after the initial factor persists
+through `numeric_refactor!` because refactor preserves the column layout.
+Only call this if the cache will be reused for multiple solves; for a
+one-shot solve it is pure overhead.
+
+Float64 only.
+"""
+function sort_factors!(cache::KLULinSolveCache{Float64, Ti}) where {Ti}
+    is_factored(cache) ||
+        error("sort_factors!: cache must be factored before sorting.")
+    ok = _sort_call(Ti, cache.symbolic, cache.numeric, cache.common)
+    ok != 1 && klu_throw(cache.common[], "klu_sort")
+    return cache
+end
+
+"""
+    condest!(cache) -> Float64
+
+Compute the 1-norm condition-number estimate of the cached factorization
+via libklu's `klu_condest`. The result lands in `cache.common[].condest`
+and is also returned. Cost is roughly two extra solves; use sparingly.
+
+Useful when deciding whether iterative refinement is worth running, or for
+flagging near-singular Jacobians in Newton-Raphson loops.
+
+Float64 only.
+"""
+function condest!(
+    cache::KLULinSolveCache{Float64, Ti},
+) where {Ti}
+    is_factored(cache) ||
+        error("condest!: cache must be factored before condest.")
+    isempty(cache.nzval) && error(
+        "condest!: requires a previous numeric_refactor! to have populated nzval.",
+    )
+    ok = _condest_call(
+        Ti,
+        pointer(cache.colptr),
+        pointer(cache.nzval),
+        cache.symbolic,
+        cache.numeric,
+        cache.common,
+    )
+    ok != 1 && klu_throw(cache.common[], "klu_condest")
+    return Float64(cache.common[].condest)
+end
+
+"""
+    rcond!(cache) -> Float64
+
+Compute the cheap reciprocal-condition estimate
+`min(|diag(U)|)/max(|diag(U)|)` via libklu's `klu_rcond`. The result lands
+in `cache.common[].rcond` and is also returned. Faster than `condest!` but
+less reliable as a conditioning indicator.
+
+Float64 only.
+"""
+function rcond!(cache::KLULinSolveCache{Float64, Ti}) where {Ti}
+    is_factored(cache) ||
+        error("rcond!: cache must be factored before rcond.")
+    ok = _rcond_call(Ti, cache.symbolic, cache.numeric, cache.common)
+    ok != 1 && klu_throw(cache.common[], "klu_rcond")
+    return Float64(cache.common[].rcond)
+end
diff --git a/src/KLUWrapper/klu_jll_bindings.jl b/src/KLUWrapper/klu_jll_bindings.jl
index ff0ed0a92..b525a6572 100644
--- a/src/KLUWrapper/klu_jll_bindings.jl
+++ b/src/KLUWrapper/klu_jll_bindings.jl
@@ -1,9 +1,32 @@
-# Bindings into libklu (SuiteSparse_jll), restricted to the SuiteSparse_long
-# (`klu_l_*` and `klu_zl_*`) entry points used by KLULinSolveCache.
+# Bindings into libklu (SuiteSparse_jll), covering both index-type
+# entry-point families:
+#   - `klu_l_*` / `klu_zl_*` for `SuiteSparse_long` (Int64) indices
+#   - `klu_*`   / `klu_z_*`  for `int`              (Int32) indices
+#
+# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the
+# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths
+# (`klu_*_free_*`), which can fire on any thread at any safepoint and would
+# otherwise race against in-flight `solve!` calls on a different cache. See
+# `_LIBKLU_LOCK` in `KLUWrapper.jl` for the empirical evidence (intermittent
+# `KLU_INVALID` return; SEGV at `klu_solve.c:118`).
 
 import LinearAlgebra
 import SuiteSparse_jll: libklu
 
+# ---------------------------------------------------------------------------
+# Status codes and shared error helper
+# ---------------------------------------------------------------------------
+
+const KLU_OK = 0
+const KLU_SINGULAR = 1
+const KLU_OUT_OF_MEMORY = -2
+const KLU_INVALID = -3
+const KLU_TOO_LARGE = -4
+
+# ---------------------------------------------------------------------------
+# Int64 (SuiteSparse_long) family
+# ---------------------------------------------------------------------------
+
 # Layout matches `klu_l_common` in upstream `klu.h`. Must stay in sync.
 mutable struct KluLCommon
     tol::Cdouble
@@ -40,19 +63,15 @@ mutable struct KluLNumeric end
 const SymbolicPtr = Ptr{KluLSymbolic}
 const NumericPtr = Ptr{KluLNumeric}
 
-# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the
-# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths
-# (`klu_l_free_*`, `klu_zl_free_*`), which can fire on any thread at any
-# safepoint and would otherwise race against in-flight `solve!` calls on a
-# different cache. See `_LIBKLU_LOCK` in `KLUWrapper.jl` for the
-# empirical evidence (intermittent `KLU_INVALID` return; SEGV at
-# `klu_solve.c:118`).
-
 klu_l_defaults!(common::Ref{KluLCommon}) =
     @klu_lock ccall((:klu_l_defaults, libklu), Cint, (Ptr{KluLCommon},), common)
 
-function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64},
-    common::Ref{KluLCommon})
+function klu_l_analyze(
+    n::Int64,
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_analyze, libklu),
         SymbolicPtr,
@@ -61,8 +80,10 @@ function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64},
     )
 end
 
-function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr},
-    common::Ref{KluLCommon})
+function klu_l_free_symbolic!(
+    symbolic_ref::Ref{SymbolicPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_free_symbolic, libklu),
         Cint,
@@ -71,8 +92,13 @@ function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr},
     )
 end
 
-function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
-    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+function klu_l_factor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_factor, libklu),
         NumericPtr,
@@ -81,19 +107,37 @@ function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
     )
 end
 
-function klu_l_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
-    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+function klu_l_refactor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_refactor, libklu),
         Cint,
-        (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr,
-            Ptr{KluLCommon}),
+        (
+            Ptr{Int64},
+            Ptr{Int64},
+            Ptr{Cdouble},
+            SymbolicPtr,
+            NumericPtr,
+            Ptr{KluLCommon},
+        ),
         ap, ai, ax, symbolic, numeric, common,
     )
 end
 
-function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+function klu_l_solve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{Cdouble},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_solve, libklu),
         Cint,
@@ -102,8 +146,14 @@ function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
     )
 end
 
-function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+function klu_l_tsolve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{Cdouble},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_tsolve, libklu),
         Cint,
@@ -112,8 +162,10 @@ function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
     )
 end
 
-function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr},
-    common::Ref{KluLCommon})
+function klu_l_free_numeric!(
+    numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_free_numeric, libklu),
         Cint,
@@ -122,8 +174,69 @@ function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr},
     )
 end
 
-function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
-    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+# Performance knobs / diagnostics — Int64 family.
+
+# klu_l_sort sorts the columns of the L and U factors in place. KLU's
+# numeric phase stores L/U columns in arbitrary order; sorting once after
+# the factor improves cache locality on every subsequent solve. Cheap
+# (O(nnz_factor)) and idempotent. Call once after the first numeric
+# factor; refactor preserves the sort.
+function klu_l_sort(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_sort, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# klu_l_condest computes a 1-norm condition number estimate, populating
+# `common.condest`. Costs roughly two extra solves. Useful for iterative
+# refinement (informs tolerance choice) and as a diagnostic for near-singular
+# matrices. Caller reads result from `cache.common[].condest`.
+function klu_l_condest(
+    ap::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_condest, libklu),
+        Cint,
+        (Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        ap, ax, symbolic, numeric, common,
+    )
+end
+
+# klu_l_rcond fills `common.rcond` with the cheap diagonal-ratio
+# reciprocal condition estimate (min(|diag(U)|)/max(|diag(U)|)). Faster
+# than condest but less reliable.
+function klu_l_rcond(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_rcond, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# Complex / Int64
+function klu_zl_factor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_factor, libklu),
         NumericPtr,
@@ -132,42 +245,74 @@ function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
     )
 end
 
-function klu_zl_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
-    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+function klu_zl_refactor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_refactor, libklu),
         Cint,
-        (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, NumericPtr,
-            Ptr{KluLCommon}),
+        (
+            Ptr{Int64},
+            Ptr{Int64},
+            Ptr{ComplexF64},
+            SymbolicPtr,
+            NumericPtr,
+            Ptr{KluLCommon},
+        ),
         ap, ai, ax, symbolic, numeric, common,
     )
 end
 
-function klu_zl_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, common::Ref{KluLCommon})
+function klu_zl_solve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{ComplexF64},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_solve, libklu),
         Cint,
-        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64},
-            Ptr{KluLCommon}),
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Ptr{KluLCommon}),
         symbolic, numeric, ldim, nrhs, b, common,
     )
 end
 
-function klu_zl_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, conj_solve::Cint,
-    common::Ref{KluLCommon})
+function klu_zl_tsolve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{ComplexF64},
+    conj_solve::Cint,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_tsolve, libklu),
         Cint,
-        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Cint,
-            Ptr{KluLCommon}),
+        (
+            SymbolicPtr,
+            NumericPtr,
+            Int64,
+            Int64,
+            Ptr{ComplexF64},
+            Cint,
+            Ptr{KluLCommon},
+        ),
         symbolic, numeric, ldim, nrhs, b, conj_solve, common,
     )
 end
 
-function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr},
-    common::Ref{KluLCommon})
+function klu_zl_free_numeric!(
+    numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_free_numeric, libklu),
         Cint,
@@ -176,14 +321,305 @@ function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr},
     )
 end
 
-# Status codes from klu.h.
-const KLU_OK = 0
-const KLU_SINGULAR = 1
-const KLU_OUT_OF_MEMORY = -2
-const KLU_INVALID = -3
-const KLU_TOO_LARGE = -4
+# ---------------------------------------------------------------------------
+# Int32 (int) family — mirror of the Int64 set above
+# ---------------------------------------------------------------------------
+
+# Layout matches `klu_common` (Int32 path) in upstream `klu.h`. Differs from
+# `KluLCommon` only in the four rank/diag fields, which are `int` instead of
+# `SuiteSparse_long`.
+mutable struct KluCommon
+    tol::Cdouble
+    memgrow::Cdouble
+    initmem_amd::Cdouble
+    initmem::Cdouble
+    maxwork::Cdouble
+    btf::Cint
+    ordering::Cint
+    scale::Cint
+    user_order::Ptr{Cvoid}
+    user_data::Ptr{Cvoid}
+    halt_if_singular::Cint
+    status::Cint
+    nrealloc::Cint
+    structural_rank::Cint
+    numerical_rank::Cint
+    singular_col::Cint
+    noffdiag::Cint
+    flops::Cdouble
+    rcond::Cdouble
+    condest::Cdouble
+    rgrowth::Cdouble
+    work::Cdouble
+    memusage::Csize_t
+    mempeak::Csize_t
+    KluCommon() = new()
+end
+
+mutable struct KluSymbolic end
+mutable struct KluNumeric end
+
+const SymbolicPtr32 = Ptr{KluSymbolic}
+const NumericPtr32 = Ptr{KluNumeric}
+
+klu_defaults!(common::Ref{KluCommon}) =
+    @klu_lock ccall((:klu_defaults, libklu), Cint, (Ptr{KluCommon},), common)
+
+function klu_analyze(
+    n::Cint,
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_analyze, libklu),
+        SymbolicPtr32,
+        (Cint, Ptr{Cint}, Ptr{Cint}, Ptr{KluCommon}),
+        n, ap, ai, common,
+    )
+end
+
+function klu_free_symbolic!(
+    symbolic_ref::Ref{SymbolicPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_free_symbolic, libklu),
+        Cint,
+        (Ptr{SymbolicPtr32}, Ptr{KluCommon}),
+        symbolic_ref, common,
+    )
+end
+
+function klu_factor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_factor, libklu),
+        NumericPtr32,
+        (Ptr{Cint}, Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, Ptr{KluCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_refactor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_refactor, libklu),
+        Cint,
+        (
+            Ptr{Cint},
+            Ptr{Cint},
+            Ptr{Cdouble},
+            SymbolicPtr32,
+            NumericPtr32,
+            Ptr{KluCommon},
+        ),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_solve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{Cdouble},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_solve, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_tsolve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{Cdouble},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_tsolve, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_free_numeric!(
+    numeric_ref::Ref{NumericPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr32}, Ptr{KluCommon}),
+        numeric_ref, common,
+    )
+end
+
+# Performance knobs / diagnostics — Int32 family.
+
+function klu_sort(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_sort, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+function klu_condest(
+    ap::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_condest, libklu),
+        Cint,
+        (Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        ap, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_rcond(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_rcond, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# Complex / Int32
+function klu_z_factor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_factor, libklu),
+        NumericPtr32,
+        (Ptr{Cint}, Ptr{Cint}, Ptr{ComplexF64}, SymbolicPtr32, Ptr{KluCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_z_refactor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_refactor, libklu),
+        Cint,
+        (
+            Ptr{Cint},
+            Ptr{Cint},
+            Ptr{ComplexF64},
+            SymbolicPtr32,
+            NumericPtr32,
+            Ptr{KluCommon},
+        ),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_z_solve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{ComplexF64},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_solve, libklu),
+        Cint,
+        (
+            SymbolicPtr32,
+            NumericPtr32,
+            Cint,
+            Cint,
+            Ptr{ComplexF64},
+            Ptr{KluCommon},
+        ),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_z_tsolve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{ComplexF64},
+    conj_solve::Cint,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_tsolve, libklu),
+        Cint,
+        (
+            SymbolicPtr32,
+            NumericPtr32,
+            Cint,
+            Cint,
+            Ptr{ComplexF64},
+            Cint,
+            Ptr{KluCommon},
+        ),
+        symbolic, numeric, ldim, nrhs, b, conj_solve, common,
+    )
+end
+
+function klu_z_free_numeric!(
+    numeric_ref::Ref{NumericPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr32}, Ptr{KluCommon}),
+        numeric_ref, common,
+    )
+end
+
+# ---------------------------------------------------------------------------
+# Shared error throw — dispatches on the common-struct width
+# ---------------------------------------------------------------------------
 
-function klu_throw(common::KluLCommon, op::AbstractString)
+function klu_throw(common::Union{KluLCommon, KluCommon}, op::AbstractString)
     s = common.status
     s == KLU_SINGULAR &&
         throw(LinearAlgebra.SingularException(Int(common.singular_col + 1)))
diff --git a/src/KLUWrapper/solve_dense.jl b/src/KLUWrapper/solve_dense.jl
index 7ce29b78c..85d758170 100644
--- a/src/KLUWrapper/solve_dense.jl
+++ b/src/KLUWrapper/solve_dense.jl
@@ -5,17 +5,19 @@ Solve `A · X = B` in place. `B::StridedVecOrMat{Tv}` must have first-dimension
 size equal to `cache.n` and unit stride in the first dimension. Multiple
 columns of `B` are handled in a single libklu call.
 """
-function solve!(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+function solve!(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv},
+) where {Tv, Ti}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
     stride(B, 1) == 1 || throw(ArgumentError(
         "B must have unit stride in the first dimension.",
     ))
-    nrhs = Int64(size(B, 2))
+    nrhs = size(B, 2)
     nrhs == 0 && return B
     # Snapshot KLU's preconditions plus identity info — gated by
     # `KLU_POOL_DEBUG`. `klu_l_solve` returns FALSE with `KLU_INVALID` when
@@ -31,7 +33,7 @@ function solve!(cache::KLULinSolveCache{Tv},
         pre_b_ptr = pointer(B)
     end
     ok = _solve_call(
-        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common,
+        Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common,
     )
     if ok == 0
         @static if KLU_POOL_DEBUG
@@ -57,21 +59,23 @@ In-place solve `Aᵀ · X = B` (or `Aᴴ · X = B` when `conjugate=true` on the
 complex path). Same shape requirements as `solve!`. The `conjugate` keyword
 is ignored on the real path.
 """
-function tsolve!(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}; conjugate::Bool = false,
-) where {Tv <: Union{Float64, ComplexF64}}
+function tsolve!(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv};
+    conjugate::Bool = false,
+) where {Tv, Ti}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
     stride(B, 1) == 1 || throw(ArgumentError(
         "B must have unit stride in the first dimension.",
     ))
-    nrhs = Int64(size(B, 2))
+    nrhs = size(B, 2)
     nrhs == 0 && return B
     ok = _tsolve_call(
-        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common;
+        Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common;
         conjugate = conjugate,
     )
     ok == 0 && klu_throw(cache.common[], "klu_tsolve")
@@ -83,7 +87,9 @@ end
 
 Allocating solve, mirroring `LinearAlgebra.Factorization`'s API.
 """
-function Base.:\(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+function Base.:\(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv},
+) where {Tv, Ti}
     return solve!(cache, copy(B))
 end
diff --git a/src/KLUWrapper/solve_sparse_rhs.jl b/src/KLUWrapper/solve_sparse_rhs.jl
index a20679f24..089fa06a0 100644
--- a/src/KLUWrapper/solve_sparse_rhs.jl
+++ b/src/KLUWrapper/solve_sparse_rhs.jl
@@ -20,20 +20,23 @@ Not thread-safe (mutates per-cache scratch). Callers should serialize
 access through a per-cache lock if invoked from multiple threads.
 """
 function solve_sparse!(
-    cache::KLULinSolveCache{Tv},
-    B::SparseMatrixCSC{Tb, Int},
+    cache::KLULinSolveCache{Tv, Ti},
+    B::SparseMatrixCSC{Tb, <:Integer},
     out::AbstractMatrix{Tv};
     block::Int = SPARSE_RHS_DEFAULT_BLOCK,
-) where {Tv <: Union{Float64, ComplexF64}, Tb <: Number}
+) where {Tv, Ti, Tb <: Number}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)"))
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
-    ))
-    size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch(
-        "out has size $(size(out)); expected $((n, size(B, 2))).",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
+    size(out, 1) == Int(n) && size(out, 2) == size(B, 2) ||
+        throw(
+            DimensionMismatch(
+                "out has size $(size(out)); expected $((Int(n), size(B, 2))).",
+            ),
+        )
 
     nb = size(B, 2)
     nb == 0 && return out
@@ -71,7 +74,7 @@ function solve_sparse!(
             # helper) to keep the inner-loop closure from capturing the per-chunk
             # `npack`, which would force Julia to box it.
             ok = _solve_call(
-                Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                Tv, Ti, cache.symbolic, cache.numeric, n, npack,
                 pointer(scratch), cache.common,
             )
             if ok == 0 && cache.common[].status == KLU_INVALID
@@ -79,7 +82,7 @@ function solve_sparse!(
                     objectid(cache)
                 _recover_factorization!(cache)
                 ok = _solve_call(
-                    Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                    Tv, Ti, cache.symbolic, cache.numeric, n, npack,
                     pointer(scratch), cache.common,
                 )
             end
@@ -96,12 +99,13 @@ function solve_sparse!(
 end
 
 """Allocating wrapper around `solve_sparse!`."""
-function solve_sparse(cache::KLULinSolveCache{Tv},
-    B::SparseMatrixCSC{<:Number, Int};
+function solve_sparse(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::SparseMatrixCSC{<:Number, <:Integer};
     block::Int = SPARSE_RHS_DEFAULT_BLOCK,
-) where {Tv <: Union{Float64, ComplexF64}}
+) where {Tv, Ti}
     return solve_sparse!(
-        cache, B, Matrix{Tv}(undef, _dim(cache), size(B, 2));
+        cache, B, Matrix{Tv}(undef, Int(_dim(cache)), size(B, 2));
         block = block,
     )
 end
diff --git a/src/PowerNetworkMatrices.jl b/src/PowerNetworkMatrices.jl
index 5239f5859..23ce6e4c5 100644
--- a/src/PowerNetworkMatrices.jl
+++ b/src/PowerNetworkMatrices.jl
@@ -89,8 +89,12 @@ import .KLUWrapper:
     n_valid,
     is_factored
 
+include("AccelerateWrapper/AccelerateWrapper.jl")
+import .AccelerateWrapper: AAFactorCache, aa_factorize, aa_spmm!, aa_spmv!
+
 include("linalg_settings.jl")
 include("solver_dispatch.jl")
+include("iterative_refinement.jl")
 
 function __init__()
     something(get_linalg_backend_check(), false) && check_linalg_backend()
@@ -137,14 +141,12 @@ include("virtual_modf_calculations.jl")
 include("system_utils.jl")
 include("serialization.jl")
 
-# Declare functions that will be defined by extensions
-# These need to be declared so extensions can extend them
+# Forward declarations for symbols still defined inside package extensions.
+# AppleAccelerate-related functions live in `src/` now (no extension needed)
+# and are not redeclared here.
 function _calculate_PTDF_matrix_MKLPardiso end
-function _calculate_PTDF_matrix_AppleAccelerate end
 function _calculate_LODF_matrix_MKLPardiso end
-function _calculate_LODF_matrix_AppleAccelerate end
 function _pardiso_sequential_LODF! end
 function _pardiso_single_LODF! end
-function _create_apple_accelerate_factorization end
 
 end
diff --git a/src/PowerflowMatrixTypes.jl b/src/PowerflowMatrixTypes.jl
index 98d103289..cd1aa2ab9 100644
--- a/src/PowerflowMatrixTypes.jl
+++ b/src/PowerflowMatrixTypes.jl
@@ -1,7 +1,7 @@
 const DC_ABA_Matrix_Factorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
     Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}},
-    KLULinSolveCache{Float64},
+    KLULinSolveCache{Float64, Int64},
 }
 const DC_ABA_Matrix_Unfactorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
diff --git a/src/common.jl b/src/common.jl
index 70a1b50ff..c506170ad 100644
--- a/src/common.jl
+++ b/src/common.jl
@@ -6,6 +6,23 @@ function _add_to_collection!(
     return
 end
 
+"""
+    _build_bus_to_valid_idx(n_buses, valid_ix) -> Vector{Int}
+
+Build the inverse of `valid_ix`: a length-`n_buses` vector where entry `b`
+is the position of bus `b` inside `valid_ix`, or 0 if `b` is a reference
+bus. Used by the Virtual\\* row-computation hot path so it can iterate the
+nonzeros of a sparse `BA` column directly (O(nnz_col)) instead of scanning
+the full bus axis (O(n_buses)) and bisecting the CSC for each entry.
+"""
+function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int})
+    bus_to_valid_idx = zeros(Int, n_buses)
+    @inbounds for (i, b) in enumerate(valid_ix)
+        bus_to_valid_idx[b] = i
+    end
+    return bus_to_valid_idx
+end
+
 function _add_to_collection!(
     collection_tr3w::Vector{PSY.ThreeWindingTransformer},
     transformer_tr3w::PSY.ThreeWindingTransformer,
diff --git a/src/definitions.jl b/src/definitions.jl
index 7b1bc991a..95e7d4b64 100644
--- a/src/definitions.jl
+++ b/src/definitions.jl
@@ -22,14 +22,16 @@ abstract type LinearSolverType end
 struct KLUSolver <: LinearSolverType end
 struct DenseSolver <: LinearSolverType end
 struct MKLPardisoSolver <: LinearSolverType end
-struct AppleAccelerateSolver <: LinearSolverType end
+struct AppleAccelerateLUSolver <: LinearSolverType end
 
-const SUPPORTED_LINEAR_SOLVERS = ("KLU", "MKLPardiso", "AppleAccelerate", "Dense")
+const SUPPORTED_LINEAR_SOLVERS =
+    ("KLU", "MKLPardiso", "AppleAccelerateLU", "Dense")
 
 @inline function resolve_linear_solver(s::String)
     s == "KLU" && return KLUSolver()
     s == "Dense" && return DenseSolver()
     s == "MKLPardiso" && return MKLPardisoSolver()
-    s == "AppleAccelerate" && return AppleAccelerateSolver()
+    s == "AppleAccelerateLU" && return AppleAccelerateLUSolver()
+    s == "AppleAccelerate" && return AppleAccelerateLUSolver()
     error("Unsupported linear solver: $s. Supported: $SUPPORTED_LINEAR_SOLVERS")
 end
diff --git a/src/iterative_refinement.jl b/src/iterative_refinement.jl
new file mode 100644
index 000000000..50e01f86e
--- /dev/null
+++ b/src/iterative_refinement.jl
@@ -0,0 +1,153 @@
+import LinearAlgebra: norm, mul!
+
+"""
+    DEFAULT_REFINEMENT_MAX_ITER :: Int
+
+Default iteration cap for `solve_w_refinement!`.
+"""
+const DEFAULT_REFINEMENT_MAX_ITER = 25
+
+# --- backend dispatch bridge ---------------------------------------------------
+#
+# The refinement body uses three primitives — in-place `solve!`, `is_factored`,
+# and `_dim` — each of which is defined inside `KLUWrapper` and
+# `AccelerateWrapper` as a *separate* function. The bridge below lets the
+# shared body reach the right backend by multiple dispatch instead of an `isa`
+# branch, and keeps the body itself backend-agnostic.
+
+_refine_solve!(K::KLULinSolveCache, r::StridedVecOrMat) = solve!(K, r)
+_refine_solve!(K::AAFactorCache, r::StridedVecOrMat) = AccelerateWrapper.solve!(K, r)
+
+_refine_is_factored(K::KLULinSolveCache) = is_factored(K)
+_refine_is_factored(K::AAFactorCache) = AccelerateWrapper.is_factored(K)
+
+_refine_dim(K::KLULinSolveCache) = Int(KLUWrapper._dim(K))
+_refine_dim(K::AAFactorCache) = AccelerateWrapper._dim(K)
+
+"""
+    solve_w_refinement!(cache, A, X, B; tol=…, max_iters=…) -> X
+
+Solve `A · X = B` in place using `cache` and apply iterative refinement
+until `norm(B − A·X, 1) < norm(B, 1) * tol`, or until refinement stops
+improving. `X` must be pre-allocated by the caller with the same shape as
+`B`; it is overwritten with the refined solution.
+
+This is the non-allocating variant. For a one-shot allocating variant that
+returns a fresh `X`, see `solve_w_refinement`.
+
+Supports `cache::KLULinSolveCache` (KLU backend, any `{Tv, Ti}`) and
+`cache::AAFactorCache` (Apple Accelerate backend, `Cdouble` only). The
+cache must already be factored (`is_factored(cache) == true`). The function
+does not mutate `cache`'s factor; it only triggers in-place `solve!` calls
+on a residual buffer.
+
+`max_iters` caps the refinement loop (default
+`DEFAULT_REFINEMENT_MAX_ITER`). The default `tol` is `sqrt(eps(real(Tv)))`,
+which is conservative for power-flow Newton-Raphson Jacobians.
+
+Useful when the cached factor is of an ill-conditioned matrix — e.g.,
+a Newton-Raphson Jacobian near a saddle point or a network reduction
+interface — where a single back-solve carries enough error to slow NR
+convergence. Cost per refinement iteration: one sparse matrix-vector
+product plus one `solve!` against the cached factor.
+"""
+function solve_w_refinement!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseArrays.SparseMatrixCSC{Tv, Ti},
+    X::StridedVecOrMat{Tv},
+    B::StridedVecOrMat{Tv};
+    tol::Real = sqrt(eps(real(Tv))),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+) where {Tv, Ti}
+    return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters)
+end
+
+function solve_w_refinement!(
+    cache::AAFactorCache,
+    A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer},
+    X::StridedVecOrMat{Cdouble},
+    B::StridedVecOrMat{Cdouble};
+    tol::Real = sqrt(eps(Cdouble)),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+)
+    return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters)
+end
+
+function _solve_w_refinement_body!(
+    cache,
+    A::SparseArrays.SparseMatrixCSC,
+    X::StridedVecOrMat,
+    B::StridedVecOrMat,
+    tol::Real,
+    max_iters::Int,
+)
+    _refine_is_factored(cache) || error(
+        "solve_w_refinement!: cache must be factored. " *
+        "Call `full_factor!(cache, A)` first.",
+    )
+    size(X) == size(B) ||
+        throw(DimensionMismatch("X is $(size(X)); B is $(size(B))."))
+    size(B, 1) == _refine_dim(cache) || throw(
+        DimensionMismatch(
+            "size(B, 1) = $(size(B, 1)); cache n = $(_refine_dim(cache)).",
+        ),
+    )
+
+    Tv = eltype(X)
+    fill!(X, zero(Tv))
+    # `X = 0` ⇒ initial residual `r = B - A·0 = B`. Copy once; subsequent
+    # iterations reuse `r` in place via `mul!(r, A, X); @. r = B - r`,
+    # which avoids the temporary `A * X` allocation `B - A * X` would create
+    # each refinement step.
+    r = copy(B)
+    bNorm = norm(B, 1)
+    iters = 0
+    while iters < max_iters && norm(r, 1) >= bNorm * tol
+        prev_err = norm(r, 1)
+        _refine_solve!(cache, r)
+        X .+= r
+        mul!(r, A, X)
+        @. r = B - r
+        iters += 1
+        if norm(r, 1) > prev_err
+            @debug "Iterative refinement diverging; returning best-so-far." iters
+            return X
+        end
+    end
+    @debug "Iterative refinement converged." iters
+    return X
+end
+
+"""
+    solve_w_refinement(cache, A, B; tol=…, max_iters=…) -> X
+
+Allocating wrapper around `solve_w_refinement!`. Allocates `X` matching
+`B`'s shape, then refines.
+"""
+function solve_w_refinement(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseArrays.SparseMatrixCSC{Tv, Ti},
+    B::StridedVecOrMat{Tv};
+    tol::Real = sqrt(eps(real(Tv))),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+) where {Tv, Ti}
+    X = similar(B)
+    return solve_w_refinement!(
+        cache, A, X, B;
+        tol = tol, max_iters = max_iters,
+    )
+end
+
+function solve_w_refinement(
+    cache::AAFactorCache,
+    A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer},
+    B::StridedVecOrMat{Cdouble};
+    tol::Real = sqrt(eps(Cdouble)),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+)
+    X = similar(B)
+    return solve_w_refinement!(
+        cache, A, X, B;
+        tol = tol, max_iters = max_iters,
+    )
+end
diff --git a/src/linalg_settings.jl b/src/linalg_settings.jl
index c79962189..4d2efb265 100644
--- a/src/linalg_settings.jl
+++ b/src/linalg_settings.jl
@@ -1,7 +1,9 @@
 
-# Extensions are loaded when trigger packages (Pardiso, AppleAccelerate) are loaded
+# The MKL/Pardiso path still uses the package-extension mechanism (Pardiso.jl
+# is the only consumer-facing way to access the MKL Pardiso solver). The
+# Apple Accelerate path no longer does — `AccelerateWrapper` is built in via
+# a `@static if Sys.isapple()` gate.
 
-# Check if MKL/Pardiso extension is available at runtime
 function _has_mkl_pardiso_ext()
     ext = Base.get_extension(@__MODULE__, :MKLPardisoExt)
     return !isnothing(ext)
@@ -12,24 +14,66 @@ _mkl_pardiso_install_error() =
     Install the Pardiso package:
     julia> using Pkg; Pkg.add(\"Pardiso\")"""
 
-# Check if AppleAccelerate extension is available at runtime
-function _has_apple_accelerate_ext()
-    ext = Base.get_extension(@__MODULE__, :AppleAccelerateExt)
-    return !isnothing(ext)
+# Minimum macOS for the AppleAccelerate (libSparse LU) backend. The
+# `SparseFactorizationLU` code is API_AVAILABLE(macos(15.5)); older
+# libSparse rejects factorization type 80.
+const _AA_MIN_MACOS = v"15.5"
+
+# Query the running macOS product version via the `kern.osproductversion`
+# sysctl (libc, no subprocess). Returns a VersionNumber, or v"0" if the
+# sysctl is unavailable (treated as "too old").
+function _macos_product_version()
+    Sys.isapple() || return v"0"
+    buf = Vector{UInt8}(undef, 64)
+    len = Ref{Csize_t}(length(buf))
+    rc = ccall(
+        :sysctlbyname, Cint,
+        (Cstring, Ptr{UInt8}, Ptr{Csize_t}, Ptr{Cvoid}, Csize_t),
+        "kern.osproductversion", buf, len, C_NULL, 0,
+    )
+    rc == 0 || return v"0"
+    s = String(buf[1:(len[] - 1)])  # NUL-terminated; drop the NUL
+    try
+        return VersionNumber(s)
+    catch
+        return v"0"
+    end
 end
 
-_apple_accelerate_install_error() =
-    """The AppleAccelerate extension is not available.
-    This solver is only available on macOS.
-    Install AppleAccelerate:
-    julia> using Pkg; Pkg.add(\"AppleAccelerate\")"""
+_macos_at_least(v::VersionNumber) = _macos_product_version() >= v
 
-# _create_apple_accelerate_factorization is defined in ext/AppleAccelerateExt.jl
-# when AppleAccelerate package is loaded
+_has_apple_accelerate_backend() = Sys.isapple() && _macos_at_least(_AA_MIN_MACOS)
+
+function _apple_accelerate_unavailable_error()
+    if Sys.isapple()
+        return """The Apple Accelerate sparse backend requires macOS $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor) or newer \
+        (libSparse LU / SparseFactorizationLU is API_AVAILABLE(macos(15.5))); \
+        detected macOS $(_macos_product_version()). Use the KLU solver (the default fallback)."""
+    end
+    return """The Apple Accelerate sparse backend is macOS-only (Sys.isapple() returned false).
+    Use the KLU solver (the default) on non-Apple platforms."""
+end
+
+"""
+    _default_linear_solver() -> String
+
+Default sparse linear solver name. Returns "AppleAccelerateLU" on macOS
+$(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)+ (Apple's built-in libSparse LU via `AccelerateWrapper`)
+and "KLU" elsewhere (non-Apple, or macOS older than $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)).
+Used as the default for the `linear_solver` keyword on PTDF / LODF /
+VirtualPTDF / VirtualLODF / VirtualMODF constructors.
+"""
+function _default_linear_solver()
+    if Sys.isapple() && _macos_at_least(_AA_MIN_MACOS)
+        return "AppleAccelerateLU"
+    end
+    return "KLU"
+end
 
 "Set a preference of the backend library for sparse linear algebra operations."
 function set_linalg_backend_preference(linalglib::Union{String, Nothing})
-    if !isnothing(linalglib) && !(linalglib in ["MKLPardiso", "AppleAccelerate"])
+    if !isnothing(linalglib) &&
+       !(linalglib in ["MKLPardiso", "AppleAccelerateLU", "AppleAccelerate"])
         throw(
             ArgumentError(
                 "Unsupported sparse linear algebra backend requested: $(linalglib)",
@@ -79,7 +123,7 @@ function check_linalg_backend()
         @info """The sparse linear algebra solver preference has been set to $(user_linalg_backend).
                 To change this for your active project, call the function
                 PowerNetworkMatrices.set_linalg_backend_preference()
-                with one of "MKLPardiso" or "AppleAccelerate", or `nothing` to turn off.
+                with one of "MKLPardiso", "AppleAccelerateLU", or `nothing` to turn off.
               """
     end
 
@@ -101,7 +145,10 @@ function check_linalg_backend()
             @info no_msg("Pardiso")
             @info "See https://github.com/JuliaSparse/Pardiso.jl for more details."
         end
-        if user_linalg_backend == "AppleAccelerate"
+        if (
+            user_linalg_backend !== nothing &&
+            startswith(user_linalg_backend, "AppleAccelerate")
+        )
             @warn "AppleAccelerate is not supported on non-Apple systems."
         end
     end
@@ -125,14 +172,17 @@ function check_linalg_backend()
             end
         end
 
-        if _has_apple_accelerate_ext()
+        if _has_apple_accelerate_backend()
             @info go_msg("AppleAccelerate")
         else
-            if user_linalg_backend == "AppleAccelerate"
-                @warn yo_msg("AppleAccelerate")
+            if (
+                user_linalg_backend !== nothing &&
+                startswith(user_linalg_backend, "AppleAccelerate")
+            )
+                @warn """AppleAccelerate was requested but is unavailable: it requires \
+                macOS $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)+ (detected macOS $(_macos_product_version())). \
+                Falling back to KLU."""
             end
-            @info no_msg("AppleAccelerate")
-            @info "See https://github.com/JuliaLinearAlgebra/AppleAccelerate.jl"
         end
     end
 end
diff --git a/src/lodf_calculations.jl b/src/lodf_calculations.jl
index 78dae430c..282938676 100644
--- a/src/lodf_calculations.jl
+++ b/src/lodf_calculations.jl
@@ -64,6 +64,40 @@ get_network_reduction_data(M::LODF) = M.network_reduction_data
 get_arc_lookup(M::LODF) = M.lookup[1]
 stores_transpose(::LODF) = true
 
+# --- Demand-matrix short-circuit ---------------------------------------------
+#
+# The LODF computation builds a *diagonal* "demand" matrix `D = diag(m_V)`
+# where `m_V[i] = 1 - PTDF·A[i, i]` (clamped to 1.0 at `LODF_ENTRY_TOLERANCE`
+# to avoid divide-by-zero when an outage islands the line). The original
+# code factored `D` and ran a triangular solve `D · X = ptdf_denominator`;
+# that's a `factor + back-solve` over a diagonal, which collapses to
+# element-wise row scaling. KLU's BTF short-circuits this internally so the
+# overhead was modest; AA's libSparse and LAPACK's `getrf!`/`getrs!` do
+# not, so the previous code was 3–5× slower on AA and order-of-magnitude
+# slower on DENSE than necessary. Replace both with a direct row scaling.
+
+function _build_lodf_demand(ptdf_denominator::AbstractMatrix{Float64}, linecount::Int)
+    m_V = Vector{Float64}(undef, linecount)
+    @inbounds for i in 1:linecount
+        d = 1.0 - ptdf_denominator[i, i]
+        m_V[i] = d < LODF_ENTRY_TOLERANCE ? 1.0 : d
+    end
+    return m_V
+end
+
+function _apply_lodf_demand!(M::AbstractMatrix{Float64}, m_V::Vector{Float64})
+    IS.@assert_op size(M, 1) == length(m_V)
+    IS.@assert_op size(M, 1) == size(M, 2)
+    # `inv_dem .* M` mirrors what the triangular solve did internally —
+    # one reciprocal per row, then a row-wise multiply. The broadcast
+    # `M .*= inv_dem` scales each row `i` by `inv_dem[i]` because the
+    # length-n vector broadcasts down the first dimension.
+    inv_dem = 1.0 ./ m_V
+    M .*= inv_dem
+    M[SparseArrays.diagind(M)] .= -1.0
+    return M
+end
+
 function _buildlodf(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
     ptdf::Matrix{Float64},
@@ -92,9 +126,9 @@ end
 function _buildlodf(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
     ptdf::Matrix{Float64},
-    ::AppleAccelerateSolver,
+    ::AppleAccelerateLUSolver,
 )
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
     return _calculate_LODF_matrix_AppleAccelerate(a, ptdf)
 end
 
@@ -131,20 +165,8 @@ function _calculate_LODF_matrix_KLU(
     solve_sparse!(k, a_t_valid, view(first_, valid_ix, :))
     ptdf_denominator = first_' * ba
 
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator[iline, iline])
-        end
-    end
-    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
-    solve!(Dem_cache, ptdf_denominator)
-    ptdf_denominator[SparseArrays.diagind(ptdf_denominator)] .= -1.0
+    m_V = _build_lodf_demand(ptdf_denominator, linecount)
+    _apply_lodf_demand!(ptdf_denominator, m_V)
     return ptdf_denominator
 end
 
@@ -154,22 +176,9 @@ function _calculate_LODF_matrix_KLU(
 )
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
+    m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
     lodf_t = copy(ptdf_denominator_t)
-    solve!(Dem_cache, lodf_t)
-    lodf_t[SparseArrays.diagind(lodf_t)] .= -1.0
-
+    _apply_lodf_demand!(lodf_t, m_V)
     return lodf_t
 end
 
@@ -179,29 +188,41 @@ function _calculate_LODF_matrix_DENSE(
 )
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_V, 1.0)
-        else
-            push!(m_V, 1.0 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    (mV, bipiv, binfo) = getrf!(Matrix(LinearAlgebra.diagm(m_V)))
-    _binfo_check(binfo)
-    getrs!('N', mV, bipiv, ptdf_denominator_t)
-    ptdf_denominator_t[LinearAlgebra.diagind(ptdf_denominator_t)] .= -1.0
+    m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
+    _apply_lodf_demand!(ptdf_denominator_t, m_V)
     return ptdf_denominator_t
 end
 
 # _pardiso_sequential_LODF!, _pardiso_single_LODF!, _calculate_LODF_matrix_MKLPardiso
 # are defined in ext/MKLPardisoExt.jl when the Pardiso package is loaded
 
-# _calculate_LODF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl
-# when the AppleAccelerate package is loaded
+@static if Sys.isapple()
+    """
+    Function for internal use only.
+
+    Computes the LODF matrix using the internal Apple Accelerate backend
+    (`AccelerateWrapper`). Available only on macOS. Shape mirrors
+    `_calculate_LODF_matrix_KLU(a, ptdf)` exactly: factor the diagonal "demand"
+    matrix `diag(1 - PTDF·A)` and solve in place against `a · ptdf`.
+
+    # Arguments
+    - `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix
+    - `ptdf::Matrix{Float64}`: PTDF matrix
+    """
+    function _calculate_LODF_matrix_AppleAccelerate(
+        a::SparseArrays.SparseMatrixCSC{Int8, Int},
+        ptdf::Matrix{Float64},
+    )
+        linecount = size(ptdf, 2)
+        ptdf_denominator_t = a * ptdf
+        m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
+        _apply_lodf_demand!(ptdf_denominator_t, m_V)
+        return ptdf_denominator_t
+    end
+end
 
 """
-    LODF(sys::PSY.System; linear_solver::String = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    LODF(sys::PSY.System; linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Line Outage Distribution Factor (LODF) matrix from a PowerSystems.System by computing
 the sensitivity of line flows to single line outages. This is the primary constructor for LODF
@@ -211,7 +232,7 @@ analysis starting from system data.
 - `sys::PSY.System`: The power system from which to construct the LODF matrix
 
 # Keyword Arguments
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -258,16 +279,12 @@ where A is the incidence matrix and PTDF is the power transfer distribution fact
 """
 function LODF(
     sys::PSY.System;
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     network_reductions::Vector{NetworkReduction} = NetworkReduction[],
     kwargs...,
 )
-    Ymatrix = Ybus(
-        sys;
-        network_reductions = network_reductions,
-        kwargs...,
-    )
+    Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...)
     A = IncidenceMatrix(Ymatrix)
     BA = BA_Matrix(Ymatrix)
     ptdf = PTDF(A, BA)
@@ -275,7 +292,7 @@ function LODF(
 end
 
 """
-    LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = "KLU", tol::Float64 = eps())
+    LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = _default_linear_solver(), tol::Float64 = eps())
 
 Construct a Line Outage Distribution Factor (LODF) matrix from existing incidence and PTDF matrices.
 This constructor is more efficient when the prerequisite matrices are already available.
@@ -285,7 +302,7 @@ This constructor is more efficient when the prerequisite matrices are already av
 - `PTDFm::PTDF`: The power transfer distribution factor matrix (should be non-sparsified for accuracy)
 
 # Keyword Arguments
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for the LODF matrix (not applied to input PTDF)
@@ -327,7 +344,7 @@ where:
 function LODF(
     A::IncidenceMatrix,
     PTDFm::PTDF;
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     solver = resolve_linear_solver(linear_solver)
@@ -384,7 +401,9 @@ efficient when the prerequisite matrices with factorization are already availabl
 
 # Keyword Arguments
 - `linear_solver::String = "KLU"`:
-        Linear solver algorithm for matrix computations. Currently only "KLU" is supported
+        This constructor is intentionally KLU-only because `ABA.K` is always a
+        KLU factorization. The keyword is kept for API consistency; passing any
+        other value will error.
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements
 
@@ -431,6 +450,10 @@ function LODF(
     linear_solver::String = "KLU",
     tol::Float64 = eps(),
 )
+    # NOTE: ABA.K is always a KLU factorization, so this constructor is
+    # KLU-only regardless of the `linear_solver` argument. The kwarg is kept
+    # for API consistency; passing anything other than "KLU" will error in
+    # `_buildlodf`.
     if !(
         isequal(A.network_reduction_data, BA.network_reduction_data) &&
         isequal(BA.network_reduction_data, ABA.network_reduction_data)
@@ -443,8 +466,7 @@ function LODF(
     subnetwork_axes = make_arc_arc_subnetwork_axes(A)
     ax_ref = make_ax_ref(get_arc_axis(A))
     if tol > eps()
-        lodf_t =
-            _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver)
+        lodf_t = _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver)
         return LODF(
             sparsify(lodf_t, tol),
             (get_arc_axis(A), get_arc_axis(A)),
diff --git a/src/ptdf_calculations.jl b/src/ptdf_calculations.jl
index 132fe16b4..979a4fc67 100644
--- a/src/ptdf_calculations.jl
+++ b/src/ptdf_calculations.jl
@@ -110,8 +110,8 @@ function _buildptdf_from_matrices(
     BA::SparseArrays.SparseMatrixCSC{T, Int} where {T <: Union{Float32, Float64}},
     ref_bus_positions::Set{Int},
     dist_slack::Vector{Float64},
-    ::AppleAccelerateSolver)
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
+    ::AppleAccelerateLUSolver)
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
     return _calculate_PTDF_matrix_AppleAccelerate(A, BA, ref_bus_positions, dist_slack)
 end
 
@@ -230,11 +230,62 @@ end
 # _calculate_PTDF_matrix_MKLPardiso is defined in ext/MKLPardisoExt.jl
 # when Pardiso package is loaded
 
-# _calculate_PTDF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl
-# when AppleAccelerate package is loaded
+@static if Sys.isapple()
+    """
+    Function for internal use only.
+
+    Computes the PTDF matrix using the internal Apple Accelerate backend
+    (`AccelerateWrapper`). Available only on macOS — non-Apple callers are
+    rejected by `_buildptdf_from_matrices` before reaching this entry. Shape
+    mirrors `_calculate_PTDF_matrix_KLU`: factor ABA via LU, then solve
+    `ABA · X = BA[valid_ix, :]` via the block-packed `solve_sparse!`.
+
+    # Arguments
+    - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix
+    - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix
+    - `ref_bus_positions::Set{Int}`: indexes of reference slack buses
+    - `dist_slack::Vector{Float64}`: distributed-slack weights
+    """
+    function _calculate_PTDF_matrix_AppleAccelerate(
+        A::SparseArrays.SparseMatrixCSC{Int8, Int},
+        BA::SparseArrays.SparseMatrixCSC{Float64, Int},
+        ref_bus_positions::Set{Int},
+        dist_slack::Vector{Float64},
+    )
+        linecount = size(BA, 2)
+        buscount = size(BA, 1)
+        if !isempty(dist_slack) && length(ref_bus_positions) != 1
+            error(
+                "Distributed slack is not supported for systems with multiple reference buses.",
+            )
+        end
+        if !isempty(dist_slack) && length(dist_slack) != buscount
+            error("Distributed bus specification doesn't match the number of buses.")
+        end
+        length(ref_bus_positions) < buscount || error(
+            "All buses are reference buses; PTDF is not defined.",
+        )
+
+        ABA = calculate_ABA_matrix(A, BA, ref_bus_positions)
+        cache = AccelerateWrapper.aa_factorize(ABA)
+        valid_ix = setdiff(1:buscount, ref_bus_positions)
+        PTDFm_t = zeros(buscount, linecount)
+        AccelerateWrapper.solve_sparse!(
+            cache,
+            BA[valid_ix, :],
+            view(PTDFm_t, valid_ix, :),
+        )
+
+        isempty(dist_slack) && return PTDFm_t
+
+        @info "Distributed bus"
+        slack_array = reshape(dist_slack ./ sum(dist_slack), 1, buscount)
+        return PTDFm_t .- (slack_array * PTDFm_t)
+    end
+end
 
 """
-    PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from a PowerSystems.System by computing
 the sensitivity of transmission line flows to bus power injections. This is the primary constructor
@@ -247,7 +298,7 @@ for PTDF analysis starting from system data.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling.
         Empty dictionary uses single slack bus (default behavior)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -300,7 +351,7 @@ where A is the incidence matrix and B is the susceptance matrix.
 """
 function PTDF(sys::PSY.System;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
     kwargs...,
 )
@@ -312,7 +363,7 @@ function PTDF(sys::PSY.System;
 end
 
 """
-    PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from existing Ybus matrix.
 This constructor is more efficient when the prerequisite matrices are already available and provides
@@ -325,7 +376,7 @@ direct control over the underlying matrix computations.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling.
         Empty dictionary uses single slack bus (default behavior)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -370,7 +421,7 @@ where A is the incidence matrix and B is the susceptance matrix.
 """
 function PTDF(ybus::Ybus;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     A = IncidenceMatrix(ybus)
@@ -385,7 +436,7 @@ function PTDF(ybus::Ybus;
 end
 
 """
-    PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps())
+    PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps())
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from existing incidence and BA matrices.
 This constructor is more efficient when the prerequisite matrices are already available and provides
@@ -399,7 +450,7 @@ direct control over the underlying matrix computations.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack participation factors.
         Empty dictionary uses single slack bus (reference bus from matrices)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -451,7 +502,7 @@ function PTDF(
     A::IncidenceMatrix,
     BA::BA_Matrix;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     dist_slack_vector = if !(isempty(dist_slack))
diff --git a/src/solver_dispatch.jl b/src/solver_dispatch.jl
index 7a31ba86c..4998d6c97 100644
--- a/src/solver_dispatch.jl
+++ b/src/solver_dispatch.jl
@@ -10,17 +10,20 @@
 # the callback. One factor + one cache per Virtual matrix is what
 # remains — same throughput as the pool variant once the global lock
 # is in place.
+#
+# Apple's libSparse has no documented cross-handle corruption issue
+# analogous to `_LIBKLU_LOCK`; the per-cache `solver_lock` we acquire
+# here is sufficient for the `AAFactorCache` backend.
 
 """
     with_solver(f, K, work_ba_col, temp_data, solver_lock) -> result
 
 Acquire `solver_lock`, then invoke `f(K, work_ba_col[1], temp_data[1])`.
-Two overloads: one specialized on `KLULinSolveCache{Float64}` (the KLU
-backend), one generic for any other factorization (`AppleAccelerate.AAFactorization`,
-etc.). Both serialize through `solver_lock`; the per-cache scratch
-slot at index 1 is the only slot — `work_ba_col` and `temp_data` are
-single-element vectors, kept as `Vector{Vector{Float64}}` because
-`_solve_factorization` and the AppleAccelerate extension are typed on
+Three overloads: `KLULinSolveCache{Float64}` (KLU backend), `AAFactorCache`
+(Apple Accelerate backend), and a generic fallback. All serialize through
+`solver_lock`; the per-cache scratch slot at index 1 is the only slot —
+`work_ba_col` and `temp_data` are single-element vectors, kept as
+`Vector{Vector{Float64}}` because `_solve_factorization` is typed on
 `Vector{Float64}` and the two buffers have different lengths
 (`n_buses` vs. `n_buses - n_ref_buses`).
 """
@@ -34,6 +37,16 @@ function with_solver(
     return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
 end
 
+function with_solver(
+    f::F,
+    K::AAFactorCache,
+    work_ba_col::Vector{Vector{Float64}},
+    temp_data::Vector{Vector{Float64}},
+    solver_lock::ReentrantLock,
+) where {F}
+    return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
+end
+
 function with_solver(
     f::F,
     K::KT,
diff --git a/src/virtual_lodf_calculations.jl b/src/virtual_lodf_calculations.jl
index 322c146e2..3417393a4 100644
--- a/src/virtual_lodf_calculations.jl
+++ b/src/virtual_lodf_calculations.jl
@@ -53,6 +53,10 @@ callers can issue requests concurrently; the libklu work runs one at a time.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot
+        path iterate the nonzeros of a `BA` column directly.
 - `temp_data::Vector{Vector{Float64}}`:
         Single-element scratch vector kept as a `Vector{Vector{Float64}}` for
         uniform `with_solver` callback signatures.
@@ -82,6 +86,7 @@ struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
     axes::Ax
     lookup::L
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     temp_data::Vector{Vector{Float64}}
     cache::RowCache
     cache_lock::ReentrantLock
@@ -110,15 +115,6 @@ function Base.show(io::IO, ::MIME{Symbol("text/plain")}, array::VirtualLODF)
     return
 end
 
-# Map bus index (1:n_buses) to its position in `valid_ix`; ref buses map to 0.
-function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int})::Vector{Int}
-    bus_to_valid = zeros(Int, n_buses)
-    for (vi, bus_ix) in enumerate(valid_ix)
-        bus_to_valid[bus_ix] = vi
-    end
-    return bus_to_valid
-end
-
 """
     _get_PTDF_A_diag(K, BA, A, ref_bus_positions) -> Vector{Float64}
 
@@ -175,18 +171,21 @@ function _get_PTDF_A_diag(
             ba_col[valid_i] = ba_nz[k]
         end
 
-        _solve_factorization(K, ba_col)
+        # Read PTDF row from the returned buffer — backend-agnostic
+        # (KLU mutates `ba_col` and returns it; other backends may
+        # return a fresh vector, so capture the return value).
+        lin_solve = _solve_factorization(K, ba_col)
 
-        # ba_col is now PTDF row i in valid-index space; H[e,e] = ptdf[from] - ptdf[to].
+        # H[e,e] = ptdf[from] - ptdf[to]; ref-bus entries are 0.
         f = arc_from_valid[i]
         t = arc_to_valid[i]
         v_f = if f > 0
-            ba_col[f]
+            lin_solve[f]
         else
             0.0
         end
         v_t = if t > 0
-            ba_col[t]
+            lin_solve[t]
         else
             0.0
         end
@@ -265,6 +264,9 @@ struct with an empty cache.
         PSY system for which the matrix is constructed
 
 # Keyword Arguments
+- `linear_solver::String = _default_linear_solver()`: Linear solver for the
+        ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to
+        "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `network_reduction::NetworkReduction`:
         Structure containing the details of the network reduction applied when computing the matrix
 - `kwargs...`:
@@ -273,6 +275,7 @@ struct with an empty cache.
 function VirtualLODF(
     sys::PSY.System;
     dist_slack::Vector{Float64} = Float64[],
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -282,6 +285,7 @@ function VirtualLODF(
     if length(dist_slack) != 0
         @info "Distributed bus"
     end
+    solver = resolve_linear_solver(linear_solver)
     Ymatrix = Ybus(
         sys;
         network_reductions = network_reductions,
@@ -296,7 +300,7 @@ function VirtualLODF(
     subnetwork_axes = make_arc_arc_subnetwork_axes(A)
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu_factorize(ABA)
+    K = _create_factorization(solver, ABA)
     bus_ax = get_bus_axis(A)
 
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
@@ -324,6 +328,7 @@ function VirtualLODF(
     # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`.
     temp_data = [zeros(length(bus_ax))]
     work_ba_col = [zeros(length(valid_ix))]
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
 
     return VirtualLODF(
         K,
@@ -337,6 +342,7 @@ function VirtualLODF(
         axes,
         look_up,
         valid_ix,
+        bus_to_valid_idx,
         temp_data,
         empty_cache,
         ReentrantLock(),
@@ -384,8 +390,17 @@ function _compute_lodf_row(vlodf::VirtualLODF, row::Int)::Vector{Float64}
     return with_solver(
         vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        @inbounds for i in eachindex(vlodf.valid_ix)
-            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row]
+        # Sparse-only extraction: iterate BA[:, row] non-zeros (typically
+        # 2 per arc) instead of scanning the full bus axis.
+        fill!(work_ba_col, 0.0)
+        BA = vlodf.BA
+        bus_to_valid_idx = vlodf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, row)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
 
@@ -515,9 +530,17 @@ function _getindex_partial(
     return with_solver(
         vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve.
-        @inbounds for i in eachindex(vlodf.valid_ix)
-            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx]
+        # Steps 1-2: Compute B⁻¹(b_e · ν_e) via sparse-only BA-column
+        # extraction + solve.
+        fill!(work_ba_col, 0.0)
+        BA = vlodf.BA
+        bus_to_valid_idx = vlodf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, arc_idx)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
 
diff --git a/src/virtual_modf_calculations.jl b/src/virtual_modf_calculations.jl
index 29c2ba7d5..f11879d11 100644
--- a/src/virtual_modf_calculations.jl
+++ b/src/virtual_modf_calculations.jl
@@ -46,6 +46,10 @@ cache and skips the recomputation.
         Tuple of lookup dictionaries for indexing.
 - `valid_ix::Vector{Int}`:
         Indices of non-reference buses.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the
+        Woodbury kernel iterate the nonzeros of a `BA` column directly.
 - `contingency_cache::Dict{Base.UUID, ContingencySpec}`:
         Resolved contingencies keyed by outage UUID.
 - `woodbury_cache::Dict{NetworkModification, WoodburyFactors}`:
@@ -86,6 +90,7 @@ struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
     axes::Ax
     lookup::L
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     contingency_cache::Dict{Base.UUID, ContingencySpec}
     woodbury_cache::Dict{NetworkModification, WoodburyFactors}
     row_caches::Dict{NetworkModification, RowCache}
@@ -178,6 +183,7 @@ function _compute_woodbury_factors(
             mat.BA,
             mat.arc_susceptances,
             mat.valid_ix,
+            mat.bus_to_valid_idx,
             modifications,
         )
     end
@@ -201,6 +207,7 @@ function _apply_woodbury_correction(
             mat.BA,
             mat.arc_susceptances,
             mat.valid_ix,
+            mat.bus_to_valid_idx,
             monitored_idx,
             wf,
         )
@@ -250,6 +257,9 @@ Outage supplemental attributes found in the system.
 
 # Keyword Arguments
 - `dist_slack::Vector{Float64}`: Distributed slack weights (default: empty)
+- `linear_solver::String = _default_linear_solver()`: Linear solver for the
+        ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to
+        "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `tol::Float64`: Tolerance for row sparsification (default: eps())
 - `max_cache_size::Int`: Max cache size in MiB per contingency (default: MAX_CACHE_SIZE_MiB)
 - `network_reductions::Vector{NetworkReduction}`: Network reductions to apply
@@ -257,6 +267,7 @@ Outage supplemental attributes found in the system.
 function VirtualMODF(
     sys::PSY.System;
     dist_slack::Vector{Float64} = Float64[],
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     network_reductions::Vector{NetworkReduction} = NetworkReduction[],
@@ -266,6 +277,7 @@ function VirtualMODF(
     if length(dist_slack) != 0
         @info "Distributed bus"
     end
+    solver = resolve_linear_solver(linear_solver)
 
     # Build network matrices (same path as VirtualLODF)
     Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...)
@@ -282,9 +294,10 @@ function VirtualMODF(
 
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu_factorize(ABA)
+    K = _create_factorization(solver, ABA)
 
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
 
     # Empty: populated lazily on first read of `vmodf.PTDF_A_diag`.
     PTDF_A_diag = Float64[]
@@ -308,6 +321,7 @@ function VirtualMODF(
         axes,
         look_up,
         valid_ix,
+        bus_to_valid_idx,
         Dict{Base.UUID, ContingencySpec}(),
         Dict{NetworkModification, WoodburyFactors}(),
         Dict{NetworkModification, RowCache}(),
diff --git a/src/virtual_ptdf_calculations.jl b/src/virtual_ptdf_calculations.jl
index 89b15df62..d8b3f09d4 100644
--- a/src/virtual_ptdf_calculations.jl
+++ b/src/virtual_ptdf_calculations.jl
@@ -21,8 +21,8 @@ JuMP-side work (in callers) parallelizes freely.
 # Arguments
 - `K`:
         LU factorization of the ABA matrix. A `KLULinSolveCache{Float64}` for
-        the default KLU solver, or an `AppleAccelerate.AAFactorization{Float64}`
-        when the AppleAccelerate extension is loaded.
+        the default KLU solver, or an `AccelerateWrapper.AAFactorCache` when the
+        AppleAccelerate backend is selected on macOS.
 - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
         BA matrix
 - `ref_bus_positions::Set{Int}`:
@@ -48,6 +48,11 @@ JuMP-side work (in callers) parallelizes freely.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot
+        path iterate the nonzeros of a `BA` column instead of scanning the
+        full bus axis.
 - `cache::RowCache`:
         Cache where PTDF rows are stored.
 - `cache_lock::ReentrantLock`:
@@ -80,6 +85,7 @@ struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K} <:
     lookup::L
     temp_data::Vector{Vector{Float64}}
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     cache::RowCache
     cache_lock::ReentrantLock
     subnetwork_axes::Dict{Int, Ax}
@@ -120,8 +126,9 @@ struct with an empty cache.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary of weights to be used as distributed slack bus.
         The distributed slack dictionary must have the same number of entries as the number of buses.
-- `linear_solver::String = "KLU"`:
-        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate"
+- `linear_solver::String = _default_linear_solver()`:
+        Linear solver to use for factorization. Options: "KLU", "AppleAccelerateLU".
+        Defaults to "AppleAccelerateLU" on macOS 15.5+ and "KLU" elsewhere.
 - `tol::Float64 = eps()`:
         Tolerance related to sparsification and values to drop.
 - `max_cache_size::Int`:
@@ -136,7 +143,7 @@ struct with an empty cache.
 function VirtualPTDF(
     sys::PSY.System;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -169,11 +176,11 @@ function _create_factorization(
 end
 
 function _create_factorization(
-    ::AppleAccelerateSolver,
+    ::AppleAccelerateLUSolver,
     ABA::SparseArrays.SparseMatrixCSC{Float64, Int},
 )
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
-    return _create_apple_accelerate_factorization(ABA)
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
+    return AccelerateWrapper.aa_factorize(ABA)
 end
 
 function _create_factorization(
@@ -181,7 +188,7 @@ function _create_factorization(
     ::SparseArrays.SparseMatrixCSC{Float64, Int},
 )
     return error(
-        "Only KLU and AppleAccelerate solvers are supported for VirtualPTDF factorization.",
+        "Only KLU and AppleAccelerateLU solvers are supported for VirtualPTDF factorization.",
     )
 end
 
@@ -197,8 +204,9 @@ The return is a VirtualPTDF struct with an empty cache.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary of weights to be used as distributed slack bus.
         The distributed slack dictionary must have the same number of entries as the number of buses.
-- `linear_solver::String = "KLU"`:
-        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate"
+- `linear_solver::String = _default_linear_solver()`:
+        Linear solver to use for factorization. Options: "KLU", "AppleAccelerateLU".
+        Defaults to "AppleAccelerateLU" on macOS 15.5+ and "KLU" elsewhere.
 - `tol::Float64 = eps()`:
         Tolerance related to sparsification and values to drop.
 - `max_cache_size::Int`:
@@ -209,7 +217,7 @@ The return is a VirtualPTDF struct with an empty cache.
 function VirtualPTDF(
     ybus::Ybus;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -261,6 +269,7 @@ function VirtualPTDF(
     # `Vector{Vector{Float64}}` so `with_solver`'s callback signature
     # stays uniform across solver backends.
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
     temp_data = [zeros(length(bus_ax))]
     work_ba_col = [zeros(length(valid_ix))]
 
@@ -277,6 +286,7 @@ function VirtualPTDF(
         look_up,
         temp_data,
         valid_ix,
+        bus_to_valid_idx,
         empty_cache,
         ReentrantLock(),
         subnetwork_axes,
@@ -321,17 +331,18 @@ if isdefined(Base, :print_array) # 0.7 and later
     Base.print_array(io::IO, X::VirtualPTDF) = "VirtualPTDF"
 end
 
-# Helper function to solve with different factorization types. The
-# `KLULinSolveCache` overload solves in place (zero-allocation hot path);
-# the generic fallback delegates to `\` and is extended by the
-# AppleAccelerate extension for `AAFactorization`.
+# Helper function to solve with different factorization types. Both
+# overloads solve in place (zero-allocation hot path). The KLU and Apple
+# Accelerate backends are the only solvers supported here; adding a new
+# backend requires extending this method.
 function _solve_factorization(K::KLULinSolveCache{Float64}, b::Vector{Float64})
     solve!(K, b)
     return b
 end
 
-function _solve_factorization(K, b::Vector{Float64})
-    return K \ b
+function _solve_factorization(K::AAFactorCache, b::Vector{Float64})
+    AccelerateWrapper.solve!(K, b)
+    return b
 end
 
 function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64}
@@ -350,12 +361,23 @@ function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64}
     return with_solver(
         vptdf.K, vptdf.work_ba_col, vptdf.temp_data, vptdf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        valid_ix = vptdf.valid_ix
-        @inbounds for i in eachindex(valid_ix)
-            work_ba_col[i] = vptdf.BA[valid_ix[i], row]
+        # Extract BA[:, row] non-zeros into work_ba_col at non-ref-bus
+        # positions. Iterates only the nonzeros of the BA column (typically
+        # 2 per arc) instead of scanning the full bus axis and bisecting
+        # the CSC for each entry.
+        fill!(work_ba_col, 0.0)
+        BA = vptdf.BA
+        bus_to_valid_idx = vptdf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, row)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
         fill!(temp_data, 0.0)
+        valid_ix = vptdf.valid_ix
         @inbounds for i in eachindex(valid_ix)
             temp_data[valid_ix[i]] = lin_solve[i]
         end
diff --git a/src/woodbury_kernel.jl b/src/woodbury_kernel.jl
index ad29ba3b4..7f183e125 100644
--- a/src/woodbury_kernel.jl
+++ b/src/woodbury_kernel.jl
@@ -83,6 +83,7 @@ function _compute_woodbury_factors_impl(
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     arc_sus::Vector{Float64},
     valid_ix::Vector{Int},
+    bus_to_valid_idx::Vector{Int},
     modifications::Tuple{Vararg{ArcModification}},
 )::WoodburyFactors
     M = length(modifications)
@@ -97,13 +98,19 @@ function _compute_woodbury_factors_impl(
 
     # Compute Z[:,j] = B⁻¹ν_j for each modified arc
     Z = Matrix{Float64}(undef, n_bus, M)
+    ba_rv_outer = SparseArrays.rowvals(BA)
+    ba_nz_outer = SparseArrays.nonzeros(BA)
 
     for (j, mod) in enumerate(modifications)
         e = mod.arc_index
         b_e = arc_sus[e]
 
-        @inbounds for i in eachindex(valid_ix)
-            work_ba_col[i] = BA[valid_ix[i], e]
+        # Sparse-only extraction of BA[:, e] into work_ba_col.
+        fill!(work_ba_col, 0.0)
+        @inbounds for k in SparseArrays.nzrange(BA, e)
+            valid_i = bus_to_valid_idx[ba_rv_outer[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz_outer[k]
         end
         lin_solve = _solve_factorization(K, work_ba_col)
 
@@ -159,6 +166,7 @@ function _apply_woodbury_correction_impl(
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     arc_sus::Vector{Float64},
     valid_ix::Vector{Int},
+    bus_to_valid_idx::Vector{Int},
     monitored_idx::Int,
     wf::WoodburyFactors,
 )::Vector{Float64}
@@ -177,10 +185,15 @@ function _apply_woodbury_correction_impl(
         return zeros(n_bus)
     end
 
-    # z_m = B⁻¹ν_m / b_mon_pre via KLU solve on BA column
+    # z_m = B⁻¹ν_m / b_mon_pre via sparse-only BA-column extraction + solve.
     b_mon_pre = arc_sus[monitored_idx]
-    @inbounds for i in eachindex(valid_ix)
-        work_ba_col[i] = BA[valid_ix[i], monitored_idx]
+    fill!(work_ba_col, 0.0)
+    ba_rv_mon = SparseArrays.rowvals(BA)
+    ba_nz_mon = SparseArrays.nonzeros(BA)
+    @inbounds for k in SparseArrays.nzrange(BA, monitored_idx)
+        valid_i = bus_to_valid_idx[ba_rv_mon[k]]
+        valid_i > 0 || continue
+        work_ba_col[valid_i] = ba_nz_mon[k]
     end
     lin_solve = _solve_factorization(K, work_ba_col)
 
@@ -226,7 +239,8 @@ function _compute_woodbury_factors(
     ) do K_solver, work_ba_col, temp_data
         _compute_woodbury_factors_impl(
             K_solver, work_ba_col, temp_data,
-            mat.BA, mat.arc_susceptances, mat.valid_ix, modifications,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx,
+            modifications,
         )
     end
 end
@@ -241,7 +255,8 @@ function _apply_woodbury_correction(
     ) do K_solver, work_ba_col, temp_data
         _apply_woodbury_correction_impl(
             K_solver, work_ba_col, temp_data,
-            mat.BA, mat.arc_susceptances, mat.valid_ix, monitored_idx, wf,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx,
+            monitored_idx, wf,
         )
     end
 end
diff --git a/test/PowerNetworkMatricesTests.jl b/test/PowerNetworkMatricesTests.jl
index b3857fe30..d3036822c 100644
--- a/test/PowerNetworkMatricesTests.jl
+++ b/test/PowerNetworkMatricesTests.jl
@@ -21,7 +21,7 @@ import Aqua
 Aqua.test_unbound_args(PowerNetworkMatrices)
 Aqua.test_undefined_exports(PowerNetworkMatrices)
 Aqua.test_ambiguities(PowerNetworkMatrices)
-Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:AppleAccelerate, :Pardiso])
+Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:Pardiso])
 Aqua.test_deps_compat(PowerNetworkMatrices)
 Aqua.find_persistent_tasks_deps(PowerNetworkMatrices)
 Aqua.test_persistent_tasks(PowerNetworkMatrices)
diff --git a/test/runtests.jl b/test/runtests.jl
index 1afa42b7c..ae21ff04f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,12 +1,11 @@
-# load system-specific packages before PNM: ensures relevant PNM extensions will be loaded.
+# Load system-specific packages before PNM so relevant PNM extensions
+# precompile. The Apple Accelerate backend is now built in to PNM and does
+# not need a trigger package; only the MKL/Pardiso extension is gated on a
+# `using` call.
 @static if (Sys.ARCH === :x86_64 || Sys.ARCH === :i686) && !Sys.isapple()
     import Pkg
     Pkg.add("Pardiso")
     using Pardiso
-else
-    import Pkg
-    Pkg.add("AppleAccelerate")
-    using AppleAccelerate
 end
 
 using PowerNetworkMatrices
diff --git a/test/test_accelerate_wrapper.jl b/test/test_accelerate_wrapper.jl
new file mode 100644
index 000000000..d4376eb6a
--- /dev/null
+++ b/test/test_accelerate_wrapper.jl
@@ -0,0 +1,245 @@
+import LinearAlgebra
+import SparseArrays
+import Random
+
+const _AA_TEST_SEED = 0xABCDEF
+
+function _random_spd(n::Int; density::Float64 = 0.05, scale::Real = 4)
+    M = SparseArrays.sprandn(n, n, density)
+    return SparseArrays.sparse(M + M' + scale * n * LinearAlgebra.I)
+end
+
+# A general (non-symmetric) real sparse system: diagonally dominant so it
+# is nonsingular but structurally and numerically unsymmetric.
+function _random_unsymmetric(n::Int; density::Float64 = 0.04)
+    M = SparseArrays.sprandn(n, n, density)
+    return SparseArrays.sparse(M + (2 * n) * LinearAlgebra.I)
+end
+
+@testset "AccelerateWrapper: smoke" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+
+    cache = PNM.AccelerateWrapper.aa_factorize(ABA)
+    @test PNM.AccelerateWrapper.is_factored(cache)
+
+    b = randn(n)
+    x = copy(b)
+    PNM.AccelerateWrapper.solve!(cache, x)
+    @test isapprox(ABA * x, b; atol = 1e-9)
+
+    B = SparseArrays.sprandn(n, 120, 0.03)
+    for j in 1:5:120
+        B[:, j] .= 0
+    end
+    SparseArrays.dropzeros!(B)
+    out = zeros(n, 120)
+    PNM.AccelerateWrapper.solve_sparse!(cache, B, out)
+    @test isapprox(out, ABA \ Matrix(B); atol = 1e-9)
+end
+
+@testset "AccelerateWrapper: _create_factorization dispatch" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+
+    K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA)
+    @test K_klu isa PNM.KLULinSolveCache{Float64}
+    @test K_aa isa PNM.AAFactorCache
+
+    b_klu = randn(n)
+    b_aa = copy(b_klu)
+    PNM._solve_factorization(K_klu, b_klu)
+    PNM._solve_factorization(K_aa, b_aa)
+    @test isapprox(b_klu, b_aa; atol = 1e-9)
+end
+
+@testset "AccelerateWrapper: with_solver resolves to concrete method" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA)
+
+    work = [zeros(n)]
+    tmp = [zeros(n)]
+    lk = ReentrantLock()
+    b = randn(n)
+    copyto!(work[1], b)
+
+    result = PNM.with_solver(K_aa, work, tmp, lk) do K, wba, _td
+        PNM._solve_factorization(K, wba)
+        return copy(wba)
+    end
+    @test isapprox(ABA * result, b; atol = 1e-9)
+
+    m = which(
+        PNM.with_solver,
+        (
+            typeof(identity),
+            PNM.AAFactorCache,
+            Vector{Vector{Float64}},
+            Vector{Vector{Float64}},
+            ReentrantLock,
+        ),
+    )
+    @test Base.unwrap_unionall(m.sig).parameters[3] === PNM.AAFactorCache
+end
+
+@testset "AccelerateWrapper: KLU vs AA per-column solve parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    nbranches = 40
+    ABA = _random_spd(n)
+    BA = SparseArrays.sprandn(n, nbranches, 0.03)
+
+    K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA)
+
+    work_klu = zeros(n)
+    work_aa = zeros(n)
+    for col in 1:nbranches
+        work_klu .= Vector(BA[:, col])
+        copyto!(work_aa, work_klu)
+        PNM._solve_factorization(K_klu, work_klu)
+        PNM._solve_factorization(K_aa, work_aa)
+        @test isapprox(work_klu, work_aa; atol = 1e-9)
+    end
+end
+
+@testset "AccelerateWrapper: solve_w_refinement matches direct solve on AA cache" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 60
+    ABA = _random_spd(n)
+    cache = PNM.AccelerateWrapper.aa_factorize(ABA)
+
+    x_true = randn(n)
+    b = ABA * x_true
+    X = PNM.solve_w_refinement(cache, ABA, b)
+    @test isapprox(X, x_true; atol = 1e-10)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement! recovers ill-conditioned AA solve" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED + 1)
+    n = 80
+    # Symmetric ill-conditioned tridiagonal — back-solve leaves residual that
+    # refinement closes. Less pathological than the KLU equivalent because
+    # AA's LU (threshold partial pivoting) keeps a higher effective pivot
+    # floor than KLU's BTF + LU on a nearly-defective tridiagonal.
+    A = SparseArrays.spdiagm(
+        0 => fill(1.0, n),
+        1 => fill(1.0 - 1e-6, n - 1),
+        -1 => fill(1.0 - 1e-6, n - 1),
+    )
+    cache = PNM.AccelerateWrapper.aa_factorize(A)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = zeros(n)
+    PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12)
+    @test isapprox(X, x_true; atol = 1e-8)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement requires a factored AA cache" begin
+    PNM._has_apple_accelerate_backend() || return
+    n = 12
+    A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n)))
+    cache = PNM.AAFactorCache(A)  # not factored
+    b = randn(n)
+    @test_throws ErrorException PNM.solve_w_refinement(cache, A, b)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement KLU vs AA parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED + 2)
+    n = 50
+    A = _random_spd(n)
+    x_true = randn(n)
+    b = A * x_true
+
+    K_klu = PNM.klu_factorize(A)
+    K_aa = PNM.AccelerateWrapper.aa_factorize(A)
+
+    X_klu = PNM.solve_w_refinement(K_klu, A, b)
+    X_aa = PNM.solve_w_refinement(K_aa, A, b)
+    @test isapprox(X_klu, X_aa; atol = 1e-10)
+end
+
+@testset "AccelerateWrapper: LU non-symmetric vs KLU parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 220
+    A = _random_unsymmetric(n)
+    @test A != permutedims(A)  # genuinely unsymmetric
+
+    K_klu = PNM._create_factorization(PNM.KLUSolver(), A)
+    K_aa = PNM.AccelerateWrapper.aa_factorize(A)
+    @test PNM.AccelerateWrapper.is_factored(K_aa)
+
+    b = randn(n)
+    x_klu = copy(b)
+    x_aa = copy(b)
+    PNM._solve_factorization(K_klu, x_klu)
+    PNM.AccelerateWrapper.solve!(K_aa, x_aa)
+    @test isapprox(x_aa, x_klu; rtol = 1e-9)
+    @test isapprox(A * x_aa, b; rtol = 1e-9)
+end
+
+@testset "AccelerateWrapper: LU symmetric-ABA regression (solve_sparse!)" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 260
+    ABA = _random_spd(n)  # still works; LU has no symmetry requirement
+
+    K_aa = PNM.AccelerateWrapper.aa_factorize(ABA)
+
+    B = SparseArrays.sprandn(n, 64, 0.03)
+    out_aa = zeros(n, 64)
+    PNM.AccelerateWrapper.solve_sparse!(K_aa, B, out_aa)
+    @test isapprox(out_aa, ABA \ Matrix(B); rtol = 1e-8)
+end
+
+@testset "AccelerateWrapper: LU numeric_refactor! reuse parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 180
+    A1 = _random_unsymmetric(n)
+    cache = PNM.AccelerateWrapper.aa_factorize(A1)
+
+    # Same pattern, different values.
+    A2 = SparseArrays.SparseMatrixCSC(
+        n, n, copy(SparseArrays.getcolptr(A1)), copy(SparseArrays.rowvals(A1)),
+        SparseArrays.nonzeros(A1) .+ 0.5,
+    )
+    PNM.AccelerateWrapper.full_refactor!(cache, A2)
+    b = randn(n)
+    x = copy(b)
+    PNM.AccelerateWrapper.solve!(cache, x)
+    @test isapprox(A2 * x, b; rtol = 1e-8)
+end
+
+@testset "AccelerateWrapper: LU singular throws on factor (KLU parity)" begin
+    PNM._has_apple_accelerate_backend() || return
+    n = 50
+    S = SparseArrays.sparse(1.0 * LinearAlgebra.I, n, n)
+    S[n, n] = 0.0  # exactly singular
+    SparseArrays.dropzeros!(S)
+    # Both backends detect singularity at factor time (not solve time), which is
+    # the key parity guarantee. The exception TYPES differ by design (accepted
+    # divergence): KLU follows the LAPACK convention and throws
+    # `SingularException`, while libSparse fires its `reportError` callback with
+    # "Matrix is structurally singular." which our binding surfaces as a plain
+    # `ErrorException`. Callers needing uniform handling must catch both.
+    @test_throws LinearAlgebra.SingularException PNM._create_factorization(
+        PNM.KLUSolver(),
+        S,
+    )
+    @test_throws ErrorException PNM.AccelerateWrapper.aa_factorize(S)
+end
diff --git a/test/test_klu_wrapper.jl b/test/test_klu_wrapper.jl
index a4be6d48c..902c4a088 100644
--- a/test/test_klu_wrapper.jl
+++ b/test/test_klu_wrapper.jl
@@ -1,4 +1,6 @@
 import SparseArrays
+import LinearAlgebra
+import Random
 
 @testset "KLU wrapper: real round-trip and refactor" begin
     n = 50
@@ -232,3 +234,168 @@ end
     alloc_warm = @allocated PNM.solve_sparse!(cache, B, out; block = 32)
     @test alloc_warm < n * size(B, 2) * sizeof(Float64) ÷ 4
 end
+
+# ---------------------------------------------------------------------------
+# Int32 index-type path
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: Int32 real round-trip and refactor" begin
+    n = 50
+    rng_vals = collect(1.0:n)
+    A64 = SparseArrays.spdiagm(0 => rng_vals .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    x = collect(1.0:n)
+    b = A64 * x
+
+    cache = PNM.klu_factorize(A)
+    @test PNM.is_factored(cache)
+    @test cache isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32}
+    @test size(cache) == (n, n)
+
+    y = copy(b)
+    PNM.solve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # Refactor with new values, same pattern.
+    A2_64 = SparseArrays.spdiagm(0 => rng_vals .+ 2.0,
+        1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1))
+    A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64)
+    x2 = randn(n)
+    b2 = A2_64 * x2
+    PNM.numeric_refactor!(cache, A2)
+    y2 = copy(b2)
+    PNM.solve!(cache, y2)
+    @test isapprox(y2, x2, atol = 1e-9)
+end
+
+@testset "KLU wrapper: Int32 vs Int64 solves agree bit-for-bit" begin
+    # KLU is deterministic; identical entries and identical permutation give
+    # identical solves regardless of the C entry point's index width.
+    Random.seed!(0xC0FFEE)
+    n = 200
+    A64 = SparseArrays.spdiagm(0 => 4.0 .+ rand(n), 1 => 0.1 .* rand(n - 1),
+        -1 => 0.1 .* rand(n - 1))
+    A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+
+    c64 = PNM.klu_factorize(A64)
+    c32 = PNM.klu_factorize(A32)
+    @test c64 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int64}
+    @test c32 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32}
+
+    b = randn(n)
+    y64 = copy(b)
+    y32 = copy(b)
+    PNM.solve!(c64, y64)
+    PNM.solve!(c32, y32)
+    @test y64 == y32   # bit-equal
+end
+
+@testset "KLU wrapper: Int32 solve_sparse! agrees with dense reference" begin
+    Random.seed!(0xBEEF)
+    n = 80
+    A_dense = Matrix(SparseArrays.sprandn(n, n, 0.1)) + 4 * LinearAlgebra.I
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(SparseArrays.sparse(A_dense))
+    cache = PNM.klu_factorize(A)
+
+    B_dense = SparseArrays.sprandn(n, 20, 0.1)
+    B = SparseArrays.SparseMatrixCSC{Float64, Int32}(B_dense)
+
+    out = PNM.solve_sparse(cache, B)
+    @test isapprox(out, A_dense \ Matrix(B_dense); atol = 1e-9)
+end
+
+@testset "KLU wrapper: Int32 numeric_refactor! is allocation-free" begin
+    n = 100
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    cache = PNM.klu_factorize(A)
+
+    A2_64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.5,
+        1 => fill(0.07, n - 1), -1 => fill(0.07, n - 1))
+    A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64)
+
+    # Warm up specialization on the Int32 path.
+    PNM.numeric_refactor!(cache, A2)
+    alloc = @allocated PNM.numeric_refactor!(cache, A2)
+    @test alloc == 0
+end
+
+# ---------------------------------------------------------------------------
+# Performance-knob surface
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: sort_factors!/condest!/rcond! work on both index types" begin
+    n = 60
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+
+    for cache in (PNM.klu_factorize(A64), PNM.klu_factorize(A32))
+        PNM.KLUWrapper.sort_factors!(cache)
+        # Subsequent solve still returns the right answer.
+        b = randn(n)
+        y = copy(b)
+        PNM.solve!(cache, y)
+        @test isapprox(y, A64 \ b, atol = 1e-10)
+
+        c = PNM.KLUWrapper.condest!(cache)
+        r = PNM.KLUWrapper.rcond!(cache)
+        @test c > 0
+        @test 0 < r <= 1.0
+    end
+end
+
+# ---------------------------------------------------------------------------
+# Iterative refinement
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: solve_w_refinement matches direct solve on well-conditioned A" begin
+    Random.seed!(2)
+    n = 40
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = PNM.solve_w_refinement(cache, A, b)
+    @test isapprox(X, x_true, atol = 1e-12)
+end
+
+@testset "KLU wrapper: solve_w_refinement! recovers ill-conditioned solve" begin
+    # Build a noticeably ill-conditioned tridiagonal so a single back-solve
+    # leaves residual headroom that refinement actually closes.
+    Random.seed!(3)
+    n = 50
+    A = SparseArrays.spdiagm(0 => fill(1.0, n), 1 => fill(1.0 - 1e-10, n - 1))
+    cache = PNM.klu_factorize(A)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = zeros(n)
+    PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12)
+    @test isapprox(X, x_true, atol = 1e-8)
+end
+
+@testset "KLU wrapper: solve_w_refinement requires a factored cache" begin
+    n = 10
+    A = SparseArrays.spdiagm(0 => 1.0:n)
+    cache = PNM.KLUWrapper.KLULinSolveCache(A)  # not factored
+    b = randn(n)
+    @test_throws ErrorException PNM.solve_w_refinement(cache, A, b)
+end
+
+@testset "KLU wrapper: solve_w_refinement works on Int32 cache" begin
+    Random.seed!(4)
+    n = 30
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    cache = PNM.klu_factorize(A)
+    x_true = randn(n)
+    b = A64 * x_true
+    X = PNM.solve_w_refinement(cache, A, b)
+    @test isapprox(X, x_true, atol = 1e-10)
+end
diff --git a/test/test_lodf.jl b/test/test_lodf.jl
index a5a52d614..bc7d4a6a0 100644
--- a/test/test_lodf.jl
+++ b/test/test_lodf.jl
@@ -56,7 +56,7 @@
         @test isapprox(sum(total_error3), 0.0, atol = 1e-3)
     end
 
-    if PowerNetworkMatrices._has_apple_accelerate_ext()
+    if PowerNetworkMatrices._has_apple_accelerate_backend()
         L5NS_from_ptdf4 = LODF(A, P5; linear_solver = "AppleAccelerate")
         @test getindex(L5NS_from_ptdf4, "5", "6") - -0.3071 <= 1e-4
         total_error4 = 0.0
diff --git a/test/test_modf_lodf_reductions.jl b/test/test_modf_lodf_reductions.jl
index 68d577df0..c3ce31f1b 100644
--- a/test/test_modf_lodf_reductions.jl
+++ b/test/test_modf_lodf_reductions.jl
@@ -1,3 +1,68 @@
+"""
+    _clone_transformer_with_name(xfmr, new_name) -> TapTransformer
+
+Clone a TapTransformer with a new name, sharing the same buses and identical
+electrical parameters (R, X, tap, primary_shunt, rating, base_power,
+base_voltage_primary, base_voltage_secondary). The returned object has no
+supplemental attributes and is not attached to any system.
+
+Used to create a second circuit on a bus pair so that `BA_Matrix` treats that
+pair as a 2-circuit parallel group, enabling N-2 parallel-circuit tests.
+"""
+function _clone_transformer_with_name(xfmr::PSY.TapTransformer, new_name::String)
+    old_arc = PSY.get_arc(xfmr)
+    new_arc = PSY.Arc(; from = old_arc.from, to = old_arc.to)
+    return PSY.TapTransformer(;
+        name = new_name,
+        available = PSY.get_available(xfmr),
+        active_power_flow = PSY.get_active_power_flow(xfmr),
+        reactive_power_flow = PSY.get_reactive_power_flow(xfmr),
+        arc = new_arc,
+        r = PSY.get_r(xfmr),
+        x = PSY.get_x(xfmr),
+        primary_shunt = PSY.get_primary_shunt(xfmr),
+        tap = PSY.get_tap(xfmr),
+        rating = PSY.get_rating(xfmr),
+        base_power = PSY.get_base_power(xfmr),
+        base_voltage_primary = PSY.get_base_voltage_primary(xfmr),
+        base_voltage_secondary = PSY.get_base_voltage_secondary(xfmr),
+    )
+end
+
+"""
+    _canonical_arc_key(d, k) -> Tuple{Int,Int}
+
+Return `k` if it is a key of `d`, otherwise return the reversed tuple `(k[2], k[1])`.
+Used when an arc may be stored under either orientation in a lookup dictionary.
+"""
+function _canonical_arc_key(d::AbstractDict, k::Tuple{Int, Int})
+    if haskey(d, k)
+        return k
+    else
+        return (k[2], k[1])
+    end
+end
+
+"""
+    _ptdf_post_row_builder(ptdf_post, pre_bus_axis) -> Function
+
+Return a closure that accepts an `arc_key::Tuple{Int,Int}` and returns the
+corresponding row of `ptdf_post` permuted to match `pre_bus_axis`.  The arc key
+is canonicalised via `_canonical_arc_key` so either orientation is accepted.
+"""
+function _ptdf_post_row_builder(ptdf_post, pre_bus_axis)
+    post_bus_axis = PNM.get_bus_axis(ptdf_post)
+    post_bus_lookup = Dict(bn => i for (i, bn) in enumerate(post_bus_axis))
+    bus_perm = [post_bus_lookup[bn] for bn in pre_bus_axis]
+    post_arc_lookup = Dict{Tuple{Int, Int}, Int}(
+        a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post))
+    )
+    return function (arc_key::Tuple{Int, Int})
+        k = _canonical_arc_key(post_arc_lookup, arc_key)
+        return ptdf_post[post_arc_lookup[k], :][bus_perm]
+    end
+end
+
 """
 Helper: verify the N-1 identity for a single contingency.
 
@@ -282,3 +347,224 @@ end
         _find_two_non_islanding_arcs(vmodf, vlodf, nrd.direct_branch_map)
     verify_modf_n2_lodf_identity(vmodf, vlodf, ptdf, arc_idx1, arc_idx2)
 end
+
+@testset "VirtualMODF parallel-group N-k tests on RTS-GMLC" begin
+    # Shared fixture: RTS-GMLC system augmented with a second circuit on the A14
+    # transformer (bus 109 → 111) to create a 2-circuit parallel group.
+    sys = PSB.build_system(PSB.PSITestSystems, "test_RTS_GMLC_sys")
+    xfmr_a14 = PSY.get_component(PSY.TapTransformer, sys, "A14")
+    PSY.add_component!(sys, _clone_transformer_with_name(xfmr_a14, "A14_b"))
+
+    # Resolve the canonical arc key for the parallel group once.  A lightweight
+    # VirtualMODF is constructed here only to query the reduction maps; each
+    # child testset builds its own vmodf so contingency caches stay isolated.
+    nrd_shared = get_network_reduction_data(VirtualMODF(sys))
+    par_key = if haskey(nrd_shared.parallel_branch_map, (109, 111))
+        (109, 111)
+    else
+        (111, 109)
+    end
+
+    @testset "N-2 trip of one parallel transformer + a line" begin
+        vmodf = VirtualMODF(sys)
+        nrd = get_network_reduction_data(vmodf)
+
+        @test haskey(nrd.parallel_branch_map, par_key)
+        @test length(nrd.parallel_branch_map[par_key].branches) == 2
+
+        # N-2 partner line: A18 (bus 111 → 113), selected as the line incident on
+        # bus 111 with the largest |LODF| coupling with the parallel-group arc
+        # (|LODF| ≈ 0.428 vs A19's 0.231).  Confirmed non-islanding by building
+        # PTDF(sys_post) without error.
+        #
+        # Selection probe (run once, result baked in):
+        #   vlodf = VirtualLODF(sys)
+        #   |vlodf[(111,113), par_arc_idx]| ≈ 0.4285  <- largest
+        #   |vlodf[(111,114), par_arc_idx]| ≈ 0.2310
+        sel_line_name = "A18"
+
+        A14_b_branch = PSY.get_component(PSY.TapTransformer, sys, "A14_b")
+        line_branch = PSY.get_component(PSY.Line, sys, sel_line_name)
+
+        mod_a14b = NetworkModification(vmodf, A14_b_branch)
+        mod_a18 = NetworkModification(vmodf, line_branch)
+
+        n2_mods = vcat(
+            collect(mod_a14b.arc_modifications),
+            collect(mod_a18.arc_modifications),
+        )
+        n2_mod = NetworkModification("n2_parallel_xfmr_plus_line", n2_mods)
+
+        ctg_uuid = Base.UUID(UInt128(515151))
+        ctg = ContingencySpec(ctg_uuid, n2_mod)
+        vmodf.contingency_cache[ctg_uuid] = ctg
+
+        sys_post = deepcopy(sys)
+        PSY.remove_component!(
+            sys_post,
+            PSY.get_component(PSY.TapTransformer, sys_post, "A14_b"),
+        )
+        PSY.remove_component!(
+            sys_post,
+            PSY.get_component(PSY.Line, sys_post, sel_line_name),
+        )
+        ptdf_post = PTDF(sys_post)
+
+        pre_bus_axis = collect(PNM.get_bus_axis(vmodf))
+        ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis)
+        post_arc_lookup = Dict{Tuple{Int, Int}, Int}(
+            a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post))
+        )
+
+        atol = 1e-6
+        for (label, arc_key) in [
+            ("parallel pair (109,111)", par_key),
+            ("A19 other 230kV off bus 111", (111, 114)),
+            ("A12-1 138kV off bus 109", (108, 109)),
+            ("B8 area-200", (204, 209)),
+            ("C5 area-300", (302, 306)),
+        ]
+            mon_idx = PNM.get_arc_lookup(vmodf)[arc_key]
+            modf_row = vmodf[mon_idx, ctg]
+            @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol)
+        end
+    end
+
+    @testset "N-3 trip of one parallel transformer + two lines" begin
+        # N-3 outage: {A14_b, A18, A19}
+        #   A18 (bus 111 -> 113): |LODF| ≈ 0.428 with the parallel-group arc.
+        #   A19 (bus 111 -> 114): |LODF| ≈ 0.231 with the parallel-group arc.
+        # Both lines are off bus 111 (area-100 hub).  PTDF(sys_post) confirmed
+        # non-islanding (size (73, 106) after removing three branches).
+        vmodf = VirtualMODF(sys)
+        nrd = get_network_reduction_data(vmodf)
+
+        @test haskey(nrd.parallel_branch_map, par_key)
+        @test length(nrd.parallel_branch_map[par_key].branches) == 2
+
+        A14_b_branch = PSY.get_component(PSY.TapTransformer, sys, "A14_b")
+        a18_branch = PSY.get_component(PSY.Line, sys, "A18")
+        a19_branch = PSY.get_component(PSY.Line, sys, "A19")
+
+        mod_a14b = NetworkModification(vmodf, A14_b_branch)
+        mod_a18 = NetworkModification(vmodf, a18_branch)
+        mod_a19 = NetworkModification(vmodf, a19_branch)
+
+        n3_mods = vcat(
+            collect(mod_a14b.arc_modifications),
+            collect(mod_a18.arc_modifications),
+            collect(mod_a19.arc_modifications),
+        )
+        n3_mod = NetworkModification("n3_parallel_xfmr_plus_two_lines", n3_mods)
+
+        ctg_uuid = Base.UUID(UInt128(515251))
+        ctg = ContingencySpec(ctg_uuid, n3_mod)
+        vmodf.contingency_cache[ctg_uuid] = ctg
+
+        sys_post = deepcopy(sys)
+        PSY.remove_component!(
+            sys_post,
+            PSY.get_component(PSY.TapTransformer, sys_post, "A14_b"),
+        )
+        PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A18"))
+        PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A19"))
+        ptdf_post = PTDF(sys_post)
+
+        pre_bus_axis = collect(PNM.get_bus_axis(vmodf))
+        ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis)
+        post_arc_lookup = Dict{Tuple{Int, Int}, Int}(
+            a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post))
+        )
+
+        # (111, 114) is A19 which is outaged and absent from sys_post, so it is
+        # not included here.
+        atol = 1e-6
+        for (label, arc_key) in [
+            ("parallel pair (109,111)", par_key),
+            ("A20 230kV hub (112,113)", (112, 113)),
+            ("A12-1 138kV off bus 109 (108,109)", (108, 109)),
+            ("B8 area-200 (204,209)", (204, 209)),
+            ("C5 area-300 (302,306)", (302, 306)),
+        ]
+            if !haskey(post_arc_lookup, arc_key) &&
+               !haskey(post_arc_lookup, (arc_key[2], arc_key[1]))
+                @info "Skipping $label: arc $arc_key not present in sys_post"
+                continue
+            end
+            mon_idx = PNM.get_arc_lookup(vmodf)[arc_key]
+            modf_row = vmodf[mon_idx, ctg]
+            @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol)
+        end
+    end
+
+    @testset "N-3 trip of three regular lines (Woodbury M=3)" begin
+        # N-3 selection: {A23, B20, C20} — three PSY.Line instances from three
+        # different areas (area 1, 2, 3) so the Woodbury W matrix is genuinely
+        # M=3 with full rank.  None is in a parallel group or incident on a bus
+        # whose removal would island the network.  PTDF(sys_post) confirmed
+        # non-islanding (size (73, 105) after removing three branches).
+        #   A23: area 1, bus 114 -> 116, 230 kV
+        #   B20: area 2, bus 212 -> 213, 230 kV
+        #   C20: area 3, bus 312 -> 313, 230 kV
+        vmodf = VirtualMODF(sys)
+        nrd = get_network_reduction_data(vmodf)
+
+        a23_branch = PSY.get_component(PSY.Line, sys, "A23")
+        b20_branch = PSY.get_component(PSY.Line, sys, "B20")
+        c20_branch = PSY.get_component(PSY.Line, sys, "C20")
+
+        mod_a23 = NetworkModification(vmodf, a23_branch)
+        mod_b20 = NetworkModification(vmodf, b20_branch)
+        mod_c20 = NetworkModification(vmodf, c20_branch)
+
+        n3_mods = vcat(
+            collect(mod_a23.arc_modifications),
+            collect(mod_b20.arc_modifications),
+            collect(mod_c20.arc_modifications),
+        )
+        n3_mod = NetworkModification("n3_three_regular_lines", n3_mods)
+
+        ctg_uuid = Base.UUID(UInt128(515252))
+        ctg = ContingencySpec(ctg_uuid, n3_mod)
+        vmodf.contingency_cache[ctg_uuid] = ctg
+
+        sys_post = deepcopy(sys)
+        PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A23"))
+        PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "B20"))
+        PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "C20"))
+        ptdf_post = PTDF(sys_post)
+
+        pre_bus_axis = collect(PNM.get_bus_axis(vmodf))
+        ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis)
+        post_arc_lookup = Dict{Tuple{Int, Int}, Int}(
+            a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post))
+        )
+
+        # Monitored arcs:
+        #   1. (116, 119) A28: adjacent to A23's to-bus (116), area 1.
+        #   2. (212, 223) B21: adjacent to B20's from-bus (212), area 2.
+        #   3. (312, 323) C21: adjacent to C20's from-bus (312), area 3.
+        #   4. (109, 111) parallel pair: sanity check that the parallel-augmented
+        #      system still behaves correctly when A23/B20/C20 are not part of the N-3.
+        #   5. (204, 209) B8: unrelated area-200 line.
+        #   6. (302, 306) C5: unrelated area-300 line.
+        atol = 1e-6
+        for (label, arc_key) in [
+            ("A28 adj A23 to-bus 116 (116,119)", (116, 119)),
+            ("B21 adj B20 from-bus 212 (212,223)", (212, 223)),
+            ("C21 adj C20 from-bus 312 (312,323)", (312, 323)),
+            ("parallel pair sanity (109,111)", par_key),
+            ("B8 area-200 (204,209)", (204, 209)),
+            ("C5 area-300 (302,306)", (302, 306)),
+        ]
+            if !haskey(post_arc_lookup, arc_key) &&
+               !haskey(post_arc_lookup, (arc_key[2], arc_key[1]))
+                @info "Skipping $label: arc $arc_key not present in sys_post"
+                continue
+            end
+            mon_idx = PNM.get_arc_lookup(vmodf)[arc_key]
+            modf_row = vmodf[mon_idx, ctg]
+            @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol)
+        end
+    end
+end
diff --git a/test/test_network_modification.jl b/test/test_network_modification.jl
index 7bdb278ad..2b158fd4a 100644
--- a/test/test_network_modification.jl
+++ b/test/test_network_modification.jl
@@ -258,8 +258,8 @@ end
     # serial baseline. The KLU path is exercised by the threaded testsets in
     # `test/test_virtual_modf.jl`; this complements that coverage on the
     # AppleAccelerate path.
-    if !PowerNetworkMatrices._has_apple_accelerate_ext()
-        @info "Skipping: AppleAccelerate extension not loaded."
+    if !PowerNetworkMatrices._has_apple_accelerate_backend()
+        @info "Skipping: AppleAccelerate backend not available on this platform."
         return
     end
     if Threads.nthreads() < 2
diff --git a/test/test_partial_lodf.jl b/test/test_partial_lodf.jl
index 53363593f..e34eb8d0a 100644
--- a/test/test_partial_lodf.jl
+++ b/test/test_partial_lodf.jl
@@ -89,7 +89,9 @@ end
 
 @testset "Partial LODF: half-susceptance matches rebuilt ground truth" begin
     sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5")
-    vlodf = VirtualLODF(sys5)
+    # Pin KLU here: the ground-truth path below calls `PNM.solve!` directly on
+    # `vlodf.K`, which is dispatched on `KLULinSolveCache` only.
+    vlodf = VirtualLODF(sys5; linear_solver = "KLU")
     e = 1  # test arc index
     b_e = vlodf.arc_susceptances[e]
 
diff --git a/test/test_powerflow_matrix_types.jl b/test/test_powerflow_matrix_types.jl
index 870f650c5..33e634208 100644
--- a/test/test_powerflow_matrix_types.jl
+++ b/test/test_powerflow_matrix_types.jl
@@ -2,7 +2,7 @@
     sys = PSB.build_system(PSB.PSITestSystems, "c_sys5")
 
     @testset "DC_ABA_Matrix_Factorized" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             M = ABA_Matrix(sys; factorize = true)
             @test M isa PNM.DC_ABA_Matrix_Factorized
         end
@@ -14,7 +14,7 @@
     end
 
     @testset "DC_PTDF_Matrix" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             @test PTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_PTDF_Matrix
         end
         @test PTDF(sys; linear_solver = "KLU") isa PNM.DC_PTDF_Matrix
@@ -22,7 +22,7 @@
     end
 
     @testset "DC_vPTDF_Matrix" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             @test VirtualPTDF(sys; linear_solver = "AppleAccelerate") isa
                   PNM.DC_vPTDF_Matrix
         end
diff --git a/test/test_ptdf.jl b/test/test_ptdf.jl
index 749cbf7a0..fb427491f 100644
--- a/test/test_ptdf.jl
+++ b/test/test_ptdf.jl
@@ -8,8 +8,8 @@
         @info "Skipped MKLPardiso tests (extension not loaded)"
         continue
     end
-    if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate"
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate"
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
         continue
     end
     sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5")
diff --git a/test/test_virtual_lodf.jl b/test/test_virtual_lodf.jl
index 7301f3b70..7c2a9bd90 100644
--- a/test/test_virtual_lodf.jl
+++ b/test/test_virtual_lodf.jl
@@ -148,8 +148,36 @@ end
     @test test_value
 end
 
+@testset "Test Virtual LODF with Apple Accelerate" begin
+    if !PNM._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate VirtualLODF tests (backend unavailable on this platform)"
+    else
+        sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
+
+        vlodf_aa = VirtualLODF(sys; linear_solver = "AppleAccelerate")
+        vlodf_klu = VirtualLODF(sys; linear_solver = "KLU")
+
+        @test contains(string(typeof(vlodf_aa.K)), "AAFactorCache")
+        @test vlodf_klu.K isa PNM.KLULinSolveCache{Float64}
+
+        arc_axis = PNM.get_arc_axis(vlodf_aa)
+        @test arc_axis == PNM.get_arc_axis(vlodf_klu)
+        for arc in arc_axis
+            row_aa = vlodf_aa[arc, :]
+            row_klu = vlodf_klu[arc, :]
+            @test isapprox(row_aa, row_klu, atol = 1e-9)
+        end
+
+        # macOS default should resolve to AppleAccelerate.
+        vlodf_default = VirtualLODF(sys)
+        @test contains(string(typeof(vlodf_default.K)), "AAFactorCache")
+    end
+end
+
 @testset "_get_PTDF_A_diag: matches reference implementation" begin
     # Reference: full-bus dot product. Slow, unambiguously correct.
+    # Captures the return value of `_solve_factorization` to stay
+    # backend-agnostic (mirrors the fast path).
     function _reference_ptdf_a_diag(K, BA, A, ref_bus_positions::Set{Int})
         n_branches = size(BA, 2)
         n_buses = size(BA, 1)
@@ -164,10 +192,10 @@ end
                 bus_idx = valid_ix[idx]
                 ba_col[idx] = BA[bus_idx, i]
             end
-            PNM._solve_factorization(K, ba_col)
+            lin_solve = PNM._solve_factorization(K, ba_col)
             fill!(ptdf_row, 0.0)
             for idx in 1:n_valid
-                ptdf_row[valid_ix[idx]] = ba_col[idx]
+                ptdf_row[valid_ix[idx]] = lin_solve[idx]
             end
             for j in 1:n_buses
                 diag_[i] += ptdf_row[j] * A[i, j]
@@ -176,16 +204,24 @@ end
         return diag_
     end
 
+    backends = String["KLU"]
+    if PNM._has_apple_accelerate_backend()
+        push!(backends, "AppleAccelerate")
+    end
+
     for case in ("c_sys5", "c_sys14")
         sys = PSB.build_system(PSB.PSITestSystems, case)
         A = PNM.IncidenceMatrix(sys)
         BA = PNM.BA_Matrix(sys)
         ref_pos = Set(PNM.get_ref_bus_position(A))
         ABA = PNM.calculate_ABA_matrix(A.data, BA.data, ref_pos)
-        K = PNM.klu_factorize(ABA)
 
-        fast = PNM._get_PTDF_A_diag(K, BA.data, A.data, ref_pos)
-        ref = _reference_ptdf_a_diag(K, BA.data, A.data, ref_pos)
-        @test fast ≈ ref atol = 1e-12 rtol = 0
+        for solver_name in backends
+            solver = PNM.resolve_linear_solver(solver_name)
+            K = PNM._create_factorization(solver, ABA)
+            fast = PNM._get_PTDF_A_diag(K, BA.data, A.data, ref_pos)
+            ref = _reference_ptdf_a_diag(K, BA.data, A.data, ref_pos)
+            @test fast ≈ ref atol = 1e-12 rtol = 0
+        end
     end
 end
diff --git a/test/test_virtual_modf.jl b/test/test_virtual_modf.jl
index 40f4dbbfd..3c0dfc254 100644
--- a/test/test_virtual_modf.jl
+++ b/test/test_virtual_modf.jl
@@ -436,6 +436,39 @@ end
     end
 end
 
+@testset "VirtualMODF with Apple Accelerate backend matches KLU" begin
+    if !PNM._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate VirtualMODF tests (backend unavailable on this platform)"
+    else
+        sys, _ = _build_c_sys14_with_outages()
+
+        vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate")
+        vmodf_klu = VirtualMODF(sys; linear_solver = "KLU")
+
+        # Factorization should be the AA cache type.
+        @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache")
+        @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64}
+
+        registered_aa = get_registered_contingencies(vmodf_aa)
+        registered_klu = get_registered_contingencies(vmodf_klu)
+        @test !isempty(registered_aa)
+        @test keys(registered_aa) == keys(registered_klu)
+
+        # Compare post-contingency rows for every registered contingency
+        # against the KLU build, sweeping all monitored arcs.
+        arc_axis = PNM.get_arc_axis(vmodf_aa)
+        @test arc_axis == PNM.get_arc_axis(vmodf_klu)
+        for (uuid, ctg_aa) in registered_aa
+            ctg_klu = registered_klu[uuid]
+            for arc in arc_axis
+                row_aa = vmodf_aa[arc, ctg_aa]
+                row_klu = vmodf_klu[arc, ctg_klu]
+                @test isapprox(row_aa, row_klu, atol = 1e-9)
+            end
+        end
+    end
+end
+
 @testset "VirtualMODF concurrent getindex on the SAME (arc, ctg) is consistent" begin
     # Complements the previous testset: there, each (arc, ctg) pair appears
     # once in the work list, so only the `woodbury_cache` first-call race is
@@ -646,3 +679,37 @@ end
 
     PNM.clear_caches!(vmodf)
 end
+
+@testset "VirtualMODF: KLU and AppleAccelerate backend parity" begin
+    if !PNM._has_apple_accelerate_backend()
+        @info "Skipped VirtualMODF AA/KLU parity (backend unavailable on this platform)"
+        return
+    end
+
+    sys = PSB.build_system(PSB.PSITestSystems, "c_sys5")
+
+    vmodf_klu = VirtualMODF(sys; linear_solver = "KLU")
+    vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate")
+
+    @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64}
+    @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache")
+
+    # Trigger the lazy PTDF_A_diag on both backends.
+    diag_klu = vmodf_klu.PTDF_A_diag
+    diag_aa = vmodf_aa.PTDF_A_diag
+    @test length(diag_klu) == length(diag_aa)
+    @test isapprox(diag_aa, diag_klu, atol = 1e-9)
+
+    # Register the same N-1 contingency on both and compare one MODF row.
+    e = 1
+    b_e = vmodf_klu.arc_susceptances[e]
+    ctg_uuid = Base.UUID(UInt128(424242))
+    mod = NetworkModification("aa_parity_outage", [ArcModification(e, -b_e)])
+    ctg = ContingencySpec(ctg_uuid, mod)
+    vmodf_klu.contingency_cache[ctg_uuid] = ctg
+    vmodf_aa.contingency_cache[ctg_uuid] = ctg
+
+    row_klu = vmodf_klu[2, ctg]
+    row_aa = vmodf_aa[2, ctg]
+    @test isapprox(row_aa, row_klu, atol = 1e-9)
+end
diff --git a/test/test_virtual_ptdf.jl b/test/test_virtual_ptdf.jl
index 5993d4d03..83d7ebb9e 100644
--- a/test/test_virtual_ptdf.jl
+++ b/test/test_virtual_ptdf.jl
@@ -1,8 +1,8 @@
 # if it fails, we don't want the terminal to be flooded with errors, therefore failfast=true
 @testset "Test Virtual PTDF matrices" for solver in
                                           ("KLU", "AppleAccelerate")
-    if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate"
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate"
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
         continue
     end
     sys = PSB.build_system(
@@ -185,8 +185,8 @@ end
 end
 
 @testset "Test Virtual PTDF with Apple Accelerate" begin
-    if !PowerNetworkMatrices._has_apple_accelerate_ext()
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
     else
         # Test VirtualPTDF with Apple Accelerate solver
         sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
@@ -194,8 +194,8 @@ end
         # Create VirtualPTDF with AppleAccelerate
         vptdf_aa = VirtualPTDF(sys; linear_solver = "AppleAccelerate")
 
-        # Verify the factorization type is AAFactorization
-        @test contains(string(typeof(vptdf_aa.K)), "AAFactorization")
+        # Verify the factorization type is AAFactorCache
+        @test contains(string(typeof(vptdf_aa.K)), "AAFactorCache")
 
         # Create reference VirtualPTDF with KLU
         vptdf_klu = VirtualPTDF(sys; linear_solver = "KLU")
@@ -209,7 +209,7 @@ end
 
         # Test with tolerance
         vptdf_aa_tol = VirtualPTDF(sys; linear_solver = "AppleAccelerate", tol = 1e-2)
-        @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorization")
+        @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorCache")
 
         # Test with distributed slack
         buscount = length(PSY.get_available_components(PSY.ACBus, sys))
@@ -217,7 +217,7 @@ end
         dist_slack = Dict(i => dist_slack_factor for i in 1:buscount)
         vptdf_aa_slack =
             VirtualPTDF(sys; linear_solver = "AppleAccelerate", dist_slack = dist_slack)
-        @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorization")
+        @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorCache")
 
         # Compare with KLU for distributed slack case
         vptdf_klu_slack = VirtualPTDF(sys; linear_solver = "KLU", dist_slack = dist_slack)