diff --git a/.claude/Sienna.md b/.claude/Sienna.md
index b3a33b9d6..3afd7658c 100644
--- a/.claude/Sienna.md
+++ b/.claude/Sienna.md
@@ -72,6 +72,10 @@ Key rules:
   - Globals: `UPPER_CASE` for constants
   - Exports: all exports in main module file
   - Comments: complete sentences, describe why not how
+  - Nothing checks: use `isnothing(x)` / `!isnothing(x)`. Do **not** use the older `x === nothing` / `x !== nothing` patterns.
+  - Type checks: prefer multiple dispatch over `isa` (the only acceptable use of `isa` is filtering inside a `catch` block, where dispatch is unavailable).
+  - Conditionals: prefer `if/else` over the ternary `? :` operator for readability, especially in multi-line expressions.
+  - Cache lookups: `get!(dict, key) do ... end` (the closure form is **lazy** — only evaluates on a miss). Never use the 3-arg `get!(dict, key, default)` when `default` is expensive: Julia evaluates function arguments eagerly, so `default` runs on every call (including cache hits) and silently defeats the cache.
 
 ## Documentation Practices and Requirements
 
diff --git a/.github/workflows/main-tests.yml b/.github/workflows/main-tests.yml
index 3c71caf07..3dfd70b4c 100644
--- a/.github/workflows/main-tests.yml
+++ b/.github/workflows/main-tests.yml
@@ -31,6 +31,7 @@ jobs:
         continue-on-error: ${{ matrix.julia-version == 'nightly' }}
         env:
           PYTHON: ""
+          JULIA_NUM_THREADS: "2"
       - uses: julia-actions/julia-processcoverage@v1
         if: matrix.julia-version != 'nightly'
         with:
diff --git a/.github/workflows/pr_testing.yml b/.github/workflows/pr_testing.yml
index ab366d5ee..24b7fb7de 100644
--- a/.github/workflows/pr_testing.yml
+++ b/.github/workflows/pr_testing.yml
@@ -26,6 +26,7 @@ jobs:
       - uses: julia-actions/julia-runtest@latest
         env:
           PYTHON: ""
+          JULIA_NUM_THREADS: "2"
       - uses: julia-actions/julia-processcoverage@v1
         with:
           directories: src,ext
diff --git a/Project.toml b/Project.toml
index ff013f66e..fcd1eee47 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "PowerNetworkMatrices"
 uuid = "bed98974-b02a-5e2f-9fe0-a103f5c450dd"
-version = "0.20.0"
+version = "0.21.0"
 authors = ["SIENNA TEAM"]
 
 [deps]
@@ -8,11 +8,11 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 InfrastructureSystems = "2cd47ed4-ca9b-11e9-27f2-ab636a7671f1"
-KLU = "ef3ab10e-7fda-4108-b977-705223b18434"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PowerSystems = "bcd98974-b02a-5e2f-9ee0-a103f5c450dd"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SuiteSparse_jll = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
 
 [weakdeps]
 AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
@@ -28,10 +28,10 @@ DataStructures = "^0.19"
 DocStringExtensions = "~0.8, ~0.9"
 HDF5 = "0.17"
 InfrastructureSystems = "3"
-KLU = "^0.6"
 LinearAlgebra = "1"
 Pardiso = "1"
-PowerSystems = "^5.7"
+PowerSystems = "^5.10"
 Preferences = "^1.5"
 SparseArrays = "1"
+SuiteSparse_jll = "5, 6, 7"
 julia = "^1.10"
diff --git a/docs/make.jl b/docs/make.jl
index ffbd7cf6f..93ce16d41 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -26,6 +26,7 @@ pages = OrderedDict(
     "Reference" => Any[
         "Matrix Overview" => "reference/network_matrices_overview.md",
         "Public API" => "reference/public.md",
+        "Internals" => "reference/internals.md",
     ],
 )
 
diff --git a/docs/src/reference/internals.md b/docs/src/reference/internals.md
new file mode 100644
index 000000000..cf5b63078
--- /dev/null
+++ b/docs/src/reference/internals.md
@@ -0,0 +1,16 @@
+# Internals
+
+The symbols documented on this page are **internal** to `PowerNetworkMatrices` and are
+not part of the public API. They are documented here so that the published manual covers
+every docstring shipped with the package, but they may change at any time without notice
+and should not be relied on by downstream packages.
+
+## `KLUWrapper`
+
+`PowerNetworkMatrices.KLUWrapper` is a thin, allocation-aware wrapper over `libklu`
+(provided by `SuiteSparse_jll`) used internally for sparse linear solves. None of these
+symbols are exported from `PowerNetworkMatrices`.
+
+```@autodocs
+Modules = [PowerNetworkMatrices.KLUWrapper]
+```
diff --git a/src/BA_ABA_matrices.jl b/src/BA_ABA_matrices.jl
index 0493935eb..0cec17048 100644
--- a/src/BA_ABA_matrices.jl
+++ b/src/BA_ABA_matrices.jl
@@ -159,7 +159,7 @@ power flow analysis, sensitivity calculations, and linear power system studies.
         Mapping from reference bus numbers to their corresponding subnetwork axes
 - `ref_bus_position::Vector{Int}`:
         Vector containing the original indices of reference buses before matrix reduction
-- `K::F <: Union{Nothing, KLU.KLUFactorization{Float64, Int}}`:
+- `K::F <: Union{Nothing, KLULinSolveCache{Float64}}`:
         Optional KLU factorization object for efficient linear system solving. Nothing if unfactorized
 - `network_reduction_data::NetworkReductionData`:
         Container for network reduction information applied during matrix construction
@@ -179,7 +179,7 @@ power flow analysis, sensitivity calculations, and linear power system studies.
 struct ABA_Matrix{
     Ax <: NTuple{2, Vector},
     L <: NTuple{2, Dict},
-    F <: Union{Nothing, KLU.KLUFactorization{Float64, Int}},
+    F <: Union{Nothing, KLULinSolveCache{Float64}},
 } <: PowerNetworkMatrix{Float64}
     data::SparseArrays.SparseMatrixCSC{Float64, Int}
     axes::Ax
@@ -295,7 +295,7 @@ function ABA_Matrix(ybus::Ybus; factorize::Bool = false)
     bus_ax_ref = make_ax_ref(axes[1])
     lookup = (bus_ax_ref, bus_ax_ref)
     if factorize
-        K = klu(ABA)
+        K = klu_factorize(ABA)
     else
         K = nothing
     end
@@ -339,7 +339,7 @@ function factorize(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Di
         deepcopy(ABA.lookup),
         deepcopy(ABA.subnetwork_axes),
         deepcopy(ABA.ref_bus_position),
-        klu(ABA.data),
+        klu_factorize(ABA.data),
         deepcopy(ABA.network_reduction_data),
     )
     return ABA_lu
@@ -358,7 +358,7 @@ Check if an ABA_Matrix has been factorized (i.e., contains LU factorization matr
 """
 is_factorized(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Dict}} = false
 is_factorized(
-    ABA::ABA_Matrix{Ax, L, KLU.KLUFactorization{Float64, Int}},
+    ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64}},
 ) where {Ax, L <: NTuple{2, Dict}} = true
 
 # get_index functions: BA_Matrix stores the transposed matrix, thus get index
diff --git a/src/BranchesParallel.jl b/src/BranchesParallel.jl
index 342a12c5c..cb66e8317 100644
--- a/src/BranchesParallel.jl
+++ b/src/BranchesParallel.jl
@@ -1,23 +1,45 @@
-mutable struct BranchesParallel{T <: PSY.ACTransmission} <: PSY.ACTransmission
+abstract type AbstractBranchesParallel <: PSY.ACTransmission end
+
+mutable struct BranchesParallel{T <: PSY.ACTransmission} <: AbstractBranchesParallel
     branches::Vector{T}
     equivalent_ybus::Union{Matrix{YBUS_ELTYPE}, Nothing}
+
+    function BranchesParallel{T}(
+        branches::Vector{T},
+        equivalent_ybus::Union{Matrix{YBUS_ELTYPE}, Nothing},
+    ) where {T <: PSY.ACTransmission}
+        if !isconcretetype(T)
+            error(
+                "BranchesParallel{T} requires a concrete branch type T. " *
+                "Use MixedBranchesParallel for groups with mixed branch types. Got T=$T.",
+            )
+        end
+        return new{T}(branches, equivalent_ybus)
+    end
 end
 
 function BranchesParallel(branches::Vector{T}) where {T <: PSY.ACTransmission}
-    BranchesParallel(branches, nothing)
+    return BranchesParallel{T}(branches, nothing)
 end
-# Constructor for the mixed types
-function BranchesParallel(branches::Vector{PSY.ACTransmission})
-    return BranchesParallel{PSY.ACTransmission}(branches, nothing)
+
+mutable struct MixedBranchesParallel <: AbstractBranchesParallel
+    branches::Vector{PSY.ACTransmission}
+    equivalent_ybus::Union{Matrix{YBUS_ELTYPE}, Nothing}
+end
+
+function MixedBranchesParallel(branches::Vector{<:PSY.ACTransmission})
+    return MixedBranchesParallel(Vector{PSY.ACTransmission}(branches), nothing)
 end
 
 function add_branch!(bp::BranchesParallel{T}, branch::T) where {T <: PSY.ACTransmission}
     push!(bp.branches, branch)
 end
 
-get_branch_type(::BranchesParallel{T}) where {T <: PSY.ACTransmission} = T
+function add_branch!(mbp::MixedBranchesParallel, branch::PSY.ACTransmission)
+    push!(mbp.branches, branch)
+end
 
-function get_name(bp::BranchesParallel{T}) where {T <: PSY.ACTransmission}
+function get_name(bp::AbstractBranchesParallel)
     base_string = _longest_starting_substring(PSY.get_name.(bp.branches)...)
     if isempty(base_string)
         base_string = join(PSY.get_name.(bp.branches), "_") * "_"
@@ -44,7 +66,7 @@ function _longest_starting_substring(branch_names...)
 end
 
 function compute_parallel_multiplier(
-    parallel_branch_set::BranchesParallel,
+    parallel_branch_set::AbstractBranchesParallel,
     branch_name::String,
 )
     b_total = 0.0
@@ -58,11 +80,11 @@ function compute_parallel_multiplier(
     return b_branch / b_total
 end
 
-function get_series_susceptance(segment::BranchesParallel)
+function get_series_susceptance(segment::AbstractBranchesParallel)
     return sum(get_series_susceptance(branch) for branch in segment.branches)
 end
 
-function get_equivalent_physical_branch_parameters(bp::BranchesParallel)
+function get_equivalent_physical_branch_parameters(bp::AbstractBranchesParallel)
     if isnothing(bp.equivalent_ybus)
         populate_equivalent_ybus!(bp)
     end
@@ -70,33 +92,70 @@ function get_equivalent_physical_branch_parameters(bp::BranchesParallel)
     return _get_equivalent_physical_branch_parameters(equivalent_ybus)
 end
 
-function populate_equivalent_ybus!(bp::BranchesParallel)
+function populate_equivalent_ybus!(bp::AbstractBranchesParallel)
     Y11, Y12, Y21, Y22 = ybus_branch_entries(bp)
     bp.equivalent_ybus = YBUS_ELTYPE[Y11 Y12; Y21 Y22]
     return
 end
 
 """
-    get_equivalent_rating(bp::BranchesParallel)
+    get_sum_of_max_rating(bp::AbstractBranchesParallel)
 
-Calculate the total rating for branches in parallel.
-For parallel circuits, the rating is the sum of individual ratings divided by the number of circuits.
-This provides a conservative estimate that accounts for potential overestimation of total capacity.
+Sum of the individual branch ratings, treating each circuit as independently loadable
+up to its own thermal limit. This is the least conservative aggregate and assumes
+unconstrained flow steering across the parallel group.
+"""
+function get_sum_of_max_rating(bp::AbstractBranchesParallel)
+    return sum(get_equivalent_rating(branch) for branch in bp.branches)
+end
+
+"""
+    get_single_element_contingency_rating(bp::AbstractBranchesParallel)
+
+N-1 rating for the parallel group: the surviving capacity after the largest-rated
+circuit trips, ``\\sum_i S_i - \\max_i S_i``. For a group of one branch this is zero.
 """
-function get_equivalent_rating(bp::BranchesParallel)
-    # Sum of ratings divided by number of circuits
-    return sum(get_equivalent_rating(branch) for branch in bp.branches) /
-           length(bp.branches)
+function get_single_element_contingency_rating(bp::AbstractBranchesParallel)
+    ratings = get_equivalent_rating.(bp.branches)
+    return sum(ratings) - maximum(ratings)
 end
 
 """
-    get_equivalent_emergency_rating(bp::BranchesParallel)
+    get_impedance_averaged_rating(bp::AbstractBranchesParallel)
+
+Susceptance-weighted average of individual branch ratings,
+``\\sum_i f_i \\cdot S_i`` with ``f_i = b_i / \\sum_k b_k``. Reflects how DC flow
+physically splits across a parallel group. Throws `ArgumentError` if the total
+series susceptance is zero or non-finite.
+"""
+function get_impedance_averaged_rating(bp::AbstractBranchesParallel)
+    b_total = sum(PSY.get_series_susceptance, bp.branches)
+    if !isfinite(b_total) || iszero(b_total)
+        throw(
+            ArgumentError(
+                "Cannot compute impedance-averaged rating: total series susceptance across the parallel group must be finite and non-zero.",
+            ),
+        )
+    end
+    return sum(
+        PSY.get_series_susceptance(br) / b_total * get_equivalent_rating(br)
+        for br in bp.branches
+    )
+end
+
+# Series-chain rating contribution for a parallel block: dispatch arm for
+# `get_equivalent_rating(::BranchesSeries)` defined in BranchesSeries.jl.
+_series_member_rating(bp::AbstractBranchesParallel) =
+    get_single_element_contingency_rating(bp)
+
+"""
+    get_equivalent_emergency_rating(bp::AbstractBranchesParallel)
 
 Calculate the total emergency rating for branches in parallel.
 For parallel circuits, the emergency rating is the sum of individual emergency ratings divided by the number of circuits.
 This provides a conservative estimate that accounts for potential overestimation of total capacity.
 """
-function get_equivalent_emergency_rating(bp::BranchesParallel)
+function get_equivalent_emergency_rating(bp::AbstractBranchesParallel)
     equivalent_rating = 0.0
     for branch in bp.branches
         rating_b = get_equivalent_emergency_rating(branch)
@@ -106,36 +165,38 @@ function get_equivalent_emergency_rating(bp::BranchesParallel)
 end
 
 """
-    get_equivalent_available(bp::BranchesParallel)
+    get_equivalent_available(bp::AbstractBranchesParallel)
 
 Get the availability status for parallel branches.
 All branches in parallel must be available for the parallel circuit to be available.
 """
-function get_equivalent_available(bp::BranchesParallel)
+function get_equivalent_available(bp::AbstractBranchesParallel)
     # All branches must be available
     return all(PSY.get_available(branch) for branch in bp.branches)
 end
 
+PSY.get_available(bp::AbstractBranchesParallel) = get_equivalent_available(bp)
+
 """
-    get_equivalent_α(bp::BranchesParallel)
+    get_equivalent_α(bp::AbstractBranchesParallel)
 
 Get the phase angle shift for parallel branches.
 Returns the average phase angle shift across all parallel branches.
 Returns 0.0 if branches don't support phase angle shift (e.g., lines).
 """
-function get_equivalent_α(bp::BranchesParallel)
+function get_equivalent_α(bp::AbstractBranchesParallel)
     # Need to check the PS books
 end
 
-function Base.iterate(bp::BranchesParallel)
+function Base.iterate(bp::AbstractBranchesParallel)
     return iterate(bp.branches)
 end
 
-function Base.iterate(bp::BranchesParallel, state)
+function Base.iterate(bp::AbstractBranchesParallel, state)
     return iterate(bp.branches, state)
 end
 
-function Base.length(bp::BranchesParallel)
+function Base.length(bp::AbstractBranchesParallel)
     return length(bp.branches)
 end
 
@@ -144,33 +205,32 @@ function add_to_map(
     filters::Dict,
 ) where {T <: PSY.ACTransmission}
     isempty(filters) && return true
-    if isabstracttype(T)
-        @warn "Parallel circuit contains mixed branch types, filters might be applied to more components than intended. Use Logging.Debug for additional information."
-        @debug "Parallel circuit branch types: $(typeof.(double_circuit.branches))"
-        @debug "Parallel circuit branch names: $(PSY.get_name.(double_circuit.branches))"
-        for branch in double_circuit.branches
-            filter = get(filters, typeof(branch), x -> true)
-            if !filter(branch)
-                return false
-            end
-        end
+    if !haskey(filters, T)
         return true
     end
+    return any(filters[T](device) for device in double_circuit)
+end
 
-    if !haskey(filters, T)
-        return true
-    else
-        return any([filters[T](device) for device in double_circuit])
+function add_to_map(double_circuit::MixedBranchesParallel, filters::Dict)
+    isempty(filters) && return true
+    @warn "Parallel circuit contains mixed branch types, filters might be applied to more components than intended. Use Logging.Debug for additional information."
+    @debug "Parallel circuit branch types: $(typeof.(double_circuit.branches))"
+    @debug "Parallel circuit branch names: $(PSY.get_name.(double_circuit.branches))"
+    for branch in double_circuit.branches
+        filter = get(filters, typeof(branch), x -> true)
+        if !filter(branch)
+            return false
+        end
     end
-    error("Invalid condition reached in add_to_map for BranchesParallel")
+    return true
 end
 
-function Base.:(==)(a::BranchesParallel, b::BranchesParallel)
+function Base.:(==)(a::AbstractBranchesParallel, b::AbstractBranchesParallel)
     return a.branches == b.branches
 end
 
-function Base.show(io::IO, x::MIME{Symbol("text/plain")}, y::BranchesParallel)
+function Base.show(io::IO, x::MIME{Symbol("text/plain")}, y::AbstractBranchesParallel)
     show(io, x, y.branches)
 end
 
-is_a_reduction(::BranchesParallel) = true
+is_a_reduction(::AbstractBranchesParallel) = true
diff --git a/src/BranchesSeries.jl b/src/BranchesSeries.jl
index 35ae4ed26..0b38a7aec 100644
--- a/src/BranchesSeries.jl
+++ b/src/BranchesSeries.jl
@@ -129,14 +129,16 @@ end
     get_equivalent_rating(bs::BranchesSeries)
 
 Calculate the rating for branches in series.
-Series chains, can be composed of PSY.ACTransmission branches and PNM.BranchesParallel.
-For series circuits, the rating is limited by the weakest link: Rating_total = min(Rating1, Rating2, ..., Ratingn)
+Series chains can be composed of PSY.ACTransmission branches and parallel groups.
+For series circuits, the rating is limited by the weakest link: Rating_total = min(Rating1, Rating2, ..., Ratingn).
+Parallel members contribute their N-1 single-element-contingency rating.
 """
 function get_equivalent_rating(bs::BranchesSeries)
-    # Minimum rating for series branches (weakest link)
-    return minimum(get_equivalent_rating(branch) for branch in bs)
+    return minimum(_series_member_rating(branch) for branch in bs)
 end
 
+_series_member_rating(branch::PSY.ACTransmission) = get_equivalent_rating(branch)
+
 """
     get_equivalent_rating(bs<:PSY.ACTransmission)
 
@@ -201,6 +203,8 @@ function get_equivalent_available(bs::BranchesSeries)
     return all(PSY.get_available(branch) for branch in bs)
 end
 
+PSY.get_available(bs::BranchesSeries) = get_equivalent_available(bs)
+
 """
     get_equivalent_α(bs::BranchesSeries)
 
diff --git a/src/KLUWrapper/KLUWrapper.jl b/src/KLUWrapper/KLUWrapper.jl
new file mode 100644
index 000000000..5e996dfdf
--- /dev/null
+++ b/src/KLUWrapper/KLUWrapper.jl
@@ -0,0 +1,85 @@
+"""
+    KLUWrapper
+
+A small, allocation-aware wrapper over `libklu` (provided by `SuiteSparse_jll`)
+designed for the access patterns of `PowerNetworkMatrices`:
+
+- Cache the symbolic and numeric factorizations of an SPD/asymmetric sparse
+  matrix and reuse them across many solves.
+- Refactor (numeric only, or full) without re-allocating.
+- Solve dense and **sparse** right-hand sides without materializing N×N
+  intermediates when the RHS is structurally sparse.
+
+This module is intentionally lighter than `KLU.jl`: it owns no Julia-side
+copies of the matrix values, exposes the symbolic/numeric split directly, and
+binds only the SuiteSparse_long (`klu_l_*`, `klu_zl_*`) entry points used by
+the package.
+"""
+module KLUWrapper
+
+import LinearAlgebra
+import SparseArrays
+import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros, nzrange
+
+"""
+    KLU_POOL_DEBUG :: Bool
+
+Compile-time gate for the KLU wrapper's runtime diagnostics — currently
+just the precondition snapshot in `solve!` that fires when `klu_l_solve`
+returns FALSE. When `false`, the snapshot is folded out by `@static if`
+and contributes zero runtime cost. Flip to `true` when reproducing the
+libklu cross-cache concurrency failure documented on `_LIBKLU_LOCK`.
+
+This is a `const` rather than a `Preferences.@load_preference` flag so
+that toggling it forces a precompile rebuild and the production binary
+never carries the diagnostic code.
+"""
+const KLU_POOL_DEBUG = false
+
+"""
+    _LIBKLU_LOCK :: ReentrantLock
+
+Process-wide lock that serializes every libklu ccall. `libklu` corrupts
+internal state under concurrent access even when the caller hands out
+distinct `Numeric`/`Symbolic`/`Common` triples per thread (i.e. the
+access pattern KLU's user guide implies is supported). The corruption
+manifests two ways: an intermittent `KLU_INVALID` return with all input
+pointers still valid both pre- and post-call, and a `SIGSEGV` inside
+`klu_l_solve` (`klu_solve.c:118` in v7.8.3, the row-permutation read in
+the `nrhs == 1` chunk). The pre-call snapshot dump in `solve!` made
+both modes reproducible on macOS and confirmed the deterministic
+Windows-MinGW failure was the same bug. This lock is the only
+mechanism we have evidence for that prevents both modes.
+"""
+const _LIBKLU_LOCK = ReentrantLock()
+
+"""
+    @klu_lock expr
+
+Evaluate `expr` while holding `_LIBKLU_LOCK`. Wrap every libklu ccall
+so that no two libklu entries can run concurrently in the process.
+"""
+macro klu_lock(expr)
+    return :(@lock _LIBKLU_LOCK $(esc(expr)))
+end
+
+export KLULinSolveCache,
+    klu_factorize,
+    symbolic_factor!,
+    symbolic_refactor!,
+    numeric_refactor!,
+    full_factor!,
+    full_refactor!,
+    solve!,
+    tsolve!,
+    solve_sparse!,
+    solve_sparse,
+    n_valid,
+    is_factored
+
+include("klu_jll_bindings.jl")
+include("klu_cache.jl")
+include("solve_dense.jl")
+include("solve_sparse_rhs.jl")
+
+end # module
diff --git a/src/KLUWrapper/klu_cache.jl b/src/KLUWrapper/klu_cache.jl
new file mode 100644
index 000000000..89d519980
--- /dev/null
+++ b/src/KLUWrapper/klu_cache.jl
@@ -0,0 +1,373 @@
+import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros
+
+"""
+A cached KLU linear solver designed for repeated solves against the same
+sparse matrix structure. `numeric_refactor!` and `solve!` allocate nothing
+once the cache is built.
+
+The type parameter `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU
+path. Indices are always `Int64` (SuiteSparse_long).
+
+`reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis;
+`check_pattern` adds a structural-equality check on refactor calls and is
+only consulted when reusing.
+"""
+mutable struct KLULinSolveCache{Tv <: Union{Float64, ComplexF64}}
+    colptr::Vector{Int64}
+    rowval::Vector{Int64}
+    # Copy of the matrix values used in the most recent numeric factorization.
+    # Lets `_recover_factorization!` rebuild a corrupted numeric handle without
+    # the caller having to re-supply `A`. Empty before the first factor call.
+    nzval::Vector{Tv}
+    common::Base.RefValue{KluLCommon}
+    symbolic::SymbolicPtr
+    numeric::NumericPtr
+    reuse_symbolic::Bool
+    check_pattern::Bool
+    # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call so
+    # the wrapper's working set stays O(n*block) instead of O(n*nrhs); see
+    # `solve_sparse_rhs.jl`.
+    scratch::Matrix{Tv}
+    col_map::Vector{Int64}
+end
+
+@inline _dim(cache::KLULinSolveCache) = Int64(length(cache.colptr) - 1)
+
+Base.size(cache::KLULinSolveCache) = (n = _dim(cache); (n, n))
+Base.size(cache::KLULinSolveCache, d::Integer) = d <= 2 ? _dim(cache) : 1
+Base.eltype(::Type{KLULinSolveCache{Tv}}) where {Tv} = Tv
+get_reuse_symbolic(cache::KLULinSolveCache) = cache.reuse_symbolic
+
+"""
+    is_factored(cache::KLULinSolveCache) -> Bool
+
+Return `true` when `cache` holds both a symbolic and a numeric factorization
+ready for `solve!` / `tsolve!` / `solve_sparse!`. Returns `false` after
+construction (before `full_factor!`) or after the libklu handles have been
+finalized.
+"""
+is_factored(cache::KLULinSolveCache) =
+    cache.symbolic != C_NULL && cache.numeric != C_NULL
+
+# A cache holds at most one valid factorization; `is_factored` is the
+# truthy form. Kept around for tests that want a uniform numeric reading.
+n_valid(cache::KLULinSolveCache) = is_factored(cache) ? 1 : 0
+
+@inline _factor_call(::Type{Float64}, ap, ai, ax, sym, common) =
+    klu_l_factor(ap, ai, ax, sym, common)
+@inline _factor_call(::Type{ComplexF64}, ap, ai, ax, sym, common) =
+    klu_zl_factor(ap, ai, ax, sym, common)
+
+@inline _refactor_call(::Type{Float64}, ap, ai, ax, sym, num, common) =
+    klu_l_refactor(ap, ai, ax, sym, num, common)
+@inline _refactor_call(::Type{ComplexF64}, ap, ai, ax, sym, num, common) =
+    klu_zl_refactor(ap, ai, ax, sym, num, common)
+
+@inline _solve_call(::Type{Float64}, sym, num, n, nrhs, b, common) =
+    klu_l_solve(sym, num, n, nrhs, b, common)
+@inline _solve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common) =
+    klu_zl_solve(sym, num, n, nrhs, b, common)
+
+@inline _tsolve_call(::Type{Float64}, sym, num, n, nrhs, b, common; conjugate = false) =
+    klu_l_tsolve(sym, num, n, nrhs, b, common)
+@inline _tsolve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common; conjugate = false) =
+    klu_zl_tsolve(sym, num, n, nrhs, b, Cint(conjugate), common)
+
+@inline _free_numeric!(::Type{Float64}, num_ref, common) =
+    klu_l_free_numeric!(num_ref, common)
+@inline _free_numeric!(::Type{ComplexF64}, num_ref, common) =
+    klu_zl_free_numeric!(num_ref, common)
+
+"""
+    KLULinSolveCache(A; reuse_symbolic=true, check_pattern=true)
+
+Build a cache for the sparse matrix `A`. Allocates structural arrays and
+runs `klu_l_defaults`, but does **not** factorize. Call `full_factor!`
+(or `symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
+
+A finalizer frees libklu handles on GC; call `Base.finalize(cache)` to
+release them eagerly. Releasing the handles leaves Julia-side state intact,
+so the cache can be re-factorized via `symbolic_factor!`/`numeric_refactor!`
+or `full_factor!`.
+"""
+function KLULinSolveCache(
+    A::SparseMatrixCSC{Tv, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+) where {Tv <: Union{Float64, ComplexF64}}
+    Int === Int64 || error(
+        "KLULinSolveCache requires a 64-bit Julia build (SuiteSparse_long " *
+        "bindings need Int64 indices). Got Int = $(Int).",
+    )
+    n = size(A, 1)
+    n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))"))
+
+    common = Ref(KluLCommon())
+    klu_l_defaults!(common)
+
+    colptr = Vector{Int64}(undef, length(getcolptr(A)))
+    copyto!(colptr, getcolptr(A))
+    colptr .-= 1
+    rowval = Vector{Int64}(undef, length(rowvals(A)))
+    copyto!(rowval, rowvals(A))
+    rowval .-= 1
+
+    cache = KLULinSolveCache{Tv}(
+        colptr, rowval, Tv[], common,
+        convert(SymbolicPtr, C_NULL),
+        convert(NumericPtr, C_NULL),
+        reuse_symbolic, check_pattern,
+        Matrix{Tv}(undef, 0, 0),
+        Int64[],
+    )
+    finalizer(_free_klu_handles!, cache)
+    return cache
+end
+
+"""
+    _ensure_scratch!(cache, block) -> Nothing
+
+Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length
+`block`. Grows in place; reuses across `solve_sparse!` calls.
+"""
+@inline function _ensure_scratch!(
+    cache::KLULinSolveCache{Tv},
+    block::Int,
+) where {Tv <: Union{Float64, ComplexF64}}
+    n = _dim(cache)
+    s = cache.scratch
+    if size(s, 1) != n || size(s, 2) < block
+        cache.scratch = Matrix{Tv}(undef, n, block)
+    end
+    if length(cache.col_map) < block
+        resize!(cache.col_map, block)
+    end
+    return nothing
+end
+
+"""
+Release the libklu numeric and symbolic handles held by `cache`, leaving the
+Julia-side fields (`colptr`, `rowval`, `common`, `scratch`, `col_map`) intact
+so the cache remains structurally valid and re-factorable. Idempotent: a
+second call hits the `C_NULL` guards. Used both by `symbolic_factor!`
+mid-life (drop old handles before re-analyzing) and by the GC finalizer.
+"""
+function _free_klu_handles!(
+    cache::KLULinSolveCache{Tv},
+) where {Tv <: Union{Float64, ComplexF64}}
+    if cache.numeric != C_NULL
+        num_ref = Ref(cache.numeric)
+        _free_numeric!(Tv, num_ref, cache.common)
+        cache.numeric = num_ref[]
+    end
+    if cache.symbolic != C_NULL
+        sym_ref = Ref(cache.symbolic)
+        klu_l_free_symbolic!(sym_ref, cache.common)
+        cache.symbolic = sym_ref[]
+    end
+    return nothing
+end
+
+# Public eager-release alias for `_free_klu_handles!` (the internal helper
+# stays unexported per the KLUWrapper convention).
+Base.finalize(cache::KLULinSolveCache) = _free_klu_handles!(cache)
+
+"""
+Drop a corrupted numeric handle and rebuild it from the values cached in
+`cache.nzval`. Used by `solve_sparse!` on the `KLU_INVALID` retry path, where
+libklu state has been observed to corrupt without the caller having `A` in
+scope. Requires that a numeric factor has been built before (so `cache.nzval`
+is populated) and the symbolic factor is still valid.
+"""
+function _recover_factorization!(
+    cache::KLULinSolveCache{Tv},
+) where {Tv <: Union{Float64, ComplexF64}}
+    cache.symbolic == C_NULL && error(
+        "KLULinSolveCache: cannot recover without a symbolic factor.",
+    )
+    isempty(cache.nzval) && error(
+        "KLULinSolveCache: cannot recover; no cached numerical values yet.",
+    )
+    if cache.numeric != C_NULL
+        num_ref = Ref(cache.numeric)
+        _free_numeric!(Tv, num_ref, cache.common)
+        cache.numeric = num_ref[]
+    end
+    num = _factor_call(
+        Tv, pointer(cache.colptr), pointer(cache.rowval),
+        pointer(cache.nzval), cache.symbolic, cache.common,
+    )
+    num == C_NULL && klu_throw(cache.common[], "klu_factor (recovery)")
+    cache.numeric = num
+    return cache
+end
+
+@inline function _check_pattern_match(cache::KLULinSolveCache,
+    A::SparseMatrixCSC, op::AbstractString)
+    Acolptr = getcolptr(A)
+    Arowval = rowvals(A)
+    if length(Acolptr) != length(cache.colptr) ||
+       length(Arowval) != length(cache.rowval)
+        throw(
+            ArgumentError(
+                "Cannot $op: matrix has different sparsity structure (length).",
+            ),
+        )
+    end
+    # Increment-compare-decrement: avoids allocating a 1-indexed copy. The
+    # `try/finally` restores `colptr`/`rowval` even on InterruptException so
+    # the cache is never left with off-by-one structural arrays.
+    cache.colptr .+= 1
+    cache.rowval .+= 1
+    bad = try
+        (cache.colptr != Acolptr) || (cache.rowval != Arowval)
+    finally
+        cache.colptr .-= 1
+        cache.rowval .-= 1
+    end
+    if bad
+        throw(ArgumentError(
+            "Cannot $op: matrix has different sparsity structure.",
+        ))
+    end
+    return nothing
+end
+
+"""
+    symbolic_factor!(cache, A)
+
+Free any cached symbolic/numeric factor, replace the structural arrays with
+`A`'s pattern, and run `klu_l_analyze`.
+"""
+function symbolic_factor!(cache::KLULinSolveCache{Tv},
+    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+    n = _dim(cache)
+    if size(A, 1) != n || size(A, 2) != n
+        throw(DimensionMismatch(
+            "Cannot factor: cache is $(n)×$(n) but A is $(size(A)).",
+        ))
+    end
+    _free_klu_handles!(cache)
+
+    Acolptr = getcolptr(A)
+    Arowval = rowvals(A)
+    resize!(cache.colptr, length(Acolptr))
+    copyto!(cache.colptr, Acolptr)
+    cache.colptr .-= 1
+    resize!(cache.rowval, length(Arowval))
+    copyto!(cache.rowval, Arowval)
+    cache.rowval .-= 1
+
+    sym = klu_l_analyze(
+        Int64(n), pointer(cache.colptr), pointer(cache.rowval), cache.common,
+    )
+    sym == C_NULL && klu_throw(cache.common[], "klu_l_analyze")
+    cache.symbolic = sym
+    return cache
+end
+
+"""
+    symbolic_refactor!(cache, A)
+
+If `cache.reuse_symbolic`, optionally verify the structure matches and reuse
+the existing analysis. Otherwise, rerun `symbolic_factor!`.
+"""
+function symbolic_refactor!(cache::KLULinSolveCache{Tv},
+    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+    if !cache.reuse_symbolic
+        return symbolic_factor!(cache, A)
+    end
+    if cache.check_pattern
+        n = _dim(cache)
+        if size(A, 1) != n || size(A, 2) != n
+            throw(
+                DimensionMismatch(
+                    "Cannot refactor: cache is $(n)×$(n) but A is $(size(A)).",
+                ),
+            )
+        end
+        _check_pattern_match(cache, A, "symbolic_refactor")
+    end
+    return cache
+end
+
+"""
+    numeric_refactor!(cache, A)
+
+Compute (or refresh) the numeric factorization. The first call after
+`symbolic_factor!` invokes `klu_*_factor`; subsequent calls invoke
+`klu_*_refactor` and reuse the existing numeric struct.
+"""
+function numeric_refactor!(cache::KLULinSolveCache{Tv},
+    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+    cache.symbolic == C_NULL && error(
+        "KLULinSolveCache: call symbolic_factor! before numeric_refactor!.",
+    )
+    Anz = nonzeros(A)
+    if cache.numeric == C_NULL
+        num = _factor_call(
+            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            pointer(Anz), cache.symbolic, cache.common,
+        )
+        num == C_NULL && klu_throw(cache.common[], "klu_factor")
+        cache.numeric = num
+    else
+        cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor")
+        ok = _refactor_call(
+            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            pointer(Anz), cache.symbolic, cache.numeric, cache.common,
+        )
+        ok != 1 && klu_throw(cache.common[], "klu_refactor")
+    end
+    # Snapshot the values used so `_recover_factorization!` can rebuild the
+    # numeric handle without the caller having to re-supply A.
+    resize!(cache.nzval, length(Anz))
+    copyto!(cache.nzval, Anz)
+    return cache
+end
+
+"""
+    full_factor!(cache, A) -> cache
+
+Run a fresh symbolic analysis followed by a numeric factorization on `A`.
+Equivalent to `symbolic_factor!(cache, A); numeric_refactor!(cache, A)`. Use
+this on a freshly constructed cache, or after `_free_klu_handles!` has cleared
+the handles, to bring the cache to a factored state.
+"""
+function full_factor!(cache::KLULinSolveCache{Tv},
+    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+    symbolic_factor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    full_refactor!(cache, A) -> cache
+
+Refresh both the symbolic and numeric factorizations on `A`. Defers to
+`symbolic_refactor!` (which reuses the existing analysis when
+`cache.reuse_symbolic` is set) followed by `numeric_refactor!`. Use this when
+the matrix values have changed; if the structure has also changed and the
+cache was built with `reuse_symbolic = false`, the symbolic analysis is rerun
+as well.
+"""
+function full_refactor!(cache::KLULinSolveCache{Tv},
+    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+    symbolic_refactor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    klu_factorize(A; reuse_symbolic=true, check_pattern=true) -> KLULinSolveCache
+
+Build a cache for `A` and immediately compute the full factorization.
+"""
+function klu_factorize(A::SparseMatrixCSC{Tv, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+) where {Tv <: Union{Float64, ComplexF64}}
+    cache = KLULinSolveCache(A;
+        reuse_symbolic = reuse_symbolic, check_pattern = check_pattern)
+    return full_factor!(cache, A)
+end
diff --git a/src/KLUWrapper/klu_jll_bindings.jl b/src/KLUWrapper/klu_jll_bindings.jl
new file mode 100644
index 000000000..ff0ed0a92
--- /dev/null
+++ b/src/KLUWrapper/klu_jll_bindings.jl
@@ -0,0 +1,194 @@
+# Bindings into libklu (SuiteSparse_jll), restricted to the SuiteSparse_long
+# (`klu_l_*` and `klu_zl_*`) entry points used by KLULinSolveCache.
+
+import LinearAlgebra
+import SuiteSparse_jll: libklu
+
+# Layout matches `klu_l_common` in upstream `klu.h`. Must stay in sync.
+mutable struct KluLCommon
+    tol::Cdouble
+    memgrow::Cdouble
+    initmem_amd::Cdouble
+    initmem::Cdouble
+    maxwork::Cdouble
+    btf::Cint
+    ordering::Cint
+    scale::Cint
+    user_order::Ptr{Cvoid}
+    user_data::Ptr{Cvoid}
+    halt_if_singular::Cint
+    status::Cint
+    nrealloc::Cint
+    structural_rank::Int64
+    numerical_rank::Int64
+    singular_col::Int64
+    noffdiag::Int64
+    flops::Cdouble
+    rcond::Cdouble
+    condest::Cdouble
+    rgrowth::Cdouble
+    work::Cdouble
+    memusage::Csize_t
+    mempeak::Csize_t
+    KluLCommon() = new()
+end
+
+# Opaque handles. Empty structs let the ccall signatures stay explicit.
+mutable struct KluLSymbolic end
+mutable struct KluLNumeric end
+
+const SymbolicPtr = Ptr{KluLSymbolic}
+const NumericPtr = Ptr{KluLNumeric}
+
+# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the
+# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths
+# (`klu_l_free_*`, `klu_zl_free_*`), which can fire on any thread at any
+# safepoint and would otherwise race against in-flight `solve!` calls on a
+# different cache. See `_LIBKLU_LOCK` in `KLUWrapper.jl` for the
+# empirical evidence (intermittent `KLU_INVALID` return; SEGV at
+# `klu_solve.c:118`).
+
+klu_l_defaults!(common::Ref{KluLCommon}) =
+    @klu_lock ccall((:klu_l_defaults, libklu), Cint, (Ptr{KluLCommon},), common)
+
+function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64},
+    common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_analyze, libklu),
+        SymbolicPtr,
+        (Int64, Ptr{Int64}, Ptr{Int64}, Ptr{KluLCommon}),
+        n, ap, ai, common,
+    )
+end
+
+function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr},
+    common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_free_symbolic, libklu),
+        Cint,
+        (Ptr{SymbolicPtr}, Ptr{KluLCommon}),
+        symbolic_ref, common,
+    )
+end
+
+function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_factor, libklu),
+        NumericPtr,
+        (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, Ptr{KluLCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_l_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_refactor, libklu),
+        Cint,
+        (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr,
+            Ptr{KluLCommon}),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
+    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_solve, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{Cdouble}, Ptr{KluLCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
+    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_tsolve, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{Cdouble}, Ptr{KluLCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_l_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr}, Ptr{KluLCommon}),
+        numeric_ref, common,
+    )
+end
+
+function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_zl_factor, libklu),
+        NumericPtr,
+        (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, Ptr{KluLCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_zl_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_zl_refactor, libklu),
+        Cint,
+        (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, NumericPtr,
+            Ptr{KluLCommon}),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_zl_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
+    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_zl_solve, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64},
+            Ptr{KluLCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_zl_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
+    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, conj_solve::Cint,
+    common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_zl_tsolve, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Cint,
+            Ptr{KluLCommon}),
+        symbolic, numeric, ldim, nrhs, b, conj_solve, common,
+    )
+end
+
+function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon})
+    return @klu_lock ccall(
+        (:klu_zl_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr}, Ptr{KluLCommon}),
+        numeric_ref, common,
+    )
+end
+
+# Status codes from klu.h.
+const KLU_OK = 0
+const KLU_SINGULAR = 1
+const KLU_OUT_OF_MEMORY = -2
+const KLU_INVALID = -3
+const KLU_TOO_LARGE = -4
+
+function klu_throw(common::KluLCommon, op::AbstractString)
+    s = common.status
+    s == KLU_SINGULAR &&
+        throw(LinearAlgebra.SingularException(Int(common.singular_col + 1)))
+    s == KLU_OUT_OF_MEMORY && throw(OutOfMemoryError())
+    s == KLU_INVALID && throw(ArgumentError("KLU $(op) failed: invalid argument"))
+    s == KLU_TOO_LARGE && throw(OverflowError("KLU $(op) failed: integer overflow"))
+    return error("KLU $(op) failed: status=$(s)")
+end
diff --git a/src/KLUWrapper/solve_dense.jl b/src/KLUWrapper/solve_dense.jl
new file mode 100644
index 000000000..7ce29b78c
--- /dev/null
+++ b/src/KLUWrapper/solve_dense.jl
@@ -0,0 +1,89 @@
+"""
+    solve!(cache, B) -> B
+
+Solve `A · X = B` in place. `B::StridedVecOrMat{Tv}` must have first-dimension
+size equal to `cache.n` and unit stride in the first dimension. Multiple
+columns of `B` are handled in a single libklu call.
+"""
+function solve!(cache::KLULinSolveCache{Tv},
+    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+    is_factored(cache) || error("KLULinSolveCache: not factored yet.")
+    n = _dim(cache)
+    size(B, 1) == n || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    ))
+    stride(B, 1) == 1 || throw(ArgumentError(
+        "B must have unit stride in the first dimension.",
+    ))
+    nrhs = Int64(size(B, 2))
+    nrhs == 0 && return B
+    # Snapshot KLU's preconditions plus identity info — gated by
+    # `KLU_POOL_DEBUG`. `klu_l_solve` returns FALSE with `KLU_INVALID` when
+    # any of {Numeric, Symbolic, B} is NULL, when `ldim < Numeric->n`, or
+    # when `nrhs < 0`. The extra identity fields (`cache_id`, raw pointer
+    # values) let the reader cross-reference the per-thread `@error` logs:
+    # matching `cache_id` or `numeric_ptr` across threads is the smoking
+    # gun for shared state. Off in production (zero runtime cost via
+    # `@static if`); flip the const in `KLUWrapper.jl` to re-enable.
+    @static if KLU_POOL_DEBUG
+        pre_numeric = cache.numeric
+        pre_symbolic = cache.symbolic
+        pre_b_ptr = pointer(B)
+    end
+    ok = _solve_call(
+        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common,
+    )
+    if ok == 0
+        @static if KLU_POOL_DEBUG
+            @error "KLU klu_solve precondition snapshot" tid =
+                Threads.threadid() cache_id = objectid(cache) common_addr =
+                UInt(Base.pointer_from_objref(cache.common)) numeric_ptr =
+                UInt(pre_numeric) symbolic_ptr = UInt(pre_symbolic) b_ptr =
+                UInt(pre_b_ptr) ldim_n = Int(n) nrhs = Int(nrhs) pre_numeric_null =
+                (pre_numeric == C_NULL) pre_symbolic_null =
+                (pre_symbolic == C_NULL) pre_b_null = (pre_b_ptr == C_NULL) post_numeric_null =
+                (cache.numeric == C_NULL) post_symbolic_null =
+                (cache.symbolic == C_NULL) status = Int(cache.common[].status)
+        end
+        klu_throw(cache.common[], "klu_solve")
+    end
+    return B
+end
+
+"""
+    tsolve!(cache, B; conjugate=false) -> B
+
+In-place solve `Aᵀ · X = B` (or `Aᴴ · X = B` when `conjugate=true` on the
+complex path). Same shape requirements as `solve!`. The `conjugate` keyword
+is ignored on the real path.
+"""
+function tsolve!(cache::KLULinSolveCache{Tv},
+    B::StridedVecOrMat{Tv}; conjugate::Bool = false,
+) where {Tv <: Union{Float64, ComplexF64}}
+    is_factored(cache) || error("KLULinSolveCache: not factored yet.")
+    n = _dim(cache)
+    size(B, 1) == n || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    ))
+    stride(B, 1) == 1 || throw(ArgumentError(
+        "B must have unit stride in the first dimension.",
+    ))
+    nrhs = Int64(size(B, 2))
+    nrhs == 0 && return B
+    ok = _tsolve_call(
+        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common;
+        conjugate = conjugate,
+    )
+    ok == 0 && klu_throw(cache.common[], "klu_tsolve")
+    return B
+end
+
+"""
+    \\(cache::KLULinSolveCache, B) -> X
+
+Allocating solve, mirroring `LinearAlgebra.Factorization`'s API.
+"""
+function Base.:\(cache::KLULinSolveCache{Tv},
+    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+    return solve!(cache, copy(B))
+end
diff --git a/src/KLUWrapper/solve_sparse_rhs.jl b/src/KLUWrapper/solve_sparse_rhs.jl
new file mode 100644
index 000000000..a20679f24
--- /dev/null
+++ b/src/KLUWrapper/solve_sparse_rhs.jl
@@ -0,0 +1,107 @@
+const SPARSE_RHS_DEFAULT_BLOCK = 64
+
+"""
+    solve_sparse!(cache, B, out; block=$(SPARSE_RHS_DEFAULT_BLOCK)) -> out
+
+Solve `A · X = B` for a `SparseMatrixCSC` right-hand side, writing the result
+into `out`. Empty columns of `B` are not solved — `out`'s corresponding
+columns are filled with zeros. Non-empty columns within each chunk of `block`
+consecutive RHS columns are packed into a dense scratch and solved in a
+single libklu call. `out` may be any `AbstractMatrix{Tv}` with shape
+`(cache.n, size(B,2))`, including a view into a larger matrix. For an
+allocating variant, see `solve_sparse`.
+
+The `block` chunk size bounds the working set so that processing an
+`n × nrhs` sparse RHS requires only `O(n · block)` extra memory regardless of
+`nrhs`. The cache reuses its packing buffer across calls; warm calls
+allocate nothing in the solver.
+
+Not thread-safe (mutates per-cache scratch). Callers should serialize
+access through a per-cache lock if invoked from multiple threads.
+"""
+function solve_sparse!(
+    cache::KLULinSolveCache{Tv},
+    B::SparseMatrixCSC{Tb, Int},
+    out::AbstractMatrix{Tv};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+) where {Tv <: Union{Float64, ComplexF64}, Tb <: Number}
+    is_factored(cache) || error("KLULinSolveCache: not factored yet.")
+    block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)"))
+    n = _dim(cache)
+    size(B, 1) == n || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    ))
+    size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch(
+        "out has size $(size(out)); expected $((n, size(B, 2))).",
+    ))
+
+    nb = size(B, 2)
+    nb == 0 && return out
+    fill!(out, zero(Tv))
+
+    Browval = rowvals(B)
+    Bnzval = nonzeros(B)
+
+    _ensure_scratch!(cache, block)
+    scratch = cache.scratch
+    col_map = cache.col_map
+
+    j_start = 1
+    @inbounds while j_start <= nb
+        j_end = min(j_start + block - 1, nb)
+
+        # Expand non-empty columns of `B[:, j_start:j_end]` into dense columns
+        # of `scratch`, recording the original column index of each packed
+        # column in `col_map`. Structurally empty columns are skipped (their
+        # solve output is already zero from the `fill!(out, ...)` above).
+        npack = 0
+        for j in j_start:j_end
+            rng = nzrange(B, j)
+            isempty(rng) && continue
+            npack += 1
+            col_map[npack] = j
+            fill!(view(scratch, :, npack), zero(Tv))
+            for p in rng
+                scratch[Browval[p], npack] = Bnzval[p]
+            end
+        end
+
+        if npack > 0
+            # Retry-on-KLU_INVALID is inlined here (rather than factored into a
+            # helper) to keep the inner-loop closure from capturing the per-chunk
+            # `npack`, which would force Julia to box it.
+            ok = _solve_call(
+                Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                pointer(scratch), cache.common,
+            )
+            if ok == 0 && cache.common[].status == KLU_INVALID
+                @warn "klu_solve (sparse RHS) returned KLU_INVALID; freeing numeric handle and re-factoring" cache_id =
+                    objectid(cache)
+                _recover_factorization!(cache)
+                ok = _solve_call(
+                    Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                    pointer(scratch), cache.common,
+                )
+            end
+            ok == 0 && klu_throw(cache.common[], "klu_solve (sparse RHS)")
+
+            for k in 1:npack
+                copyto!(view(out, :, col_map[k]), view(scratch, :, k))
+            end
+        end
+
+        j_start = j_end + 1
+    end
+    return out
+end
+
+"""Allocating wrapper around `solve_sparse!`."""
+function solve_sparse(cache::KLULinSolveCache{Tv},
+    B::SparseMatrixCSC{<:Number, Int};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+) where {Tv <: Union{Float64, ComplexF64}}
+    return solve_sparse!(
+        cache, B, Matrix{Tv}(undef, _dim(cache), size(B, 2));
+        block = block,
+    )
+end
diff --git a/src/NetworkReductionData.jl b/src/NetworkReductionData.jl
index f263c5102..b8a1ec6c4 100644
--- a/src/NetworkReductionData.jl
+++ b/src/NetworkReductionData.jl
@@ -59,11 +59,16 @@ function get_typed_reverse_direct_branch_map(
     return b.reverse_direct_branch_map[T]::Dict{T, Tuple{Int, Int}}
 end
 
+# Per-type bucket is widened to `AbstractBranchesParallel` so that a
+# `MixedBranchesParallel` group is reachable under each of its underlying
+# branch types (e.g. both `parallel_branch_map[Line]` and
+# `parallel_branch_map[MonitoredLine]` see the same group). Pure
+# `BranchesParallel{T}` groups remain assignment-compatible.
 function get_typed_parallel_branch_map(
     b::BranchMapsByType,
     ::Type{T},
 ) where {T <: PSY.ACTransmission}
-    return b.parallel_branch_map[T]::Dict{Tuple{Int, Int}, BranchesParallel{T}}
+    return b.parallel_branch_map[T]::Dict{Tuple{Int, Int}, AbstractBranchesParallel}
 end
 
 function get_typed_reverse_parallel_branch_map(
@@ -110,7 +115,7 @@ network reduction algorithms.
 - `reverse_bus_search_map::Dict{Int, Int}`: Maps eliminated buses to their parent buses
 - `direct_branch_map::Dict{Tuple{Int, Int}, PSY.ACTransmission}`: One-to-one branch mappings
 - `reverse_direct_branch_map::Dict{PSY.ACTransmission, Tuple{Int, Int}}`: Reverse direct mappings
-- `parallel_branch_map::Dict{Tuple{Int, Int}, BranchesParallel}`: Parallel branch combinations
+- `parallel_branch_map::Dict{Tuple{Int, Int}, AbstractBranchesParallel}`: Parallel branch combinations (homogeneous `BranchesParallel{T}` or `MixedBranchesParallel`)
 - `reverse_parallel_branch_map::Dict{PSY.ACTransmission, Tuple{Int, Int}}`: Reverse parallel mappings
 - `series_branch_map::Dict{Tuple{Int, Int}, BranchesSeries}`: Series branch combinations
 - `reverse_series_branch_map::Dict{Any, Tuple{Int, Int}}`: Reverse series mappings
@@ -137,8 +142,8 @@ network reduction algorithms.
         Dict{Tuple{Int, Int}, PSY.ACTransmission}()
     reverse_direct_branch_map::Dict{PSY.ACTransmission, Tuple{Int, Int}} =
         Dict{PSY.ACTransmission, Tuple{Int, Int}}()
-    parallel_branch_map::Dict{Tuple{Int, Int}, BranchesParallel} =
-        Dict{Tuple{Int, Int}, BranchesParallel}()
+    parallel_branch_map::Dict{Tuple{Int, Int}, AbstractBranchesParallel} =
+        Dict{Tuple{Int, Int}, AbstractBranchesParallel}()
     reverse_parallel_branch_map::Dict{<:PSY.ACTransmission, Tuple{Int, Int}} =
         Dict{PSY.ACTransmission, Tuple{Int, Int}}()
     series_branch_map::Dict{Tuple{Int, Int}, BranchesSeries} =
@@ -270,7 +275,7 @@ function populate_branch_maps_by_type!(nrd::NetworkReductionData, filters = Dict
                 map_by_type = get!(
                     all_branch_maps_by_type.parallel_branch_map,
                     concrete_type,
-                    Dict{Tuple{Int, Int}, BranchesParallel{_get_segment_type(v)}}(),
+                    _empty_parallel_branch_map(),
                 )
                 map_by_type[k] = v
                 name_to_arc_map = get!(
@@ -385,16 +390,27 @@ function populate_branch_maps_by_type!(nrd::NetworkReductionData, filters = Dict
 end
 
 _get_segment_components(x::T) where {T <: PSY.ACBranch} = [x]
-_get_segment_components(x::BranchesParallel{T}) where {T <: PSY.ACTransmission} = x.branches
+_get_segment_components(x::AbstractBranchesParallel) = x.branches
 _get_segment_type(::T) where {T <: PSY.ACBranch} = T
 _get_segment_type(::BranchesParallel{T}) where {T <: PSY.ACTransmission} = T
+_get_segment_type(::MixedBranchesParallel) = MixedBranchesParallel
 _get_segment_type(
     ::ThreeWindingTransformerWinding{T},
 ) where {T <: PSY.ThreeWindingTransformer} = T
 
 _get_concrete_types(x::T) where {T <: PSY.ACBranch} = [T]
-_get_concrete_types(x::BranchesParallel{T}) where {T <: PSY.ACTransmission} =
-    unique(_get_segment_type.(x.branches))
+_get_concrete_types(::BranchesParallel{T}) where {T <: PSY.ACTransmission} = [T]
+# A heterogeneous group must be discoverable under each of its members'
+# concrete branch types so that downstream per-type iteration (e.g. PSI's
+# `name_to_arc_map[MonitoredLine]`) can find it.
+_get_concrete_types(bp::MixedBranchesParallel) = unique(typeof.(bp.branches))
+
+# Construct an empty per-slot dict for `BranchMapsByType.parallel_branch_map`.
+# Value type is `AbstractBranchesParallel` so that the same per-type bucket can
+# hold either a homogeneous `BranchesParallel{T}` or a `MixedBranchesParallel`
+# that includes a branch of type `T`.
+_empty_parallel_branch_map() =
+    Dict{Tuple{Int, Int}, AbstractBranchesParallel}()
 
 get_irreducible_buses(rb::NetworkReductionData) = rb.irreducible_buses
 """
diff --git a/src/PowerNetworkMatrices.jl b/src/PowerNetworkMatrices.jl
index 13bc15e94..5239f5859 100644
--- a/src/PowerNetworkMatrices.jl
+++ b/src/PowerNetworkMatrices.jl
@@ -29,6 +29,9 @@ export DC_vPTDF_Matrix
 export DC_BA_Matrix
 export AC_Ybus_Matrix
 export YBUS_ELTYPE
+export get_sum_of_max_rating
+export get_single_element_contingency_rating
+export get_impedance_averaged_rating
 
 export apply_woodbury_correction
 export clear_all_caches!
@@ -64,15 +67,30 @@ import DataStructures: SortedDict
 import SparseArrays
 import SparseArrays: rowvals, nzrange
 import HDF5
-import KLU: klu
-import KLU
 import LinearAlgebra
 import LinearAlgebra: BLAS.gemm
 import LinearAlgebra: ldiv!, mul!, I, dot
 import LinearAlgebra: LAPACK.getrf!, LAPACK.getrs!
 import Preferences
 
+include("KLUWrapper/KLUWrapper.jl")
+import .KLUWrapper:
+    KLULinSolveCache,
+    klu_factorize,
+    symbolic_factor!,
+    symbolic_refactor!,
+    numeric_refactor!,
+    full_factor!,
+    full_refactor!,
+    solve!,
+    tsolve!,
+    solve_sparse!,
+    solve_sparse,
+    n_valid,
+    is_factored
+
 include("linalg_settings.jl")
+include("solver_dispatch.jl")
 
 function __init__()
     something(get_linalg_backend_check(), false) && check_linalg_backend()
diff --git a/src/PowerNetworkMatrix.jl b/src/PowerNetworkMatrix.jl
index d2e205b6a..464846327 100644
--- a/src/PowerNetworkMatrix.jl
+++ b/src/PowerNetworkMatrix.jl
@@ -169,11 +169,13 @@ function Base.eltype(iter::PowerNetworkMatrixKeys)
 end
 function Base.iterate(iter::PowerNetworkMatrixKeys)
     next = iterate(iter.product_iter)
-    return next === nothing ? nothing : (PowerNetworkMatrixKey(next[1]), next[2])
+    isnothing(next) && return nothing
+    return (PowerNetworkMatrixKey(next[1]), next[2])
 end
 function Base.iterate(iter::PowerNetworkMatrixKeys, state)
     next = iterate(iter.product_iter, state)
-    return next === nothing ? nothing : (PowerNetworkMatrixKey(next[1]), next[2])
+    isnothing(next) && return nothing
+    return (PowerNetworkMatrixKey(next[1]), next[2])
 end
 function Base.keys(a::PowerNetworkMatrix)
     return PowerNetworkMatrixKeys(Base.Iterators.product(a.axes...))
@@ -327,7 +329,7 @@ the UUID of `sys`. No-op when the matrix does not track system origin.
 """
 function _validate_system_uuid(mat::PowerNetworkMatrix, sys::PSY.System)
     mat_uuid = get_system_uuid(mat)
-    if mat_uuid !== nothing && mat_uuid != IS.get_uuid(sys)
+    if !isnothing(mat_uuid) && mat_uuid != IS.get_uuid(sys)
         error(
             "System UUID mismatch: the matrix was constructed from a system with " *
             "UUID $mat_uuid, but the provided system has UUID $(IS.get_uuid(sys)). " *
diff --git a/src/PowerflowMatrixTypes.jl b/src/PowerflowMatrixTypes.jl
index 20f19206d..98d103289 100644
--- a/src/PowerflowMatrixTypes.jl
+++ b/src/PowerflowMatrixTypes.jl
@@ -1,19 +1,20 @@
 const DC_ABA_Matrix_Factorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
     Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}},
-    KLU.KLUFactorization{Float64, Int64},
+    KLULinSolveCache{Float64},
 }
 const DC_ABA_Matrix_Unfactorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
     Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}},
     Nothing,
 }
-# VirtualPTDF can use various factorization types when extensions are loaded
+# VirtualPTDF can use various factorization types when extensions are loaded;
+# the K type parameter is left unconstrained on purpose.
 const DC_vPTDF_Matrix = VirtualPTDF{
     Tuple{Vector{Tuple{Int, Int}}, Vector{Int64}},
     Tuple{Dict{Tuple{Int, Int}, Int64}, Dict{Int64, Int64}},
-    <:LinearAlgebra.Factorization,
-}
+    K,
+} where {K}
 const DC_PTDF_Matrix = PTDF{
     Tuple{Vector{Int64}, Vector{Tuple{Int, Int}}},
     Tuple{Dict{Int64, Int64}, Dict{Tuple{Int, Int}, Int64}},
diff --git a/src/ThreeWindingTransformerWinding.jl b/src/ThreeWindingTransformerWinding.jl
index f63723535..c4a9e290d 100644
--- a/src/ThreeWindingTransformerWinding.jl
+++ b/src/ThreeWindingTransformerWinding.jl
@@ -162,6 +162,8 @@ function get_equivalent_available(tw::ThreeWindingTransformerWinding)
     return winding_status
 end
 
+PSY.get_available(tw::ThreeWindingTransformerWinding) = get_equivalent_available(tw)
+
 function get_arc_tuple(tr::ThreeWindingTransformerWinding)
     t3W = get_transformer(tr)
     arc_number = get_winding_number(tr)
diff --git a/src/Ybus.jl b/src/Ybus.jl
index fe17d043d..7a8ceb527 100644
--- a/src/Ybus.jl
+++ b/src/Ybus.jl
@@ -163,18 +163,65 @@ function get_reduction(
     return get_reduction(A, sys, reduction)
 end
 
+function _make_parallel_branch_pair(
+    a::T,
+    b::T,
+    ::Tuple{Int, Int},
+) where {T <: PSY.ACTransmission}
+    return BranchesParallel(T[a, b])
+end
+
+function _make_parallel_branch_pair(
+    a::PSY.ACTransmission,
+    b::PSY.ACTransmission,
+    arc_tuple::Tuple{Int, Int},
+)
+    @warn "Mismatch in parallel device types for arc $(arc_tuple). This could indicate issues in the network data."
+    return MixedBranchesParallel(PSY.ACTransmission[a, b])
+end
+
 function _push_parallel_branch!(
     parallel_branch_map::Dict,
     arc_tuple::Tuple{Int, Int},
+    br::PSY.ACTransmission,
+)
+    existing = parallel_branch_map[arc_tuple]
+    _push_parallel_branch_dispatch!(parallel_branch_map, arc_tuple, existing, br)
+    return
+end
+
+function _push_parallel_branch_dispatch!(
+    ::Dict,
+    ::Tuple{Int, Int},
+    existing::BranchesParallel{T},
     br::T,
 ) where {T <: PSY.ACTransmission}
-    if get_branch_type(parallel_branch_map[arc_tuple]) == T
-        add_branch!(parallel_branch_map[arc_tuple], br)
-    else
+    add_branch!(existing, br)
+    return
+end
+
+function _push_parallel_branch_dispatch!(
+    parallel_branch_map::Dict,
+    arc_tuple::Tuple{Int, Int},
+    existing::BranchesParallel,
+    br::PSY.ACTransmission,
+)
+    @warn "Mismatch in parallel device types for arc $(arc_tuple). This could indicate issues in the network data."
+    parallel_branch_map[arc_tuple] =
+        MixedBranchesParallel(PSY.ACTransmission[existing.branches..., br])
+    return
+end
+
+function _push_parallel_branch_dispatch!(
+    ::Dict,
+    arc_tuple::Tuple{Int, Int},
+    existing::MixedBranchesParallel,
+    br::PSY.ACTransmission,
+)
+    if !any(typeof(b) === typeof(br) for b in existing.branches)
         @warn "Mismatch in parallel device types for arc $(arc_tuple). This could indicate issues in the network data."
-        parallel_branch_map[arc_tuple] =
-            BranchesParallel([parallel_branch_map[arc_tuple].branches..., br])
     end
+    add_branch!(existing, br)
     return
 end
 
@@ -217,7 +264,8 @@ function add_to_branch_maps!(
         corresponding_branch = direct_branch_map[arc_tuple]
         delete!(direct_branch_map, arc_tuple)
         delete!(reverse_direct_branch_map, corresponding_branch)
-        parallel_branch_map[arc_tuple] = BranchesParallel([corresponding_branch, br])
+        parallel_branch_map[arc_tuple] =
+            _make_parallel_branch_pair(corresponding_branch, br, arc_tuple)
         reverse_parallel_branch_map[corresponding_branch] = arc_tuple
         reverse_parallel_branch_map[br] = arc_tuple
     else
@@ -412,7 +460,7 @@ function ybus_branch_entries(br::PSY.GenericArcImpedance)
     return (Y11, Y12, Y21, Y22)
 end
 
-function ybus_branch_entries(parallel_br::BranchesParallel)
+function ybus_branch_entries(parallel_br::AbstractBranchesParallel)
     arc = get_arc_tuple(first(parallel_br))
     Y11 = Y12 = Y21 = Y22 = zero(YBUS_ELTYPE)
     for br in parallel_br
@@ -1707,7 +1755,7 @@ end
 
 """
     add_segment_to_ybus!(
-        segment::BranchesParallel,
+        segment::AbstractBranchesParallel,
         y11::Vector{YBUS_ELTYPE},
         y12::Vector{YBUS_ELTYPE},
         y21::Vector{YBUS_ELTYPE},
@@ -1725,7 +1773,7 @@ branches between the same pair of buses. Each branch in the set is added to the
 same Y-bus position, effectively combining their admittances.
 
 # Arguments
-- `segment::BranchesParallel`: Set of parallel AC transmission branches
+- `segment::AbstractBranchesParallel`: Set of parallel AC transmission branches
 - `y11::Vector{YBUS_ELTYPE}`: Vector for from-bus self admittances
 - `y12::Vector{YBUS_ELTYPE}`: Vector for from-to mutual admittances
 - `y21::Vector{YBUS_ELTYPE}`: Vector for to-from mutual admittances
@@ -1746,7 +1794,7 @@ same Y-bus position, effectively combining their admittances.
 - [`DegreeTwoReduction`](@ref): Series chain elimination
 """
 function add_segment_to_ybus!(
-    segment::BranchesParallel,
+    segment::AbstractBranchesParallel,
     y11::Vector{YBUS_ELTYPE},
     y12::Vector{YBUS_ELTYPE},
     y21::Vector{YBUS_ELTYPE},
diff --git a/src/common.jl b/src/common.jl
index f50b57c79..70a1b50ff 100644
--- a/src/common.jl
+++ b/src/common.jl
@@ -102,11 +102,11 @@ function get_arc_tuple(br::PSY.ACTransmission, nr::NetworkReductionData)
 end
 
 # Parallel branches: all oriented in same direction, so just take arc of first.
-function get_arc_tuple(br::BranchesParallel, nr::NetworkReductionData)
+function get_arc_tuple(br::AbstractBranchesParallel, nr::NetworkReductionData)
     get_arc_tuple(PSY.get_arc(first(br)), nr)
 end
 
-function get_arc_tuple(br::BranchesParallel)
+function get_arc_tuple(br::AbstractBranchesParallel)
     return get_arc_tuple(PSY.get_arc(first(br)))
 end
 
@@ -288,8 +288,8 @@ end
 """
     _get_equivalent_physical_branch_parameters(equivalent_ybus::Matrix{$YBUS_ELTYPE})
 
-Takes as input a 2x2 Matrix{$YBUS_ELTYPE} representing the Ybus contribution of either a
-BranchesParallel or BranchesSeries object.
+Takes as input a 2x2 Matrix{$YBUS_ELTYPE} representing the Ybus contribution of either an
+AbstractBranchesParallel (homogeneous or mixed) or BranchesSeries object.
 Returns a dictionary of equivalent parameters, matching the PowerModels data format.
 """
 function _get_equivalent_physical_branch_parameters(equivalent_ybus::Matrix{YBUS_ELTYPE})
@@ -345,7 +345,7 @@ function has_time_series(
 end
 
 function has_time_series(
-    branch::BranchesParallel,
+    branch::AbstractBranchesParallel,
     ts_type::Type{T},
     ts_name::String,
 ) where {
@@ -388,7 +388,7 @@ function get_device_with_time_series(
 end
 
 function get_device_with_time_series(
-    branch::BranchesParallel,
+    branch::AbstractBranchesParallel,
     ts_type::Type{T},
     ts_name::String,
 ) where {
@@ -477,7 +477,7 @@ function _segment_susceptance_after_outage(
 end
 
 function _segment_susceptance_after_outage(
-    segment::BranchesParallel,
+    segment::AbstractBranchesParallel,
     tripped_set::Set{<:PSY.ACTransmission},
 )::Float64
     b_remaining = 0.0
diff --git a/src/degree_two_reduction.jl b/src/degree_two_reduction.jl
index feebd504b..1af6785be 100644
--- a/src/degree_two_reduction.jl
+++ b/src/degree_two_reduction.jl
@@ -37,7 +37,7 @@ function get_degree2_reduction(
     bus_lookup::Dict{Int, Int},
     exempt_bus_positions::Set{Int},
     direct_branch_map::Dict{Tuple{Int, Int}, PSY.ACTransmission},
-    parallel_branch_map::Dict{Tuple{Int, Int}, BranchesParallel},
+    parallel_branch_map::Dict{Tuple{Int, Int}, AbstractBranchesParallel},
     transformer3W_map::Dict{Tuple{Int, Int}, ThreeWindingTransformerWinding},
 )
     reverse_bus_lookup = Dict(v => k for (k, v) in bus_lookup)
@@ -76,7 +76,7 @@ end
 function _add_to_reverse_series_branch_map!(
     reverse_series_branch_map::Dict{PSY.ACTransmission, Tuple{Int, Int}},
     composite_arc::Tuple{Int, Int},
-    segment::BranchesParallel,
+    segment::AbstractBranchesParallel,
 )
     for branch in segment.branches
         reverse_series_branch_map[branch] = composite_arc
@@ -113,7 +113,7 @@ end
 
 function _get_branch_map_entry(
     direct_branch_map::Dict{Tuple{Int, Int}, PSY.ACTransmission},
-    parallel_branch_map::Dict{Tuple{Int, Int}, BranchesParallel},
+    parallel_branch_map::Dict{Tuple{Int, Int}, AbstractBranchesParallel},
     transformer3W_map::Dict{Tuple{Int, Int}, ThreeWindingTransformerWinding},
     arc::Tuple{Int, Int},
 )
diff --git a/src/lodf_calculations.jl b/src/lodf_calculations.jl
index 8e39d579b..78dae430c 100644
--- a/src/lodf_calculations.jl
+++ b/src/lodf_calculations.jl
@@ -100,7 +100,7 @@ end
 
 function _buildlodf(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
-    k::KLU.KLUFactorization{Float64, Int},
+    k::KLULinSolveCache{Float64},
     ba::SparseArrays.SparseMatrixCSC{Float64, Int},
     ref_bus_positions::Set{Int},
     ::KLUSolver,
@@ -110,7 +110,7 @@ end
 
 function _buildlodf(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
-    k::KLU.KLUFactorization{Float64, Int},
+    k::KLULinSolveCache{Float64},
     ba::SparseArrays.SparseMatrixCSC{Float64, Int},
     ref_bus_positions::Set{Int},
     ::LinearSolverType,
@@ -120,17 +120,15 @@ end
 
 function _calculate_LODF_matrix_KLU(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
-    k::KLU.KLUFactorization{Float64, Int},
+    k::KLULinSolveCache{Float64},
     ba::SparseArrays.SparseMatrixCSC{Float64, Int},
     ref_bus_positions::Set{Int},
 )
     linecount = size(ba, 2)
-    # get inverse of aba
-    first_ = zeros(size(a, 2), size(a, 1))
     valid_ix = setdiff(1:size(a, 2), ref_bus_positions)
-    copyto!(first_, transpose(a))
-    first_[valid_ix, :] = KLU.solve!(k, first_[valid_ix, :])
-    first_[collect(ref_bus_positions), :] .= 0.0
+    a_t_valid = SparseArrays.SparseMatrixCSC(transpose(a))[valid_ix, :]
+    first_ = zeros(size(a, 2), size(a, 1))
+    solve_sparse!(k, a_t_valid, view(first_, valid_ix, :))
     ptdf_denominator = first_' * ba
 
     m_I = Int[]
@@ -144,8 +142,8 @@ function _calculate_LODF_matrix_KLU(
             push!(m_V, 1 - ptdf_denominator[iline, iline])
         end
     end
-    Dem_LU = klu(SparseArrays.sparse(m_I, m_I, m_V))
-    KLU.solve!(Dem_LU, ptdf_denominator)
+    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
+    solve!(Dem_cache, ptdf_denominator)
     ptdf_denominator[SparseArrays.diagind(ptdf_denominator)] .= -1.0
     return ptdf_denominator
 end
@@ -167,8 +165,9 @@ function _calculate_LODF_matrix_KLU(
             push!(m_V, 1 - ptdf_denominator_t[iline, iline])
         end
     end
-    Dem_LU = klu(SparseArrays.sparse(m_I, m_I, m_V))
-    lodf_t = Dem_LU \ ptdf_denominator_t
+    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
+    lodf_t = copy(ptdf_denominator_t)
+    solve!(Dem_cache, lodf_t)
     lodf_t[SparseArrays.diagind(lodf_t)] .= -1.0
 
     return lodf_t
diff --git a/src/network_modification.jl b/src/network_modification.jl
index a00ea3d6c..08b94b076 100644
--- a/src/network_modification.jl
+++ b/src/network_modification.jl
@@ -5,7 +5,7 @@ Compute the Pi-model Ybus delta for a partial outage on a parallel branch group.
 Finds the circuit whose susceptance matches `delta_b` and returns its negated Pi-model entries.
 """
 function _compute_parallel_partial_ybus_delta(
-    bp::BranchesParallel,
+    bp::AbstractBranchesParallel,
     delta_b::Float64,
 )::NTuple{4, YBUS_ELTYPE}
     for br in bp.branches
diff --git a/src/ptdf_calculations.jl b/src/ptdf_calculations.jl
index e0fd1f700..132fe16b4 100644
--- a/src/ptdf_calculations.jl
+++ b/src/ptdf_calculations.jl
@@ -137,33 +137,29 @@ function _calculate_PTDF_matrix_KLU(
     dist_slack::Vector{Float64})
     linecount = size(BA, 2)
     buscount = size(BA, 1)
-
-    ABA = calculate_ABA_matrix(A, BA, ref_bus_positions)
-    K = klu(ABA)
-    # initialize matrices for evaluation
-    valid_ix = setdiff(1:buscount, ref_bus_positions)
-    PTDFm_t = zeros(buscount, linecount)
-    copyto!(PTDFm_t, BA)
     if !isempty(dist_slack) && length(ref_bus_positions) != 1
         error(
             "Distributed slack is not supported for systems with multiple reference buses.",
         )
-    elseif isempty(dist_slack) && length(ref_bus_positions) < buscount
-        PTDFm_t[valid_ix, :] = KLU.solve!(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        return PTDFm_t
-    elseif length(dist_slack) == buscount
-        @info "Distributed bus"
-        PTDFm_t[valid_ix, :] = KLU.solve!(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        slack_array = dist_slack / sum(dist_slack)
-        slack_array = reshape(slack_array, 1, buscount)
-        return PTDFm_t .- (slack_array * PTDFm_t)
-    else
+    end
+    if !isempty(dist_slack) && length(dist_slack) != buscount
         error("Distributed bus specification doesn't match the number of buses.")
     end
+    length(ref_bus_positions) < buscount || error(
+        "All buses are reference buses; PTDF is not defined.",
+    )
 
-    return
+    ABA = calculate_ABA_matrix(A, BA, ref_bus_positions)
+    cache = klu_factorize(ABA)
+    valid_ix = setdiff(1:buscount, ref_bus_positions)
+    PTDFm_t = zeros(buscount, linecount)
+    solve_sparse!(cache, BA[valid_ix, :], view(PTDFm_t, valid_ix, :))
+
+    isempty(dist_slack) && return PTDFm_t
+
+    @info "Distributed bus"
+    slack_array = reshape(dist_slack ./ sum(dist_slack), 1, buscount)
+    return PTDFm_t .- (slack_array * PTDFm_t)
 end
 
 function _binfo_check(binfo::Int)
diff --git a/src/radial_reduction.jl b/src/radial_reduction.jl
index 6859a4b90..01c2820a0 100644
--- a/src/radial_reduction.jl
+++ b/src/radial_reduction.jl
@@ -37,9 +37,10 @@ function _build_row_to_cols(A::SparseArrays.SparseMatrixCSC{Int8, Int}, buscount
     n_rows = size(A, 1)
     row_first_col = zeros(Int, n_rows)
     row_to_cols = Vector{Tuple{Int, Int}}(undef, n_rows)
+    Arowval = SparseArrays.rowvals(A)
     for col in 1:buscount
-        for k in A.colptr[col]:(A.colptr[col + 1] - 1)
-            row = A.rowval[k]
+        for k in SparseArrays.nzrange(A, col)
+            row = Arowval[k]
             if row_first_col[row] == 0
                 row_first_col[row] = col
             else
diff --git a/src/row_cache.jl b/src/row_cache.jl
index 2f4493514..2d09ca086 100644
--- a/src/row_cache.jl
+++ b/src/row_cache.jl
@@ -176,3 +176,44 @@ function check_cache_size!(cache::RowCache; new_add::Bool = false)
     end
     return
 end
+
+"""
+    cached_row_lookup(compute_row, cache, cache_lock, row, column, tol) -> value
+
+Shared cache-fast-path / compute / double-checked-insert pattern used by
+`VirtualPTDF._getindex` and `VirtualLODF._getindex`. Acquires `cache_lock`
+to test for a hit, runs `compute_row` outside the lock on a miss
+(KLU solves dominate the cost), then takes the lock again to insert. A
+concurrent producer that wins the insert race wins; the other side
+returns the winner's row.
+
+`compute_row` is the first positional argument so callers can pass the
+miss-path computation as a `do … end` block:
+
+```julia
+return cached_row_lookup(
+    vlodf.cache, vlodf.cache_lock, row, column, get_tol(vlodf),
+) do
+    _compute_lodf_row(vlodf, row)
+end
+```
+"""
+function cached_row_lookup(
+    compute_row,
+    cache::RowCache,
+    cache_lock::ReentrantLock,
+    row::Int,
+    column::Union{Int, Colon},
+    tol::Float64,
+)
+    @lock cache_lock begin
+        haskey(cache, row) && return cache.temp_cache[row][column]
+    end
+    row_data = compute_row()
+    stored = tol > eps() ? sparsify(row_data, tol) : row_data
+    @lock cache_lock begin
+        haskey(cache, row) && return cache.temp_cache[row][column]
+        cache[row] = stored
+        return cache[row][column]
+    end
+end
diff --git a/src/solver_dispatch.jl b/src/solver_dispatch.jl
new file mode 100644
index 000000000..7a31ba86c
--- /dev/null
+++ b/src/solver_dispatch.jl
@@ -0,0 +1,45 @@
+# Shared dispatch for "run a solve under whatever solver we have." Lives
+# outside the Virtual{PTDF, LODF, MODF} files so all three matrices share
+# the same `with_solver` seam and the same KLU/AppleAccelerate factory.
+#
+# All libklu activity in this process serializes through `_LIBKLU_LOCK`
+# (see `KLUWrapper.jl`). A pool-of-independent-caches design was tried
+# and removed: empirically, distinct `Numeric`/`Symbolic`/`Common` per
+# thread does not prevent libklu state corruption, and the per-cache
+# `solver_lock` we hold here further serializes any non-libklu work in
+# the callback. One factor + one cache per Virtual matrix is what
+# remains — same throughput as the pool variant once the global lock
+# is in place.
+
+"""
+    with_solver(f, K, work_ba_col, temp_data, solver_lock) -> result
+
+Acquire `solver_lock`, then invoke `f(K, work_ba_col[1], temp_data[1])`.
+Two overloads: one specialized on `KLULinSolveCache{Float64}` (the KLU
+backend), one generic for any other factorization (`AppleAccelerate.AAFactorization`,
+etc.). Both serialize through `solver_lock`; the per-cache scratch
+slot at index 1 is the only slot — `work_ba_col` and `temp_data` are
+single-element vectors, kept as `Vector{Vector{Float64}}` because
+`_solve_factorization` and the AppleAccelerate extension are typed on
+`Vector{Float64}` and the two buffers have different lengths
+(`n_buses` vs. `n_buses - n_ref_buses`).
+"""
+function with_solver(
+    f::F,
+    K::KLULinSolveCache{Float64},
+    work_ba_col::Vector{Vector{Float64}},
+    temp_data::Vector{Vector{Float64}},
+    solver_lock::ReentrantLock,
+) where {F}
+    return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
+end
+
+function with_solver(
+    f::F,
+    K::KT,
+    work_ba_col::Vector{Vector{Float64}},
+    temp_data::Vector{Vector{Float64}},
+    solver_lock::ReentrantLock,
+) where {F, KT}
+    return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
+end
diff --git a/src/virtual_lodf_calculations.jl b/src/virtual_lodf_calculations.jl
index 0bab28cb2..d6f4be258 100644
--- a/src/virtual_lodf_calculations.jl
+++ b/src/virtual_lodf_calculations.jl
@@ -9,9 +9,16 @@ The VirtualLODF is initialized with no row stored.
 
 The VirtualLODF struct is indexed using branch names.
 
+# Thread-safety
+
+Concurrent `getindex` (and `get_partial_lodf_row`) is safe but serialized:
+every libklu solve runs under `_LIBKLU_LOCK` (process-wide) and the per-cache
+`solver_lock`, and the row cache is guarded by `cache_lock`. Multi-threaded
+callers can issue requests concurrently; the libklu work runs one at a time.
+
 # Arguments
-- `K::KLU.KLUFactorization{Float64, Int}`:
-        LU factorization matrices of the ABA matrix, evaluated by means of KLU.
+- `K::KLULinSolveCache{Float64}`:
+        ABA factorization (single cache; solves serialize via the locks above).
 - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
         BA matrix.
 - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`:
@@ -45,20 +52,25 @@ The VirtualLODF struct is indexed using branch names.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
-- `temp_data::Vector{Float64}`:
-        Temporary vector for internal use.
+- `temp_data::Vector{Vector{Float64}}`:
+        Single-element scratch vector kept as a `Vector{Vector{Float64}}` for
+        uniform `with_solver` callback signatures.
 - `cache::RowCache`:
-        Cache were LODF rows are stored.
+        Cache where LODF rows are stored.
+- `cache_lock::ReentrantLock`:
+        Guards `cache` reads/writes for parallel `getindex` callers.
 - `subnetworks::Dict{Int, Set{Int}}`:
         Dictionary containing the subsets of buses defining the different subnetwork of the system.
 - `tol::Base.RefValue{Float64}`:
         Tolerance related to scarification and values to drop.
 - `network_reduction::NetworkReduction`:
         Structure containing the details of the network reduction applied when computing the matrix
+- `work_ba_col::Vector{Vector{Float64}}`:
+        Single-element BA-column scratch buffer.
 """
-struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}} <:
+struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
        PowerNetworkMatrix{Float64}
-    K::KLU.KLUFactorization{Float64, Int}
+    K::K
     BA::SparseArrays.SparseMatrixCSC{Float64, Int}
     A::SparseArrays.SparseMatrixCSC{Int8, Int}
     inv_PTDF_A_diag::Vector{Float64}
@@ -69,12 +81,16 @@ struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}} <:
     axes::Ax
     lookup::L
     valid_ix::Vector{Int}
-    temp_data::Vector{Float64}
+    temp_data::Vector{Vector{Float64}}
     cache::RowCache
+    cache_lock::ReentrantLock
     subnetwork_axes::Dict{Int, Ax}
     tol::Base.RefValue{Float64}
     network_reduction_data::NetworkReductionData
-    work_ba_col::Vector{Float64}
+    work_ba_col::Vector{Vector{Float64}}
+    # Serializes solves on this cache. `with_solver` always acquires it,
+    # for both the KLU and AppleAccelerate backends.
+    solver_lock::ReentrantLock
 end
 
 get_axes(M::VirtualLODF) = M.axes
@@ -94,7 +110,7 @@ function Base.show(io::IO, ::MIME{Symbol("text/plain")}, array::VirtualLODF)
 end
 
 function _get_PTDF_A_diag(
-    K::KLU.KLUFactorization{Float64, Int},
+    K::KLULinSolveCache{Float64},
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     A::SparseArrays.SparseMatrixCSC{Int8, Int},
     ref_bus_positions::Set{Int},
@@ -120,13 +136,12 @@ function _get_PTDF_A_diag(
             ba_col[idx] = BA[bus_idx, i]
         end
 
-        # Solve for PTDF row: ptdf_row_valid = ABA^(-1) * ba_col
-        ptdf_row_valid = K \ ba_col
+        solve!(K, ba_col)
 
         # Map back to full bus indices
         fill!(ptdf_row, 0.0)
         for idx in 1:n_valid
-            ptdf_row[valid_ix[idx]] = ptdf_row_valid[idx]
+            ptdf_row[valid_ix[idx]] = ba_col[idx]
         end
 
         # Compute diagonal element: sum of PTDF[i,j] * A[i,j] for all buses j
@@ -238,17 +253,12 @@ function VirtualLODF(
     subnetwork_axes = make_arc_arc_subnetwork_axes(A)
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu(ABA)
+    K = klu_factorize(ABA)
     bus_ax = get_bus_axis(A)
 
-    temp_data = zeros(length(bus_ax))
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
-    PTDF_diag = _get_PTDF_A_diag(
-        K,
-        BA.data,
-        A.data,
-        Set(ref_bus_positions),
-    )
+    # PTDF diagonal precompute runs serially on the dispatcher thread.
+    PTDF_diag = _get_PTDF_A_diag(K, BA.data, A.data, Set(ref_bus_positions))
     PTDF_A_diag_raw = copy(PTDF_diag)
     arc_susceptances = _extract_arc_susceptances(BA.data)
     branch_susceptances_by_arc = _extract_branch_susceptances_by_arc(
@@ -268,8 +278,9 @@ function VirtualLODF(
             )
     end
 
-    # Pre-allocate work array for BA column extraction
-    work_ba_col = zeros(length(valid_ix))
+    # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`.
+    temp_data = [zeros(length(bus_ax))]
+    work_ba_col = [zeros(length(valid_ix))]
 
     return VirtualLODF(
         K,
@@ -285,10 +296,12 @@ function VirtualLODF(
         valid_ix,
         temp_data,
         empty_cache,
+        ReentrantLock(),
         subnetwork_axes,
         Ref(tol),
         Ymatrix.network_reduction_data,
         work_ba_col,
+        ReentrantLock(),
     )
 end
 
@@ -322,37 +335,37 @@ if isdefined(Base, :print_array) # 0.7 and later
     Base.print_array(io::IO, X::VirtualLODF) = "VirtualLODF"
 end
 
-function _getindex(
-    vlodf::VirtualLODF,
-    row::Int,
-    column::Union{Int, Colon},
-)
-    # check if value is in the cache
-    if haskey(vlodf.cache, row)
-        return vlodf.cache.temp_cache[row][column]
-    else
-        # evaluate the value for the LODF column
-        # Use pre-allocated work array instead of collect() to reduce allocations
+# Compute the LODF row for `row` using exclusive per-worker scratch. Pure
+# computation: no cache reads/writes, no tolerance application.
+function _compute_lodf_row(vlodf::VirtualLODF, row::Int)::Vector{Float64}
+    return with_solver(
+        vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
         @inbounds for i in eachindex(vlodf.valid_ix)
-            vlodf.work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row]
+            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row]
         end
-        lin_solve = KLU.solve!(vlodf.K, vlodf.work_ba_col)
+        lin_solve = _solve_factorization(K_solver, work_ba_col)
 
-        # get full lodf row
+        fill!(temp_data, 0.0)
         @inbounds for i in eachindex(vlodf.valid_ix)
-            vlodf.temp_data[vlodf.valid_ix[i]] = lin_solve[i]
+            temp_data[vlodf.valid_ix[i]] = lin_solve[i]
         end
 
-        # now get the LODF row
-        lodf_row = (vlodf.A * vlodf.temp_data) .* vlodf.inv_PTDF_A_diag
+        lodf_row = (vlodf.A * temp_data) .* vlodf.inv_PTDF_A_diag
         lodf_row[row] = -1.0
+        return lodf_row
+    end
+end
 
-        if get_tol(vlodf) > eps()
-            vlodf.cache[row] = sparsify(lodf_row, get_tol(vlodf))
-        else
-            vlodf.cache[row] = copy(lodf_row)
-        end
-        return vlodf.cache[row][column]
+function _getindex(
+    vlodf::VirtualLODF,
+    row::Int,
+    column::Union{Int, Colon},
+)
+    return cached_row_lookup(
+        vlodf.cache, vlodf.cache_lock, row, column, get_tol(vlodf),
+    ) do
+        _compute_lodf_row(vlodf, row)
     end
 end
 
@@ -420,10 +433,7 @@ end
 
 Compute the partial LODF column for a susceptance change `delta_b` on arc `arc_idx`.
 
-!!! warning
-    This function is NOT thread-safe. It mutates `vlodf.work_ba_col` and
-    `vlodf.temp_data` on every call. Do not call concurrently on the same
-    `VirtualLODF` instance from multiple threads.
+Concurrent callers serialize on `vlodf.solver_lock` and `_LIBKLU_LOCK`.
 
 Uses the Sherman-Morrison (matrix inversion lemma) formula derived from DC power flow
 sensitivity analysis. For a change Δb in the susceptance of arc e, the change in flow
@@ -459,42 +469,46 @@ function _getindex_partial(
         return zeros(n_arcs)
     end
 
-    # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve.
-    @inbounds for i in eachindex(vlodf.valid_ix)
-        vlodf.work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx]
-    end
-    lin_solve = KLU.solve!(vlodf.K, vlodf.work_ba_col)
+    return with_solver(
+        vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
+        # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve.
+        @inbounds for i in eachindex(vlodf.valid_ix)
+            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx]
+        end
+        lin_solve = _solve_factorization(K_solver, work_ba_col)
 
-    # Step 3: Map solution back to full bus space.
-    fill!(vlodf.temp_data, 0.0)
-    @inbounds for i in eachindex(vlodf.valid_ix)
-        vlodf.temp_data[vlodf.valid_ix[i]] = lin_solve[i]
-    end
+        # Step 3: Map solution back to full bus space.
+        fill!(temp_data, 0.0)
+        @inbounds for i in eachindex(vlodf.valid_ix)
+            temp_data[vlodf.valid_ix[i]] = lin_solve[i]
+        end
 
-    # Step 4: H_col[ℓ] = b_e · C[e,ℓ] for all monitoring arcs ℓ.
-    H_col = vlodf.A * vlodf.temp_data
-
-    # Step 5: Scalar denominator: 1 - α · H[e,e].
-    # α = -Δb / b_e (positive for outage/decrease, negative for increase).
-    H_ee = vlodf.PTDF_A_diag[arc_idx]
-    alpha = -delta_b / b_arc
-    denom = 1.0 - alpha * H_ee
-
-    # Step 6: Partial LODF column: scale by b_ℓ/b_e to convert from C[e,ℓ] to b_ℓ·C[e,ℓ].
-    # partial_lodf[ℓ] = α · b_ℓ/b_e · H_col[ℓ] / denom
-    #                 = α · b_ℓ · C[e,ℓ] / (1 - α · H_ee)
-    partial_lodf =
-        (alpha / (denom * b_arc)) .* (vlodf.arc_susceptances .* H_col)
-
-    # By convention, the outaged arc's own redistribution factor is -1.0 for a full
-    # outage: the arc carries -100% of its own pre-contingency flow post-outage.
-    # The raw formula gives α·H[e,e]/denom for the self-element, which is
-    # b_e·C[e,e]/(1-b_e·C[e,e]) = H_ee/(1-H_ee) ≠ -1 in general.
-    if abs(delta_b + b_arc) < eps() * b_arc
-        partial_lodf[arc_idx] = -1.0
-    end
+        # Step 4: H_col[ℓ] = b_e · C[e,ℓ] for all monitoring arcs ℓ.
+        H_col = vlodf.A * temp_data
+
+        # Step 5: Scalar denominator: 1 - α · H[e,e].
+        # α = -Δb / b_e (positive for outage/decrease, negative for increase).
+        H_ee = vlodf.PTDF_A_diag[arc_idx]
+        alpha = -delta_b / b_arc
+        denom = 1.0 - alpha * H_ee
+
+        # Step 6: Partial LODF column: scale by b_ℓ/b_e to convert from C[e,ℓ] to b_ℓ·C[e,ℓ].
+        # partial_lodf[ℓ] = α · b_ℓ/b_e · H_col[ℓ] / denom
+        #                 = α · b_ℓ · C[e,ℓ] / (1 - α · H_ee)
+        partial_lodf =
+            (alpha / (denom * b_arc)) .* (vlodf.arc_susceptances .* H_col)
+
+        # By convention, the outaged arc's own redistribution factor is -1.0 for a full
+        # outage: the arc carries -100% of its own pre-contingency flow post-outage.
+        # The raw formula gives α·H[e,e]/denom for the self-element, which is
+        # b_e·C[e,e]/(1-b_e·C[e,e]) = H_ee/(1-H_ee) ≠ -1 in general.
+        if abs(delta_b + b_arc) < eps() * b_arc
+            partial_lodf[arc_idx] = -1.0
+        end
 
-    return partial_lodf
+        return partial_lodf
+    end
 end
 
 """
diff --git a/src/virtual_modf_calculations.jl b/src/virtual_modf_calculations.jl
index ebb6fe2d7..cbe596087 100644
--- a/src/virtual_modf_calculations.jl
+++ b/src/virtual_modf_calculations.jl
@@ -11,9 +11,20 @@ Caching is two-tiered:
 - PTDF rows (1 KLU solve each) are cached per (monitored_arc, contingency) via
   one RowCache per contingency
 
+# Thread-safety
+
+Concurrent `getindex` is safe but serialized: `solver_lock` (a `ReentrantLock`)
+is held for the full body of `getindex`, `clear_caches!`, and `clear_all_caches!`,
+so Dict mutations on the cache structures and the libklu solves it wraps all
+run under a single mutex. libklu activity additionally serializes through the
+process-wide `_LIBKLU_LOCK`. Two threads racing on a first-time query for the
+same modification serialize on `solver_lock`; the second observes the populated
+cache and skips the recomputation.
+
 # Arguments
-- `K::KLU.KLUFactorization{Float64, Int}`:
-        LU factorization of ABA matrix.
+- `K::KLULinSolveCache{Float64}`:
+        ABA factorization (single cache; concurrent `getindex` callers
+        serialize through `solver_lock` and `_LIBKLU_LOCK`).
 - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
         BA matrix.
 - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`:
@@ -39,7 +50,9 @@ Caching is two-tiered:
 - `woodbury_cache::Dict{NetworkModification, WoodburyFactors}`:
         Precomputed Woodbury factors keyed by modification.
 - `row_caches::Dict{NetworkModification, RowCache}`:
-        One RowCache per modification, keyed by modification.
+        One `RowCache` per modification. Mutations are serialized by
+        `solver_lock`, the same mutex that wraps the libklu solve, so no
+        separate cache lock is needed.
 - `subnetwork_axes::Dict{Int, Ax}`:
         Maps reference bus indices to subnetwork axes.
 - `tol::Base.RefValue{Float64}`:
@@ -48,17 +61,21 @@ Caching is two-tiered:
         Max cache size in bytes per contingency.
 - `network_reduction_data::NetworkReductionData`:
         Network reduction mappings for branch resolution.
-- `temp_data::Vector{Float64}`:
-        Scratch vector of size n_buses.
-- `work_ba_col::Vector{Float64}`:
-        Pre-allocated work array for BA column extraction.
+- `temp_data::Vector{Vector{Float64}}`:
+        Single-element scratch vector of size n_buses.
+- `work_ba_col::Vector{Vector{Float64}}`:
+        Single-element work array for BA column extraction.
+- `solver_lock::ReentrantLock`:
+        Reentrant; held for the duration of `getindex` / `clear_*caches!`.
+        Serializes both libklu solves and Dict mutations on the caches;
+        combined with `_LIBKLU_LOCK` ensures one libklu call at a time.
 - `system_uuid::Union{Base.UUID, Nothing}`:
         UUID of the system used to construct the matrix, used to validate that
         modification operations are applied to the correct system.
 """
-struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}} <:
+struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
        PowerNetworkMatrix{Float64}
-    K::KLU.KLUFactorization{Float64, Int}
+    K::K
     BA::SparseArrays.SparseMatrixCSC{Float64, Int}
     A::SparseArrays.SparseMatrixCSC{Int8, Int}
     PTDF_A_diag::Vector{Float64}
@@ -75,8 +92,9 @@ struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}} <:
     tol::Base.RefValue{Float64}
     max_cache_size_bytes::Int
     network_reduction_data::NetworkReductionData
-    temp_data::Vector{Float64}
-    work_ba_col::Vector{Float64}
+    temp_data::Vector{Vector{Float64}}
+    work_ba_col::Vector{Vector{Float64}}
+    solver_lock::ReentrantLock
     system_uuid::Union{Base.UUID, Nothing}
 end
 
@@ -93,13 +111,57 @@ get_bus_axis(mat::VirtualMODF) = mat.axes[2]
 get_tol(mat::VirtualMODF) = mat.tol[]
 get_system_uuid(M::VirtualMODF) = M.system_uuid
 
-# Woodbury kernel accessors
-_get_K(m::VirtualMODF) = m.K
+# Woodbury kernel accessors. The Woodbury kernel always goes through
+# `with_solver` so it picks up the `solver_lock` + `_LIBKLU_LOCK` chain.
 _get_BA(m::VirtualMODF) = m.BA
 _get_arc_susceptances(m::VirtualMODF) = m.arc_susceptances
 _get_valid_ix(m::VirtualMODF) = m.valid_ix
-_get_temp_data(m::VirtualMODF) = m.temp_data
-_get_work_ba_col(m::VirtualMODF) = m.work_ba_col
+
+function _compute_woodbury_factors(
+    mat::VirtualMODF,
+    modifications::Tuple{Vararg{ArcModification}},
+)::WoodburyFactors
+    return with_solver(
+        mat.K,
+        mat.work_ba_col,
+        mat.temp_data,
+        mat.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
+        _compute_woodbury_factors_impl(
+            K_solver,
+            work_ba_col,
+            temp_data,
+            mat.BA,
+            mat.arc_susceptances,
+            mat.valid_ix,
+            modifications,
+        )
+    end
+end
+
+function _apply_woodbury_correction(
+    mat::VirtualMODF,
+    monitored_idx::Int,
+    wf::WoodburyFactors,
+)::Vector{Float64}
+    return with_solver(
+        mat.K,
+        mat.work_ba_col,
+        mat.temp_data,
+        mat.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
+        _apply_woodbury_correction_impl(
+            K_solver,
+            work_ba_col,
+            temp_data,
+            mat.BA,
+            mat.arc_susceptances,
+            mat.valid_ix,
+            monitored_idx,
+            wf,
+        )
+    end
+end
 
 """
     get_registered_contingencies(vmodf::VirtualMODF) -> Dict{Base.UUID, ContingencySpec}
@@ -154,6 +216,7 @@ function VirtualMODF(
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     network_reductions::Vector{NetworkReduction} = NetworkReduction[],
+    automatically_register_outages::Bool = true,
     kwargs...,
 )
     if length(dist_slack) != 0
@@ -171,23 +234,23 @@ function VirtualMODF(
     arc_ax_ref = make_ax_ref(arc_ax)
     bus_ax_ref = make_ax_ref(bus_ax)
     look_up = (arc_ax_ref, bus_ax_ref)
-    # Use A.subnetwork_axes which has Ax == (arc_ax, bus_ax) matching our axes field type
     subnetwork_axes = A.subnetwork_axes
 
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu(ABA)
+    K = klu_factorize(ABA)
 
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
 
-    # Compute raw PTDF diagonal and arc susceptances
+    # PTDF diagonal precompute runs serially on the dispatcher thread.
     PTDF_A_diag = _get_PTDF_A_diag(K, BA.data, A.data, Set(ref_bus_positions))
     arc_susceptances = _extract_arc_susceptances(BA.data)
-    branch_susceptances_by_arc = _extract_branch_susceptances_by_arc(
-        BA.data, arc_ax, Ymatrix.network_reduction_data)
+    branch_susceptances_by_arc =
+        _extract_branch_susceptances_by_arc(BA.data, arc_ax, Ymatrix.network_reduction_data)
 
-    temp_data = zeros(length(bus_ax))
-    work_ba_col = zeros(length(valid_ix))
+    # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`.
+    temp_data = [zeros(length(bus_ax))]
+    work_ba_col = [zeros(length(valid_ix))]
     max_cache_bytes = max_cache_size * MiB
 
     vmodf = VirtualMODF(
@@ -210,11 +273,12 @@ function VirtualMODF(
         Ymatrix.network_reduction_data,
         temp_data,
         work_ba_col,
+        ReentrantLock(),
         IS.get_uuid(sys),
     )
 
     # Auto-register all outage attributes from the system
-    _register_outages!(vmodf, sys)
+    automatically_register_outages && _register_all_outages!(vmodf, sys)
 
     return vmodf
 end
@@ -222,7 +286,7 @@ end
 # --- Outage registration ---
 
 """
-    _register_outages!(vmodf, sys)
+    _register_all_outages!(vmodf, sys)
 
 Bulk-register all Outage supplemental attributes in the system.
 Called automatically by the VirtualMODF constructor.
@@ -231,7 +295,7 @@ Uses `PSY.get_supplemental_attributes(PSY.Outage, sys)` which accepts
 the abstract type and iterates over all concrete subtypes
 (PlannedOutage, UnplannedOutage).
 """
-function _register_outages!(vmodf::VirtualMODF, sys::PSY.System)
+function _register_all_outages!(vmodf::VirtualMODF, sys::PSY.System)
     count = 0
     for outage in PSY.get_supplemental_attributes(PSY.Outage, sys)
         try
@@ -258,19 +322,16 @@ end
 Resolve an Outage supplemental attribute to a ContingencySpec and cache it.
 Delegates to `NetworkModification(mat, sys, outage)` for the resolution logic.
 """
-function _register_outage!(
-    vmodf::VirtualMODF,
-    sys::PSY.System,
-    outage::PSY.Outage,
-)
+function _register_outage!(vmodf::VirtualMODF, sys::PSY.System, outage::PSY.Outage)
     outage_uuid = IS.get_uuid(outage)
     if haskey(vmodf.contingency_cache, outage_uuid)
-        return vmodf.contingency_cache[outage_uuid]
+        @warn "Outage with UUID $(outage_uuid) is already registered; skipping."
+        return
     end
     mod = NetworkModification(vmodf, sys, outage)
     ctg = ContingencySpec(outage_uuid, mod)
     vmodf.contingency_cache[outage_uuid] = ctg
-    return ctg
+    return
 end
 
 # --- Woodbury factor computation ---
@@ -278,24 +339,17 @@ end
 """
     _get_woodbury_factors(vmodf, mod) -> WoodburyFactors
 
-Compute and cache the Woodbury factors for a network modification.
-Delegates to the shared Woodbury kernel `_compute_woodbury_factors`.
-Caches by content hash of the modification.
-
-!!! warning
-    This function is NOT thread-safe. It mutates `vmodf.work_ba_col` on
-    every call. Do not call concurrently on the same `VirtualMODF` instance.
+Return cached Woodbury factors for a modification, computing them on a miss.
+Caller holds `solver_lock`; the inner `_compute_woodbury_factors` re-enters
+that same lock (it's a `ReentrantLock`).
 """
-function _get_woodbury_factors(
-    vmodf::VirtualMODF,
-    mod::NetworkModification,
-)
-    if haskey(vmodf.woodbury_cache, mod)
-        return vmodf.woodbury_cache[mod]
+function _get_woodbury_factors(vmodf::VirtualMODF, mod::NetworkModification)
+    # Use the do-block form, NOT `get!(dict, key, default)`: Julia evaluates
+    # function arguments eagerly, so the 3-arg form would run the M KLU solves
+    # on every call (cache hit included), defeating the cache.
+    return get!(vmodf.woodbury_cache, mod) do
+        _compute_woodbury_factors(vmodf, mod.arc_modifications)
     end
-    wf = _compute_woodbury_factors(vmodf, mod.arc_modifications)
-    vmodf.woodbury_cache[mod] = wf
-    return wf
 end
 
 """
@@ -306,9 +360,6 @@ Gets or computes Woodbury factors, then applies the Woodbury correction.
 
 For N-1 contingencies, the result satisfies:
     post_ptdf[mon, :] = pre_ptdf[mon, :] + LODF[mon, e] * pre_ptdf[e, :]
-
-!!! warning
-    Not thread-safe. Mutates scratch vectors in `vmodf`.
 """
 function _compute_modf_entry(
     vmodf::VirtualMODF,
@@ -319,22 +370,6 @@ function _compute_modf_entry(
     return _apply_woodbury_correction(vmodf, monitored_idx, wf)
 end
 
-# --- Row cache management ---
-
-"""
-    _get_or_create_row_cache(vmodf, mod) -> RowCache
-
-Get or create the per-modification RowCache for the given modification.
-"""
-function _get_or_create_row_cache(vmodf::VirtualMODF, mod::NetworkModification)
-    if !haskey(vmodf.row_caches, mod)
-        row_size = length(vmodf.temp_data) * sizeof(Float64)
-        vmodf.row_caches[mod] =
-            RowCache(vmodf.max_cache_size_bytes, Set{Int}(), row_size)
-    end
-    return vmodf.row_caches[mod]
-end
-
 # --- getindex: by integer monitored index + NetworkModification ---
 
 """
@@ -343,26 +378,24 @@ Uses per-modification RowCache for LRU-eviction caching.
 
 $(TYPEDSIGNATURES)
 """
-function Base.getindex(
-    vmodf::VirtualMODF,
-    monitored_idx::Int,
-    mod::NetworkModification,
-)
-    cache = _get_or_create_row_cache(vmodf, mod)
-
-    if haskey(cache, monitored_idx)
-        return copy(cache[monitored_idx])
-    end
-
-    row = _compute_modf_entry(vmodf, monitored_idx, mod)
-
-    if get_tol(vmodf) > eps()
-        cache[monitored_idx] = sparsify(row, get_tol(vmodf))
-    else
-        cache[monitored_idx] = row
+function Base.getindex(vmodf::VirtualMODF, monitored_idx::Int, mod::NetworkModification)
+    return @lock vmodf.solver_lock begin
+        rc = get!(vmodf.row_caches, mod) do
+            row_size = length(vmodf.temp_data[1]) * sizeof(Float64)
+            RowCache(vmodf.max_cache_size_bytes, Set{Int}(), row_size)
+        end
+        if haskey(rc, monitored_idx)
+            return copy(rc[monitored_idx])
+        end
+        row = _compute_modf_entry(vmodf, monitored_idx, mod)
+        if get_tol(vmodf) > eps()
+            stored = sparsify(row, get_tol(vmodf))
+        else
+            stored = row
+        end
+        rc[monitored_idx] = stored
+        copy(stored)
     end
-
-    return copy(cache[monitored_idx])
 end
 
 """
@@ -387,11 +420,7 @@ Delegates to the NetworkModification-based getindex.
 
 $(TYPEDSIGNATURES)
 """
-function Base.getindex(
-    vmodf::VirtualMODF,
-    monitored_idx::Int,
-    contingency::ContingencySpec,
-)
+function Base.getindex(vmodf::VirtualMODF, monitored_idx::Int, contingency::ContingencySpec)
     return vmodf[monitored_idx, contingency.modification]
 end
 
@@ -411,19 +440,19 @@ The outage must have been registered at VirtualMODF construction time.
 
 $(TYPEDSIGNATURES)
 """
-function Base.getindex(
-    vmodf::VirtualMODF,
-    monitored::Int,
-    outage::PSY.Outage,
-)
+function Base.getindex(vmodf::VirtualMODF, monitored::Int, outage::PSY.Outage)
     outage_uuid = IS.get_uuid(outage)
-    if !haskey(vmodf.contingency_cache, outage_uuid)
-        error(
-            "Outage (UUID=$outage_uuid) is not registered. " *
-            "Construct VirtualMODF with the system containing this outage.",
-        )
+    # Pair with the locked `empty!` in `clear_all_caches!`; without it, a
+    # concurrent clear could rehash `contingency_cache` mid-lookup.
+    ctg = @lock vmodf.solver_lock begin
+        if !haskey(vmodf.contingency_cache, outage_uuid)
+            error(
+                "Outage (UUID=$outage_uuid) is not registered. " *
+                "Construct VirtualMODF with the system containing this outage.",
+            )
+        end
+        vmodf.contingency_cache[outage_uuid]
     end
-    ctg = vmodf.contingency_cache[outage_uuid]
     return vmodf[monitored, ctg.modification]
 end
 
@@ -432,11 +461,7 @@ Arc-tuple indexed version of getindex by PSY.Outage.
 
 $(TYPEDSIGNATURES)
 """
-function Base.getindex(
-    vmodf::VirtualMODF,
-    monitored::Tuple{Int, Int},
-    outage::PSY.Outage,
-)
+function Base.getindex(vmodf::VirtualMODF, monitored::Tuple{Int, Int}, outage::PSY.Outage)
     m_idx = vmodf.lookup[1][monitored]
     return vmodf[m_idx, outage]
 end
@@ -448,8 +473,10 @@ Clear Woodbury and row caches. Does NOT clear the contingency registration
 cache — registered outages remain valid and can be queried again.
 """
 function clear_caches!(vmodf::VirtualMODF)
-    empty!(vmodf.woodbury_cache)
-    empty!(vmodf.row_caches)
+    @lock vmodf.solver_lock begin
+        empty!(vmodf.woodbury_cache)
+        empty!(vmodf.row_caches)
+    end
     return
 end
 
@@ -465,8 +492,10 @@ Use `clear_caches!` instead to preserve contingency registrations while
 freeing computation cache memory.
 """
 function clear_all_caches!(vmodf::VirtualMODF)
-    empty!(vmodf.contingency_cache)
-    empty!(vmodf.woodbury_cache)
-    empty!(vmodf.row_caches)
+    @lock vmodf.solver_lock begin
+        empty!(vmodf.contingency_cache)
+        empty!(vmodf.woodbury_cache)
+        empty!(vmodf.row_caches)
+    end
     return
 end
diff --git a/src/virtual_ptdf_calculations.jl b/src/virtual_ptdf_calculations.jl
index f587add28..89b15df62 100644
--- a/src/virtual_ptdf_calculations.jl
+++ b/src/virtual_ptdf_calculations.jl
@@ -10,9 +10,19 @@ The VirtualPTDF is initialized with no row stored.
 The VirtualPTDF is indexed using branch names and bus numbers as for the PTDF
 matrix.
 
+# Thread-safety
+
+Concurrent `getindex` is safe but serialized: every libklu solve is wrapped
+by `_LIBKLU_LOCK` (process-wide) and the per-cache `solver_lock` here, and
+the row cache is guarded by `cache_lock`. Multiple threads can call
+`getindex` simultaneously; their libklu work runs one at a time, while the
+JuMP-side work (in callers) parallelizes freely.
+
 # Arguments
-- `K::Union{KLU.KLUFactorization{Float64, Int}, AppleAccelerate.AAFactorization{Float64}}`:
-        LU factorization matrices of the ABA matrix, evaluated by means of KLU or AppleAccelerate
+- `K`:
+        LU factorization of the ABA matrix. A `KLULinSolveCache{Float64}` for
+        the default KLU solver, or an `AppleAccelerate.AAFactorization{Float64}`
+        when the AppleAccelerate extension is loaded.
 - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
         BA matrix
 - `ref_bus_positions::Set{Int}`:
@@ -31,25 +41,34 @@ matrix.
         and buses with their enumerated indexes. The branch indexes refer to
         the key of the cache dictionary. The bus indexes refer to the position
         of the elements in the PTDF row stored.
-- `temp_data::Vector{Float64}`:
-        Temporary vector for internal use.
+- `temp_data::Vector{Vector{Float64}}`:
+        Single-element vector holding a temporary buffer for internal use.
+        Kept as `Vector{Vector{Float64}}` so the dispatch on
+        `_solve_factorization` stays uniform across backends.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
 - `cache::RowCache`:
-        Cache were PTDF rows are stored.
+        Cache where PTDF rows are stored.
+- `cache_lock::ReentrantLock`:
+        Guards `cache` reads/writes for parallel `getindex` callers.
 - `subnetworks::Dict{Int, Set{Int}}`:
         Dictionary containing the subsets of buses defining the different subnetwork of the system.
 - `tol::Base.RefValue{Float64}`:
         Tolerance related to scarification and values to drop.
 - `network_reduction::NetworkReduction`:
         Structure containing the details of the network reduction applied when computing the matrix
+- `work_ba_col::Vector{Vector{Float64}}`:
+        Single-element BA-column scratch buffer.
+- `solver_lock::ReentrantLock`:
+        Serializes solves on this cache. Combined with `_LIBKLU_LOCK` at the
+        libklu boundary, ensures one solve at a time per cache.
 - `system_uuid::Union{Base.UUID, Nothing}`:
         UUID of the system used to construct the matrix, used to validate that
         modification operations are applied to the correct system. `nothing` when
         constructed from a Ybus without an associated system.
 """
-struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K <: LinearAlgebra.Factorization} <:
+struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K} <:
        PowerNetworkMatrix{Float64}
     K::K
     BA::SparseArrays.SparseMatrixCSC{Float64, Int}
@@ -59,13 +78,15 @@ struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K <: LinearAlgebra.Factorization} <
     dist_slack_normalized::Vector{Float64}
     axes::Ax
     lookup::L
-    temp_data::Vector{Float64}
+    temp_data::Vector{Vector{Float64}}
     valid_ix::Vector{Int}
     cache::RowCache
+    cache_lock::ReentrantLock
     subnetwork_axes::Dict{Int, Ax}
     tol::Base.RefValue{Float64}
     network_reduction_data::NetworkReductionData
-    work_ba_col::Vector{Float64}
+    work_ba_col::Vector{Vector{Float64}}
+    solver_lock::ReentrantLock
     system_uuid::Union{Base.UUID, Nothing}
 end
 
@@ -140,8 +161,11 @@ function VirtualPTDF(
 end
 
 # Factorization dispatch methods for VirtualPTDF solver selection.
-function _create_factorization(::KLUSolver, ABA::SparseArrays.SparseMatrixCSC{Float64, Int})
-    return klu(ABA)
+function _create_factorization(
+    ::KLUSolver,
+    ABA::SparseArrays.SparseMatrixCSC{Float64, Int},
+)
+    return klu_factorize(ABA)
 end
 
 function _create_factorization(
@@ -164,7 +188,7 @@ end
 """
 Builds the Virtual PTDF matrix from a Ybus matrix. This constructor is more efficient when the prerequisite Ybus
 matrix is already available and provides direct control over the underlying matrix computations (including network reductions).
-The return is a VirtualPTDF struct with an empty cache. 
+The return is a VirtualPTDF struct with an empty cache.
 
 # Arguments
 - `ybus::Ybus`: Ybus matrix from which the matrix is constructed
@@ -208,7 +232,6 @@ function VirtualPTDF(
     if length(subnetwork_axes) > 1
         @info "Network is not connected, using subnetworks"
     end
-    temp_data = zeros(length(axes[2]))
 
     if isempty(persistent_arcs)
         empty_cache =
@@ -233,9 +256,13 @@ function VirtualPTDF(
         dist_slack_normalized = Float64[]
     end
 
-    # Pre-allocate work array for BA column extraction
-    valid_ix = setdiff(1:length(temp_data), ref_bus_positions)
-    work_ba_col = zeros(length(valid_ix))
+    # Single scratch slot — solves serialize through `solver_lock` +
+    # `_LIBKLU_LOCK`, so per-worker scratch is unnecessary. Kept as a
+    # `Vector{Vector{Float64}}` so `with_solver`'s callback signature
+    # stays uniform across solver backends.
+    valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
+    temp_data = [zeros(length(bus_ax))]
+    work_ba_col = [zeros(length(valid_ix))]
 
     arc_susceptances = _extract_arc_susceptances(BA.data)
 
@@ -251,10 +278,12 @@ function VirtualPTDF(
         temp_data,
         valid_ix,
         empty_cache,
+        ReentrantLock(),
         subnetwork_axes,
         Ref(tol),
         ybus.network_reduction_data,
         work_ba_col,
+        ReentrantLock(),
         system_uuid,
     )
 end
@@ -292,59 +321,61 @@ if isdefined(Base, :print_array) # 0.7 and later
     Base.print_array(io::IO, X::VirtualPTDF) = "VirtualPTDF"
 end
 
-# Helper function to solve with different factorization types
-function _solve_factorization(K::KLU.KLUFactorization{Float64, Int}, b::Vector{Float64})
-    return KLU.solve!(K, b)
+# Helper function to solve with different factorization types. The
+# `KLULinSolveCache` overload solves in place (zero-allocation hot path);
+# the generic fallback delegates to `\` and is extended by the
+# AppleAccelerate extension for `AAFactorization`.
+function _solve_factorization(K::KLULinSolveCache{Float64}, b::Vector{Float64})
+    solve!(K, b)
+    return b
 end
 
-# Generic fallback for other factorization types (will be extended by extensions)
-function _solve_factorization(K::LinearAlgebra.Factorization, b::Vector{Float64})
+function _solve_factorization(K, b::Vector{Float64})
     return K \ b
 end
 
-function _getindex(
-    vptdf::VirtualPTDF,
-    row::Int,
-    column::Union{Int, Colon},
-)
-    # check if value is in the cache
-    if haskey(vptdf.cache, row)
-        return vptdf.cache.temp_cache[row][column]
-    else
-        # evaluate the value for the PTDF column
+function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64}
+    buscount = size(vptdf, 1)
+    ref_bus_positions = get_ref_bus_position(vptdf)
+    if !isempty(vptdf.dist_slack) && length(ref_bus_positions) != 1
+        error(
+            "Distributed slack is not supported for systems with multiple reference buses.",
+        )
+    end
+    use_dist_slack = length(vptdf.dist_slack) == buscount
+    if !use_dist_slack && !isempty(vptdf.dist_slack)
+        error("Distributed bus specification doesn't match the number of buses.")
+    end
+
+    return with_solver(
+        vptdf.K, vptdf.work_ba_col, vptdf.temp_data, vptdf.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
         valid_ix = vptdf.valid_ix
-        # Use pre-allocated work array instead of collect() to reduce allocations
         @inbounds for i in eachindex(valid_ix)
-            vptdf.work_ba_col[i] = vptdf.BA[valid_ix[i], row]
+            work_ba_col[i] = vptdf.BA[valid_ix[i], row]
         end
-        lin_solve = _solve_factorization(vptdf.K, vptdf.work_ba_col)
-        buscount = size(vptdf, 1)
-        ref_bus_positions = get_ref_bus_position(vptdf)
-        if !isempty(vptdf.dist_slack) && length(ref_bus_positions) != 1
-            error(
-                "Distributed slack is not supported for systems with multiple reference buses.",
-            )
-        elseif isempty(vptdf.dist_slack) && length(ref_bus_positions) < buscount
-            @inbounds for i in eachindex(valid_ix)
-                vptdf.temp_data[valid_ix[i]] = lin_solve[i]
-            end
-            vptdf.cache[row] = copy(vptdf.temp_data)
-        elseif length(vptdf.dist_slack) == buscount
-            @inbounds for i in eachindex(valid_ix)
-                vptdf.temp_data[valid_ix[i]] = lin_solve[i]
-            end
-            # Use pre-computed normalized slack array for efficiency
-            adjustment = dot(vptdf.temp_data, vptdf.dist_slack_normalized)
-            vptdf.cache[row] = vptdf.temp_data .- adjustment
-        else
-            error("Distributed bus specification doesn't match the number of buses.")
+        lin_solve = _solve_factorization(K_solver, work_ba_col)
+        fill!(temp_data, 0.0)
+        @inbounds for i in eachindex(valid_ix)
+            temp_data[valid_ix[i]] = lin_solve[i]
         end
-
-        if get_tol(vptdf) > eps()
-            vptdf.cache[row] = sparsify(vptdf.cache[row], get_tol(vptdf))
+        if use_dist_slack
+            adjustment = dot(temp_data, vptdf.dist_slack_normalized)
+            return temp_data .- adjustment
         end
+        return copy(temp_data)
+    end
+end
 
-        return vptdf.cache[row][column]
+function _getindex(
+    vptdf::VirtualPTDF,
+    row::Int,
+    column::Union{Int, Colon},
+)
+    return cached_row_lookup(
+        vptdf.cache, vptdf.cache_lock, row, column, get_tol(vptdf),
+    ) do
+        _compute_ptdf_row(vptdf, row)
     end
 end
 
diff --git a/src/virtual_ptdf_modification.jl b/src/virtual_ptdf_modification.jl
index f8122cac9..4009b87fc 100644
--- a/src/virtual_ptdf_modification.jl
+++ b/src/virtual_ptdf_modification.jl
@@ -11,9 +11,10 @@ The returned `WoodburyFactors` can be reused across multiple monitored
 arcs, making this the recommended path for optimization loops where
 factors are computed once per modification and many rows are queried.
 
-!!! warning
-    Not thread-safe. Mutates scratch vectors in `vptdf`. Do not call
-    concurrently on the same `VirtualPTDF` instance.
+!!! note
+    Concurrent callers serialize on the per-cache `solver_lock` and
+    `_LIBKLU_LOCK` (KLU backend) or just the per-cache `solver_lock`
+    (AppleAccelerate backend).
 
 $(TYPEDSIGNATURES)
 """
@@ -31,9 +32,10 @@ Compute the post-modification PTDF row for a monitored arc using
 precomputed Woodbury factors. Accepts either an integer arc index
 or a `Tuple{Int, Int}` bus pair.
 
-!!! warning
-    Not thread-safe. Mutates scratch vectors in `vptdf`. Do not call
-    concurrently on the same `VirtualPTDF` instance.
+!!! note
+    Concurrent callers serialize on the per-cache `solver_lock` and
+    `_LIBKLU_LOCK` (KLU backend) or just the per-cache `solver_lock`
+    (AppleAccelerate backend).
 
 $(TYPEDSIGNATURES)
 """
diff --git a/src/ward_reduction.jl b/src/ward_reduction.jl
index a4a058491..2728526da 100644
--- a/src/ward_reduction.jl
+++ b/src/ward_reduction.jl
@@ -59,7 +59,11 @@ function get_ward_reduction(
     all_buses = subnetwork_bus_axis
     subnetwork_bus_indices = [bus_lookup[x] for x in all_buses]
     subnetwork_bus_lookup = Dict(bus => ix for (ix, bus) in enumerate(all_buses))
-    subnetwork_data = data[subnetwork_bus_indices, subnetwork_bus_indices]
+    # Promote to ComplexF64 for the KLU factorizations below; libklu only
+    # exposes `klu_l_*` (double) and `klu_zl_*` (complex double) entry points.
+    subnetwork_data = SparseArrays.SparseMatrixCSC{ComplexF64, Int}(
+        data[subnetwork_bus_indices, subnetwork_bus_indices],
+    )
     boundary_buses = collect(intersect(boundary_buses, Set(all_buses)))
 
     external_buses = setdiff(all_buses, study_buses)
@@ -74,18 +78,18 @@ function get_ward_reduction(
         @error "no boundary buses found; cannot make bus_reduction_map based on impedance based criteria. mapping all external buses to the first reference bus ($first_ref_study_bus)"
         bus_reduction_map_index[first_ref_study_bus] = Set(external_buses)
     else
-        K = klu(subnetwork_data)
+        K = klu_factorize(subnetwork_data)
         boundary_bus_indices = [subnetwork_bus_lookup[x] for x in boundary_buses]
         boundary_bus_numbers = collect(boundary_buses)
         n_boundary = length(boundary_buses)
-        e = zeros(ComplexF64, n_buses)
+        E = SparseArrays.sparse(
+            boundary_bus_indices,
+            collect(1:n_boundary),
+            ones(ComplexF64, n_boundary),
+            n_buses, n_boundary,
+        )
         Z_boundary_cols = Matrix{ComplexF64}(undef, n_buses, n_boundary)
-        # Solve one column of Z per boundary bus
-        for (j, b_idx) in enumerate(boundary_bus_indices)
-            fill!(e, zero(ComplexF64))
-            e[b_idx] = one(ComplexF64)
-            Z_boundary_cols[:, j] = K \ e
-        end
+        solve_sparse!(K, E, Z_boundary_cols)
 
         for b in external_buses
             row_index = subnetwork_bus_lookup[b]
@@ -109,9 +113,9 @@ function get_ward_reduction(
         boundary_bus_indices,
     ]
 
-    # Eq. (2.16) from  https://core.ac.uk/download/pdf/79564835.pdf
-    y_eq =
-        y_be * KLU.solve!(klu(y_ee), Matrix{Complex{Float64}}(y_eb))
+    # Eq. (2.16) from https://core.ac.uk/download/pdf/79564835.pdf.
+    y_ee_cache = klu_factorize(y_ee)
+    y_eq = y_be * solve_sparse(y_ee_cache, y_eb)
     #Loop upper diagonal of Yeq
     for ix in 1:length(boundary_buses)
         for jx in ix:length(boundary_buses)
diff --git a/src/woodbury_kernel.jl b/src/woodbury_kernel.jl
index 5503f1488..ad29ba3b4 100644
--- a/src/woodbury_kernel.jl
+++ b/src/woodbury_kernel.jl
@@ -63,43 +63,29 @@ function _invert_woodbury_W(
     return W_inv, is_island
 end
 
-# --- Accessor functions for Woodbury kernel ---
-# VirtualPTDF accessors (VirtualMODF accessors defined in virtual_modf_calculations.jl)
-
-_get_K(m::VirtualPTDF) = m.K
 _get_BA(m::VirtualPTDF) = m.BA
 _get_arc_susceptances(m::VirtualPTDF) = m.arc_susceptances
 _get_valid_ix(m::VirtualPTDF) = m.valid_ix
-_get_temp_data(m::VirtualPTDF) = m.temp_data
-_get_work_ba_col(m::VirtualPTDF) = m.work_ba_col
 
 """
-    _compute_woodbury_factors(mat, modifications) -> WoodburyFactors
-
-Compute the Woodbury correction factors for a set of arc modifications.
-Implements van Dijk et al. Eq. 29:
-    B_m⁻¹ = B_r⁻¹ - B_r⁻¹U (A⁻¹ + U⊤B_r⁻¹U)⁻¹ U⊤B_r⁻¹
+    _compute_woodbury_factors_impl(K, work_ba_col, temp_data, BA, arc_sus,
+                                   valid_ix, modifications) -> WoodburyFactors
 
-where U = [ν_{e1} ... ν_{eM}] and A = diag(Δb₁, ..., Δb_M).
-
-The expensive part (M KLU solves + M×M factorization) is shared
-across all monitored arcs for a given modification set.
-
-!!! warning
-    Not thread-safe. Mutates scratch vectors in `mat`. Do not call
-    concurrently on the same VirtualPTDF/VirtualMODF instance.
+Pure-data Woodbury factor computation. Mutates `work_ba_col` and
+`temp_data`. The caller is responsible for exclusive access to those
+buffers; in `Virtual{PTDF, MODF}` this is provided by holding
+`solver_lock` via `with_solver` for the duration of the call.
 """
-function _compute_woodbury_factors(
-    mat::PowerNetworkMatrix,
+function _compute_woodbury_factors_impl(
+    K,
+    work_ba_col::Vector{Float64},
+    temp_data::Vector{Float64},
+    BA::SparseArrays.SparseMatrixCSC{Float64, Int},
+    arc_sus::Vector{Float64},
+    valid_ix::Vector{Int},
     modifications::Tuple{Vararg{ArcModification}},
 )::WoodburyFactors
     M = length(modifications)
-    K = _get_K(mat)
-    BA = _get_BA(mat)
-    arc_sus = _get_arc_susceptances(mat)
-    valid_ix = _get_valid_ix(mat)
-    temp_data = _get_temp_data(mat)
-    work_ba_col = _get_work_ba_col(mat)
     n_bus = length(temp_data)
 
     arc_indices = Vector{Int}(undef, M)
@@ -160,29 +146,22 @@ function _compute_woodbury_factors(
 end
 
 """
-    _apply_woodbury_correction(mat, monitored_idx, wf) -> Vector{Float64}
+    _apply_woodbury_correction_impl(K, work_ba_col, temp_data, BA, arc_sus,
+                                    valid_ix, monitored_idx, wf) -> Vector{Float64}
 
-Compute the post-modification PTDF row for a monitored arc using
-precomputed Woodbury factors.
-
-Post-modification PTDF: `PTDF_m[mon,:] = b_mon_post · ν_mon⊤ · B_m⁻¹`
-Computed as: `b_mon_post · (z_m - Z · W⁻¹ · (ν_mon⊤ · Z))`
-
-!!! warning
-    Not thread-safe. Mutates scratch vectors in `mat`. Do not call
-    concurrently on the same VirtualPTDF/VirtualMODF instance.
+Pure-data Woodbury correction. Mutates `work_ba_col` and `temp_data`; the
+caller owns exclusive access to those buffers.
 """
-function _apply_woodbury_correction(
-    mat::PowerNetworkMatrix,
+function _apply_woodbury_correction_impl(
+    K,
+    work_ba_col::Vector{Float64},
+    temp_data::Vector{Float64},
+    BA::SparseArrays.SparseMatrixCSC{Float64, Int},
+    arc_sus::Vector{Float64},
+    valid_ix::Vector{Int},
     monitored_idx::Int,
     wf::WoodburyFactors,
 )::Vector{Float64}
-    K = _get_K(mat)
-    BA = _get_BA(mat)
-    arc_sus = _get_arc_susceptances(mat)
-    valid_ix = _get_valid_ix(mat)
-    temp_data = _get_temp_data(mat)
-    work_ba_col = _get_work_ba_col(mat)
     n_bus = length(temp_data)
 
     M = length(wf.arc_indices)
@@ -232,3 +211,37 @@ function _apply_woodbury_correction(
     temp_data .*= b_mon
     return copy(temp_data)
 end
+
+# Outer dispatchers: VirtualPTDF and VirtualMODF both acquire a solver and
+# matched per-worker scratch via `with_solver` / `with_worker`. The
+# VirtualMODF methods are defined in virtual_modf_calculations.jl alongside
+# the struct.
+
+function _compute_woodbury_factors(
+    mat::VirtualPTDF,
+    modifications::Tuple{Vararg{ArcModification}},
+)::WoodburyFactors
+    return with_solver(
+        mat.K, mat.work_ba_col, mat.temp_data, mat.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
+        _compute_woodbury_factors_impl(
+            K_solver, work_ba_col, temp_data,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, modifications,
+        )
+    end
+end
+
+function _apply_woodbury_correction(
+    mat::VirtualPTDF,
+    monitored_idx::Int,
+    wf::WoodburyFactors,
+)::Vector{Float64}
+    return with_solver(
+        mat.K, mat.work_ba_col, mat.temp_data, mat.solver_lock,
+    ) do K_solver, work_ba_col, temp_data
+        _apply_woodbury_correction_impl(
+            K_solver, work_ba_col, temp_data,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, monitored_idx, wf,
+        )
+    end
+end
diff --git a/test/PowerNetworkMatricesTests.jl b/test/PowerNetworkMatricesTests.jl
index cd3228193..b3857fe30 100644
--- a/test/PowerNetworkMatricesTests.jl
+++ b/test/PowerNetworkMatricesTests.jl
@@ -57,7 +57,7 @@ function run_tests(args...; kwargs...)
     logger = global_logger()
     try
         logging_config_filename = get(ENV, "SIIP_LOGGING_CONFIG", nothing)
-        if logging_config_filename !== nothing
+        if !isnothing(logging_config_filename)
             config = IS.LoggingConfiguration(logging_config_filename)
         else
             config = IS.LoggingConfiguration(;
diff --git a/test/test_arc_classification.jl b/test/test_arc_classification.jl
index a90c0ddca..df4ab9070 100644
--- a/test/test_arc_classification.jl
+++ b/test/test_arc_classification.jl
@@ -48,5 +48,5 @@ end
 
     tag, arc = PNM._resolve_branch_arc(nr2, line)
     @test tag === :not_found
-    @test arc === nothing
+    @test isnothing(arc)
 end
diff --git a/test/test_arc_types_and_reductions.jl b/test/test_arc_types_and_reductions.jl
index 820135c91..c734d29ab 100644
--- a/test/test_arc_types_and_reductions.jl
+++ b/test/test_arc_types_and_reductions.jl
@@ -85,6 +85,97 @@ end
     @test PNM.BranchesParallel{Line} in types_in_series_reduction(nrd)
 end
 
+@testset "MixedBranchesParallel construction and methods" begin
+    bus1 = PSY.ACBus(;
+        number = 101,
+        name = "bus_101",
+        available = true,
+        bustype = PSY.ACBusTypes.PV,
+        angle = 0.0,
+        magnitude = 1.0,
+        voltage_limits = (min = 0.9, max = 1.1),
+        base_voltage = 230.0,
+    )
+    bus2 = PSY.ACBus(;
+        number = 102,
+        name = "bus_102",
+        available = true,
+        bustype = PSY.ACBusTypes.PV,
+        angle = 0.0,
+        magnitude = 1.0,
+        voltage_limits = (min = 0.9, max = 1.1),
+        base_voltage = 230.0,
+    )
+    line = PSY.Line(;
+        name = "mixed_line",
+        available = true,
+        active_power_flow = 0.0,
+        reactive_power_flow = 0.0,
+        arc = PSY.Arc(; from = bus1, to = bus2),
+        r = 0.05,
+        x = 0.10,
+        b = (from = 0.01, to = 0.01),
+        g = (from = 0.0, to = 0.0),
+        rating = 100.0,
+        angle_limits = (min = -π / 2, max = π / 2),
+    )
+    tap = PSY.TapTransformer(;
+        name = "mixed_tap",
+        available = true,
+        active_power_flow = 0.0,
+        reactive_power_flow = 0.0,
+        arc = PSY.Arc(; from = bus1, to = bus2),
+        r = 0.122,
+        x = 0.10,
+        primary_shunt = 0.01 + im * 0.02,
+        tap = 1.0,
+        rating = 80.0,
+        base_power = 100.0,
+        winding_group_number = WindingGroupNumber.GROUP_11,
+    )
+
+    # Homogeneous group dispatches to BranchesParallel{Line}.
+    bp_homog = PNM.BranchesParallel([line])
+    @test bp_homog isa PNM.BranchesParallel{PSY.Line}
+    @test bp_homog isa PNM.AbstractBranchesParallel
+
+    # The {T} parameter must be concrete; abstract T should fail at construction.
+    @test_throws ErrorException PNM.BranchesParallel{PSY.ACTransmission}(
+        PSY.ACTransmission[line, tap],
+        nothing,
+    )
+
+    # MixedBranchesParallel holds heterogeneous branches.
+    mbp = PNM.MixedBranchesParallel([line, tap])
+    @test mbp isa PNM.MixedBranchesParallel
+    @test mbp isa PNM.AbstractBranchesParallel
+    @test length(mbp) == 2
+    @test eltype(mbp.branches) === PSY.ACTransmission
+
+    # ybus_branch_entries on the mixed group should equal the sum of the parts.
+    Y11_l, Y12_l, Y21_l, Y22_l = PNM.ybus_branch_entries(line)
+    Y11_t, Y12_t, Y21_t, Y22_t = PNM.ybus_branch_entries(tap)
+    Y11_m, Y12_m, Y21_m, Y22_m = PNM.ybus_branch_entries(mbp)
+    @test Y11_m ≈ Y11_l + Y11_t
+    @test Y12_m ≈ Y12_l + Y12_t
+    @test Y21_m ≈ Y21_l + Y21_t
+    @test Y22_m ≈ Y22_l + Y22_t
+
+    # Explicit equivalent-rating strategies for parallel groups; emergency rating sums them.
+    @test PNM.get_sum_of_max_rating(mbp) ≈ 100.0 + 80.0
+    @test PNM.get_single_element_contingency_rating(mbp) ≈ 80.0
+    @test PNM.get_equivalent_emergency_rating(mbp) ≈ 100.0 + 80.0
+
+    # add_to_map: empty filters short-circuit (no warning).
+    @test PNM.add_to_map(mbp, Dict{DataType, Function}()) == true
+
+    # add_to_map: non-empty filters trigger the mixed-type warning.
+    filters = Dict{DataType, Function}(PSY.Line => x -> true)
+    @test_logs (:warn, r"mixed branch types") match_mode = :any begin
+        @test PNM.add_to_map(mbp, filters) == true
+    end
+end
+
 @testset "Test Reductions with filters" begin
     sys_rts_da = build_system(PSISystems, "modified_RTS_GMLC_DA_sys")
 
diff --git a/test/test_equivalent_getters.jl b/test/test_equivalent_getters.jl
index a2f615772..059a0686f 100644
--- a/test/test_equivalent_getters.jl
+++ b/test/test_equivalent_getters.jl
@@ -36,9 +36,18 @@
     )
     # Create BranchesParallel
     bp = PNM.BranchesParallel([line1, line2])
-    # Test get_rating: Rating = (Rating1 + Rating2) / n = (100.0 + 150.0) / 2 = 125.0
-    rating_eq = PNM.get_equivalent_rating(bp)
-    @test rating_eq ≈ 125.0 atol = 1e-6
+
+    # Sum of individual thermal limits.
+    @test PNM.get_sum_of_max_rating(bp) ≈ 250.0 atol = 1e-6
+
+    # N-1: capacity remaining after the largest-rated circuit trips.
+    # sum(100, 150) - max(100, 150) = 100.0
+    @test PNM.get_single_element_contingency_rating(bp) ≈ 100.0 atol = 1e-6
+
+    # Susceptance-weighted average.
+    # b1 = x1/(r1²+x1²) = 0.2/0.05 = 4.0, b2 = 0.4/0.20 = 2.0, b_total = 6.0
+    # f1 = 2/3, f2 = 1/3 → (2/3)*100 + (1/3)*150 = 350/3 ≈ 116.667
+    @test PNM.get_impedance_averaged_rating(bp) ≈ 350.0 / 3.0 atol = 1e-6
 
     emergency_rating_eq = PNM.get_equivalent_emergency_rating(bp)
     @test emergency_rating_eq ≈ 250.0 atol = 1e-6
@@ -46,20 +55,20 @@
     bs = PNM.BranchesSeries()
     PNM.add_branch!(bs, line1, :FromTo)
     PNM.add_branch!(bs, line2, :FromTo)
-    # Test get_rating: Rating = minimum rating for series branches (weakest link)
+    # Series weakest-link rule: min(100, 150) = 100.0
     rating_eq = PNM.get_equivalent_rating(bs)
     @test rating_eq ≈ 100.0 atol = 1e-6
 
     emergency_rating_eq = PNM.get_equivalent_emergency_rating(bs)
     @test emergency_rating_eq ≈ 100.0 atol = 1e-6
 
-    #Add test parrallel circuit + line1
+    # Series chain containing a parallel block: the block contributes its N-1
+    # single-element-contingency rating (100.0), so min(100, 150) = 100.0.
     bs = PNM.BranchesSeries()
     PNM.add_branch!(bs, bp, :FromTo)
     PNM.add_branch!(bs, line2, :FromTo)
-    # Test get_rating: Rating = minimum rating for series branches (weakest link)
     rating_eq = PNM.get_equivalent_rating(bs)
-    @test rating_eq ≈ 125.0 atol = 1e-6
+    @test rating_eq ≈ 100.0 atol = 1e-6
 
     emergency_rating_eq = PNM.get_equivalent_emergency_rating(bs)
     @test emergency_rating_eq ≈ 150.0 atol = 1e-6
@@ -99,7 +108,8 @@ function test_ybus_equivalence_branches_parallel(vector_branches)
         voltage_limits = (min = 0.0, max = 1.0),
         base_voltage = 1.0,
         area = nothing,
-        load_zone = nothing)
+        load_zone = nothing,
+    )
     bus2 = ACBus(;
         number = 2,
         name = "bus2",
@@ -110,7 +120,8 @@ function test_ybus_equivalence_branches_parallel(vector_branches)
         voltage_limits = (min = 0.0, max = 1.0),
         base_voltage = 1.0,
         area = nothing,
-        load_zone = nothing)
+        load_zone = nothing,
+    )
 
     add_component!(sys, bus1)
     add_component!(sys, bus2)
@@ -199,7 +210,8 @@ function test_ybus_equivalence_branches_series(vector_branches)
             voltage_limits = (min = 0.0, max = 1.0),
             base_voltage = 1.0,
             area = nothing,
-            load_zone = nothing)
+            load_zone = nothing,
+        )
         add_component!(sys, bus)
     end
     for (ix, br) in enumerate(vector_branches)
diff --git a/test/test_has_time_series.jl b/test/test_has_time_series.jl
index 569ac8e4e..a5e5a396d 100644
--- a/test/test_has_time_series.jl
+++ b/test/test_has_time_series.jl
@@ -70,9 +70,9 @@ end
     tww2 = PNM.ThreeWindingTransformerWinding(trf, 2)
 
     # No time series attached — should return nothing
-    @test PNM.get_device_with_time_series(bp, PSY.SingleTimeSeries, "rating") === nothing
-    @test PNM.get_device_with_time_series(bs, PSY.SingleTimeSeries, "rating") === nothing
-    @test PNM.get_device_with_time_series(tww1, PSY.SingleTimeSeries, "rating") === nothing
+    @test isnothing(PNM.get_device_with_time_series(bp, PSY.SingleTimeSeries, "rating"))
+    @test isnothing(PNM.get_device_with_time_series(bs, PSY.SingleTimeSeries, "rating"))
+    @test isnothing(PNM.get_device_with_time_series(tww1, PSY.SingleTimeSeries, "rating"))
 
     # Add time series to line1 in BranchesParallel
     PSY.add_time_series!(sys, line1, _make_test_time_series("rating"))
diff --git a/test/test_klu_wrapper.jl b/test/test_klu_wrapper.jl
new file mode 100644
index 000000000..a4be6d48c
--- /dev/null
+++ b/test/test_klu_wrapper.jl
@@ -0,0 +1,234 @@
+import SparseArrays
+
+@testset "KLU wrapper: real round-trip and refactor" begin
+    n = 50
+    rng_vals = collect(1.0:n)
+    A = SparseArrays.spdiagm(0 => rng_vals .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    x = collect(1.0:n)
+    b = A * x
+
+    cache = PNM.klu_factorize(A)
+    @test PNM.is_factored(cache)
+    @test size(cache) == (n, n)
+
+    y = copy(b)
+    PNM.solve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # Refactor with new values, same pattern.
+    A2 = SparseArrays.spdiagm(0 => rng_vals .+ 2.0,
+        1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1))
+    x2 = randn(n)
+    b2 = A2 * x2
+    PNM.numeric_refactor!(cache, A2)
+    y2 = copy(b2)
+    PNM.solve!(cache, y2)
+    @test isapprox(y2, x2, atol = 1e-9)
+
+    # Pattern change should be rejected with check_pattern=true.
+    A3 = copy(A2)
+    A3[2, n] = 1.0
+    @test_throws ArgumentError PNM.numeric_refactor!(cache, A3)
+end
+
+@testset "KLU wrapper: tsolve! solves Aᵀ x = b" begin
+    n = 30
+    # Asymmetric matrix so transposing actually exercises the transpose path.
+    A = SparseArrays.spdiagm(
+        0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.5, n - 1),
+        -1 => fill(0.1, n - 1),
+    )
+    cache = PNM.klu_factorize(A)
+
+    # Single RHS.
+    x = randn(n)
+    b = transpose(A) * x
+    y = copy(b)
+    PNM.tsolve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # Multiple RHS, dense matrix path.
+    X = randn(n, 4)
+    B = transpose(A) * X
+    Y = copy(B)
+    PNM.tsolve!(cache, Y)
+    @test isapprox(Y, X, atol = 1e-10)
+end
+
+@testset "KLU wrapper: reuse_symbolic=false rebuilds analysis on refactor" begin
+    n = 25
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    x = collect(1.0:n)
+    b = A * x
+
+    cache = PNM.klu_factorize(A; reuse_symbolic = false)
+    @test PNM.is_factored(cache)
+    y = copy(b)
+    PNM.solve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # A pattern change is allowed because each refactor reruns
+    # `symbolic_factor!` from scratch.
+    A2 = copy(A)
+    A2[1, n] = 0.25
+    x2 = randn(n)
+    b2 = A2 * x2
+    PNM.full_refactor!(cache, A2)
+    y2 = copy(b2)
+    PNM.solve!(cache, y2)
+    @test isapprox(y2, x2, atol = 1e-9)
+end
+
+@testset "KLU wrapper: solve_sparse matches dense path" begin
+    n, m = 30, 40
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+
+    # Build a deliberately sparse RHS.
+    rows = vcat([1, 5, 12, n], [3, 9, n - 1])
+    cols = vcat(fill(2, 4), fill(7, 3))
+    vals = [1.0, -2.0, 3.5, -0.5, 2.0, -1.0, 1.5]
+    B = SparseArrays.sparse(rows, cols, vals, n, m)
+
+    out = PNM.solve_sparse(cache, B)
+    Bdense = Matrix(B)
+    PNM.solve!(cache, Bdense)
+    @test isapprox(out, Bdense, atol = 1e-10)
+end
+
+@testset "KLU wrapper: solve_sparse zeros empty columns" begin
+    n = 20
+    A = SparseArrays.spdiagm(0 => fill(2.0, n), 1 => fill(-1.0, n - 1),
+        -1 => fill(-1.0, n - 1))
+    cache = PNM.klu_factorize(A)
+    rows = [3, 7]
+    cols = [3, 3]
+    vals = [1.0, 2.0]
+    B = SparseArrays.sparse(rows, cols, vals, n, 5)
+
+    out = PNM.solve_sparse(cache, B)
+    @test all(==(0.0), out[:, 1])
+    @test all(==(0.0), out[:, 2])
+    @test all(==(0.0), out[:, 4])
+    @test all(==(0.0), out[:, 5])
+    @test !all(==(0.0), out[:, 3])
+
+    Bdense = Matrix(B)
+    PNM.solve!(cache, Bdense)
+    @test isapprox(out, Bdense, atol = 1e-12)
+end
+
+@testset "KLU wrapper: backslash" begin
+    n = 25
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+    x = randn(n)
+    b = A * x
+    @test isapprox(cache \ b, x, atol = 1e-10)
+end
+
+@testset "KLU wrapper: ComplexF64 path" begin
+    n = 12
+    A = SparseArrays.spdiagm(
+        0 => ComplexF64.(collect(1.0:n) .+ 1.0im),
+        1 => fill(0.1 + 0.0im, n - 1),
+        -1 => fill(0.1 + 0.0im, n - 1),
+    )
+    x = ComplexF64.(randn(n) .+ 1im .* randn(n))
+    b = A * x
+
+    cache = PNM.klu_factorize(A)
+    y = copy(b)
+    PNM.solve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # Sparse RHS path on the complex cache.
+    rows = [1, n]
+    cols = [1, 2]
+    vals = ComplexF64.([1.0 + 0.5im, 2.0 - 0.3im])
+    B = SparseArrays.sparse(rows, cols, vals, n, 3)
+    out = PNM.solve_sparse(cache, B)
+    Bdense = Matrix(B)
+    PNM.solve!(cache, Bdense)
+    @test isapprox(out, Bdense, atol = 1e-10)
+end
+
+@testset "KLU wrapper: solve_sparse! into a view" begin
+    n = 15
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 5.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+    nrhs = 6
+    B = SparseArrays.sprand(n, nrhs, 0.2)
+
+    full = zeros(n + 4, nrhs)
+    PNM.solve_sparse!(cache, B, view(full, 3:(n + 2), :))
+
+    Bdense = Matrix(B)
+    PNM.solve!(cache, Bdense)
+    @test isapprox(full[3:(n + 2), :], Bdense, atol = 1e-10)
+    @test all(==(0.0), full[1:2, :])
+    @test all(==(0.0), full[(n + 3):end, :])
+end
+
+@testset "KLU wrapper: solve_sparse! block chunking matches monolithic solve" begin
+    # Many sparse columns with small block must give the same answer as a
+    # large block: defends the bottleneck-preservation contract from the
+    # original plan (n × block working set, not n × nrhs).
+    n = 80
+    nrhs = 250
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 5.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+    B = SparseArrays.sprand(n, nrhs, 0.05)
+
+    Xbig = PNM.solve_sparse(cache, B; block = 256)
+    Xsmall = PNM.solve_sparse(cache, B; block = 7)
+    @test isapprox(Xbig, Xsmall, atol = 1e-12)
+
+    Bdense = Matrix(B)
+    PNM.solve!(cache, Bdense)
+    @test isapprox(Xsmall, Bdense, atol = 1e-10)
+end
+
+@testset "KLU wrapper: warm solve! / numeric_refactor! are non-allocating" begin
+    n = 40
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+    b = randn(n)
+
+    # Warm-up to compile.
+    y = copy(b)
+    PNM.solve!(cache, y)
+    A2 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 2.0,
+        1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1))
+    PNM.numeric_refactor!(cache, A2)
+
+    # Hot path: solve! mutates the buffer in place; no allocations.
+    y .= b
+    alloc_solve = @allocated PNM.solve!(cache, y)
+    @test alloc_solve == 0
+
+    alloc_refactor = @allocated PNM.numeric_refactor!(cache, A2)
+    @test alloc_refactor == 0
+end
+
+@testset "KLU wrapper: solve_sparse! warm working set is bounded" begin
+    n = 50
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+    B = SparseArrays.sprand(n, 100, 0.05)
+    out = Matrix{Float64}(undef, n, 100)
+
+    PNM.solve_sparse!(cache, B, out; block = 32)
+    # Warm call must not scale with `n * nrhs`; bound is well below that.
+    alloc_warm = @allocated PNM.solve_sparse!(cache, B, out; block = 32)
+    @test alloc_warm < n * size(B, 2) * sizeof(Float64) ÷ 4
+end
diff --git a/test/test_network_modification.jl b/test/test_network_modification.jl
index f3f714502..7bdb278ad 100644
--- a/test/test_network_modification.jl
+++ b/test/test_network_modification.jl
@@ -31,7 +31,7 @@
                 break
             end
         end
-        @test outaged_line !== nothing
+        @test !isnothing(outaged_line)
         PSY.set_available!(outaged_line, false)
         ptdf_rebuilt = PTDF(sys_mod)
         rebuilt_arc_ax = PNM.get_arc_axis(ptdf_rebuilt)
@@ -250,3 +250,44 @@ end
         end
     end
 end
+
+@testset "Woodbury correction concurrent across arcs (AppleAccelerate backend)" begin
+    # Validates the AA-backend concurrency claim documented in
+    # `src/virtual_ptdf_modification.jl`: many tasks calling
+    # `apply_woodbury_correction` on a single VirtualPTDF should agree with the
+    # serial baseline. The KLU path is exercised by the threaded testsets in
+    # `test/test_virtual_modf.jl`; this complements that coverage on the
+    # AppleAccelerate path.
+    if !PowerNetworkMatrices._has_apple_accelerate_ext()
+        @info "Skipping: AppleAccelerate extension not loaded."
+        return
+    end
+    if Threads.nthreads() < 2
+        @info "Skipping: requires Threads.nthreads() ≥ 2 to exercise concurrent access."
+        return
+    end
+
+    sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
+    vptdf = VirtualPTDF(sys; linear_solver = "AppleAccelerate")
+
+    arc_ax = PNM.get_arc_axis(vptdf)
+    n_arcs = length(arc_ax)
+    @test n_arcs ≥ 2
+
+    outaged_arc = arc_ax[1]
+    mod = NetworkModification(vptdf, outaged_arc)
+    wf = compute_woodbury_factors(vptdf, mod)
+
+    monitored_indices = collect(2:n_arcs)
+    serial = [apply_woodbury_correction(vptdf, m, wf) for m in monitored_indices]
+
+    for iter in 1:5
+        parallel = Vector{Vector{Float64}}(undef, length(monitored_indices))
+        Threads.@threads :dynamic for i in eachindex(monitored_indices)
+            parallel[i] = apply_woodbury_correction(vptdf, monitored_indices[i], wf)
+        end
+        for i in eachindex(monitored_indices)
+            @test parallel[i] ≈ serial[i]
+        end
+    end
+end
diff --git a/test/test_partial_lodf.jl b/test/test_partial_lodf.jl
index 6eae891a3..53363593f 100644
--- a/test/test_partial_lodf.jl
+++ b/test/test_partial_lodf.jl
@@ -107,8 +107,13 @@ end
     # Extract the BA column for arc e at non-reference bus indices.
     ba_col_e = [vlodf.BA[vlodf.valid_ix[i], e] for i in 1:n_valid]
 
-    # Solve ABA x = ba_col_e using the factorization (fresh copy to avoid aliasing).
-    lin_solve = vlodf.K \ copy(ba_col_e)
+    # Solve ABA x = ba_col_e using the cache's factorization (fresh copy
+    # to avoid aliasing).
+    lin_solve = let
+        b = copy(ba_col_e)
+        PNM.solve!(vlodf.K, b)
+        b
+    end
 
     # Map the solution back to full bus space.
     temp_full = zeros(n_buses)
diff --git a/test/test_system_uuid_tracking.jl b/test/test_system_uuid_tracking.jl
index 46349a657..4aa01c498 100644
--- a/test/test_system_uuid_tracking.jl
+++ b/test/test_system_uuid_tracking.jl
@@ -9,7 +9,7 @@
     # UUID is nothing when constructing from Ybus (no system available)
     ybus = Ybus(sys)
     vptdf_ybus = VirtualPTDF(ybus)
-    @test get_system_uuid(vptdf_ybus) === nothing
+    @test isnothing(get_system_uuid(vptdf_ybus))
 end
 
 @testset "System UUID tracking in VirtualMODF" begin
@@ -24,10 +24,10 @@ end
     sys = PSB.build_system(PSB.PSITestSystems, "c_sys5")
 
     ybus = Ybus(sys)
-    @test get_system_uuid(ybus) === nothing
+    @test isnothing(get_system_uuid(ybus))
 
     ptdf = PTDF(sys)
-    @test get_system_uuid(ptdf) === nothing
+    @test isnothing(get_system_uuid(ptdf))
 end
 
 @testset "System UUID validation in NetworkModification" begin
diff --git a/test/test_virtual_modf.jl b/test/test_virtual_modf.jl
index 21bb6e7e4..4dc9464c8 100644
--- a/test/test_virtual_modf.jl
+++ b/test/test_virtual_modf.jl
@@ -1,3 +1,5 @@
+using Random
+
 @testset "VirtualMODF construction" begin
     # Test construction with a simple system (no outages)
     sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5")
@@ -261,7 +263,7 @@ end
         collect(keys(nrd_d2.direct_branch_map)),
         collect(keys(nrd_d2.parallel_branch_map)),
     )
-    # Compare results for buses that are present in the reduced system 
+    # Compare results for buses that are present in the reduced system
     buses_to_compare = collect(keys(nrd_d2.bus_reduction_map))
     for branch in valid_outage_branches
         outage = get_supplemental_attributes(branch)[1]
@@ -360,9 +362,7 @@ end
         )
         add_supplemental_attribute!(sys, branch, outage)
     end
-    vmodf = VirtualMODF(sys; network_reductions = NetworkReduction[
-        DegreeTwoReduction()
-    ])
+    vmodf = VirtualMODF(sys; network_reductions = NetworkReduction[DegreeTwoReduction()])
     for branch in get_components(ACTransmission, sys)
         !has_supplemental_attributes(branch) && continue
         outage = get_supplemental_attributes(branch)[1]
@@ -371,3 +371,100 @@ end
         @test ctg.modification.arc_modifications[1].delta_b <= 0.0
     end
 end
+
+# Helper for the parallel-safety testsets below: registers a fixed set of line
+# outages on c_sys14 and returns (sys, line_names).
+function _build_c_sys14_with_outages()
+    sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
+    line_names = ["Line1", "Line2", "Line9", "Line10", "Line12"]
+    for line_name in line_names
+        line = PSY.get_component(PSY.ACTransmission, sys, line_name)
+        outage = PSY.GeometricDistributionForcedOutage(;
+            mean_time_to_recovery = 10,
+            outage_transition_probability = 0.9999,
+        )
+        PSY.add_supplemental_attribute!(sys, line, outage)
+    end
+    return sys, line_names
+end
+
+@testset "VirtualMODF concurrent getindex across different contingencies matches serial baseline" begin
+    # Mirrors the access pattern PowerSimulations uses in
+    # `add_post_contingency_flow_expressions!`: many concurrent tasks query
+    # `vmodf[arc, contingency_spec]` across DIFFERENT contingencies. With the
+    # single-cache solver + `_LIBKLU_LOCK`, all libklu work serializes; this
+    # test confirms the result is still correct under that serialization,
+    # including the double-checked-insert path on `woodbury_cache` /
+    # `row_caches` when two threads race on a first-time query.
+    # Skipped under JULIA_NUM_THREADS=1 because @threads :dynamic degenerates
+    # to serial there and the test reduces to a tautology.
+    if Threads.nthreads() < 2
+        @info "Skipping: requires Threads.nthreads() ≥ 2 to exercise concurrent getindex."
+        return
+    end
+
+    sys, _ = _build_c_sys14_with_outages()
+    vmodf = PowerNetworkMatrices.VirtualMODF(sys)
+    registered = PowerNetworkMatrices.get_registered_contingencies(vmodf)
+    @test !isempty(registered)
+    ctgs = collect(values(registered))
+    arc_axis = PowerNetworkMatrices.get_arc_axis(vmodf)
+    @test !isempty(arc_axis)
+
+    # Cap arc count so test runtime stays bounded if c_sys14's arc list grows
+    # in the future. With 5 contingencies, 20 arcs gives 100 work items per
+    # iteration — enough parallelism on a 4-core CI without dominating runtime.
+    n_arcs = min(length(arc_axis), 20)
+    work = [(arc, ctg) for arc in arc_axis[1:n_arcs] for ctg in ctgs]
+    Random.shuffle!(Random.MersenneTwister(42), work)
+
+    # Serial baseline.
+    serial = [copy(vmodf[a, c]) for (a, c) in work]
+
+    # Five iterations of parallel access, each starting from a clean cache,
+    # to flush scheduling-dependent races.
+    for iter in 1:5
+        PowerNetworkMatrices.clear_caches!(vmodf)
+        parallel = Vector{Vector{Float64}}(undef, length(work))
+        Threads.@threads :dynamic for i in eachindex(work)
+            a, c = work[i]
+            parallel[i] = copy(vmodf[a, c])
+        end
+        for i in eachindex(work)
+            @test parallel[i] ≈ serial[i]
+        end
+    end
+end
+
+@testset "VirtualMODF concurrent getindex on the SAME (arc, ctg) is consistent" begin
+    # Complements the previous testset: there, each (arc, ctg) pair appears
+    # once in the work list, so only the `woodbury_cache` first-call race is
+    # exercised. Here, many tasks race on the SAME (arc, ctg), which
+    # exercises the row-cache population path serialized by `solver_lock`.
+    if Threads.nthreads() < 2
+        @info "Skipping: requires Threads.nthreads() ≥ 2 to exercise concurrent getindex."
+        return
+    end
+
+    sys, _ = _build_c_sys14_with_outages()
+    vmodf = PowerNetworkMatrices.VirtualMODF(sys)
+    registered = PowerNetworkMatrices.get_registered_contingencies(vmodf)
+    arc_axis = PowerNetworkMatrices.get_arc_axis(vmodf)
+    arc = first(arc_axis)
+    ctg = first(values(registered))
+
+    serial_value = copy(vmodf[arc, ctg])
+
+    # 64 tasks racing on one cache slot.
+    n_tasks = 64
+    for iter in 1:5
+        PowerNetworkMatrices.clear_caches!(vmodf)
+        parallel = Vector{Vector{Float64}}(undef, n_tasks)
+        Threads.@threads :dynamic for i in 1:n_tasks
+            parallel[i] = copy(vmodf[arc, ctg])
+        end
+        for i in 1:n_tasks
+            @test parallel[i] ≈ serial_value
+        end
+    end
+end
diff --git a/test/test_ybus_contingencies.jl b/test/test_ybus_contingencies.jl
index cf23777bf..a41f3e36f 100644
--- a/test/test_ybus_contingencies.jl
+++ b/test/test_ybus_contingencies.jl
@@ -121,7 +121,7 @@ end
         break
     end
 
-    if parallel_branch !== nothing
+    if !isnothing(parallel_branch)
         mod = NetworkModification(vptdf, parallel_branch)
         result = apply_ybus_modification(ybus, mod)
 
@@ -166,7 +166,7 @@ end
         end
     end
 
-    if series_branch !== nothing
+    if !isnothing(series_branch)
         mod = NetworkModification(vptdf, series_branch)
         result = apply_ybus_modification(ybus, mod)