diff --git a/.claude/claude.md b/.claude/claude.md index 0d49c0c64..2ba1fab29 100644 --- a/.claude/claude.md +++ b/.claude/claude.md @@ -75,7 +75,6 @@ Efficient computation of power network matrices for large-scale systems. Support ### `ext/` - **MKLPardisoExt.jl**: MKL-Pardiso sparse solver (Windows/Linux) - - **AppleAccelerateExt.jl**: native macOS BLAS acceleration ### `test/` @@ -112,7 +111,7 @@ Documentation source ### Optional Extensions - **MKL + Pardiso**: high-performance sparse factorization - - **AppleAccelerate**: native macOS dense linear algebra + - **AppleAccelerate**: native macOS sparse direct solver (libSparse `SparseFactorizationLU`, built-in via `src/AccelerateWrapper/`, macOS 15.5+) ## Core Abstractions @@ -160,7 +159,7 @@ Documentation source - **Indexing**: `matrix[bus_num, branch_tuple]` auto-maps to internal indices - **Subnetworks**: `subnetwork_axes` Dict maps reference buses to island components - **Caching**: VirtualPTDF/LODF use LRU cache (default 100 MiB) for row storage - - **Solvers**: KLU (default), Dense, MKLPardiso, AppleAccelerate via extensions + - **Solvers**: KLU, Dense, MKLPardiso (ext), and built-in AppleAccelerateLU (libSparse `SparseFactorizationLU` + Inf-norm equilibration scaling, macOS 15.5+) via `src/AccelerateWrapper/`; KLU is the default on non-Apple / macOS < 15.5; AppleAccelerateLU is the default on macOS ≥ 15.5. ## Test Patterns diff --git a/Project.toml b/Project.toml index 725cae6d0..a2f0ea685 100644 --- a/Project.toml +++ b/Project.toml @@ -15,15 +15,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SuiteSparse_jll = "bea87d4a-7f5b-5778-9afe-8cc45184846c" [weakdeps] -AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924" Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" [extensions] -AppleAccelerateExt = "AppleAccelerate" MKLPardisoExt = "Pardiso" [compat] -AppleAccelerate = "^0.4" DataStructures = "^0.19" DocStringExtensions = "~0.8, ~0.9" HDF5 = "0.17" diff --git a/docs/src/reference/internals.md b/docs/src/reference/internals.md index cf5b63078..e3cc12b85 100644 --- a/docs/src/reference/internals.md +++ b/docs/src/reference/internals.md @@ -14,3 +14,14 @@ symbols are exported from `PowerNetworkMatrices`. ```@autodocs Modules = [PowerNetworkMatrices.KLUWrapper] ``` + +## `AccelerateWrapper` + +`PowerNetworkMatrices.AccelerateWrapper` is a thin, allocation-aware wrapper over Apple's +`libSparse.dylib` (provided by the system Accelerate framework) used internally for sparse +linear solves on macOS. Non-Apple builds load stub fallbacks that throw on use. None of +these symbols are exported from `PowerNetworkMatrices`. + +```@autodocs +Modules = [PowerNetworkMatrices.AccelerateWrapper] +``` diff --git a/ext/AppleAccelerateExt.jl b/ext/AppleAccelerateExt.jl deleted file mode 100644 index 8b2e48345..000000000 --- a/ext/AppleAccelerateExt.jl +++ /dev/null @@ -1,112 +0,0 @@ -module AppleAccelerateExt - -import PowerNetworkMatrices as PNM -using AppleAccelerate -import SparseArrays -import LinearAlgebra - -# Extend the factorization creation function -function PNM._create_apple_accelerate_factorization(ABA) - K = AppleAccelerate.AAFactorization(ABA) - AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT) - return K -end - -# Extend the solve function for AppleAccelerate factorizations -function PNM._solve_factorization( - K::AppleAccelerate.AAFactorization{Float64}, - b::Vector{Float64}, -) - return AppleAccelerate.solve(K, b) -end - -""" -Function for internal use only. - -Computes the PTDF matrix by means of AppleAccelerate for sparse matrices. - -# Arguments -- `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: - Incidence Matrix -- `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: - BA matrix -- `ref_bus_positions::Set{Int}`: - vector containing the indexes of the reference slack buses. -- `dist_slack::Vector{Float64}`: - vector containing the weights for the distributed slacks. -""" -function PNM._calculate_PTDF_matrix_AppleAccelerate( - A::SparseArrays.SparseMatrixCSC{Int8, Int}, - BA::SparseArrays.SparseMatrixCSC{Float64, Int}, - ref_bus_positions::Set{Int}, - dist_slack::Vector{Float64}) - @warn "AppleAccelerate solver is experimental and may produce unexpected results. If you need high confidence use KLU" - linecount = size(BA, 2) - buscount = size(BA, 1) - - ABA = PNM.calculate_ABA_matrix(A, BA, ref_bus_positions) - K = AppleAccelerate.AAFactorization(ABA) - AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT) - - # initialize matrices for evaluation - valid_ix = setdiff(1:buscount, ref_bus_positions) - PTDFm_t = zeros(buscount, linecount) - copyto!(PTDFm_t, BA) - if !isempty(dist_slack) && length(ref_bus_positions) != 1 - error( - "Distributed slack is not supported for systems with multiple reference buses.", - ) - elseif isempty(dist_slack) && length(ref_bus_positions) < buscount - PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :]) - PTDFm_t[collect(ref_bus_positions), :] .= 0.0 - return PTDFm_t - elseif length(dist_slack) == buscount - @info "Distributed bus" - PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :]) - PTDFm_t[collect(ref_bus_positions), :] .= 0.0 - slack_array = dist_slack / sum(dist_slack) - slack_array = reshape(slack_array, 1, buscount) - return PTDFm_t .- (slack_array * PTDFm_t) - else - error("Distributed bus specification doesn't match the number of buses.") - end - - return -end - -""" -Function for internal use only. - -Computes the LODF matrix by means of AppleAccelerate for sparse matrices. - -# Arguments -- `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: - Incidence Matrix -- `ptdf::Matrix{Float64}`: - PTDF matrix -""" -function PNM._calculate_LODF_matrix_AppleAccelerate( - a::SparseArrays.SparseMatrixCSC{Int8, Int}, - ptdf::Matrix{Float64}, -) - linecount = size(ptdf, 2) - ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - Dem_LU = AppleAccelerate.AAFactorization(SparseArrays.sparse(m_I, m_I, m_V)) - lodf_t = AppleAccelerate.solve(Dem_LU, ptdf_denominator_t) - lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0 - - return lodf_t -end - -end # module diff --git a/ext/MKLPardisoExt.jl b/ext/MKLPardisoExt.jl index ab9d02e04..1f124a47d 100644 --- a/ext/MKLPardisoExt.jl +++ b/ext/MKLPardisoExt.jl @@ -171,28 +171,15 @@ function PNM._calculate_LODF_matrix_MKLPardiso( a::SparseArrays.SparseMatrixCSC{Int8, Int}, ptdf::Matrix{Float64}, ) + # The demand matrix `diag(1 - PTDF·A[i,i])` is diagonal, so the + # "solve" is a row-wise element-wise division. The Pardiso path that + # used to factor it and back-solve via `_pardiso_sequential_LODF!` / + # `_pardiso_single_LODF!` is no longer needed for this stage. linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - lodf_t = zeros(linecount, linecount) - A = SparseArrays.sparse(m_I, m_I, m_V) - if linecount > PNM.DEFAULT_LODF_CHUNK_SIZE - PNM._pardiso_sequential_LODF!(lodf_t, A, ptdf_denominator_t) - else - PNM._pardiso_single_LODF!(lodf_t, A, ptdf_denominator_t) - end - lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0 - return lodf_t + m_V = PNM._build_lodf_demand(ptdf_denominator_t, linecount) + PNM._apply_lodf_demand!(ptdf_denominator_t, m_V) + return ptdf_denominator_t end end # module diff --git a/scripts/benchmarks/benchmark_solvers.jl b/scripts/benchmarks/benchmark_solvers.jl new file mode 100644 index 000000000..6324106e6 --- /dev/null +++ b/scripts/benchmarks/benchmark_solvers.jl @@ -0,0 +1,376 @@ +# benchmark_solvers.jl +# +# Purpose: +# Benchmark VirtualPTDF and VirtualMODF construction and per-row query +# performance for KLU and AppleAccelerateLU solvers on the 10k-bus +# matpower_ACTIVSg10k_sys case. +# +# Line outages are injected as PSY.FixedForcedOutage supplemental attributes +# on ACTransmission branches above a configurable kV threshold, making +# VirtualMODF contingency-row performance measurable. +# +# Every measured quantity is sampled over PASSES (default 10) passes and +# reported as `median [min–max]` so the run-to-run range is visible. +# +# Run command (from repo root): +# julia --project=test scripts/benchmarks/benchmark_solvers.jl +# +# Runtime note: +# Each row-batch pass builds a *fresh* (cold-cache) Virtual* object so the +# row-solve cost is measured without cache hits. With PASSES=10 that means +# ~11 cold builds per solver per path; on the 10k case this is minutes, not +# seconds. Lower PASSES / N_ROWS to trade range fidelity for speed. +# +# Note on Project.toml: +# This script activates the repo `test` project rather than providing a +# standalone scripts/benchmarks/Project.toml because PowerSystemCaseBuilder +# is a test-only dependency. Resolving PSB from the registry in a standalone +# project is unreliable without a dedicated manifest; the test environment +# already has PSB deved alongside PNM and is guaranteed to be consistent. + +import Pkg +Pkg.activate(joinpath(@__DIR__, "..", "..", "test")) + +using PowerNetworkMatrices +using PowerSystemCaseBuilder +import PowerSystems as PSY +import InfrastructureSystems as IS +import SparseArrays, LinearAlgebra, Random +using Printf + +const PNM = PowerNetworkMatrices +const PSB = PowerSystemCaseBuilder + +# ───────────────────────────────────────────────────────────────────────────── +# Top-level configuration constants +# ───────────────────────────────────────────────────────────────────────────── + +const SYSTEM_NAME = "matpower_ACTIVSg10k_sys" +const KV_THRESHOLD = 230.0 # inject outages on arcs above this kV +const N_ROWS = 200 # arc-rows to time per batch +const SOLVERS = ["KLU", "AppleAccelerateLU"] +const PASSES = 10 # timed passes per measurement (range) +const WARMUP = 1 # untimed warmup passes (excluded) + +# ───────────────────────────────────────────────────────────────────────────── +# Timing helpers +# ───────────────────────────────────────────────────────────────────────────── + +Base.@noinline _no_inline(x) = x + +"Min/median/max of a sample vector of nanosecond timings." +struct TimeStats + tmin::Float64 + tmed::Float64 + tmax::Float64 +end + +const NA_STATS = TimeStats(NaN, NaN, NaN) + +isna(s::TimeStats) = isnan(s.tmed) + +""" + collect_stats(sample; passes=PASSES, warmup=WARMUP) -> TimeStats + +Call `sample()` `warmup + passes` times. `sample()` must return its own +elapsed time in nanoseconds (Float64). The first `warmup` results are +discarded; the remaining `passes` are reduced to (min, median, max). +""" +function collect_stats(sample::F; passes::Int = PASSES, warmup::Int = WARMUP) where {F} + for _ in 1:warmup + _no_inline(sample()) + end + times = Vector{Float64}(undef, passes) + for i in 1:passes + times[i] = Float64(_no_inline(sample())) + end + sort!(times) + tmin = times[1] + tmax = times[end] + tmed = times[(passes + 1) ÷ 2] + return TimeStats(tmin, tmed, tmax) +end + +"Time a single call to `f()` (whole call, GC'd first); returns elapsed ns." +function time_call(f::F) where {F} + GC.gc() + t = time_ns() + _no_inline(f()) + return Float64(time_ns() - t) +end + +function fmt_time(t_ns::Float64) + if t_ns < 1e3 + return @sprintf("%.1f ns", t_ns) + elseif t_ns < 1e6 + return @sprintf("%.1f μs", t_ns / 1e3) + elseif t_ns < 1e9 + return @sprintf("%.1f ms", t_ns / 1e6) + else + return @sprintf("%.2f s", t_ns / 1e9) + end +end + +"Markdown cell: `median [min–max]`, or `n/a`." +function cell_str(s::TimeStats) + if isna(s) + return "n/a" + end + return "$(fmt_time(s.tmed)) [$(fmt_time(s.tmin))–$(fmt_time(s.tmax))]" +end + +"Ratio of medians, e.g. KLU/AA." +function ratio_str(a::TimeStats, b::TimeStats) + if isna(a) || isna(b) + return "n/a" + end + return @sprintf("%.2fx", a.tmed / b.tmed) +end + +# ───────────────────────────────────────────────────────────────────────────── +# Spread-out index selection +# ───────────────────────────────────────────────────────────────────────────── + +""" + spread_keys(axis, n) -> Vector + +Return `n` evenly-spread elements from `axis` (or all elements if fewer than `n`). +""" +function spread_keys(axis::Vector, n::Int) + total = length(axis) + n = min(n, total) + indices = unique(round.(Int, range(1, total; length = n))) + return [axis[i] for i in indices] +end + +# ───────────────────────────────────────────────────────────────────────────── +# Outage injection +# ───────────────────────────────────────────────────────────────────────────── + +""" + inject_line_outages!(sys; kv_threshold=KV_THRESHOLD) -> Int + +Attach a `PSY.FixedForcedOutage(outage_status=1.0)` supplemental attribute to +every `PSY.Line` whose highest terminal voltage exceeds `kv_threshold` kV. + +Only `PSY.Line` components are considered — transformers and +phase-shifting transformers are intentionally excluded (contingencies are +registered for lines only). + +Returns the number of outages injected. Prints a summary with line count, +injected count, and three example (name, kV) pairs. +""" +function inject_line_outages!(sys::PSY.System; kv_threshold::Float64 = KV_THRESHOLD)::Int + all_branches = collect(PSY.get_components(PSY.Line, sys)) + n_total = length(all_branches) + injected = 0 + examples = Tuple{String, Float64}[] + + for branch in all_branches + arc = PSY.get_arc(branch) + from_kv = PSY.get_base_voltage(PSY.get_from(arc)) + to_kv = PSY.get_base_voltage(PSY.get_to(arc)) + v = max(from_kv, to_kv) + if v > kv_threshold + outage = PSY.FixedForcedOutage(; outage_status = 1.0) + PSY.add_supplemental_attribute!(sys, branch, outage) + injected += 1 + if length(examples) < 3 + push!(examples, (PSY.get_name(branch), v)) + end + end + end + + println( + "inject_line_outages!: $n_total total PSY.Line components; " * + "injected $injected outages (kv_threshold = $kv_threshold kV)", + ) + for (name, kv) in examples + println(" example: \"$name\" at $(kv) kV") + end + return injected +end + +# ───────────────────────────────────────────────────────────────────────────── +# Load system and inject outages +# ───────────────────────────────────────────────────────────────────────────── + +println("Loading $SYSTEM_NAME …") +sys = PSB.build_system(PSB.MatpowerTestSystems, SYSTEM_NAME) +n_buses = length(collect(PSY.get_components(PSY.ACBus, sys))) +println("System loaded: $n_buses buses") + +println("\nInjecting line outages (kv_threshold = $KV_THRESHOLD kV) …") +n_outages = inject_line_outages!(sys; kv_threshold = KV_THRESHOLD) +println("Total outages injected: $n_outages\n") + +# ───────────────────────────────────────────────────────────────────────────── +# Result tables +# ───────────────────────────────────────────────────────────────────────────── + +const PATHS = [ + "VirtualPTDF build", + "VirtualPTDF $(N_ROWS)-row batch", + "VirtualMODF build", + "VirtualMODF $(N_ROWS)-row batch", +] + +results = Dict{String, Dict{String, TimeStats}}() +for p in PATHS + results[p] = Dict{String, TimeStats}(s => NA_STATS for s in SOLVERS) +end + +# ───────────────────────────────────────────────────────────────────────────── +# Per-solver benchmark loop +# +# Build paths : time the whole constructor call, PASSES times. +# Row batches : each pass builds a FRESH (cold-cache) object UNTIMED, then +# times only the row loop — isolating row-solve cost with no +# build-time subtraction and no cache hits across passes. +# ───────────────────────────────────────────────────────────────────────────── + +for solver in SOLVERS + println("\n" * "="^60) + println("Solver: $solver ($PASSES passes, $WARMUP warmup)") + println("="^60) + + # ------------------------------------------------------------------ + # VirtualPTDF: build time + # ------------------------------------------------------------------ + vptdf_ref = nothing + try + println(" [VirtualPTDF] timed build …") + vptdf_ref = PNM.VirtualPTDF(sys; linear_solver = solver) + s = collect_stats( + () -> time_call( + () -> PNM.VirtualPTDF(sys; linear_solver = solver), + ), + ) + results["VirtualPTDF build"][solver] = s + println(" VirtualPTDF build: $(cell_str(s))") + catch e + println(" VirtualPTDF build FAILED: $e") + end + + # ------------------------------------------------------------------ + # VirtualPTDF: N_ROWS-row batch (fresh cold-cache object per pass) + # ------------------------------------------------------------------ + if vptdf_ref !== nothing + try + println(" [VirtualPTDF] $N_ROWS-row batch …") + arc_keys = spread_keys(PNM.get_arc_axis(vptdf_ref), N_ROWS) + sample = function () + v = PNM.VirtualPTDF(sys; linear_solver = solver) + GC.gc() + t = time_ns() + for arc in arc_keys + _ = v[arc, :] + end + return Float64(time_ns() - t) + end + s = collect_stats(sample) + results["VirtualPTDF $(N_ROWS)-row batch"][solver] = s + println(" VirtualPTDF $(N_ROWS)-row batch (rows only): $(cell_str(s))") + catch e + println(" VirtualPTDF row batch FAILED: $e") + end + end + + # ------------------------------------------------------------------ + # VirtualMODF: build time + # ------------------------------------------------------------------ + vmodf_ref = nothing + try + println(" [VirtualMODF] timed build …") + vmodf_ref = PNM.VirtualMODF(sys; linear_solver = solver) + n_ctg = length(PNM.get_registered_contingencies(vmodf_ref)) + println(" VirtualMODF: $n_ctg contingencies registered") + s = collect_stats( + () -> time_call( + () -> PNM.VirtualMODF(sys; linear_solver = solver), + ), + ) + results["VirtualMODF build"][solver] = s + println(" VirtualMODF build: $(cell_str(s))") + catch e + println(" VirtualMODF build FAILED: $e") + end + + # ------------------------------------------------------------------ + # VirtualMODF: N_ROWS-row batch for one contingency + # + # Row access: vmodf[arc_idx::Int, contingency::ContingencySpec] + # where arc_idx = PNM.get_arc_lookup(vmodf)[arc_tuple] + # and contingency is taken from PNM.get_registered_contingencies(vmodf). + # ------------------------------------------------------------------ + if vmodf_ref !== nothing + try + ctg_dict = PNM.get_registered_contingencies(vmodf_ref) + if isempty(ctg_dict) + println(" VirtualMODF: no contingencies registered — skipping row batch.") + println(" (Re-run inject_line_outages! with a lower kv_threshold.)") + else + println(" [VirtualMODF] $N_ROWS-row batch …") + arc_keys = spread_keys(PNM.get_arc_axis(vmodf_ref), N_ROWS) + arc_lookup_ref = PNM.get_arc_lookup(vmodf_ref) + arc_indices = [arc_lookup_ref[arc] for arc in arc_keys] + sample = function () + v = PNM.VirtualMODF(sys; linear_solver = solver) + ctg = first(values(PNM.get_registered_contingencies(v))) + GC.gc() + t = time_ns() + for arc_idx in arc_indices + _ = v[arc_idx, ctg] + end + return Float64(time_ns() - t) + end + s = collect_stats(sample) + results["VirtualMODF $(N_ROWS)-row batch"][solver] = s + println(" VirtualMODF $(N_ROWS)-row batch (rows only): $(cell_str(s))") + end + catch e + println(" VirtualMODF row batch FAILED: $e") + end + end +end + +# ───────────────────────────────────────────────────────────────────────────── +# Results table +# ───────────────────────────────────────────────────────────────────────────── + +println("\n\n" * "="^60) +println("RESULTS") +println("="^60) + +println() +println("System: $SYSTEM_NAME") +println("Buses: $n_buses") +println("Outages: $n_outages (kv_threshold = $KV_THRESHOLD kV)") +println("Hardware: $(Sys.MACHINE)") +println("Julia: $(VERSION)") +println("Passes: $PASSES timed ($WARMUP warmup discarded); cells are median [min–max]") +println( + "Row batch: $N_ROWS spread-out arcs; fresh cold-cache object per pass, row loop only", +) +println() + +header = "| Path | KLU | AA_LU | KLU/AA_LU |" +sep = "|---|---|---|---|" +println(header) +println(sep) + +for p in PATHS + s_klu = results[p]["KLU"] + s_lu = results[p]["AppleAccelerateLU"] + row = @sprintf( + "| %s | %s | %s | %s |", + p, + cell_str(s_klu), + cell_str(s_lu), + ratio_str(s_klu, s_lu), + ) + println(row) +end + +println() +println("Done.") diff --git a/src/AccelerateWrapper/AccelerateWrapper.jl b/src/AccelerateWrapper/AccelerateWrapper.jl new file mode 100644 index 000000000..e84f5a1c1 --- /dev/null +++ b/src/AccelerateWrapper/AccelerateWrapper.jl @@ -0,0 +1,73 @@ +""" + AccelerateWrapper + +A small, allocation-aware wrapper over Apple's `libSparse.dylib` +(`/System/Library/Frameworks/Accelerate.framework/.../libSparse.dylib`) +designed for the access patterns of `PowerNetworkMatrices`: + +- Cache the symbolic and numeric factorizations of a general (unsymmetric) + sparse matrix (LU with threshold partial pivoting + Inf-norm equilibration + scaling) and reuse them across many solves. +- Refresh the numeric factor (`numeric_refactor!`) while keeping the symbolic + analysis, without re-allocating the structural arrays. +- Solve dense and **sparse** right-hand sides in place, with the sparse path + packing only non-empty RHS columns into a bounded scratch block. +- Compute `A·X` and `A·x` directly via libSparse's `SparseMultiply`. + +This module is intentionally lighter than the upstream `AppleAccelerate.jl` +package: it owns no high-level Julia wrappers over libSparse, exposes the +symbolic/numeric split directly, binds only the entry points used by PNM, +and is compile-gated to macOS so non-Apple builds never codegen the +`@ccall` sites. +""" +module AccelerateWrapper + +import SparseArrays +import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros, nzrange +import LinearAlgebra + +export AAFactorCache, + aa_factorize, + symbolic_factor!, + numeric_refactor!, + full_factor!, + full_refactor!, + solve!, + solve_sparse!, + solve_sparse, + is_factored, + aa_spmm!, + aa_spmv! + +@static if Sys.isapple() + include("libsparse_bindings.jl") + include("aa_cache.jl") + include("solve_dense.jl") + include("solve_sparse_rhs.jl") + include("spmm.jl") +else + # Stub layer. Non-Apple builds never bind libSparse symbols, never codegen + # the `@ccall` sites, and never instantiate `SparseOpaqueFactorization`. + # The whole submodule reduces to these short bodies on Linux/Windows. + struct AAFactorCache end + + _unavailable() = error( + "AccelerateWrapper is macOS-only (Sys.isapple() returned false). " * + "Use the KLU backend on non-Apple platforms.", + ) + + AAFactorCache(args...; kwargs...) = _unavailable() + aa_factorize(args...; kwargs...) = _unavailable() + symbolic_factor!(args...) = _unavailable() + numeric_refactor!(args...) = _unavailable() + full_factor!(args...) = _unavailable() + full_refactor!(args...) = _unavailable() + solve!(args...) = _unavailable() + solve_sparse!(args...; kwargs...) = _unavailable() + solve_sparse(args...; kwargs...) = _unavailable() + is_factored(::AAFactorCache) = false + aa_spmm!(args...) = _unavailable() + aa_spmv!(args...) = _unavailable() +end + +end # module diff --git a/src/AccelerateWrapper/aa_cache.jl b/src/AccelerateWrapper/aa_cache.jl new file mode 100644 index 000000000..033894633 --- /dev/null +++ b/src/AccelerateWrapper/aa_cache.jl @@ -0,0 +1,373 @@ +""" +A cached libSparse linear solver for repeated solves against the same sparse +matrix structure. `numeric_refactor!` and `solve!` allocate nothing once the +cache is built. + +Cache the symbolic and numeric factorizations of a general (unsymmetric) sparse +matrix (LU with threshold partial pivoting + Inf-norm equilibration scaling) and +reuse them across many solves. The full CSC pattern of `A` is stored and the +libSparse structure view is marked `ATT_ORDINARY`. No symmetry requirement +(matches KLU's pivoting model). + +Float64 only. Requires macOS 15.5+ (enforced by the backend selection in +`linalg_settings.jl`). + +`reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis; +`check_pattern` adds a structural-equality check on refactor calls and is +only consulted when reusing. +""" +mutable struct AAFactorCache + # Apple-side 0-based, narrower-integer copies of the full input CSC pattern. + # Reused as-is across `numeric_refactor!` calls. + columnStarts::Vector{Clong} + rowIndices::Vector{Cint} + nzval::Vector{Cdouble} + n::Int + # Count of stored entries: full `nnz(A)`. + nnz::Int + symbolic::SparseOpaqueSymbolicFactorization + numeric::SparseOpaqueFactorization_t + reuse_symbolic::Bool + check_pattern::Bool + # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call. + scratch::Matrix{Cdouble} + col_map::Vector{Int} + # Reusable libSparse solve workspace. Sized to + # `static + nrhs * per_rhs` bytes; supplied to the workspace-aware + # `SparseSolve` overloads so libSparse does not malloc/free per call. + # Float64 storage chosen for 16-byte alignment; length is in Float64s. + solve_workspace::Vector{Float64} + # Scaling method passed to libSparse at numeric-factor time. + # `SparseScalingEquilibriationInf` reduces fill on symmetric SPD-ish inputs + # like ABA (~4× faster multi-RHS solve, residual still O(1e-13)). Set via + # the `scaling` kwarg of `AAFactorCache` / `aa_factorize`. + scaling::SparseScaling_t +end + +@inline _dim(cache::AAFactorCache) = cache.n + +Base.size(cache::AAFactorCache) = (cache.n, cache.n) +function Base.size(cache::AAFactorCache, d::Integer) + if d <= 2 + return cache.n + else + return 1 + end +end +Base.eltype(::Type{AAFactorCache}) = Cdouble + +""" + is_factored(cache::AAFactorCache) -> Bool + +`true` when `cache` holds a valid numeric factorization ready for `solve!` / +`solve_sparse!`. `false` after construction (before `full_factor!`) or after +the libSparse handles have been finalized. +""" +function is_factored(cache::AAFactorCache) + return cache.numeric.status == SparseStatusOk && cache.symbolic.status == SparseStatusOk +end + +""" + AAFactorCache(A; reuse_symbolic=true, check_pattern=true, + scaling=SparseScalingEquilibriationInf) + +Build a cache for the square sparse matrix `A`. Allocates the Apple-side +structural arrays (`columnStarts`, `rowIndices`, `nzval`) but does **not** +factorize. Call `full_factor!` (or `symbolic_factor!` followed by +`numeric_refactor!`) before `solve!`. + +The arrays are sized to the full pattern of `A` (general/LU mode). No symmetry +requirement — matches KLU's pivoting model. + +A finalizer frees libSparse handles on GC; call `Base.finalize(cache)` to +release them eagerly. +""" +function AAFactorCache( + A::SparseMatrixCSC{Float64, Int}; + reuse_symbolic::Bool = true, + check_pattern::Bool = true, + scaling::SparseScaling_t = SparseScalingEquilibriationInf, +) + n = size(A, 1) + n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))")) + stored_nnz = SparseArrays.nnz(A) + cache = AAFactorCache( + Vector{Clong}(undef, n + 1), + Vector{Cint}(undef, stored_nnz), + Vector{Cdouble}(undef, 0), + n, + stored_nnz, + _null_symbolic(), + _null_factorization(), + reuse_symbolic, + check_pattern, + Matrix{Cdouble}(undef, 0, 0), + Int[], + Float64[], + scaling, + ) + _populate_pattern!(cache, A) + finalizer(_free_handles!, cache) + return cache +end + +# --- general (LU) mode helpers ---------------------------------------------- + +# Copy A's full CSC pattern into `cache.columnStarts` / `cache.rowIndices` +# as 0-based narrowed indices. Caller sized these to (n+1) and nnz. +function _populate_pattern!( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, +) + cp = getcolptr(A) + rv = rowvals(A) + @inbounds for k in eachindex(cp) + cache.columnStarts[k] = Clong(cp[k] - 1) + end + @inbounds for k in eachindex(rv) + cache.rowIndices[k] = Cint(rv[k] - 1) + end + return cache +end + +# Snapshot the full nonzeros into `cache.nzval`, growing if needed. +function _populate_values!( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, +) + if length(cache.nzval) != cache.nnz + resize!(cache.nzval, cache.nnz) + end + copyto!(cache.nzval, nonzeros(A)) + return cache +end + +# --- pattern guard ---------------------------------------------------------- + +# Pattern-match guard for `numeric_refactor!`: assert the incoming CSC's +# stored pattern is identical to what was analyzed (full pattern, LU mode). +# Apple's arrays are 0-based but we always store them 0-based — no flipping. +function _check_pattern_match( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, + op::AbstractString, +) + n = cache.n + if size(A, 1) != n || size(A, 2) != n + throw(DimensionMismatch("Cannot $op: cache is $(n)×$(n) but A is $(size(A)).")) + end + cp = getcolptr(A) + rv = rowvals(A) + length(rv) == cache.nnz || return _pattern_mismatch(op) + @inbounds for k in eachindex(cp) + cache.columnStarts[k] == Clong(cp[k] - 1) || return _pattern_mismatch(op) + end + @inbounds for k in eachindex(rv) + cache.rowIndices[k] == Cint(rv[k] - 1) || return _pattern_mismatch(op) + end + return nothing +end + +_pattern_mismatch(op::AbstractString) = + throw(ArgumentError("Cannot $op: matrix has different sparsity structure.")) + +""" +Release the libSparse numeric and symbolic handles held by `cache`, leaving +Julia-side state intact. Idempotent. +""" +function _free_handles!(cache::AAFactorCache) + if cache.numeric.status == SparseStatusOk + _sparse_cleanup_factor!(cache.numeric) + cache.numeric = _null_factorization() + end + if cache.symbolic.status == SparseStatusOk + _sparse_cleanup_symbolic!(cache.symbolic) + cache.symbolic = _null_symbolic() + end + return nothing +end + +Base.finalize(cache::AAFactorCache) = _free_handles!(cache) + +# Build the libSparse `SparseMatrixStructure` view that points into the +# cache's owned arrays. Always marked `ATT_ORDINARY` — the LU path treats +# `cache.nzval` as the full matrix with no symmetry assumed. +function _structure_view(cache::AAFactorCache) + return SparseMatrixStructure( + Cint(cache.n), + Cint(cache.n), + pointer(cache.columnStarts), + pointer(cache.rowIndices), + ATT_ORDINARY, + UInt8(1), + ) +end + +function _matrix_view(cache::AAFactorCache) + return SparseMatrix_t(_structure_view(cache), pointer(cache.nzval)) +end + +""" + symbolic_factor!(cache, A) + +Free any cached symbolic/numeric factor, replace the structural arrays with +`A`'s full pattern, and analyze. Subsequent `numeric_refactor!` calls reuse +the analysis. +""" +function symbolic_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + n = cache.n + if size(A, 1) != n || size(A, 2) != n + throw(DimensionMismatch("Cannot factor: cache is $(n)×$(n) but A is $(size(A)).")) + end + _free_handles!(cache) + new_nnz = SparseArrays.nnz(A) + if new_nnz != cache.nnz + resize!(cache.rowIndices, new_nnz) + cache.nnz = new_nnz + end + if length(cache.columnStarts) != n + 1 + resize!(cache.columnStarts, n + 1) + end + _populate_pattern!(cache, A) + sym = _sparse_symbolic_factor( + SparseFactorizationLU, + _structure_view(cache), + SparseSymbolicFactorOptions(), + ) + if sym.status != SparseStatusOk + # libSparse may have allocated C-side state before deciding to fail — + # release it before throwing so we don't leak per failed factor. + _sparse_cleanup_symbolic!(sym) + _libsparse_throw(sym.status, "symbolic factor") + end + cache.symbolic = sym + return cache +end + +""" + numeric_refactor!(cache, A) + +Refresh the numeric factor on top of the existing symbolic analysis. Errors +if `symbolic_factor!` has not been called yet. +""" +function numeric_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + cache.symbolic.status == SparseStatusOk || + error("AAFactorCache: call symbolic_factor! before numeric_refactor!.") + cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor") + _populate_values!(cache, A) + if cache.numeric.status == SparseStatusOk + _sparse_cleanup_factor!(cache.numeric) + cache.numeric = _null_factorization() + end + num = _sparse_numeric_factor( + cache.symbolic, + _matrix_view(cache), + SparseNumericFactorOptions(cache.scaling), + ) + if num.status != SparseStatusOk + # Same rationale as in symbolic_factor!: release before throwing. + _sparse_cleanup_factor!(num) + _libsparse_throw(num.status, "numeric factor") + end + cache.numeric = num + return cache +end + +""" + symbolic_refactor!(cache, A) + +If `cache.reuse_symbolic`, optionally verify the structure matches and reuse +the existing analysis. Otherwise, rerun `symbolic_factor!`. +""" +function symbolic_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + if !cache.reuse_symbolic + return symbolic_factor!(cache, A) + end + cache.check_pattern && _check_pattern_match(cache, A, "symbolic_refactor") + return cache +end + +""" + full_factor!(cache, A) -> cache + +Run a fresh symbolic analysis followed by a numeric factorization on `A`. +""" +function full_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + symbolic_factor!(cache, A) + numeric_refactor!(cache, A) + return cache +end + +""" + full_refactor!(cache, A) -> cache + +Refresh both factorizations on `A`. Defers to `symbolic_refactor!` (which +reuses the existing analysis when `cache.reuse_symbolic` is set) followed by +`numeric_refactor!`. +""" +function full_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + symbolic_refactor!(cache, A) + numeric_refactor!(cache, A) + return cache +end + +""" + aa_factorize(A; reuse_symbolic=true, check_pattern=true, + scaling=SparseScalingEquilibriationInf) -> AAFactorCache + +Build a cache for `A` and immediately compute the full LU factorization. See +`AAFactorCache` for the kwarg semantics. +""" +function aa_factorize( + A::SparseMatrixCSC{Float64, Int}; + reuse_symbolic::Bool = true, + check_pattern::Bool = true, + scaling::SparseScaling_t = SparseScalingEquilibriationInf, +) + cache = AAFactorCache( + A; + reuse_symbolic = reuse_symbolic, + check_pattern = check_pattern, + scaling = scaling, + ) + return full_factor!(cache, A) +end + +""" + _ensure_scratch!(cache, block) -> Nothing + +Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length +`block`. Used by `solve_sparse!`. +""" +@inline function _ensure_scratch!(cache::AAFactorCache, block::Int) + n = cache.n + s = cache.scratch + if size(s, 1) != n || size(s, 2) < block + cache.scratch = Matrix{Cdouble}(undef, n, block) + end + if length(cache.col_map) < block + resize!(cache.col_map, block) + end + return nothing +end + +""" + _ensure_solve_workspace!(cache, nrhs) -> Ptr{Cvoid} + +Ensure `cache.solve_workspace` is large enough to back a libSparse +`SparseSolve` call with `nrhs` right-hand sides (`static + nrhs * per_rhs` +bytes per the factor's documented requirements). Grows only when too small +— steady state is no-op. Returns a `Ptr{Cvoid}` to pass to the +workspace-aware ccall (16-byte aligned by virtue of `Vector{Float64}`'s +allocator). +""" +@inline function _ensure_solve_workspace!(cache::AAFactorCache, nrhs::Integer) + nbytes = _solve_workspace_bytes(cache.numeric, nrhs) + # Round up to whole Float64s; min 1 element so `pointer` is well-defined + # when libSparse asks for zero bytes (it may still dereference). + need = max(cld(nbytes, 8), 1) + if length(cache.solve_workspace) < need + resize!(cache.solve_workspace, need) + end + return convert(Ptr{Cvoid}, pointer(cache.solve_workspace)) +end diff --git a/src/AccelerateWrapper/libsparse_bindings.jl b/src/AccelerateWrapper/libsparse_bindings.jl new file mode 100644 index 000000000..49f5391e5 --- /dev/null +++ b/src/AccelerateWrapper/libsparse_bindings.jl @@ -0,0 +1,373 @@ +# Direct bindings into Apple's libSparse.dylib (part of the Accelerate +# framework). Only the entry points actually consumed by PowerNetworkMatrices +# are wrapped — Float64-only, no Float32 mangled aliases, no QR / Cholesky-AtA +# variants. Mangled names match what AppleAccelerate.jl uses; see +# `/Library/Developer/CommandLineTools/SDKs/MacOSX*.sdk/System/Library/ +# Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/ +# Versions/A/Headers/Sparse/Solve.h` for the C declarations. + +const LIBSPARSE = + "/System/Library/Frameworks/Accelerate.framework/Versions/A/" * + "Frameworks/vecLib.framework/libSparse.dylib" + +# Only `SparseFactorizationLU` is used by PNM (the documented default, +# "currently LU with TPP" — threshold partial pivoting, provably stable). +# The other LU codes are listed for documentation. All LU codes require +# macOS 15.5+. QR = 40 and CholeskyAtA = 41 are intentionally not wrapped +# (see file header). The LDLT/Cholesky family (codes 0–4) is not wrapped +# because all PNM workloads use the general LU path. +@enum SparseFactorization_t::UInt8 begin + # QR = 40, CholeskyAtA = 41 — intentionally not wrapped (see file header). + SparseFactorizationLU = 80 + SparseFactorizationLUUnpivoted = 81 + SparseFactorizationLUSPP = 82 + SparseFactorizationLUTPP = 83 +end + +@enum SparseOrder_t::UInt8 begin + SparseOrderDefault = 0 + SparseOrderUser = 1 + SparseOrderAMD = 2 + SparseOrderMetis = 3 + SparseOrderCOLAMD = 4 +end + +@enum SparseScaling_t::UInt8 begin + SparseScalingDefault = 0 + SparseScalingUser = 1 + SparseScalingEquilibriationInf = 2 +end + +@enum SparseStatus_t::Int32 begin + SparseStatusOk = 0 + SparseStatusFailed = -1 + SparseMatrixIsSingular = -2 + SparseInternalError = -3 + SparseParameterError = -4 + SparseStatusReleased = -2147483647 +end + +@enum SparseControl_t::UInt32 begin + SparseDefaultControl = 0 +end + +# `SparseAttributes_t` is a packed bitfield in C; Julia can't express that +# cleanly, so we model it as `Cuint` and assemble the bits ourselves. +const att_type = Cuint +const ATT_TRANSPOSE = att_type(1) +const ATT_UPPER_TRIANGLE = att_type(0) +const ATT_LOWER_TRIANGLE = att_type(2) +const ATT_ORDINARY = att_type(0) +const ATT_TRIANGULAR = att_type(4) +const ATT_UNIT_TRIANGULAR = att_type(8) +const ATT_SYMMETRIC = att_type(12) + +struct SparseMatrixStructure + rowCount::Cint + columnCount::Cint + columnStarts::Ptr{Clong} + rowIndices::Ptr{Cint} + attributes::att_type + blockSize::UInt8 +end + +struct SparseNumericFactorOptions + control::SparseControl_t + scalingMethod::SparseScaling_t + scaling::Ptr{Cvoid} + pivotTolerance::Float64 + zeroTolerance::Float64 +end + +# Defaults match Apple's SolveImplementation.h. The scaling method is the +# meaningful knob — `SparseScalingEquilibriationInf` on the LU path is a ~4× +# speedup with no correctness impact for well-conditioned ABA-like inputs. +function SparseNumericFactorOptions(scaling::SparseScaling_t) + return SparseNumericFactorOptions( + SparseDefaultControl, + scaling, + C_NULL, + 0.01, + eps(Cdouble) * 1e-4, + ) +end + +struct SparseSymbolicFactorOptions + control::SparseControl_t + orderMethod::SparseOrder_t + order::Ptr{Cvoid} + ignoreRowsAndColumns::Ptr{Cvoid} + malloc::Ptr{Cvoid} + free::Ptr{Cvoid} + reportError::Ptr{Cvoid} +end + +# `reportError` is fired by libSparse before it returns a failure status. The +# inline `error(unsafe_string(text))` matches AppleAccelerate.jl's pattern: it +# propagates the libSparse message as a Julia exception that unwinds back +# through the originating ccall. Avoids `@error`'s allocator/logger overhead, +# which is fragile when libSparse invokes the callback from its own threads. +# Passing libc malloc/free explicitly (the C_NULL "use Apple defaults" path is +# documented but unreliable from a non-Obj-C caller). +function SparseSymbolicFactorOptions() + return SparseSymbolicFactorOptions( + SparseDefaultControl, + SparseOrderDefault, + C_NULL, + C_NULL, + @cfunction(Libc.malloc, Ptr{Cvoid}, (Csize_t,)), + @cfunction(Libc.free, Cvoid, (Ptr{Cvoid},)), + @cfunction(text -> error(unsafe_string(text)), Cvoid, (Cstring,)), + ) +end + +struct DenseVector_t + count::Cint + data::Ptr{Cdouble} +end + +struct DenseMatrix_t + rowCount::Cint + columnCount::Cint + columnStride::Cint + attributes::att_type + data::Ptr{Cdouble} +end + +struct SparseMatrix_t + structure::SparseMatrixStructure + data::Ptr{Cdouble} +end + +struct SparseOpaqueSymbolicFactorization + status::SparseStatus_t + rowCount::Cint + columnCount::Cint + attributes::att_type + blockSize::UInt8 + type::SparseFactorization_t + factorization::Ptr{Cvoid} + workspaceSize_Float::Csize_t + workspaceSize_Double::Csize_t + factorSize_Float::Csize_t + factorSize_Double::Csize_t +end + +# Placeholder used to initialize `AAFactorCache.symbolic` before the first +# factor. Marked released so any accidental cleanup is a no-op. +function _null_symbolic() + return SparseOpaqueSymbolicFactorization( + SparseStatusReleased, + 0, + 0, + ATT_ORDINARY, + 0, + SparseFactorizationLU, + C_NULL, + 0, + 0, + 0, + 0, + ) +end + +struct SparseOpaqueFactorization_t + status::SparseStatus_t + attributes::att_type + symbolicFactorization::SparseOpaqueSymbolicFactorization + userFactorStorage::Bool + numericFactorization::Ptr{Cvoid} + solveWorkspaceRequiredStatic::Csize_t + solveWorkspaceRequiredPerRHS::Csize_t +end + +function _null_factorization() + return SparseOpaqueFactorization_t( + SparseStatusReleased, + ATT_ORDINARY, + _null_symbolic(), + false, + C_NULL, + 0, + 0, + ) +end + +# Build the Apple-side dense views at the ccall boundary. `StridedMatrix`'s +# first-dimension stride must be 1 (the libSparse contract); we assert at the +# call site, not here. +function _dense_matrix(B::StridedMatrix{Cdouble}) + return DenseMatrix_t( + Cint(size(B, 1)), + Cint(size(B, 2)), + Cint(stride(B, 2)), + ATT_ORDINARY, + pointer(B), + ) +end + +function _dense_vector(b::StridedVector{Cdouble}) + return DenseVector_t(Cint(length(b)), pointer(b)) +end + +# --- ccalls ----------------------------------------------------------------- +# +# Mangled symbol names come from the C++ ABI of libSparse. They are stable on +# the system framework and match what AppleAccelerate.jl binds. If Apple +# breaks them in a future macOS release, the failure will be loud (dlopen of +# a missing symbol at first call), which is the behavior we want. + +# Symbolic-only factor: analyzes the pattern, returns an opaque symbolic +# factor. Can back many numeric factors on the same pattern. +function _sparse_symbolic_factor( + ftype::SparseFactorization_t, + structure::SparseMatrixStructure, + sym_opts::SparseSymbolicFactorOptions, +)::SparseOpaqueSymbolicFactorization + return @ccall LIBSPARSE._Z12SparseFactorh21SparseMatrixStructure27SparseSymbolicFactorOptions( + ftype::Cuint, + structure::SparseMatrixStructure, + sym_opts::SparseSymbolicFactorOptions, + )::SparseOpaqueSymbolicFactorization +end + +# Numeric factor on top of an existing symbolic factor. Reusable: the +# symbolic handle is not consumed. +function _sparse_numeric_factor( + symbolic::SparseOpaqueSymbolicFactorization, + matrix::SparseMatrix_t, + num_opts::SparseNumericFactorOptions, +)::SparseOpaqueFactorization_t + return @ccall LIBSPARSE._Z12SparseFactor33SparseOpaqueSymbolicFactorization19SparseMatrix_Double26SparseNumericFactorOptions( + symbolic::SparseOpaqueSymbolicFactorization, + matrix::SparseMatrix_t, + num_opts::SparseNumericFactorOptions, + )::SparseOpaqueFactorization_t +end + +# In-place solve `A · X = B`, where B is a column-major dense matrix and X +# overwrites B's storage. +function _sparse_solve_matrix!( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_Double( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, + )::Cvoid + return nothing +end + +function _sparse_solve_vector!( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_Double( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, + )::Cvoid + return nothing +end + +# Workspace-aware solve overloads (libSparse, macOS 10.13+). The factor +# exposes `solveWorkspaceRequiredStatic + nrhs * solveWorkspaceRequiredPerRHS` +# bytes of scratch it needs per call; supplying a reusable buffer eliminates +# the implicit malloc/free that the no-workspace variants perform internally. +# On AA_LU at 10k nodes that buffer is ~234 KiB / RHS — substantial per-call +# churn when the no-workspace path is used in a tight row loop. The workspace +# pointer must be 16-byte aligned; Julia's `Vector{Float64}` data satisfies +# this for any non-trivial size. +function _sparse_solve_matrix_ws!( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, + workspace::Ptr{Cvoid}, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_DoublePv( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, + workspace::Ptr{Cvoid}, + )::Cvoid + return nothing +end + +function _sparse_solve_vector_ws!( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, + workspace::Ptr{Cvoid}, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_DoublePv( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, + workspace::Ptr{Cvoid}, + )::Cvoid + return nothing +end + +# Bytes of workspace required to solve with `nrhs` right-hand sides. +@inline function _solve_workspace_bytes( + factor::SparseOpaqueFactorization_t, + nrhs::Integer, +) + return Int(factor.solveWorkspaceRequiredStatic) + + Int(nrhs) * Int(factor.solveWorkspaceRequiredPerRHS) +end + +# `Y = A · X`, dense multi-column. `Y` must be allocated to (rowCount, ncols) +# by the caller. libSparse overwrites — does not accumulate. +function _sparse_multiply_matrix!( + A::SparseMatrix_t, + X::DenseMatrix_t, + Y::DenseMatrix_t, +) + @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseMatrix_DoubleS0_( + A::SparseMatrix_t, + X::DenseMatrix_t, + Y::DenseMatrix_t, + )::Cvoid + return nothing +end + +function _sparse_multiply_vector!( + A::SparseMatrix_t, + x::DenseVector_t, + y::DenseVector_t, +) + @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseVector_DoubleS0_( + A::SparseMatrix_t, + x::DenseVector_t, + y::DenseVector_t, + )::Cvoid + return nothing +end + +# Frees the libSparse-side numeric / symbolic storage attached to an opaque +# factor. Idempotent: a second call with a `SparseStatusReleased` handle is +# a no-op on libSparse's side. +function _sparse_cleanup_factor!(factor::SparseOpaqueFactorization_t) + @ccall LIBSPARSE._Z13SparseCleanup32SparseOpaqueFactorization_Double( + factor::SparseOpaqueFactorization_t, + )::Cvoid + return nothing +end + +function _sparse_cleanup_symbolic!(symbolic::SparseOpaqueSymbolicFactorization) + @ccall LIBSPARSE._Z13SparseCleanup33SparseOpaqueSymbolicFactorization( + symbolic::SparseOpaqueSymbolicFactorization, + )::Cvoid + return nothing +end + +# Translate libSparse status codes into Julia exceptions. Singular and +# parameter-error are the most common; the rest fall through to a generic +# `error`. +function _libsparse_throw(status::SparseStatus_t, op::AbstractString) + status == SparseMatrixIsSingular && + throw(LinearAlgebra.SingularException(0)) + status == SparseParameterError && + throw(ArgumentError("libSparse $(op) failed: parameter error")) + status == SparseInternalError && + error("libSparse $(op) failed: internal error") + status == SparseStatusFailed && + error("libSparse $(op) failed") + return error("libSparse $(op) failed: status=$(Int(status))") +end diff --git a/src/AccelerateWrapper/solve_dense.jl b/src/AccelerateWrapper/solve_dense.jl new file mode 100644 index 000000000..62c978743 --- /dev/null +++ b/src/AccelerateWrapper/solve_dense.jl @@ -0,0 +1,44 @@ +""" + solve!(cache, B) -> B + +Solve `A · X = B` in place, dispatching on the shape of `B`: + + - `B::StridedMatrix{Cdouble}`: multiple right-hand sides handled in a single + libSparse call; `size(B, 1)` must equal `cache.n`. + - `B::StridedVector{Cdouble}`: single right-hand side; `length(B)` must + equal `cache.n`. + +Both overloads require `B` to have unit stride in the first dimension and +the cache to be factored (`is_factored(cache) == true`). +""" +function solve!(cache::AAFactorCache, B::StridedMatrix{Cdouble}) + is_factored(cache) || error("AAFactorCache: not factored yet.") + n = cache.n + size(B, 1) == n || + throw(DimensionMismatch("size(B, 1) = $(size(B, 1)), cache n = $(n)")) + stride(B, 1) == 1 || + throw(ArgumentError("B must have unit stride in the first dimension.")) + size(B, 2) == 0 && return B + # Workspace-aware overload — caller-supplied scratch avoids a per-call + # malloc/free inside libSparse. + ws = _ensure_solve_workspace!(cache, size(B, 2)) + GC.@preserve cache _sparse_solve_matrix_ws!(cache.numeric, _dense_matrix(B), ws) + return B +end + +function solve!(cache::AAFactorCache, b::StridedVector{Cdouble}) + is_factored(cache) || error("AAFactorCache: not factored yet.") + n = cache.n + length(b) == n || throw(DimensionMismatch("length(b) = $(length(b)), cache n = $(n)")) + stride(b, 1) == 1 || throw(ArgumentError("b must have unit stride.")) + ws = _ensure_solve_workspace!(cache, 1) + GC.@preserve cache _sparse_solve_vector_ws!(cache.numeric, _dense_vector(b), ws) + return b +end + +""" + \\(cache::AAFactorCache, B) -> X + +Allocating solve, mirroring `LinearAlgebra.Factorization`'s API. +""" +Base.:\(cache::AAFactorCache, B::StridedVecOrMat{Cdouble}) = solve!(cache, copy(B)) diff --git a/src/AccelerateWrapper/solve_sparse_rhs.jl b/src/AccelerateWrapper/solve_sparse_rhs.jl new file mode 100644 index 000000000..e2ba840f9 --- /dev/null +++ b/src/AccelerateWrapper/solve_sparse_rhs.jl @@ -0,0 +1,102 @@ +const SPARSE_RHS_DEFAULT_BLOCK = 64 + +""" + solve_sparse!(cache, B, out; block=$(SPARSE_RHS_DEFAULT_BLOCK)) -> out + +Solve `A · X = B` for a `SparseMatrixCSC` right-hand side, writing the +result into `out`. Empty columns of `B` are not solved — `out`'s +corresponding columns are zeroed. Non-empty columns within each chunk of +`block` consecutive RHS columns are packed into a dense scratch and solved +in a single libSparse call. + +The `block` chunk size bounds the working set so that processing an +`n × nrhs` sparse RHS requires only `O(n · block)` extra memory regardless +of `nrhs`. The cache reuses its packing buffer across calls; warm calls +allocate nothing in the solver. + +Not thread-safe (mutates per-cache scratch). +""" +function solve_sparse!( + cache::AAFactorCache, + B::SparseMatrixCSC{<:Number, Int}, + out::AbstractMatrix{Cdouble}; + block::Int = SPARSE_RHS_DEFAULT_BLOCK, +) + is_factored(cache) || error("AAFactorCache: not factored yet.") + block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)")) + n = cache.n + size(B, 1) == n || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(n)", + )) + size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch( + "out has size $(size(out)); expected $((n, size(B, 2))).", + )) + + nb = size(B, 2) + nb == 0 && return out + fill!(out, zero(Cdouble)) + + Browval = rowvals(B) + Bnzval = nonzeros(B) + + _ensure_scratch!(cache, block) + # Size the libSparse solve workspace to the *maximum* chunk size once. + # Sizing it per chunk would `resize!` on every short tail block, churning + # allocations in the hot PTDF loop. At 10k nodes a 64-RHS block needs + # ~15 MiB of scratch; without this, libSparse mallocs+frees it per call. + ws = _ensure_solve_workspace!(cache, block) + scratch = cache.scratch + col_map = cache.col_map + + j_start = 1 + @inbounds while j_start <= nb + j_end = min(j_start + block - 1, nb) + + npack = 0 + for j in j_start:j_end + rng = nzrange(B, j) + isempty(rng) && continue + npack += 1 + col_map[npack] = j + fill!(view(scratch, :, npack), zero(Cdouble)) + for p in rng + scratch[Browval[p], npack] = Bnzval[p] + end + end + + if npack > 0 + # Build a DenseMatrix view over scratch[:, 1:npack]. Column stride + # is `size(scratch, 1) = n` because scratch is column-major and + # full-height. + dm = DenseMatrix_t( + Cint(n), + Cint(npack), + Cint(size(scratch, 1)), + ATT_ORDINARY, + pointer(scratch), + ) + GC.@preserve cache _sparse_solve_matrix_ws!(cache.numeric, dm, ws) + + for k in 1:npack + copyto!(view(out, :, col_map[k]), view(scratch, :, k)) + end + end + + j_start = j_end + 1 + end + return out +end + +"""Allocating wrapper around `solve_sparse!`.""" +function solve_sparse( + cache::AAFactorCache, + B::SparseMatrixCSC{<:Number, Int}; + block::Int = SPARSE_RHS_DEFAULT_BLOCK, +) + return solve_sparse!( + cache, + B, + Matrix{Cdouble}(undef, cache.n, size(B, 2)); + block = block, + ) +end diff --git a/src/AccelerateWrapper/spmm.jl b/src/AccelerateWrapper/spmm.jl new file mode 100644 index 000000000..e1c5375ca --- /dev/null +++ b/src/AccelerateWrapper/spmm.jl @@ -0,0 +1,105 @@ +# SparseMultiply bindings. These do not require a factorization cache; they +# take a `SparseMatrixCSC{Float64, Int}` directly and build the Apple-side +# `SparseMatrix_t` view at the ccall boundary. Per call this allocates two +# transient arrays (0-based `Clong[]` colptr, narrowed `Cint[]` rowval); if +# a future hot loop needs allocation-free SpMM, lift these into a dedicated +# `AASparseView` cache type. + +""" + aa_spmm!(Y, A, X) -> Y + +Compute `Y ← A · X` in place, where `A` is a `SparseMatrixCSC{Float64, Int}`, +`X` and `Y` are `StridedMatrix{Float64}` of compatible shape. libSparse +overwrites `Y` (does not accumulate). +""" +function aa_spmm!( + Y::StridedMatrix{Cdouble}, + A::SparseMatrixCSC{Float64, Int}, + X::StridedMatrix{Cdouble}, +) + size(A, 2) == size(X, 1) || throw( + DimensionMismatch( + "A is $(size(A)), X is $(size(X)); inner dimensions must match.", + ), + ) + size(Y, 1) == size(A, 1) && size(Y, 2) == size(X, 2) || throw( + DimensionMismatch( + "Y has size $(size(Y)); expected $((size(A, 1), size(X, 2))).", + ), + ) + stride(X, 1) == 1 || throw(ArgumentError("X must have unit stride in dim 1.")) + stride(Y, 1) == 1 || throw(ArgumentError("Y must have unit stride in dim 1.")) + size(X, 2) == 0 && return Y + cs, ri = _csc_to_apple(A) + sp = SparseMatrix_t( + SparseMatrixStructure( + Cint(size(A, 1)), + Cint(size(A, 2)), + pointer(cs), + pointer(ri), + ATT_ORDINARY, + UInt8(1), + ), + pointer(nonzeros(A)), + ) + GC.@preserve cs ri A X Y _sparse_multiply_matrix!( + sp, + _dense_matrix(X), + _dense_matrix(Y), + ) + return Y +end + +""" + aa_spmv!(y, A, x) -> y + +Compute `y ← A · x` in place. +""" +function aa_spmv!( + y::StridedVector{Cdouble}, + A::SparseMatrixCSC{Float64, Int}, + x::StridedVector{Cdouble}, +) + size(A, 2) == length(x) || throw(DimensionMismatch( + "A is $(size(A)), length(x) = $(length(x)).", + )) + length(y) == size(A, 1) || throw(DimensionMismatch( + "length(y) = $(length(y)); expected $(size(A, 1)).", + )) + stride(x, 1) == 1 || throw(ArgumentError("x must have unit stride.")) + stride(y, 1) == 1 || throw(ArgumentError("y must have unit stride.")) + cs, ri = _csc_to_apple(A) + sp = SparseMatrix_t( + SparseMatrixStructure( + Cint(size(A, 1)), + Cint(size(A, 2)), + pointer(cs), + pointer(ri), + ATT_ORDINARY, + UInt8(1), + ), + pointer(nonzeros(A)), + ) + GC.@preserve cs ri A x y _sparse_multiply_vector!( + sp, + _dense_vector(x), + _dense_vector(y), + ) + return y +end + +# Convert CSC's 1-based `colptr::Vector{Int}` and `rowval::Vector{Int}` into +# Apple's 0-based `Clong[]` colstarts and narrowed `Cint[]` rowindices. +function _csc_to_apple(A::SparseMatrixCSC{Float64, Int}) + cp = getcolptr(A) + rv = rowvals(A) + cs = Vector{Clong}(undef, length(cp)) + ri = Vector{Cint}(undef, length(rv)) + @inbounds for k in eachindex(cp) + cs[k] = Clong(cp[k] - 1) + end + @inbounds for k in eachindex(rv) + ri[k] = Cint(rv[k] - 1) + end + return cs, ri +end diff --git a/src/BA_ABA_matrices.jl b/src/BA_ABA_matrices.jl index 0cec17048..7902317a3 100644 --- a/src/BA_ABA_matrices.jl +++ b/src/BA_ABA_matrices.jl @@ -179,7 +179,7 @@ power flow analysis, sensitivity calculations, and linear power system studies. struct ABA_Matrix{ Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, - F <: Union{Nothing, KLULinSolveCache{Float64}}, + F <: Union{Nothing, KLULinSolveCache{Float64, Int64}}, } <: PowerNetworkMatrix{Float64} data::SparseArrays.SparseMatrixCSC{Float64, Int} axes::Ax @@ -358,7 +358,7 @@ Check if an ABA_Matrix has been factorized (i.e., contains LU factorization matr """ is_factorized(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Dict}} = false is_factorized( - ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64}}, + ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64, Int64}}, ) where {Ax, L <: NTuple{2, Dict}} = true # get_index functions: BA_Matrix stores the transposed matrix, thus get index diff --git a/src/KLUWrapper/KLUWrapper.jl b/src/KLUWrapper/KLUWrapper.jl index 5e996dfdf..7d50b5914 100644 --- a/src/KLUWrapper/KLUWrapper.jl +++ b/src/KLUWrapper/KLUWrapper.jl @@ -74,8 +74,12 @@ export KLULinSolveCache, tsolve!, solve_sparse!, solve_sparse, + sort_factors!, + condest!, + rcond!, n_valid, - is_factored + is_factored, + get_reuse_symbolic include("klu_jll_bindings.jl") include("klu_cache.jl") diff --git a/src/KLUWrapper/klu_cache.jl b/src/KLUWrapper/klu_cache.jl index 89d519980..6318e3050 100644 --- a/src/KLUWrapper/klu_cache.jl +++ b/src/KLUWrapper/klu_cache.jl @@ -5,37 +5,55 @@ A cached KLU linear solver designed for repeated solves against the same sparse matrix structure. `numeric_refactor!` and `solve!` allocate nothing once the cache is built. -The type parameter `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU -path. Indices are always `Int64` (SuiteSparse_long). +Type parameters: +- `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU path + (`klu_*_factor`/`klu_z*_factor`). +- `Ti ∈ {Int32, Int64}` selects the index-type entry-point family + (`klu_*` for `int`/Int32, `klu_l_*` for `SuiteSparse_long`/Int64). + The cache's `colptr`/`rowval`/`col_map` are stored in this type. `reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis; `check_pattern` adds a structural-equality check on refactor calls and is only consulted when reusing. """ -mutable struct KLULinSolveCache{Tv <: Union{Float64, ComplexF64}} - colptr::Vector{Int64} - rowval::Vector{Int64} +mutable struct KLULinSolveCache{ + Tv <: Union{Float64, ComplexF64}, + Ti <: Union{Int32, Int64}, +} + colptr::Vector{Ti} + rowval::Vector{Ti} # Copy of the matrix values used in the most recent numeric factorization. # Lets `_recover_factorization!` rebuild a corrupted numeric handle without # the caller having to re-supply `A`. Empty before the first factor call. nzval::Vector{Tv} - common::Base.RefValue{KluLCommon} - symbolic::SymbolicPtr - numeric::NumericPtr + # `Base.RefValue{KluLCommon}` for Int64 or `Base.RefValue{KluCommon}` for + # Int32 — we keep the field untyped here because Julia's untyped + # `Base.RefValue` lookup is fast enough (the dispatch helpers recover the + # concrete type at each callsite via `_common_type(Ti)`) and avoiding a + # third type parameter keeps the surface tidy. + common::Base.RefValue + # Opaque Symbolic/Numeric pointers. `Ptr{Cvoid}` is safe: each callsite + # threads through the typed `SymbolicPtr` / `SymbolicPtr32` (resp. + # Numeric) at the ccall boundary; on the Julia side the values are + # treated as black boxes by every consumer. + symbolic::Ptr{Cvoid} + numeric::Ptr{Cvoid} reuse_symbolic::Bool check_pattern::Bool # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call so # the wrapper's working set stays O(n*block) instead of O(n*nrhs); see # `solve_sparse_rhs.jl`. scratch::Matrix{Tv} - col_map::Vector{Int64} + col_map::Vector{Ti} end -@inline _dim(cache::KLULinSolveCache) = Int64(length(cache.colptr) - 1) +@inline _dim(cache::KLULinSolveCache{Tv, Ti}) where {Tv, Ti} = + Ti(length(cache.colptr) - 1) -Base.size(cache::KLULinSolveCache) = (n = _dim(cache); (n, n)) -Base.size(cache::KLULinSolveCache, d::Integer) = d <= 2 ? _dim(cache) : 1 -Base.eltype(::Type{KLULinSolveCache{Tv}}) where {Tv} = Tv +Base.size(cache::KLULinSolveCache) = (n = Int(_dim(cache)); (n, n)) +Base.size(cache::KLULinSolveCache, d::Integer) = + d <= 2 ? Int(_dim(cache)) : 1 +Base.eltype(::Type{KLULinSolveCache{Tv, Ti}}) where {Tv, Ti} = Tv get_reuse_symbolic(cache::KLULinSolveCache) = cache.reuse_symbolic """ @@ -53,37 +71,363 @@ is_factored(cache::KLULinSolveCache) = # truthy form. Kept around for tests that want a uniform numeric reading. n_valid(cache::KLULinSolveCache) = is_factored(cache) ? 1 : 0 -@inline _factor_call(::Type{Float64}, ap, ai, ax, sym, common) = - klu_l_factor(ap, ai, ax, sym, common) -@inline _factor_call(::Type{ComplexF64}, ap, ai, ax, sym, common) = - klu_zl_factor(ap, ai, ax, sym, common) +# --------------------------------------------------------------------------- +# Type-paired dispatch helpers — (Tv, Ti) → libklu entry point +# --------------------------------------------------------------------------- + +# Map Ti to its concrete `klu_common` struct type. +@inline _common_type(::Type{Int32}) = KluCommon +@inline _common_type(::Type{Int64}) = KluLCommon + +# `klu_defaults` initializer. +@inline _defaults!(::Type{Int32}, common::Ref) = klu_defaults!(common) +@inline _defaults!(::Type{Int64}, common::Ref) = klu_l_defaults!(common) + +# `klu_analyze` returns the symbolic handle. n is widened to `Ti` so the +# C argument width matches. +@inline _analyze_call(::Type{Int32}, n, ap, ai, common) = + klu_analyze(Cint(n), ap, ai, common) +@inline _analyze_call(::Type{Int64}, n, ap, ai, common) = + klu_l_analyze(Int64(n), ap, ai, common) + +# `klu_free_symbolic` — takes the opaque pointer ref and the common ref. +# `sym_ref` is a `Ref{Ptr{Cvoid}}` on the Julia side; we reinterpret it +# to the typed `SymbolicPtr` / `SymbolicPtr32` at the ccall site so libklu +# sees the right pointer width. +@inline function _free_symbolic!(::Type{Int32}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref) + typed = Ref(reinterpret(SymbolicPtr32, sym_ref[])) + klu_free_symbolic!(typed, common) + sym_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_symbolic!(::Type{Int64}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref) + typed = Ref(reinterpret(SymbolicPtr, sym_ref[])) + klu_l_free_symbolic!(typed, common) + sym_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end -@inline _refactor_call(::Type{Float64}, ap, ai, ax, sym, num, common) = - klu_l_refactor(ap, ai, ax, sym, num, common) -@inline _refactor_call(::Type{ComplexF64}, ap, ai, ax, sym, num, common) = - klu_zl_refactor(ap, ai, ax, sym, num, common) +# `klu_factor` — returns the numeric handle as a typed pointer; we +# reinterpret to `Ptr{Cvoid}` for storage. Dispatch on both Tv and Ti. +@inline function _factor_call(::Type{Float64}, ::Type{Int32}, ap, ai, ax, sym, common) + return reinterpret( + Ptr{Cvoid}, + klu_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common), + ) +end +@inline function _factor_call(::Type{Float64}, ::Type{Int64}, ap, ai, ax, sym, common) + return reinterpret( + Ptr{Cvoid}, + klu_l_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common), + ) +end +@inline function _factor_call( + ::Type{ComplexF64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + common, +) + return reinterpret( + Ptr{Cvoid}, + klu_z_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common), + ) +end +@inline function _factor_call( + ::Type{ComplexF64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + common, +) + return reinterpret( + Ptr{Cvoid}, + klu_zl_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common), + ) +end -@inline _solve_call(::Type{Float64}, sym, num, n, nrhs, b, common) = - klu_l_solve(sym, num, n, nrhs, b, common) -@inline _solve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common) = - klu_zl_solve(sym, num, n, nrhs, b, common) +@inline function _refactor_call( + ::Type{Float64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + common, + ) +end +@inline function _refactor_call( + ::Type{Float64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_l_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + common, + ) +end +@inline function _refactor_call( + ::Type{ComplexF64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_z_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + common, + ) +end +@inline function _refactor_call( + ::Type{ComplexF64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_zl_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + common, + ) +end -@inline _tsolve_call(::Type{Float64}, sym, num, n, nrhs, b, common; conjugate = false) = - klu_l_tsolve(sym, num, n, nrhs, b, common) -@inline _tsolve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common; conjugate = false) = - klu_zl_tsolve(sym, num, n, nrhs, b, Cint(conjugate), common) +@inline function _solve_call( + ::Type{Float64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_solve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{Float64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_l_solve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{ComplexF64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_z_solve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{ComplexF64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_zl_solve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end -@inline _free_numeric!(::Type{Float64}, num_ref, common) = - klu_l_free_numeric!(num_ref, common) -@inline _free_numeric!(::Type{ComplexF64}, num_ref, common) = - klu_zl_free_numeric!(num_ref, common) +@inline function _tsolve_call( + ::Type{Float64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_tsolve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _tsolve_call( + ::Type{Float64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_l_tsolve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end +@inline function _tsolve_call( + ::Type{ComplexF64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_z_tsolve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, Cint(conjugate), common, + ) +end +@inline function _tsolve_call( + ::Type{ComplexF64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_zl_tsolve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, Cint(conjugate), common, + ) +end + +@inline function _free_numeric!( + ::Type{Float64}, + ::Type{Int32}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr32, num_ref[])) + klu_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{Float64}, + ::Type{Int64}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr, num_ref[])) + klu_l_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{ComplexF64}, + ::Type{Int32}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr32, num_ref[])) + klu_z_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{ComplexF64}, + ::Type{Int64}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr, num_ref[])) + klu_zl_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end + +# Performance-knob dispatchers — Float64 only. libklu exposes these in the +# ComplexF64 build too; we'll add bindings if a consumer asks. +@inline _sort_call(::Type{Int32}, sym, num, common) = + klu_sort(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common) +@inline _sort_call(::Type{Int64}, sym, num, common) = + klu_l_sort(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common) + +@inline _condest_call(::Type{Int32}, ap, ax, sym, num, common) = + klu_condest( + ap, + ax, + reinterpret(SymbolicPtr32, sym), + reinterpret(NumericPtr32, num), + common, + ) +@inline _condest_call(::Type{Int64}, ap, ax, sym, num, common) = + klu_l_condest( + ap, + ax, + reinterpret(SymbolicPtr, sym), + reinterpret(NumericPtr, num), + common, + ) + +@inline _rcond_call(::Type{Int32}, sym, num, common) = + klu_rcond(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common) +@inline _rcond_call(::Type{Int64}, sym, num, common) = + klu_l_rcond(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common) + +# --------------------------------------------------------------------------- +# Constructor +# --------------------------------------------------------------------------- """ KLULinSolveCache(A; reuse_symbolic=true, check_pattern=true) -Build a cache for the sparse matrix `A`. Allocates structural arrays and -runs `klu_l_defaults`, but does **not** factorize. Call `full_factor!` -(or `symbolic_factor!` followed by `numeric_refactor!`) before `solve!`. +Build a cache for the sparse matrix `A`. The cache's index type is taken +from `A`: `SparseMatrixCSC{Tv, Int32}` ⇒ `KLULinSolveCache{Tv, Int32}`, +`SparseMatrixCSC{Tv, Int64}` ⇒ `KLULinSolveCache{Tv, Int64}`. Allocates +structural arrays and runs the corresponding `klu_defaults`/`klu_l_defaults` +initializer, but does **not** factorize. Call `full_factor!` (or +`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`. A finalizer frees libklu handles on GC; call `Base.finalize(cache)` to release them eagerly. Releasing the handles leaves Julia-side state intact, @@ -91,34 +435,30 @@ so the cache can be re-factorized via `symbolic_factor!`/`numeric_refactor!` or `full_factor!`. """ function KLULinSolveCache( - A::SparseMatrixCSC{Tv, Int}; + A::SparseMatrixCSC{Tv, Ti}; reuse_symbolic::Bool = true, check_pattern::Bool = true, -) where {Tv <: Union{Float64, ComplexF64}} - Int === Int64 || error( - "KLULinSolveCache requires a 64-bit Julia build (SuiteSparse_long " * - "bindings need Int64 indices). Got Int = $(Int).", - ) +) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}} n = size(A, 1) - n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))")) + n == size(A, 2) || + throw(DimensionMismatch("matrix must be square; got $(size(A))")) - common = Ref(KluLCommon()) - klu_l_defaults!(common) + common = Ref(_common_type(Ti)()) + _defaults!(Ti, common) - colptr = Vector{Int64}(undef, length(getcolptr(A))) + colptr = Vector{Ti}(undef, length(getcolptr(A))) copyto!(colptr, getcolptr(A)) - colptr .-= 1 - rowval = Vector{Int64}(undef, length(rowvals(A))) + colptr .-= one(Ti) + rowval = Vector{Ti}(undef, length(rowvals(A))) copyto!(rowval, rowvals(A)) - rowval .-= 1 + rowval .-= one(Ti) - cache = KLULinSolveCache{Tv}( + cache = KLULinSolveCache{Tv, Ti}( colptr, rowval, Tv[], common, - convert(SymbolicPtr, C_NULL), - convert(NumericPtr, C_NULL), + Ptr{Cvoid}(C_NULL), Ptr{Cvoid}(C_NULL), reuse_symbolic, check_pattern, Matrix{Tv}(undef, 0, 0), - Int64[], + Ti[], ) finalizer(_free_klu_handles!, cache) return cache @@ -131,10 +471,10 @@ Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length `block`. Grows in place; reuses across `solve_sparse!` calls. """ @inline function _ensure_scratch!( - cache::KLULinSolveCache{Tv}, + cache::KLULinSolveCache{Tv, Ti}, block::Int, -) where {Tv <: Union{Float64, ComplexF64}} - n = _dim(cache) +) where {Tv, Ti} + n = Int(_dim(cache)) s = cache.scratch if size(s, 1) != n || size(s, 2) < block cache.scratch = Matrix{Tv}(undef, n, block) @@ -153,16 +493,16 @@ second call hits the `C_NULL` guards. Used both by `symbolic_factor!` mid-life (drop old handles before re-analyzing) and by the GC finalizer. """ function _free_klu_handles!( - cache::KLULinSolveCache{Tv}, -) where {Tv <: Union{Float64, ComplexF64}} + cache::KLULinSolveCache{Tv, Ti}, +) where {Tv, Ti} if cache.numeric != C_NULL num_ref = Ref(cache.numeric) - _free_numeric!(Tv, num_ref, cache.common) + _free_numeric!(Tv, Ti, num_ref, cache.common) cache.numeric = num_ref[] end if cache.symbolic != C_NULL sym_ref = Ref(cache.symbolic) - klu_l_free_symbolic!(sym_ref, cache.common) + _free_symbolic!(Ti, sym_ref, cache.common) cache.symbolic = sym_ref[] end return nothing @@ -180,8 +520,8 @@ scope. Requires that a numeric factor has been built before (so `cache.nzval` is populated) and the symbolic factor is still valid. """ function _recover_factorization!( - cache::KLULinSolveCache{Tv}, -) where {Tv <: Union{Float64, ComplexF64}} + cache::KLULinSolveCache{Tv, Ti}, +) where {Tv, Ti} cache.symbolic == C_NULL && error( "KLULinSolveCache: cannot recover without a symbolic factor.", ) @@ -190,11 +530,12 @@ function _recover_factorization!( ) if cache.numeric != C_NULL num_ref = Ref(cache.numeric) - _free_numeric!(Tv, num_ref, cache.common) + _free_numeric!(Tv, Ti, num_ref, cache.common) cache.numeric = num_ref[] end num = _factor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(cache.nzval), cache.symbolic, cache.common, ) num == C_NULL && klu_throw(cache.common[], "klu_factor (recovery)") @@ -202,8 +543,11 @@ function _recover_factorization!( return cache end -@inline function _check_pattern_match(cache::KLULinSolveCache, - A::SparseMatrixCSC, op::AbstractString) +@inline function _check_pattern_match( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC, + op::AbstractString, +) where {Tv, Ti} Acolptr = getcolptr(A) Arowval = rowvals(A) if length(Acolptr) != length(cache.colptr) || @@ -217,13 +561,13 @@ end # Increment-compare-decrement: avoids allocating a 1-indexed copy. The # `try/finally` restores `colptr`/`rowval` even on InterruptException so # the cache is never left with off-by-one structural arrays. - cache.colptr .+= 1 - cache.rowval .+= 1 + cache.colptr .+= one(Ti) + cache.rowval .+= one(Ti) bad = try (cache.colptr != Acolptr) || (cache.rowval != Arowval) finally - cache.colptr .-= 1 - cache.rowval .-= 1 + cache.colptr .-= one(Ti) + cache.rowval .-= one(Ti) end if bad throw(ArgumentError( @@ -237,15 +581,19 @@ end symbolic_factor!(cache, A) Free any cached symbolic/numeric factor, replace the structural arrays with -`A`'s pattern, and run `klu_l_analyze`. +`A`'s pattern, and run `klu_analyze` / `klu_l_analyze`. """ -function symbolic_factor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function symbolic_factor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} n = _dim(cache) - if size(A, 1) != n || size(A, 2) != n - throw(DimensionMismatch( - "Cannot factor: cache is $(n)×$(n) but A is $(size(A)).", - )) + if size(A, 1) != Int(n) || size(A, 2) != Int(n) + throw( + DimensionMismatch( + "Cannot factor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).", + ), + ) end _free_klu_handles!(cache) @@ -253,16 +601,14 @@ function symbolic_factor!(cache::KLULinSolveCache{Tv}, Arowval = rowvals(A) resize!(cache.colptr, length(Acolptr)) copyto!(cache.colptr, Acolptr) - cache.colptr .-= 1 + cache.colptr .-= one(Ti) resize!(cache.rowval, length(Arowval)) copyto!(cache.rowval, Arowval) - cache.rowval .-= 1 + cache.rowval .-= one(Ti) - sym = klu_l_analyze( - Int64(n), pointer(cache.colptr), pointer(cache.rowval), cache.common, - ) - sym == C_NULL && klu_throw(cache.common[], "klu_l_analyze") - cache.symbolic = sym + sym = _analyze_call(Ti, n, pointer(cache.colptr), pointer(cache.rowval), cache.common) + sym == C_NULL && klu_throw(cache.common[], "klu_analyze") + cache.symbolic = reinterpret(Ptr{Cvoid}, sym) return cache end @@ -272,17 +618,19 @@ end If `cache.reuse_symbolic`, optionally verify the structure matches and reuse the existing analysis. Otherwise, rerun `symbolic_factor!`. """ -function symbolic_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function symbolic_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} if !cache.reuse_symbolic return symbolic_factor!(cache, A) end if cache.check_pattern n = _dim(cache) - if size(A, 1) != n || size(A, 2) != n + if size(A, 1) != Int(n) || size(A, 2) != Int(n) throw( DimensionMismatch( - "Cannot refactor: cache is $(n)×$(n) but A is $(size(A)).", + "Cannot refactor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).", ), ) end @@ -298,15 +646,18 @@ Compute (or refresh) the numeric factorization. The first call after `symbolic_factor!` invokes `klu_*_factor`; subsequent calls invoke `klu_*_refactor` and reuse the existing numeric struct. """ -function numeric_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function numeric_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} cache.symbolic == C_NULL && error( "KLULinSolveCache: call symbolic_factor! before numeric_refactor!.", ) Anz = nonzeros(A) if cache.numeric == C_NULL num = _factor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(Anz), cache.symbolic, cache.common, ) num == C_NULL && klu_throw(cache.common[], "klu_factor") @@ -314,7 +665,8 @@ function numeric_refactor!(cache::KLULinSolveCache{Tv}, else cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor") ok = _refactor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(Anz), cache.symbolic, cache.numeric, cache.common, ) ok != 1 && klu_throw(cache.common[], "klu_refactor") @@ -334,8 +686,10 @@ Equivalent to `symbolic_factor!(cache, A); numeric_refactor!(cache, A)`. Use this on a freshly constructed cache, or after `_free_klu_handles!` has cleared the handles, to bring the cache to a factored state. """ -function full_factor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function full_factor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} symbolic_factor!(cache, A) numeric_refactor!(cache, A) return cache @@ -351,8 +705,10 @@ the matrix values have changed; if the structure has also changed and the cache was built with `reuse_symbolic = false`, the symbolic analysis is rerun as well. """ -function full_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function full_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} symbolic_refactor!(cache, A) numeric_refactor!(cache, A) return cache @@ -363,11 +719,94 @@ end Build a cache for `A` and immediately compute the full factorization. """ -function klu_factorize(A::SparseMatrixCSC{Tv, Int}; +function klu_factorize( + A::SparseMatrixCSC{Tv, Ti}; reuse_symbolic::Bool = true, check_pattern::Bool = true, -) where {Tv <: Union{Float64, ComplexF64}} - cache = KLULinSolveCache(A; - reuse_symbolic = reuse_symbolic, check_pattern = check_pattern) +) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}} + cache = KLULinSolveCache( + A; + reuse_symbolic = reuse_symbolic, + check_pattern = check_pattern, + ) return full_factor!(cache, A) end + +# --------------------------------------------------------------------------- +# Performance / diagnostic surface +# --------------------------------------------------------------------------- + +""" + sort_factors!(cache) -> cache + +Sort the columns of the cached L and U factors in place via libklu's +`klu_sort` / `klu_l_sort`. KLU's numeric phase stores factor columns in +arbitrary order; sorting once after the first factor improves cache locality +on every subsequent `solve!` / `tsolve!`. The cost is `O(nnz_factor)` and +is amortized over many repeated solves — a win whenever the cache is used +for ≥ a few solves on the same factorization. + +Idempotent and `refactor`-stable: sorting after the initial factor persists +through `numeric_refactor!` because refactor preserves the column layout. +Only call this if the cache will be reused for multiple solves; for a +one-shot solve it is pure overhead. + +Float64 only. +""" +function sort_factors!(cache::KLULinSolveCache{Float64, Ti}) where {Ti} + is_factored(cache) || + error("sort_factors!: cache must be factored before sorting.") + ok = _sort_call(Ti, cache.symbolic, cache.numeric, cache.common) + ok != 1 && klu_throw(cache.common[], "klu_sort") + return cache +end + +""" + condest!(cache) -> Float64 + +Compute the 1-norm condition-number estimate of the cached factorization +via libklu's `klu_condest`. The result lands in `cache.common[].condest` +and is also returned. Cost is roughly two extra solves; use sparingly. + +Useful when deciding whether iterative refinement is worth running, or for +flagging near-singular Jacobians in Newton-Raphson loops. + +Float64 only. +""" +function condest!( + cache::KLULinSolveCache{Float64, Ti}, +) where {Ti} + is_factored(cache) || + error("condest!: cache must be factored before condest.") + isempty(cache.nzval) && error( + "condest!: requires a previous numeric_refactor! to have populated nzval.", + ) + ok = _condest_call( + Ti, + pointer(cache.colptr), + pointer(cache.nzval), + cache.symbolic, + cache.numeric, + cache.common, + ) + ok != 1 && klu_throw(cache.common[], "klu_condest") + return Float64(cache.common[].condest) +end + +""" + rcond!(cache) -> Float64 + +Compute the cheap reciprocal-condition estimate +`min(|diag(U)|)/max(|diag(U)|)` via libklu's `klu_rcond`. The result lands +in `cache.common[].rcond` and is also returned. Faster than `condest!` but +less reliable as a conditioning indicator. + +Float64 only. +""" +function rcond!(cache::KLULinSolveCache{Float64, Ti}) where {Ti} + is_factored(cache) || + error("rcond!: cache must be factored before rcond.") + ok = _rcond_call(Ti, cache.symbolic, cache.numeric, cache.common) + ok != 1 && klu_throw(cache.common[], "klu_rcond") + return Float64(cache.common[].rcond) +end diff --git a/src/KLUWrapper/klu_jll_bindings.jl b/src/KLUWrapper/klu_jll_bindings.jl index ff0ed0a92..b525a6572 100644 --- a/src/KLUWrapper/klu_jll_bindings.jl +++ b/src/KLUWrapper/klu_jll_bindings.jl @@ -1,9 +1,32 @@ -# Bindings into libklu (SuiteSparse_jll), restricted to the SuiteSparse_long -# (`klu_l_*` and `klu_zl_*`) entry points used by KLULinSolveCache. +# Bindings into libklu (SuiteSparse_jll), covering both index-type +# entry-point families: +# - `klu_l_*` / `klu_zl_*` for `SuiteSparse_long` (Int64) indices +# - `klu_*` / `klu_z_*` for `int` (Int32) indices +# +# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the +# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths +# (`klu_*_free_*`), which can fire on any thread at any safepoint and would +# otherwise race against in-flight `solve!` calls on a different cache. See +# `_LIBKLU_LOCK` in `KLUWrapper.jl` for the empirical evidence (intermittent +# `KLU_INVALID` return; SEGV at `klu_solve.c:118`). import LinearAlgebra import SuiteSparse_jll: libklu +# --------------------------------------------------------------------------- +# Status codes and shared error helper +# --------------------------------------------------------------------------- + +const KLU_OK = 0 +const KLU_SINGULAR = 1 +const KLU_OUT_OF_MEMORY = -2 +const KLU_INVALID = -3 +const KLU_TOO_LARGE = -4 + +# --------------------------------------------------------------------------- +# Int64 (SuiteSparse_long) family +# --------------------------------------------------------------------------- + # Layout matches `klu_l_common` in upstream `klu.h`. Must stay in sync. mutable struct KluLCommon tol::Cdouble @@ -40,19 +63,15 @@ mutable struct KluLNumeric end const SymbolicPtr = Ptr{KluLSymbolic} const NumericPtr = Ptr{KluLNumeric} -# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the -# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths -# (`klu_l_free_*`, `klu_zl_free_*`), which can fire on any thread at any -# safepoint and would otherwise race against in-flight `solve!` calls on a -# different cache. See `_LIBKLU_LOCK` in `KLUWrapper.jl` for the -# empirical evidence (intermittent `KLU_INVALID` return; SEGV at -# `klu_solve.c:118`). - klu_l_defaults!(common::Ref{KluLCommon}) = @klu_lock ccall((:klu_l_defaults, libklu), Cint, (Ptr{KluLCommon},), common) -function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64}, - common::Ref{KluLCommon}) +function klu_l_analyze( + n::Int64, + ap::Ptr{Int64}, + ai::Ptr{Int64}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_analyze, libklu), SymbolicPtr, @@ -61,8 +80,10 @@ function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64}, ) end -function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr}, - common::Ref{KluLCommon}) +function klu_l_free_symbolic!( + symbolic_ref::Ref{SymbolicPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_free_symbolic, libklu), Cint, @@ -71,8 +92,13 @@ function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr}, ) end -function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, - symbolic::SymbolicPtr, common::Ref{KluLCommon}) +function klu_l_factor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_factor, libklu), NumericPtr, @@ -81,19 +107,37 @@ function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, ) end -function klu_l_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, - symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon}) +function klu_l_refactor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_refactor, libklu), Cint, - (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, - Ptr{KluLCommon}), + ( + Ptr{Int64}, + Ptr{Int64}, + Ptr{Cdouble}, + SymbolicPtr, + NumericPtr, + Ptr{KluLCommon}, + ), ap, ai, ax, symbolic, numeric, common, ) end -function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon}) +function klu_l_solve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{Cdouble}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_solve, libklu), Cint, @@ -102,8 +146,14 @@ function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr, ) end -function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon}) +function klu_l_tsolve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{Cdouble}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_tsolve, libklu), Cint, @@ -112,8 +162,10 @@ function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, ) end -function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr}, - common::Ref{KluLCommon}) +function klu_l_free_numeric!( + numeric_ref::Ref{NumericPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_free_numeric, libklu), Cint, @@ -122,8 +174,69 @@ function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr}, ) end -function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, - symbolic::SymbolicPtr, common::Ref{KluLCommon}) +# Performance knobs / diagnostics — Int64 family. + +# klu_l_sort sorts the columns of the L and U factors in place. KLU's +# numeric phase stores L/U columns in arbitrary order; sorting once after +# the factor improves cache locality on every subsequent solve. Cheap +# (O(nnz_factor)) and idempotent. Call once after the first numeric +# factor; refactor preserves the sort. +function klu_l_sort( + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_sort, libklu), + Cint, + (SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + symbolic, numeric, common, + ) +end + +# klu_l_condest computes a 1-norm condition number estimate, populating +# `common.condest`. Costs roughly two extra solves. Useful for iterative +# refinement (informs tolerance choice) and as a diagnostic for near-singular +# matrices. Caller reads result from `cache.common[].condest`. +function klu_l_condest( + ap::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_condest, libklu), + Cint, + (Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + ap, ax, symbolic, numeric, common, + ) +end + +# klu_l_rcond fills `common.rcond` with the cheap diagonal-ratio +# reciprocal condition estimate (min(|diag(U)|)/max(|diag(U)|)). Faster +# than condest but less reliable. +function klu_l_rcond( + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_rcond, libklu), + Cint, + (SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + symbolic, numeric, common, + ) +end + +# Complex / Int64 +function klu_zl_factor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_factor, libklu), NumericPtr, @@ -132,42 +245,74 @@ function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, ) end -function klu_zl_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, - symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon}) +function klu_zl_refactor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_refactor, libklu), Cint, - (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, NumericPtr, - Ptr{KluLCommon}), + ( + Ptr{Int64}, + Ptr{Int64}, + Ptr{ComplexF64}, + SymbolicPtr, + NumericPtr, + Ptr{KluLCommon}, + ), ap, ai, ax, symbolic, numeric, common, ) end -function klu_zl_solve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, common::Ref{KluLCommon}) +function klu_zl_solve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{ComplexF64}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_solve, libklu), Cint, - (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, - Ptr{KluLCommon}), + (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Ptr{KluLCommon}), symbolic, numeric, ldim, nrhs, b, common, ) end -function klu_zl_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, conj_solve::Cint, - common::Ref{KluLCommon}) +function klu_zl_tsolve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{ComplexF64}, + conj_solve::Cint, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_tsolve, libklu), Cint, - (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Cint, - Ptr{KluLCommon}), + ( + SymbolicPtr, + NumericPtr, + Int64, + Int64, + Ptr{ComplexF64}, + Cint, + Ptr{KluLCommon}, + ), symbolic, numeric, ldim, nrhs, b, conj_solve, common, ) end -function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr}, - common::Ref{KluLCommon}) +function klu_zl_free_numeric!( + numeric_ref::Ref{NumericPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_free_numeric, libklu), Cint, @@ -176,14 +321,305 @@ function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr}, ) end -# Status codes from klu.h. -const KLU_OK = 0 -const KLU_SINGULAR = 1 -const KLU_OUT_OF_MEMORY = -2 -const KLU_INVALID = -3 -const KLU_TOO_LARGE = -4 +# --------------------------------------------------------------------------- +# Int32 (int) family — mirror of the Int64 set above +# --------------------------------------------------------------------------- + +# Layout matches `klu_common` (Int32 path) in upstream `klu.h`. Differs from +# `KluLCommon` only in the four rank/diag fields, which are `int` instead of +# `SuiteSparse_long`. +mutable struct KluCommon + tol::Cdouble + memgrow::Cdouble + initmem_amd::Cdouble + initmem::Cdouble + maxwork::Cdouble + btf::Cint + ordering::Cint + scale::Cint + user_order::Ptr{Cvoid} + user_data::Ptr{Cvoid} + halt_if_singular::Cint + status::Cint + nrealloc::Cint + structural_rank::Cint + numerical_rank::Cint + singular_col::Cint + noffdiag::Cint + flops::Cdouble + rcond::Cdouble + condest::Cdouble + rgrowth::Cdouble + work::Cdouble + memusage::Csize_t + mempeak::Csize_t + KluCommon() = new() +end + +mutable struct KluSymbolic end +mutable struct KluNumeric end + +const SymbolicPtr32 = Ptr{KluSymbolic} +const NumericPtr32 = Ptr{KluNumeric} + +klu_defaults!(common::Ref{KluCommon}) = + @klu_lock ccall((:klu_defaults, libklu), Cint, (Ptr{KluCommon},), common) + +function klu_analyze( + n::Cint, + ap::Ptr{Cint}, + ai::Ptr{Cint}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_analyze, libklu), + SymbolicPtr32, + (Cint, Ptr{Cint}, Ptr{Cint}, Ptr{KluCommon}), + n, ap, ai, common, + ) +end + +function klu_free_symbolic!( + symbolic_ref::Ref{SymbolicPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_free_symbolic, libklu), + Cint, + (Ptr{SymbolicPtr32}, Ptr{KluCommon}), + symbolic_ref, common, + ) +end + +function klu_factor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_factor, libklu), + NumericPtr32, + (Ptr{Cint}, Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, Ptr{KluCommon}), + ap, ai, ax, symbolic, common, + ) +end + +function klu_refactor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_refactor, libklu), + Cint, + ( + Ptr{Cint}, + Ptr{Cint}, + Ptr{Cdouble}, + SymbolicPtr32, + NumericPtr32, + Ptr{KluCommon}, + ), + ap, ai, ax, symbolic, numeric, common, + ) +end + +function klu_solve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{Cdouble}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_solve, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_tsolve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{Cdouble}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_tsolve, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_free_numeric!( + numeric_ref::Ref{NumericPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_free_numeric, libklu), + Cint, + (Ptr{NumericPtr32}, Ptr{KluCommon}), + numeric_ref, common, + ) +end + +# Performance knobs / diagnostics — Int32 family. + +function klu_sort( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_sort, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + symbolic, numeric, common, + ) +end + +function klu_condest( + ap::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_condest, libklu), + Cint, + (Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + ap, ax, symbolic, numeric, common, + ) +end + +function klu_rcond( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_rcond, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + symbolic, numeric, common, + ) +end + +# Complex / Int32 +function klu_z_factor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_factor, libklu), + NumericPtr32, + (Ptr{Cint}, Ptr{Cint}, Ptr{ComplexF64}, SymbolicPtr32, Ptr{KluCommon}), + ap, ai, ax, symbolic, common, + ) +end + +function klu_z_refactor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_refactor, libklu), + Cint, + ( + Ptr{Cint}, + Ptr{Cint}, + Ptr{ComplexF64}, + SymbolicPtr32, + NumericPtr32, + Ptr{KluCommon}, + ), + ap, ai, ax, symbolic, numeric, common, + ) +end + +function klu_z_solve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{ComplexF64}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_solve, libklu), + Cint, + ( + SymbolicPtr32, + NumericPtr32, + Cint, + Cint, + Ptr{ComplexF64}, + Ptr{KluCommon}, + ), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_z_tsolve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{ComplexF64}, + conj_solve::Cint, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_tsolve, libklu), + Cint, + ( + SymbolicPtr32, + NumericPtr32, + Cint, + Cint, + Ptr{ComplexF64}, + Cint, + Ptr{KluCommon}, + ), + symbolic, numeric, ldim, nrhs, b, conj_solve, common, + ) +end + +function klu_z_free_numeric!( + numeric_ref::Ref{NumericPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_free_numeric, libklu), + Cint, + (Ptr{NumericPtr32}, Ptr{KluCommon}), + numeric_ref, common, + ) +end + +# --------------------------------------------------------------------------- +# Shared error throw — dispatches on the common-struct width +# --------------------------------------------------------------------------- -function klu_throw(common::KluLCommon, op::AbstractString) +function klu_throw(common::Union{KluLCommon, KluCommon}, op::AbstractString) s = common.status s == KLU_SINGULAR && throw(LinearAlgebra.SingularException(Int(common.singular_col + 1))) diff --git a/src/KLUWrapper/solve_dense.jl b/src/KLUWrapper/solve_dense.jl index 7ce29b78c..85d758170 100644 --- a/src/KLUWrapper/solve_dense.jl +++ b/src/KLUWrapper/solve_dense.jl @@ -5,17 +5,19 @@ Solve `A · X = B` in place. `B::StridedVecOrMat{Tv}` must have first-dimension size equal to `cache.n` and unit stride in the first dimension. Multiple columns of `B` are handled in a single libklu call. """ -function solve!(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}} +function solve!( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}, +) where {Tv, Ti} is_factored(cache) || error("KLULinSolveCache: not factored yet.") n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) stride(B, 1) == 1 || throw(ArgumentError( "B must have unit stride in the first dimension.", )) - nrhs = Int64(size(B, 2)) + nrhs = size(B, 2) nrhs == 0 && return B # Snapshot KLU's preconditions plus identity info — gated by # `KLU_POOL_DEBUG`. `klu_l_solve` returns FALSE with `KLU_INVALID` when @@ -31,7 +33,7 @@ function solve!(cache::KLULinSolveCache{Tv}, pre_b_ptr = pointer(B) end ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common, + Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common, ) if ok == 0 @static if KLU_POOL_DEBUG @@ -57,21 +59,23 @@ In-place solve `Aᵀ · X = B` (or `Aᴴ · X = B` when `conjugate=true` on the complex path). Same shape requirements as `solve!`. The `conjugate` keyword is ignored on the real path. """ -function tsolve!(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}; conjugate::Bool = false, -) where {Tv <: Union{Float64, ComplexF64}} +function tsolve!( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}; + conjugate::Bool = false, +) where {Tv, Ti} is_factored(cache) || error("KLULinSolveCache: not factored yet.") n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) stride(B, 1) == 1 || throw(ArgumentError( "B must have unit stride in the first dimension.", )) - nrhs = Int64(size(B, 2)) + nrhs = size(B, 2) nrhs == 0 && return B ok = _tsolve_call( - Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common; + Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common; conjugate = conjugate, ) ok == 0 && klu_throw(cache.common[], "klu_tsolve") @@ -83,7 +87,9 @@ end Allocating solve, mirroring `LinearAlgebra.Factorization`'s API. """ -function Base.:\(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}} +function Base.:\( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}, +) where {Tv, Ti} return solve!(cache, copy(B)) end diff --git a/src/KLUWrapper/solve_sparse_rhs.jl b/src/KLUWrapper/solve_sparse_rhs.jl index a20679f24..089fa06a0 100644 --- a/src/KLUWrapper/solve_sparse_rhs.jl +++ b/src/KLUWrapper/solve_sparse_rhs.jl @@ -20,20 +20,23 @@ Not thread-safe (mutates per-cache scratch). Callers should serialize access through a per-cache lock if invoked from multiple threads. """ function solve_sparse!( - cache::KLULinSolveCache{Tv}, - B::SparseMatrixCSC{Tb, Int}, + cache::KLULinSolveCache{Tv, Ti}, + B::SparseMatrixCSC{Tb, <:Integer}, out::AbstractMatrix{Tv}; block::Int = SPARSE_RHS_DEFAULT_BLOCK, -) where {Tv <: Union{Float64, ComplexF64}, Tb <: Number} +) where {Tv, Ti, Tb <: Number} is_factored(cache) || error("KLULinSolveCache: not factored yet.") block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)")) n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", - )) - size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch( - "out has size $(size(out)); expected $((n, size(B, 2))).", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) + size(out, 1) == Int(n) && size(out, 2) == size(B, 2) || + throw( + DimensionMismatch( + "out has size $(size(out)); expected $((Int(n), size(B, 2))).", + ), + ) nb = size(B, 2) nb == 0 && return out @@ -71,7 +74,7 @@ function solve_sparse!( # helper) to keep the inner-loop closure from capturing the per-chunk # `npack`, which would force Julia to box it. ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, Int64(npack), + Tv, Ti, cache.symbolic, cache.numeric, n, npack, pointer(scratch), cache.common, ) if ok == 0 && cache.common[].status == KLU_INVALID @@ -79,7 +82,7 @@ function solve_sparse!( objectid(cache) _recover_factorization!(cache) ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, Int64(npack), + Tv, Ti, cache.symbolic, cache.numeric, n, npack, pointer(scratch), cache.common, ) end @@ -96,12 +99,13 @@ function solve_sparse!( end """Allocating wrapper around `solve_sparse!`.""" -function solve_sparse(cache::KLULinSolveCache{Tv}, - B::SparseMatrixCSC{<:Number, Int}; +function solve_sparse( + cache::KLULinSolveCache{Tv, Ti}, + B::SparseMatrixCSC{<:Number, <:Integer}; block::Int = SPARSE_RHS_DEFAULT_BLOCK, -) where {Tv <: Union{Float64, ComplexF64}} +) where {Tv, Ti} return solve_sparse!( - cache, B, Matrix{Tv}(undef, _dim(cache), size(B, 2)); + cache, B, Matrix{Tv}(undef, Int(_dim(cache)), size(B, 2)); block = block, ) end diff --git a/src/PowerNetworkMatrices.jl b/src/PowerNetworkMatrices.jl index 5239f5859..23ce6e4c5 100644 --- a/src/PowerNetworkMatrices.jl +++ b/src/PowerNetworkMatrices.jl @@ -89,8 +89,12 @@ import .KLUWrapper: n_valid, is_factored +include("AccelerateWrapper/AccelerateWrapper.jl") +import .AccelerateWrapper: AAFactorCache, aa_factorize, aa_spmm!, aa_spmv! + include("linalg_settings.jl") include("solver_dispatch.jl") +include("iterative_refinement.jl") function __init__() something(get_linalg_backend_check(), false) && check_linalg_backend() @@ -137,14 +141,12 @@ include("virtual_modf_calculations.jl") include("system_utils.jl") include("serialization.jl") -# Declare functions that will be defined by extensions -# These need to be declared so extensions can extend them +# Forward declarations for symbols still defined inside package extensions. +# AppleAccelerate-related functions live in `src/` now (no extension needed) +# and are not redeclared here. function _calculate_PTDF_matrix_MKLPardiso end -function _calculate_PTDF_matrix_AppleAccelerate end function _calculate_LODF_matrix_MKLPardiso end -function _calculate_LODF_matrix_AppleAccelerate end function _pardiso_sequential_LODF! end function _pardiso_single_LODF! end -function _create_apple_accelerate_factorization end end diff --git a/src/PowerflowMatrixTypes.jl b/src/PowerflowMatrixTypes.jl index 98d103289..cd1aa2ab9 100644 --- a/src/PowerflowMatrixTypes.jl +++ b/src/PowerflowMatrixTypes.jl @@ -1,7 +1,7 @@ const DC_ABA_Matrix_Factorized = ABA_Matrix{ Tuple{Vector{Int64}, Vector{Int64}}, Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}}, - KLULinSolveCache{Float64}, + KLULinSolveCache{Float64, Int64}, } const DC_ABA_Matrix_Unfactorized = ABA_Matrix{ Tuple{Vector{Int64}, Vector{Int64}}, diff --git a/src/common.jl b/src/common.jl index 70a1b50ff..c506170ad 100644 --- a/src/common.jl +++ b/src/common.jl @@ -6,6 +6,23 @@ function _add_to_collection!( return end +""" + _build_bus_to_valid_idx(n_buses, valid_ix) -> Vector{Int} + +Build the inverse of `valid_ix`: a length-`n_buses` vector where entry `b` +is the position of bus `b` inside `valid_ix`, or 0 if `b` is a reference +bus. Used by the Virtual\\* row-computation hot path so it can iterate the +nonzeros of a sparse `BA` column directly (O(nnz_col)) instead of scanning +the full bus axis (O(n_buses)) and bisecting the CSC for each entry. +""" +function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int}) + bus_to_valid_idx = zeros(Int, n_buses) + @inbounds for (i, b) in enumerate(valid_ix) + bus_to_valid_idx[b] = i + end + return bus_to_valid_idx +end + function _add_to_collection!( collection_tr3w::Vector{PSY.ThreeWindingTransformer}, transformer_tr3w::PSY.ThreeWindingTransformer, diff --git a/src/definitions.jl b/src/definitions.jl index 7b1bc991a..95e7d4b64 100644 --- a/src/definitions.jl +++ b/src/definitions.jl @@ -22,14 +22,16 @@ abstract type LinearSolverType end struct KLUSolver <: LinearSolverType end struct DenseSolver <: LinearSolverType end struct MKLPardisoSolver <: LinearSolverType end -struct AppleAccelerateSolver <: LinearSolverType end +struct AppleAccelerateLUSolver <: LinearSolverType end -const SUPPORTED_LINEAR_SOLVERS = ("KLU", "MKLPardiso", "AppleAccelerate", "Dense") +const SUPPORTED_LINEAR_SOLVERS = + ("KLU", "MKLPardiso", "AppleAccelerateLU", "Dense") @inline function resolve_linear_solver(s::String) s == "KLU" && return KLUSolver() s == "Dense" && return DenseSolver() s == "MKLPardiso" && return MKLPardisoSolver() - s == "AppleAccelerate" && return AppleAccelerateSolver() + s == "AppleAccelerateLU" && return AppleAccelerateLUSolver() + s == "AppleAccelerate" && return AppleAccelerateLUSolver() error("Unsupported linear solver: $s. Supported: $SUPPORTED_LINEAR_SOLVERS") end diff --git a/src/iterative_refinement.jl b/src/iterative_refinement.jl new file mode 100644 index 000000000..50e01f86e --- /dev/null +++ b/src/iterative_refinement.jl @@ -0,0 +1,153 @@ +import LinearAlgebra: norm, mul! + +""" + DEFAULT_REFINEMENT_MAX_ITER :: Int + +Default iteration cap for `solve_w_refinement!`. +""" +const DEFAULT_REFINEMENT_MAX_ITER = 25 + +# --- backend dispatch bridge --------------------------------------------------- +# +# The refinement body uses three primitives — in-place `solve!`, `is_factored`, +# and `_dim` — each of which is defined inside `KLUWrapper` and +# `AccelerateWrapper` as a *separate* function. The bridge below lets the +# shared body reach the right backend by multiple dispatch instead of an `isa` +# branch, and keeps the body itself backend-agnostic. + +_refine_solve!(K::KLULinSolveCache, r::StridedVecOrMat) = solve!(K, r) +_refine_solve!(K::AAFactorCache, r::StridedVecOrMat) = AccelerateWrapper.solve!(K, r) + +_refine_is_factored(K::KLULinSolveCache) = is_factored(K) +_refine_is_factored(K::AAFactorCache) = AccelerateWrapper.is_factored(K) + +_refine_dim(K::KLULinSolveCache) = Int(KLUWrapper._dim(K)) +_refine_dim(K::AAFactorCache) = AccelerateWrapper._dim(K) + +""" + solve_w_refinement!(cache, A, X, B; tol=…, max_iters=…) -> X + +Solve `A · X = B` in place using `cache` and apply iterative refinement +until `norm(B − A·X, 1) < norm(B, 1) * tol`, or until refinement stops +improving. `X` must be pre-allocated by the caller with the same shape as +`B`; it is overwritten with the refined solution. + +This is the non-allocating variant. For a one-shot allocating variant that +returns a fresh `X`, see `solve_w_refinement`. + +Supports `cache::KLULinSolveCache` (KLU backend, any `{Tv, Ti}`) and +`cache::AAFactorCache` (Apple Accelerate backend, `Cdouble` only). The +cache must already be factored (`is_factored(cache) == true`). The function +does not mutate `cache`'s factor; it only triggers in-place `solve!` calls +on a residual buffer. + +`max_iters` caps the refinement loop (default +`DEFAULT_REFINEMENT_MAX_ITER`). The default `tol` is `sqrt(eps(real(Tv)))`, +which is conservative for power-flow Newton-Raphson Jacobians. + +Useful when the cached factor is of an ill-conditioned matrix — e.g., +a Newton-Raphson Jacobian near a saddle point or a network reduction +interface — where a single back-solve carries enough error to slow NR +convergence. Cost per refinement iteration: one sparse matrix-vector +product plus one `solve!` against the cached factor. +""" +function solve_w_refinement!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseArrays.SparseMatrixCSC{Tv, Ti}, + X::StridedVecOrMat{Tv}, + B::StridedVecOrMat{Tv}; + tol::Real = sqrt(eps(real(Tv))), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) where {Tv, Ti} + return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters) +end + +function solve_w_refinement!( + cache::AAFactorCache, + A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer}, + X::StridedVecOrMat{Cdouble}, + B::StridedVecOrMat{Cdouble}; + tol::Real = sqrt(eps(Cdouble)), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) + return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters) +end + +function _solve_w_refinement_body!( + cache, + A::SparseArrays.SparseMatrixCSC, + X::StridedVecOrMat, + B::StridedVecOrMat, + tol::Real, + max_iters::Int, +) + _refine_is_factored(cache) || error( + "solve_w_refinement!: cache must be factored. " * + "Call `full_factor!(cache, A)` first.", + ) + size(X) == size(B) || + throw(DimensionMismatch("X is $(size(X)); B is $(size(B)).")) + size(B, 1) == _refine_dim(cache) || throw( + DimensionMismatch( + "size(B, 1) = $(size(B, 1)); cache n = $(_refine_dim(cache)).", + ), + ) + + Tv = eltype(X) + fill!(X, zero(Tv)) + # `X = 0` ⇒ initial residual `r = B - A·0 = B`. Copy once; subsequent + # iterations reuse `r` in place via `mul!(r, A, X); @. r = B - r`, + # which avoids the temporary `A * X` allocation `B - A * X` would create + # each refinement step. + r = copy(B) + bNorm = norm(B, 1) + iters = 0 + while iters < max_iters && norm(r, 1) >= bNorm * tol + prev_err = norm(r, 1) + _refine_solve!(cache, r) + X .+= r + mul!(r, A, X) + @. r = B - r + iters += 1 + if norm(r, 1) > prev_err + @debug "Iterative refinement diverging; returning best-so-far." iters + return X + end + end + @debug "Iterative refinement converged." iters + return X +end + +""" + solve_w_refinement(cache, A, B; tol=…, max_iters=…) -> X + +Allocating wrapper around `solve_w_refinement!`. Allocates `X` matching +`B`'s shape, then refines. +""" +function solve_w_refinement( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseArrays.SparseMatrixCSC{Tv, Ti}, + B::StridedVecOrMat{Tv}; + tol::Real = sqrt(eps(real(Tv))), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) where {Tv, Ti} + X = similar(B) + return solve_w_refinement!( + cache, A, X, B; + tol = tol, max_iters = max_iters, + ) +end + +function solve_w_refinement( + cache::AAFactorCache, + A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer}, + B::StridedVecOrMat{Cdouble}; + tol::Real = sqrt(eps(Cdouble)), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) + X = similar(B) + return solve_w_refinement!( + cache, A, X, B; + tol = tol, max_iters = max_iters, + ) +end diff --git a/src/linalg_settings.jl b/src/linalg_settings.jl index c79962189..4d2efb265 100644 --- a/src/linalg_settings.jl +++ b/src/linalg_settings.jl @@ -1,7 +1,9 @@ -# Extensions are loaded when trigger packages (Pardiso, AppleAccelerate) are loaded +# The MKL/Pardiso path still uses the package-extension mechanism (Pardiso.jl +# is the only consumer-facing way to access the MKL Pardiso solver). The +# Apple Accelerate path no longer does — `AccelerateWrapper` is built in via +# a `@static if Sys.isapple()` gate. -# Check if MKL/Pardiso extension is available at runtime function _has_mkl_pardiso_ext() ext = Base.get_extension(@__MODULE__, :MKLPardisoExt) return !isnothing(ext) @@ -12,24 +14,66 @@ _mkl_pardiso_install_error() = Install the Pardiso package: julia> using Pkg; Pkg.add(\"Pardiso\")""" -# Check if AppleAccelerate extension is available at runtime -function _has_apple_accelerate_ext() - ext = Base.get_extension(@__MODULE__, :AppleAccelerateExt) - return !isnothing(ext) +# Minimum macOS for the AppleAccelerate (libSparse LU) backend. The +# `SparseFactorizationLU` code is API_AVAILABLE(macos(15.5)); older +# libSparse rejects factorization type 80. +const _AA_MIN_MACOS = v"15.5" + +# Query the running macOS product version via the `kern.osproductversion` +# sysctl (libc, no subprocess). Returns a VersionNumber, or v"0" if the +# sysctl is unavailable (treated as "too old"). +function _macos_product_version() + Sys.isapple() || return v"0" + buf = Vector{UInt8}(undef, 64) + len = Ref{Csize_t}(length(buf)) + rc = ccall( + :sysctlbyname, Cint, + (Cstring, Ptr{UInt8}, Ptr{Csize_t}, Ptr{Cvoid}, Csize_t), + "kern.osproductversion", buf, len, C_NULL, 0, + ) + rc == 0 || return v"0" + s = String(buf[1:(len[] - 1)]) # NUL-terminated; drop the NUL + try + return VersionNumber(s) + catch + return v"0" + end end -_apple_accelerate_install_error() = - """The AppleAccelerate extension is not available. - This solver is only available on macOS. - Install AppleAccelerate: - julia> using Pkg; Pkg.add(\"AppleAccelerate\")""" +_macos_at_least(v::VersionNumber) = _macos_product_version() >= v -# _create_apple_accelerate_factorization is defined in ext/AppleAccelerateExt.jl -# when AppleAccelerate package is loaded +_has_apple_accelerate_backend() = Sys.isapple() && _macos_at_least(_AA_MIN_MACOS) + +function _apple_accelerate_unavailable_error() + if Sys.isapple() + return """The Apple Accelerate sparse backend requires macOS $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor) or newer \ + (libSparse LU / SparseFactorizationLU is API_AVAILABLE(macos(15.5))); \ + detected macOS $(_macos_product_version()). Use the KLU solver (the default fallback).""" + end + return """The Apple Accelerate sparse backend is macOS-only (Sys.isapple() returned false). + Use the KLU solver (the default) on non-Apple platforms.""" +end + +""" + _default_linear_solver() -> String + +Default sparse linear solver name. Returns "AppleAccelerateLU" on macOS +$(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)+ (Apple's built-in libSparse LU via `AccelerateWrapper`) +and "KLU" elsewhere (non-Apple, or macOS older than $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)). +Used as the default for the `linear_solver` keyword on PTDF / LODF / +VirtualPTDF / VirtualLODF / VirtualMODF constructors. +""" +function _default_linear_solver() + if Sys.isapple() && _macos_at_least(_AA_MIN_MACOS) + return "AppleAccelerateLU" + end + return "KLU" +end "Set a preference of the backend library for sparse linear algebra operations." function set_linalg_backend_preference(linalglib::Union{String, Nothing}) - if !isnothing(linalglib) && !(linalglib in ["MKLPardiso", "AppleAccelerate"]) + if !isnothing(linalglib) && + !(linalglib in ["MKLPardiso", "AppleAccelerateLU", "AppleAccelerate"]) throw( ArgumentError( "Unsupported sparse linear algebra backend requested: $(linalglib)", @@ -79,7 +123,7 @@ function check_linalg_backend() @info """The sparse linear algebra solver preference has been set to $(user_linalg_backend). To change this for your active project, call the function PowerNetworkMatrices.set_linalg_backend_preference() - with one of "MKLPardiso" or "AppleAccelerate", or `nothing` to turn off. + with one of "MKLPardiso", "AppleAccelerateLU", or `nothing` to turn off. """ end @@ -101,7 +145,10 @@ function check_linalg_backend() @info no_msg("Pardiso") @info "See https://github.com/JuliaSparse/Pardiso.jl for more details." end - if user_linalg_backend == "AppleAccelerate" + if ( + user_linalg_backend !== nothing && + startswith(user_linalg_backend, "AppleAccelerate") + ) @warn "AppleAccelerate is not supported on non-Apple systems." end end @@ -125,14 +172,17 @@ function check_linalg_backend() end end - if _has_apple_accelerate_ext() + if _has_apple_accelerate_backend() @info go_msg("AppleAccelerate") else - if user_linalg_backend == "AppleAccelerate" - @warn yo_msg("AppleAccelerate") + if ( + user_linalg_backend !== nothing && + startswith(user_linalg_backend, "AppleAccelerate") + ) + @warn """AppleAccelerate was requested but is unavailable: it requires \ + macOS $(_AA_MIN_MACOS.major).$(_AA_MIN_MACOS.minor)+ (detected macOS $(_macos_product_version())). \ + Falling back to KLU.""" end - @info no_msg("AppleAccelerate") - @info "See https://github.com/JuliaLinearAlgebra/AppleAccelerate.jl" end end end diff --git a/src/lodf_calculations.jl b/src/lodf_calculations.jl index 78dae430c..282938676 100644 --- a/src/lodf_calculations.jl +++ b/src/lodf_calculations.jl @@ -64,6 +64,40 @@ get_network_reduction_data(M::LODF) = M.network_reduction_data get_arc_lookup(M::LODF) = M.lookup[1] stores_transpose(::LODF) = true +# --- Demand-matrix short-circuit --------------------------------------------- +# +# The LODF computation builds a *diagonal* "demand" matrix `D = diag(m_V)` +# where `m_V[i] = 1 - PTDF·A[i, i]` (clamped to 1.0 at `LODF_ENTRY_TOLERANCE` +# to avoid divide-by-zero when an outage islands the line). The original +# code factored `D` and ran a triangular solve `D · X = ptdf_denominator`; +# that's a `factor + back-solve` over a diagonal, which collapses to +# element-wise row scaling. KLU's BTF short-circuits this internally so the +# overhead was modest; AA's libSparse and LAPACK's `getrf!`/`getrs!` do +# not, so the previous code was 3–5× slower on AA and order-of-magnitude +# slower on DENSE than necessary. Replace both with a direct row scaling. + +function _build_lodf_demand(ptdf_denominator::AbstractMatrix{Float64}, linecount::Int) + m_V = Vector{Float64}(undef, linecount) + @inbounds for i in 1:linecount + d = 1.0 - ptdf_denominator[i, i] + m_V[i] = d < LODF_ENTRY_TOLERANCE ? 1.0 : d + end + return m_V +end + +function _apply_lodf_demand!(M::AbstractMatrix{Float64}, m_V::Vector{Float64}) + IS.@assert_op size(M, 1) == length(m_V) + IS.@assert_op size(M, 1) == size(M, 2) + # `inv_dem .* M` mirrors what the triangular solve did internally — + # one reciprocal per row, then a row-wise multiply. The broadcast + # `M .*= inv_dem` scales each row `i` by `inv_dem[i]` because the + # length-n vector broadcasts down the first dimension. + inv_dem = 1.0 ./ m_V + M .*= inv_dem + M[SparseArrays.diagind(M)] .= -1.0 + return M +end + function _buildlodf( a::SparseArrays.SparseMatrixCSC{Int8, Int}, ptdf::Matrix{Float64}, @@ -92,9 +126,9 @@ end function _buildlodf( a::SparseArrays.SparseMatrixCSC{Int8, Int}, ptdf::Matrix{Float64}, - ::AppleAccelerateSolver, + ::AppleAccelerateLUSolver, ) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) return _calculate_LODF_matrix_AppleAccelerate(a, ptdf) end @@ -131,20 +165,8 @@ function _calculate_LODF_matrix_KLU( solve_sparse!(k, a_t_valid, view(first_, valid_ix, :)) ptdf_denominator = first_' * ba - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator[iline, iline]) - end - end - Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V)) - solve!(Dem_cache, ptdf_denominator) - ptdf_denominator[SparseArrays.diagind(ptdf_denominator)] .= -1.0 + m_V = _build_lodf_demand(ptdf_denominator, linecount) + _apply_lodf_demand!(ptdf_denominator, m_V) return ptdf_denominator end @@ -154,22 +176,9 @@ function _calculate_LODF_matrix_KLU( ) linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V)) + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) lodf_t = copy(ptdf_denominator_t) - solve!(Dem_cache, lodf_t) - lodf_t[SparseArrays.diagind(lodf_t)] .= -1.0 - + _apply_lodf_demand!(lodf_t, m_V) return lodf_t end @@ -179,29 +188,41 @@ function _calculate_LODF_matrix_DENSE( ) linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_V, 1.0) - else - push!(m_V, 1.0 - ptdf_denominator_t[iline, iline]) - end - end - (mV, bipiv, binfo) = getrf!(Matrix(LinearAlgebra.diagm(m_V))) - _binfo_check(binfo) - getrs!('N', mV, bipiv, ptdf_denominator_t) - ptdf_denominator_t[LinearAlgebra.diagind(ptdf_denominator_t)] .= -1.0 + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) + _apply_lodf_demand!(ptdf_denominator_t, m_V) return ptdf_denominator_t end # _pardiso_sequential_LODF!, _pardiso_single_LODF!, _calculate_LODF_matrix_MKLPardiso # are defined in ext/MKLPardisoExt.jl when the Pardiso package is loaded -# _calculate_LODF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl -# when the AppleAccelerate package is loaded +@static if Sys.isapple() + """ + Function for internal use only. + + Computes the LODF matrix using the internal Apple Accelerate backend + (`AccelerateWrapper`). Available only on macOS. Shape mirrors + `_calculate_LODF_matrix_KLU(a, ptdf)` exactly: factor the diagonal "demand" + matrix `diag(1 - PTDF·A)` and solve in place against `a · ptdf`. + + # Arguments + - `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix + - `ptdf::Matrix{Float64}`: PTDF matrix + """ + function _calculate_LODF_matrix_AppleAccelerate( + a::SparseArrays.SparseMatrixCSC{Int8, Int}, + ptdf::Matrix{Float64}, + ) + linecount = size(ptdf, 2) + ptdf_denominator_t = a * ptdf + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) + _apply_lodf_demand!(ptdf_denominator_t, m_V) + return ptdf_denominator_t + end +end """ - LODF(sys::PSY.System; linear_solver::String = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + LODF(sys::PSY.System; linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Line Outage Distribution Factor (LODF) matrix from a PowerSystems.System by computing the sensitivity of line flows to single line outages. This is the primary constructor for LODF @@ -211,7 +232,7 @@ analysis starting from system data. - `sys::PSY.System`: The power system from which to construct the LODF matrix # Keyword Arguments -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -258,16 +279,12 @@ where A is the incidence matrix and PTDF is the power transfer distribution fact """ function LODF( sys::PSY.System; - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs..., ) - Ymatrix = Ybus( - sys; - network_reductions = network_reductions, - kwargs..., - ) + Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...) A = IncidenceMatrix(Ymatrix) BA = BA_Matrix(Ymatrix) ptdf = PTDF(A, BA) @@ -275,7 +292,7 @@ function LODF( end """ - LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = "KLU", tol::Float64 = eps()) + LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = _default_linear_solver(), tol::Float64 = eps()) Construct a Line Outage Distribution Factor (LODF) matrix from existing incidence and PTDF matrices. This constructor is more efficient when the prerequisite matrices are already available. @@ -285,7 +302,7 @@ This constructor is more efficient when the prerequisite matrices are already av - `PTDFm::PTDF`: The power transfer distribution factor matrix (should be non-sparsified for accuracy) # Keyword Arguments -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for the LODF matrix (not applied to input PTDF) @@ -327,7 +344,7 @@ where: function LODF( A::IncidenceMatrix, PTDFm::PTDF; - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), ) solver = resolve_linear_solver(linear_solver) @@ -384,7 +401,9 @@ efficient when the prerequisite matrices with factorization are already availabl # Keyword Arguments - `linear_solver::String = "KLU"`: - Linear solver algorithm for matrix computations. Currently only "KLU" is supported + This constructor is intentionally KLU-only because `ABA.K` is always a + KLU factorization. The keyword is kept for API consistency; passing any + other value will error. - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements @@ -431,6 +450,10 @@ function LODF( linear_solver::String = "KLU", tol::Float64 = eps(), ) + # NOTE: ABA.K is always a KLU factorization, so this constructor is + # KLU-only regardless of the `linear_solver` argument. The kwarg is kept + # for API consistency; passing anything other than "KLU" will error in + # `_buildlodf`. if !( isequal(A.network_reduction_data, BA.network_reduction_data) && isequal(BA.network_reduction_data, ABA.network_reduction_data) @@ -443,8 +466,7 @@ function LODF( subnetwork_axes = make_arc_arc_subnetwork_axes(A) ax_ref = make_ax_ref(get_arc_axis(A)) if tol > eps() - lodf_t = - _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver) + lodf_t = _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver) return LODF( sparsify(lodf_t, tol), (get_arc_axis(A), get_arc_axis(A)), diff --git a/src/ptdf_calculations.jl b/src/ptdf_calculations.jl index 132fe16b4..979a4fc67 100644 --- a/src/ptdf_calculations.jl +++ b/src/ptdf_calculations.jl @@ -110,8 +110,8 @@ function _buildptdf_from_matrices( BA::SparseArrays.SparseMatrixCSC{T, Int} where {T <: Union{Float32, Float64}}, ref_bus_positions::Set{Int}, dist_slack::Vector{Float64}, - ::AppleAccelerateSolver) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) + ::AppleAccelerateLUSolver) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) return _calculate_PTDF_matrix_AppleAccelerate(A, BA, ref_bus_positions, dist_slack) end @@ -230,11 +230,62 @@ end # _calculate_PTDF_matrix_MKLPardiso is defined in ext/MKLPardisoExt.jl # when Pardiso package is loaded -# _calculate_PTDF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl -# when AppleAccelerate package is loaded +@static if Sys.isapple() + """ + Function for internal use only. + + Computes the PTDF matrix using the internal Apple Accelerate backend + (`AccelerateWrapper`). Available only on macOS — non-Apple callers are + rejected by `_buildptdf_from_matrices` before reaching this entry. Shape + mirrors `_calculate_PTDF_matrix_KLU`: factor ABA via LU, then solve + `ABA · X = BA[valid_ix, :]` via the block-packed `solve_sparse!`. + + # Arguments + - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix + - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix + - `ref_bus_positions::Set{Int}`: indexes of reference slack buses + - `dist_slack::Vector{Float64}`: distributed-slack weights + """ + function _calculate_PTDF_matrix_AppleAccelerate( + A::SparseArrays.SparseMatrixCSC{Int8, Int}, + BA::SparseArrays.SparseMatrixCSC{Float64, Int}, + ref_bus_positions::Set{Int}, + dist_slack::Vector{Float64}, + ) + linecount = size(BA, 2) + buscount = size(BA, 1) + if !isempty(dist_slack) && length(ref_bus_positions) != 1 + error( + "Distributed slack is not supported for systems with multiple reference buses.", + ) + end + if !isempty(dist_slack) && length(dist_slack) != buscount + error("Distributed bus specification doesn't match the number of buses.") + end + length(ref_bus_positions) < buscount || error( + "All buses are reference buses; PTDF is not defined.", + ) + + ABA = calculate_ABA_matrix(A, BA, ref_bus_positions) + cache = AccelerateWrapper.aa_factorize(ABA) + valid_ix = setdiff(1:buscount, ref_bus_positions) + PTDFm_t = zeros(buscount, linecount) + AccelerateWrapper.solve_sparse!( + cache, + BA[valid_ix, :], + view(PTDFm_t, valid_ix, :), + ) + + isempty(dist_slack) && return PTDFm_t + + @info "Distributed bus" + slack_array = reshape(dist_slack ./ sum(dist_slack), 1, buscount) + return PTDFm_t .- (slack_array * PTDFm_t) + end +end """ - PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Power Transfer Distribution Factor (PTDF) matrix from a PowerSystems.System by computing the sensitivity of transmission line flows to bus power injections. This is the primary constructor @@ -247,7 +298,7 @@ for PTDF analysis starting from system data. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling. Empty dictionary uses single slack bus (default behavior) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -300,7 +351,7 @@ where A is the incidence matrix and B is the susceptance matrix. """ function PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), kwargs..., ) @@ -312,7 +363,7 @@ function PTDF(sys::PSY.System; end """ - PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Power Transfer Distribution Factor (PTDF) matrix from existing Ybus matrix. This constructor is more efficient when the prerequisite matrices are already available and provides @@ -325,7 +376,7 @@ direct control over the underlying matrix computations. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling. Empty dictionary uses single slack bus (default behavior) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -370,7 +421,7 @@ where A is the incidence matrix and B is the susceptance matrix. """ function PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), ) A = IncidenceMatrix(ybus) @@ -385,7 +436,7 @@ function PTDF(ybus::Ybus; end """ - PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps()) + PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps()) Construct a Power Transfer Distribution Factor (PTDF) matrix from existing incidence and BA matrices. This constructor is more efficient when the prerequisite matrices are already available and provides @@ -399,7 +450,7 @@ direct control over the underlying matrix computations. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack participation factors. Empty dictionary uses single slack bus (reference bus from matrices) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -451,7 +502,7 @@ function PTDF( A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), ) dist_slack_vector = if !(isempty(dist_slack)) diff --git a/src/solver_dispatch.jl b/src/solver_dispatch.jl index 7a31ba86c..4998d6c97 100644 --- a/src/solver_dispatch.jl +++ b/src/solver_dispatch.jl @@ -10,17 +10,20 @@ # the callback. One factor + one cache per Virtual matrix is what # remains — same throughput as the pool variant once the global lock # is in place. +# +# Apple's libSparse has no documented cross-handle corruption issue +# analogous to `_LIBKLU_LOCK`; the per-cache `solver_lock` we acquire +# here is sufficient for the `AAFactorCache` backend. """ with_solver(f, K, work_ba_col, temp_data, solver_lock) -> result Acquire `solver_lock`, then invoke `f(K, work_ba_col[1], temp_data[1])`. -Two overloads: one specialized on `KLULinSolveCache{Float64}` (the KLU -backend), one generic for any other factorization (`AppleAccelerate.AAFactorization`, -etc.). Both serialize through `solver_lock`; the per-cache scratch -slot at index 1 is the only slot — `work_ba_col` and `temp_data` are -single-element vectors, kept as `Vector{Vector{Float64}}` because -`_solve_factorization` and the AppleAccelerate extension are typed on +Three overloads: `KLULinSolveCache{Float64}` (KLU backend), `AAFactorCache` +(Apple Accelerate backend), and a generic fallback. All serialize through +`solver_lock`; the per-cache scratch slot at index 1 is the only slot — +`work_ba_col` and `temp_data` are single-element vectors, kept as +`Vector{Vector{Float64}}` because `_solve_factorization` is typed on `Vector{Float64}` and the two buffers have different lengths (`n_buses` vs. `n_buses - n_ref_buses`). """ @@ -34,6 +37,16 @@ function with_solver( return @lock solver_lock f(K, work_ba_col[1], temp_data[1]) end +function with_solver( + f::F, + K::AAFactorCache, + work_ba_col::Vector{Vector{Float64}}, + temp_data::Vector{Vector{Float64}}, + solver_lock::ReentrantLock, +) where {F} + return @lock solver_lock f(K, work_ba_col[1], temp_data[1]) +end + function with_solver( f::F, K::KT, diff --git a/src/virtual_lodf_calculations.jl b/src/virtual_lodf_calculations.jl index 322c146e2..3417393a4 100644 --- a/src/virtual_lodf_calculations.jl +++ b/src/virtual_lodf_calculations.jl @@ -53,6 +53,10 @@ callers can issue requests concurrently; the libklu work runs one at a time. - `valid_ix::Vector{Int}`: Vector containing the row/columns indices of matrices related the buses which are not slack ones. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot + path iterate the nonzeros of a `BA` column directly. - `temp_data::Vector{Vector{Float64}}`: Single-element scratch vector kept as a `Vector{Vector{Float64}}` for uniform `with_solver` callback signatures. @@ -82,6 +86,7 @@ struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <: axes::Ax lookup::L valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} temp_data::Vector{Vector{Float64}} cache::RowCache cache_lock::ReentrantLock @@ -110,15 +115,6 @@ function Base.show(io::IO, ::MIME{Symbol("text/plain")}, array::VirtualLODF) return end -# Map bus index (1:n_buses) to its position in `valid_ix`; ref buses map to 0. -function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int})::Vector{Int} - bus_to_valid = zeros(Int, n_buses) - for (vi, bus_ix) in enumerate(valid_ix) - bus_to_valid[bus_ix] = vi - end - return bus_to_valid -end - """ _get_PTDF_A_diag(K, BA, A, ref_bus_positions) -> Vector{Float64} @@ -175,18 +171,21 @@ function _get_PTDF_A_diag( ba_col[valid_i] = ba_nz[k] end - _solve_factorization(K, ba_col) + # Read PTDF row from the returned buffer — backend-agnostic + # (KLU mutates `ba_col` and returns it; other backends may + # return a fresh vector, so capture the return value). + lin_solve = _solve_factorization(K, ba_col) - # ba_col is now PTDF row i in valid-index space; H[e,e] = ptdf[from] - ptdf[to]. + # H[e,e] = ptdf[from] - ptdf[to]; ref-bus entries are 0. f = arc_from_valid[i] t = arc_to_valid[i] v_f = if f > 0 - ba_col[f] + lin_solve[f] else 0.0 end v_t = if t > 0 - ba_col[t] + lin_solve[t] else 0.0 end @@ -265,6 +264,9 @@ struct with an empty cache. PSY system for which the matrix is constructed # Keyword Arguments +- `linear_solver::String = _default_linear_solver()`: Linear solver for the + ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to + "AppleAccelerate" on macOS and "KLU" elsewhere. - `network_reduction::NetworkReduction`: Structure containing the details of the network reduction applied when computing the matrix - `kwargs...`: @@ -273,6 +275,7 @@ struct with an empty cache. function VirtualLODF( sys::PSY.System; dist_slack::Vector{Float64} = Float64[], + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -282,6 +285,7 @@ function VirtualLODF( if length(dist_slack) != 0 @info "Distributed bus" end + solver = resolve_linear_solver(linear_solver) Ymatrix = Ybus( sys; network_reductions = network_reductions, @@ -296,7 +300,7 @@ function VirtualLODF( subnetwork_axes = make_arc_arc_subnetwork_axes(A) BA = BA_Matrix(Ymatrix) ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions)) - K = klu_factorize(ABA) + K = _create_factorization(solver, ABA) bus_ax = get_bus_axis(A) valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) @@ -324,6 +328,7 @@ function VirtualLODF( # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`. temp_data = [zeros(length(bus_ax))] work_ba_col = [zeros(length(valid_ix))] + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) return VirtualLODF( K, @@ -337,6 +342,7 @@ function VirtualLODF( axes, look_up, valid_ix, + bus_to_valid_idx, temp_data, empty_cache, ReentrantLock(), @@ -384,8 +390,17 @@ function _compute_lodf_row(vlodf::VirtualLODF, row::Int)::Vector{Float64} return with_solver( vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock, ) do K_solver, work_ba_col, temp_data - @inbounds for i in eachindex(vlodf.valid_ix) - work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row] + # Sparse-only extraction: iterate BA[:, row] non-zeros (typically + # 2 per arc) instead of scanning the full bus axis. + fill!(work_ba_col, 0.0) + BA = vlodf.BA + bus_to_valid_idx = vlodf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, row) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) @@ -515,9 +530,17 @@ function _getindex_partial( return with_solver( vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock, ) do K_solver, work_ba_col, temp_data - # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve. - @inbounds for i in eachindex(vlodf.valid_ix) - work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx] + # Steps 1-2: Compute B⁻¹(b_e · ν_e) via sparse-only BA-column + # extraction + solve. + fill!(work_ba_col, 0.0) + BA = vlodf.BA + bus_to_valid_idx = vlodf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, arc_idx) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) diff --git a/src/virtual_modf_calculations.jl b/src/virtual_modf_calculations.jl index 29c2ba7d5..f11879d11 100644 --- a/src/virtual_modf_calculations.jl +++ b/src/virtual_modf_calculations.jl @@ -46,6 +46,10 @@ cache and skips the recomputation. Tuple of lookup dictionaries for indexing. - `valid_ix::Vector{Int}`: Indices of non-reference buses. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the + Woodbury kernel iterate the nonzeros of a `BA` column directly. - `contingency_cache::Dict{Base.UUID, ContingencySpec}`: Resolved contingencies keyed by outage UUID. - `woodbury_cache::Dict{NetworkModification, WoodburyFactors}`: @@ -86,6 +90,7 @@ struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <: axes::Ax lookup::L valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} contingency_cache::Dict{Base.UUID, ContingencySpec} woodbury_cache::Dict{NetworkModification, WoodburyFactors} row_caches::Dict{NetworkModification, RowCache} @@ -178,6 +183,7 @@ function _compute_woodbury_factors( mat.BA, mat.arc_susceptances, mat.valid_ix, + mat.bus_to_valid_idx, modifications, ) end @@ -201,6 +207,7 @@ function _apply_woodbury_correction( mat.BA, mat.arc_susceptances, mat.valid_ix, + mat.bus_to_valid_idx, monitored_idx, wf, ) @@ -250,6 +257,9 @@ Outage supplemental attributes found in the system. # Keyword Arguments - `dist_slack::Vector{Float64}`: Distributed slack weights (default: empty) +- `linear_solver::String = _default_linear_solver()`: Linear solver for the + ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to + "AppleAccelerate" on macOS and "KLU" elsewhere. - `tol::Float64`: Tolerance for row sparsification (default: eps()) - `max_cache_size::Int`: Max cache size in MiB per contingency (default: MAX_CACHE_SIZE_MiB) - `network_reductions::Vector{NetworkReduction}`: Network reductions to apply @@ -257,6 +267,7 @@ Outage supplemental attributes found in the system. function VirtualMODF( sys::PSY.System; dist_slack::Vector{Float64} = Float64[], + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, network_reductions::Vector{NetworkReduction} = NetworkReduction[], @@ -266,6 +277,7 @@ function VirtualMODF( if length(dist_slack) != 0 @info "Distributed bus" end + solver = resolve_linear_solver(linear_solver) # Build network matrices (same path as VirtualLODF) Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...) @@ -282,9 +294,10 @@ function VirtualMODF( BA = BA_Matrix(Ymatrix) ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions)) - K = klu_factorize(ABA) + K = _create_factorization(solver, ABA) valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) # Empty: populated lazily on first read of `vmodf.PTDF_A_diag`. PTDF_A_diag = Float64[] @@ -308,6 +321,7 @@ function VirtualMODF( axes, look_up, valid_ix, + bus_to_valid_idx, Dict{Base.UUID, ContingencySpec}(), Dict{NetworkModification, WoodburyFactors}(), Dict{NetworkModification, RowCache}(), diff --git a/src/virtual_ptdf_calculations.jl b/src/virtual_ptdf_calculations.jl index 89b15df62..d8b3f09d4 100644 --- a/src/virtual_ptdf_calculations.jl +++ b/src/virtual_ptdf_calculations.jl @@ -21,8 +21,8 @@ JuMP-side work (in callers) parallelizes freely. # Arguments - `K`: LU factorization of the ABA matrix. A `KLULinSolveCache{Float64}` for - the default KLU solver, or an `AppleAccelerate.AAFactorization{Float64}` - when the AppleAccelerate extension is loaded. + the default KLU solver, or an `AccelerateWrapper.AAFactorCache` when the + AppleAccelerate backend is selected on macOS. - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix - `ref_bus_positions::Set{Int}`: @@ -48,6 +48,11 @@ JuMP-side work (in callers) parallelizes freely. - `valid_ix::Vector{Int}`: Vector containing the row/columns indices of matrices related the buses which are not slack ones. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot + path iterate the nonzeros of a `BA` column instead of scanning the + full bus axis. - `cache::RowCache`: Cache where PTDF rows are stored. - `cache_lock::ReentrantLock`: @@ -80,6 +85,7 @@ struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K} <: lookup::L temp_data::Vector{Vector{Float64}} valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} cache::RowCache cache_lock::ReentrantLock subnetwork_axes::Dict{Int, Ax} @@ -120,8 +126,9 @@ struct with an empty cache. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary of weights to be used as distributed slack bus. The distributed slack dictionary must have the same number of entries as the number of buses. -- `linear_solver::String = "KLU"`: - Linear solver to use for factorization. Options: "KLU", "AppleAccelerate" +- `linear_solver::String = _default_linear_solver()`: + Linear solver to use for factorization. Options: "KLU", "AppleAccelerateLU". + Defaults to "AppleAccelerateLU" on macOS 15.5+ and "KLU" elsewhere. - `tol::Float64 = eps()`: Tolerance related to sparsification and values to drop. - `max_cache_size::Int`: @@ -136,7 +143,7 @@ struct with an empty cache. function VirtualPTDF( sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -169,11 +176,11 @@ function _create_factorization( end function _create_factorization( - ::AppleAccelerateSolver, + ::AppleAccelerateLUSolver, ABA::SparseArrays.SparseMatrixCSC{Float64, Int}, ) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) - return _create_apple_accelerate_factorization(ABA) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) + return AccelerateWrapper.aa_factorize(ABA) end function _create_factorization( @@ -181,7 +188,7 @@ function _create_factorization( ::SparseArrays.SparseMatrixCSC{Float64, Int}, ) return error( - "Only KLU and AppleAccelerate solvers are supported for VirtualPTDF factorization.", + "Only KLU and AppleAccelerateLU solvers are supported for VirtualPTDF factorization.", ) end @@ -197,8 +204,9 @@ The return is a VirtualPTDF struct with an empty cache. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary of weights to be used as distributed slack bus. The distributed slack dictionary must have the same number of entries as the number of buses. -- `linear_solver::String = "KLU"`: - Linear solver to use for factorization. Options: "KLU", "AppleAccelerate" +- `linear_solver::String = _default_linear_solver()`: + Linear solver to use for factorization. Options: "KLU", "AppleAccelerateLU". + Defaults to "AppleAccelerateLU" on macOS 15.5+ and "KLU" elsewhere. - `tol::Float64 = eps()`: Tolerance related to sparsification and values to drop. - `max_cache_size::Int`: @@ -209,7 +217,7 @@ The return is a VirtualPTDF struct with an empty cache. function VirtualPTDF( ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -261,6 +269,7 @@ function VirtualPTDF( # `Vector{Vector{Float64}}` so `with_solver`'s callback signature # stays uniform across solver backends. valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) temp_data = [zeros(length(bus_ax))] work_ba_col = [zeros(length(valid_ix))] @@ -277,6 +286,7 @@ function VirtualPTDF( look_up, temp_data, valid_ix, + bus_to_valid_idx, empty_cache, ReentrantLock(), subnetwork_axes, @@ -321,17 +331,18 @@ if isdefined(Base, :print_array) # 0.7 and later Base.print_array(io::IO, X::VirtualPTDF) = "VirtualPTDF" end -# Helper function to solve with different factorization types. The -# `KLULinSolveCache` overload solves in place (zero-allocation hot path); -# the generic fallback delegates to `\` and is extended by the -# AppleAccelerate extension for `AAFactorization`. +# Helper function to solve with different factorization types. Both +# overloads solve in place (zero-allocation hot path). The KLU and Apple +# Accelerate backends are the only solvers supported here; adding a new +# backend requires extending this method. function _solve_factorization(K::KLULinSolveCache{Float64}, b::Vector{Float64}) solve!(K, b) return b end -function _solve_factorization(K, b::Vector{Float64}) - return K \ b +function _solve_factorization(K::AAFactorCache, b::Vector{Float64}) + AccelerateWrapper.solve!(K, b) + return b end function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64} @@ -350,12 +361,23 @@ function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64} return with_solver( vptdf.K, vptdf.work_ba_col, vptdf.temp_data, vptdf.solver_lock, ) do K_solver, work_ba_col, temp_data - valid_ix = vptdf.valid_ix - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = vptdf.BA[valid_ix[i], row] + # Extract BA[:, row] non-zeros into work_ba_col at non-ref-bus + # positions. Iterates only the nonzeros of the BA column (typically + # 2 per arc) instead of scanning the full bus axis and bisecting + # the CSC for each entry. + fill!(work_ba_col, 0.0) + BA = vptdf.BA + bus_to_valid_idx = vptdf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, row) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) fill!(temp_data, 0.0) + valid_ix = vptdf.valid_ix @inbounds for i in eachindex(valid_ix) temp_data[valid_ix[i]] = lin_solve[i] end diff --git a/src/woodbury_kernel.jl b/src/woodbury_kernel.jl index ad29ba3b4..7f183e125 100644 --- a/src/woodbury_kernel.jl +++ b/src/woodbury_kernel.jl @@ -83,6 +83,7 @@ function _compute_woodbury_factors_impl( BA::SparseArrays.SparseMatrixCSC{Float64, Int}, arc_sus::Vector{Float64}, valid_ix::Vector{Int}, + bus_to_valid_idx::Vector{Int}, modifications::Tuple{Vararg{ArcModification}}, )::WoodburyFactors M = length(modifications) @@ -97,13 +98,19 @@ function _compute_woodbury_factors_impl( # Compute Z[:,j] = B⁻¹ν_j for each modified arc Z = Matrix{Float64}(undef, n_bus, M) + ba_rv_outer = SparseArrays.rowvals(BA) + ba_nz_outer = SparseArrays.nonzeros(BA) for (j, mod) in enumerate(modifications) e = mod.arc_index b_e = arc_sus[e] - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = BA[valid_ix[i], e] + # Sparse-only extraction of BA[:, e] into work_ba_col. + fill!(work_ba_col, 0.0) + @inbounds for k in SparseArrays.nzrange(BA, e) + valid_i = bus_to_valid_idx[ba_rv_outer[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz_outer[k] end lin_solve = _solve_factorization(K, work_ba_col) @@ -159,6 +166,7 @@ function _apply_woodbury_correction_impl( BA::SparseArrays.SparseMatrixCSC{Float64, Int}, arc_sus::Vector{Float64}, valid_ix::Vector{Int}, + bus_to_valid_idx::Vector{Int}, monitored_idx::Int, wf::WoodburyFactors, )::Vector{Float64} @@ -177,10 +185,15 @@ function _apply_woodbury_correction_impl( return zeros(n_bus) end - # z_m = B⁻¹ν_m / b_mon_pre via KLU solve on BA column + # z_m = B⁻¹ν_m / b_mon_pre via sparse-only BA-column extraction + solve. b_mon_pre = arc_sus[monitored_idx] - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = BA[valid_ix[i], monitored_idx] + fill!(work_ba_col, 0.0) + ba_rv_mon = SparseArrays.rowvals(BA) + ba_nz_mon = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, monitored_idx) + valid_i = bus_to_valid_idx[ba_rv_mon[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz_mon[k] end lin_solve = _solve_factorization(K, work_ba_col) @@ -226,7 +239,8 @@ function _compute_woodbury_factors( ) do K_solver, work_ba_col, temp_data _compute_woodbury_factors_impl( K_solver, work_ba_col, temp_data, - mat.BA, mat.arc_susceptances, mat.valid_ix, modifications, + mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx, + modifications, ) end end @@ -241,7 +255,8 @@ function _apply_woodbury_correction( ) do K_solver, work_ba_col, temp_data _apply_woodbury_correction_impl( K_solver, work_ba_col, temp_data, - mat.BA, mat.arc_susceptances, mat.valid_ix, monitored_idx, wf, + mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx, + monitored_idx, wf, ) end end diff --git a/test/PowerNetworkMatricesTests.jl b/test/PowerNetworkMatricesTests.jl index b3857fe30..d3036822c 100644 --- a/test/PowerNetworkMatricesTests.jl +++ b/test/PowerNetworkMatricesTests.jl @@ -21,7 +21,7 @@ import Aqua Aqua.test_unbound_args(PowerNetworkMatrices) Aqua.test_undefined_exports(PowerNetworkMatrices) Aqua.test_ambiguities(PowerNetworkMatrices) -Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:AppleAccelerate, :Pardiso]) +Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:Pardiso]) Aqua.test_deps_compat(PowerNetworkMatrices) Aqua.find_persistent_tasks_deps(PowerNetworkMatrices) Aqua.test_persistent_tasks(PowerNetworkMatrices) diff --git a/test/runtests.jl b/test/runtests.jl index 1afa42b7c..ae21ff04f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,11 @@ -# load system-specific packages before PNM: ensures relevant PNM extensions will be loaded. +# Load system-specific packages before PNM so relevant PNM extensions +# precompile. The Apple Accelerate backend is now built in to PNM and does +# not need a trigger package; only the MKL/Pardiso extension is gated on a +# `using` call. @static if (Sys.ARCH === :x86_64 || Sys.ARCH === :i686) && !Sys.isapple() import Pkg Pkg.add("Pardiso") using Pardiso -else - import Pkg - Pkg.add("AppleAccelerate") - using AppleAccelerate end using PowerNetworkMatrices diff --git a/test/test_accelerate_wrapper.jl b/test/test_accelerate_wrapper.jl new file mode 100644 index 000000000..d4376eb6a --- /dev/null +++ b/test/test_accelerate_wrapper.jl @@ -0,0 +1,245 @@ +import LinearAlgebra +import SparseArrays +import Random + +const _AA_TEST_SEED = 0xABCDEF + +function _random_spd(n::Int; density::Float64 = 0.05, scale::Real = 4) + M = SparseArrays.sprandn(n, n, density) + return SparseArrays.sparse(M + M' + scale * n * LinearAlgebra.I) +end + +# A general (non-symmetric) real sparse system: diagonally dominant so it +# is nonsingular but structurally and numerically unsymmetric. +function _random_unsymmetric(n::Int; density::Float64 = 0.04) + M = SparseArrays.sprandn(n, n, density) + return SparseArrays.sparse(M + (2 * n) * LinearAlgebra.I) +end + +@testset "AccelerateWrapper: smoke" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + + cache = PNM.AccelerateWrapper.aa_factorize(ABA) + @test PNM.AccelerateWrapper.is_factored(cache) + + b = randn(n) + x = copy(b) + PNM.AccelerateWrapper.solve!(cache, x) + @test isapprox(ABA * x, b; atol = 1e-9) + + B = SparseArrays.sprandn(n, 120, 0.03) + for j in 1:5:120 + B[:, j] .= 0 + end + SparseArrays.dropzeros!(B) + out = zeros(n, 120) + PNM.AccelerateWrapper.solve_sparse!(cache, B, out) + @test isapprox(out, ABA \ Matrix(B); atol = 1e-9) +end + +@testset "AccelerateWrapper: _create_factorization dispatch" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + + K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA) + K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA) + @test K_klu isa PNM.KLULinSolveCache{Float64} + @test K_aa isa PNM.AAFactorCache + + b_klu = randn(n) + b_aa = copy(b_klu) + PNM._solve_factorization(K_klu, b_klu) + PNM._solve_factorization(K_aa, b_aa) + @test isapprox(b_klu, b_aa; atol = 1e-9) +end + +@testset "AccelerateWrapper: with_solver resolves to concrete method" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA) + + work = [zeros(n)] + tmp = [zeros(n)] + lk = ReentrantLock() + b = randn(n) + copyto!(work[1], b) + + result = PNM.with_solver(K_aa, work, tmp, lk) do K, wba, _td + PNM._solve_factorization(K, wba) + return copy(wba) + end + @test isapprox(ABA * result, b; atol = 1e-9) + + m = which( + PNM.with_solver, + ( + typeof(identity), + PNM.AAFactorCache, + Vector{Vector{Float64}}, + Vector{Vector{Float64}}, + ReentrantLock, + ), + ) + @test Base.unwrap_unionall(m.sig).parameters[3] === PNM.AAFactorCache +end + +@testset "AccelerateWrapper: KLU vs AA per-column solve parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + nbranches = 40 + ABA = _random_spd(n) + BA = SparseArrays.sprandn(n, nbranches, 0.03) + + K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA) + K_aa = PNM._create_factorization(PNM.AppleAccelerateLUSolver(), ABA) + + work_klu = zeros(n) + work_aa = zeros(n) + for col in 1:nbranches + work_klu .= Vector(BA[:, col]) + copyto!(work_aa, work_klu) + PNM._solve_factorization(K_klu, work_klu) + PNM._solve_factorization(K_aa, work_aa) + @test isapprox(work_klu, work_aa; atol = 1e-9) + end +end + +@testset "AccelerateWrapper: solve_w_refinement matches direct solve on AA cache" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 60 + ABA = _random_spd(n) + cache = PNM.AccelerateWrapper.aa_factorize(ABA) + + x_true = randn(n) + b = ABA * x_true + X = PNM.solve_w_refinement(cache, ABA, b) + @test isapprox(X, x_true; atol = 1e-10) +end + +@testset "AccelerateWrapper: solve_w_refinement! recovers ill-conditioned AA solve" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED + 1) + n = 80 + # Symmetric ill-conditioned tridiagonal — back-solve leaves residual that + # refinement closes. Less pathological than the KLU equivalent because + # AA's LU (threshold partial pivoting) keeps a higher effective pivot + # floor than KLU's BTF + LU on a nearly-defective tridiagonal. + A = SparseArrays.spdiagm( + 0 => fill(1.0, n), + 1 => fill(1.0 - 1e-6, n - 1), + -1 => fill(1.0 - 1e-6, n - 1), + ) + cache = PNM.AccelerateWrapper.aa_factorize(A) + + x_true = randn(n) + b = A * x_true + X = zeros(n) + PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12) + @test isapprox(X, x_true; atol = 1e-8) +end + +@testset "AccelerateWrapper: solve_w_refinement requires a factored AA cache" begin + PNM._has_apple_accelerate_backend() || return + n = 12 + A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n))) + cache = PNM.AAFactorCache(A) # not factored + b = randn(n) + @test_throws ErrorException PNM.solve_w_refinement(cache, A, b) +end + +@testset "AccelerateWrapper: solve_w_refinement KLU vs AA parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED + 2) + n = 50 + A = _random_spd(n) + x_true = randn(n) + b = A * x_true + + K_klu = PNM.klu_factorize(A) + K_aa = PNM.AccelerateWrapper.aa_factorize(A) + + X_klu = PNM.solve_w_refinement(K_klu, A, b) + X_aa = PNM.solve_w_refinement(K_aa, A, b) + @test isapprox(X_klu, X_aa; atol = 1e-10) +end + +@testset "AccelerateWrapper: LU non-symmetric vs KLU parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 220 + A = _random_unsymmetric(n) + @test A != permutedims(A) # genuinely unsymmetric + + K_klu = PNM._create_factorization(PNM.KLUSolver(), A) + K_aa = PNM.AccelerateWrapper.aa_factorize(A) + @test PNM.AccelerateWrapper.is_factored(K_aa) + + b = randn(n) + x_klu = copy(b) + x_aa = copy(b) + PNM._solve_factorization(K_klu, x_klu) + PNM.AccelerateWrapper.solve!(K_aa, x_aa) + @test isapprox(x_aa, x_klu; rtol = 1e-9) + @test isapprox(A * x_aa, b; rtol = 1e-9) +end + +@testset "AccelerateWrapper: LU symmetric-ABA regression (solve_sparse!)" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 260 + ABA = _random_spd(n) # still works; LU has no symmetry requirement + + K_aa = PNM.AccelerateWrapper.aa_factorize(ABA) + + B = SparseArrays.sprandn(n, 64, 0.03) + out_aa = zeros(n, 64) + PNM.AccelerateWrapper.solve_sparse!(K_aa, B, out_aa) + @test isapprox(out_aa, ABA \ Matrix(B); rtol = 1e-8) +end + +@testset "AccelerateWrapper: LU numeric_refactor! reuse parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 180 + A1 = _random_unsymmetric(n) + cache = PNM.AccelerateWrapper.aa_factorize(A1) + + # Same pattern, different values. + A2 = SparseArrays.SparseMatrixCSC( + n, n, copy(SparseArrays.getcolptr(A1)), copy(SparseArrays.rowvals(A1)), + SparseArrays.nonzeros(A1) .+ 0.5, + ) + PNM.AccelerateWrapper.full_refactor!(cache, A2) + b = randn(n) + x = copy(b) + PNM.AccelerateWrapper.solve!(cache, x) + @test isapprox(A2 * x, b; rtol = 1e-8) +end + +@testset "AccelerateWrapper: LU singular throws on factor (KLU parity)" begin + PNM._has_apple_accelerate_backend() || return + n = 50 + S = SparseArrays.sparse(1.0 * LinearAlgebra.I, n, n) + S[n, n] = 0.0 # exactly singular + SparseArrays.dropzeros!(S) + # Both backends detect singularity at factor time (not solve time), which is + # the key parity guarantee. The exception TYPES differ by design (accepted + # divergence): KLU follows the LAPACK convention and throws + # `SingularException`, while libSparse fires its `reportError` callback with + # "Matrix is structurally singular." which our binding surfaces as a plain + # `ErrorException`. Callers needing uniform handling must catch both. + @test_throws LinearAlgebra.SingularException PNM._create_factorization( + PNM.KLUSolver(), + S, + ) + @test_throws ErrorException PNM.AccelerateWrapper.aa_factorize(S) +end diff --git a/test/test_klu_wrapper.jl b/test/test_klu_wrapper.jl index a4be6d48c..902c4a088 100644 --- a/test/test_klu_wrapper.jl +++ b/test/test_klu_wrapper.jl @@ -1,4 +1,6 @@ import SparseArrays +import LinearAlgebra +import Random @testset "KLU wrapper: real round-trip and refactor" begin n = 50 @@ -232,3 +234,168 @@ end alloc_warm = @allocated PNM.solve_sparse!(cache, B, out; block = 32) @test alloc_warm < n * size(B, 2) * sizeof(Float64) ÷ 4 end + +# --------------------------------------------------------------------------- +# Int32 index-type path +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: Int32 real round-trip and refactor" begin + n = 50 + rng_vals = collect(1.0:n) + A64 = SparseArrays.spdiagm(0 => rng_vals .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + x = collect(1.0:n) + b = A64 * x + + cache = PNM.klu_factorize(A) + @test PNM.is_factored(cache) + @test cache isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32} + @test size(cache) == (n, n) + + y = copy(b) + PNM.solve!(cache, y) + @test isapprox(y, x, atol = 1e-10) + + # Refactor with new values, same pattern. + A2_64 = SparseArrays.spdiagm(0 => rng_vals .+ 2.0, + 1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1)) + A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64) + x2 = randn(n) + b2 = A2_64 * x2 + PNM.numeric_refactor!(cache, A2) + y2 = copy(b2) + PNM.solve!(cache, y2) + @test isapprox(y2, x2, atol = 1e-9) +end + +@testset "KLU wrapper: Int32 vs Int64 solves agree bit-for-bit" begin + # KLU is deterministic; identical entries and identical permutation give + # identical solves regardless of the C entry point's index width. + Random.seed!(0xC0FFEE) + n = 200 + A64 = SparseArrays.spdiagm(0 => 4.0 .+ rand(n), 1 => 0.1 .* rand(n - 1), + -1 => 0.1 .* rand(n - 1)) + A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + + c64 = PNM.klu_factorize(A64) + c32 = PNM.klu_factorize(A32) + @test c64 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int64} + @test c32 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32} + + b = randn(n) + y64 = copy(b) + y32 = copy(b) + PNM.solve!(c64, y64) + PNM.solve!(c32, y32) + @test y64 == y32 # bit-equal +end + +@testset "KLU wrapper: Int32 solve_sparse! agrees with dense reference" begin + Random.seed!(0xBEEF) + n = 80 + A_dense = Matrix(SparseArrays.sprandn(n, n, 0.1)) + 4 * LinearAlgebra.I + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(SparseArrays.sparse(A_dense)) + cache = PNM.klu_factorize(A) + + B_dense = SparseArrays.sprandn(n, 20, 0.1) + B = SparseArrays.SparseMatrixCSC{Float64, Int32}(B_dense) + + out = PNM.solve_sparse(cache, B) + @test isapprox(out, A_dense \ Matrix(B_dense); atol = 1e-9) +end + +@testset "KLU wrapper: Int32 numeric_refactor! is allocation-free" begin + n = 100 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + cache = PNM.klu_factorize(A) + + A2_64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.5, + 1 => fill(0.07, n - 1), -1 => fill(0.07, n - 1)) + A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64) + + # Warm up specialization on the Int32 path. + PNM.numeric_refactor!(cache, A2) + alloc = @allocated PNM.numeric_refactor!(cache, A2) + @test alloc == 0 +end + +# --------------------------------------------------------------------------- +# Performance-knob surface +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: sort_factors!/condest!/rcond! work on both index types" begin + n = 60 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + + for cache in (PNM.klu_factorize(A64), PNM.klu_factorize(A32)) + PNM.KLUWrapper.sort_factors!(cache) + # Subsequent solve still returns the right answer. + b = randn(n) + y = copy(b) + PNM.solve!(cache, y) + @test isapprox(y, A64 \ b, atol = 1e-10) + + c = PNM.KLUWrapper.condest!(cache) + r = PNM.KLUWrapper.rcond!(cache) + @test c > 0 + @test 0 < r <= 1.0 + end +end + +# --------------------------------------------------------------------------- +# Iterative refinement +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: solve_w_refinement matches direct solve on well-conditioned A" begin + Random.seed!(2) + n = 40 + A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + cache = PNM.klu_factorize(A) + + x_true = randn(n) + b = A * x_true + X = PNM.solve_w_refinement(cache, A, b) + @test isapprox(X, x_true, atol = 1e-12) +end + +@testset "KLU wrapper: solve_w_refinement! recovers ill-conditioned solve" begin + # Build a noticeably ill-conditioned tridiagonal so a single back-solve + # leaves residual headroom that refinement actually closes. + Random.seed!(3) + n = 50 + A = SparseArrays.spdiagm(0 => fill(1.0, n), 1 => fill(1.0 - 1e-10, n - 1)) + cache = PNM.klu_factorize(A) + + x_true = randn(n) + b = A * x_true + X = zeros(n) + PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12) + @test isapprox(X, x_true, atol = 1e-8) +end + +@testset "KLU wrapper: solve_w_refinement requires a factored cache" begin + n = 10 + A = SparseArrays.spdiagm(0 => 1.0:n) + cache = PNM.KLUWrapper.KLULinSolveCache(A) # not factored + b = randn(n) + @test_throws ErrorException PNM.solve_w_refinement(cache, A, b) +end + +@testset "KLU wrapper: solve_w_refinement works on Int32 cache" begin + Random.seed!(4) + n = 30 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + cache = PNM.klu_factorize(A) + x_true = randn(n) + b = A64 * x_true + X = PNM.solve_w_refinement(cache, A, b) + @test isapprox(X, x_true, atol = 1e-10) +end diff --git a/test/test_lodf.jl b/test/test_lodf.jl index a5a52d614..bc7d4a6a0 100644 --- a/test/test_lodf.jl +++ b/test/test_lodf.jl @@ -56,7 +56,7 @@ @test isapprox(sum(total_error3), 0.0, atol = 1e-3) end - if PowerNetworkMatrices._has_apple_accelerate_ext() + if PowerNetworkMatrices._has_apple_accelerate_backend() L5NS_from_ptdf4 = LODF(A, P5; linear_solver = "AppleAccelerate") @test getindex(L5NS_from_ptdf4, "5", "6") - -0.3071 <= 1e-4 total_error4 = 0.0 diff --git a/test/test_modf_lodf_reductions.jl b/test/test_modf_lodf_reductions.jl index 68d577df0..c3ce31f1b 100644 --- a/test/test_modf_lodf_reductions.jl +++ b/test/test_modf_lodf_reductions.jl @@ -1,3 +1,68 @@ +""" + _clone_transformer_with_name(xfmr, new_name) -> TapTransformer + +Clone a TapTransformer with a new name, sharing the same buses and identical +electrical parameters (R, X, tap, primary_shunt, rating, base_power, +base_voltage_primary, base_voltage_secondary). The returned object has no +supplemental attributes and is not attached to any system. + +Used to create a second circuit on a bus pair so that `BA_Matrix` treats that +pair as a 2-circuit parallel group, enabling N-2 parallel-circuit tests. +""" +function _clone_transformer_with_name(xfmr::PSY.TapTransformer, new_name::String) + old_arc = PSY.get_arc(xfmr) + new_arc = PSY.Arc(; from = old_arc.from, to = old_arc.to) + return PSY.TapTransformer(; + name = new_name, + available = PSY.get_available(xfmr), + active_power_flow = PSY.get_active_power_flow(xfmr), + reactive_power_flow = PSY.get_reactive_power_flow(xfmr), + arc = new_arc, + r = PSY.get_r(xfmr), + x = PSY.get_x(xfmr), + primary_shunt = PSY.get_primary_shunt(xfmr), + tap = PSY.get_tap(xfmr), + rating = PSY.get_rating(xfmr), + base_power = PSY.get_base_power(xfmr), + base_voltage_primary = PSY.get_base_voltage_primary(xfmr), + base_voltage_secondary = PSY.get_base_voltage_secondary(xfmr), + ) +end + +""" + _canonical_arc_key(d, k) -> Tuple{Int,Int} + +Return `k` if it is a key of `d`, otherwise return the reversed tuple `(k[2], k[1])`. +Used when an arc may be stored under either orientation in a lookup dictionary. +""" +function _canonical_arc_key(d::AbstractDict, k::Tuple{Int, Int}) + if haskey(d, k) + return k + else + return (k[2], k[1]) + end +end + +""" + _ptdf_post_row_builder(ptdf_post, pre_bus_axis) -> Function + +Return a closure that accepts an `arc_key::Tuple{Int,Int}` and returns the +corresponding row of `ptdf_post` permuted to match `pre_bus_axis`. The arc key +is canonicalised via `_canonical_arc_key` so either orientation is accepted. +""" +function _ptdf_post_row_builder(ptdf_post, pre_bus_axis) + post_bus_axis = PNM.get_bus_axis(ptdf_post) + post_bus_lookup = Dict(bn => i for (i, bn) in enumerate(post_bus_axis)) + bus_perm = [post_bus_lookup[bn] for bn in pre_bus_axis] + post_arc_lookup = Dict{Tuple{Int, Int}, Int}( + a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post)) + ) + return function (arc_key::Tuple{Int, Int}) + k = _canonical_arc_key(post_arc_lookup, arc_key) + return ptdf_post[post_arc_lookup[k], :][bus_perm] + end +end + """ Helper: verify the N-1 identity for a single contingency. @@ -282,3 +347,224 @@ end _find_two_non_islanding_arcs(vmodf, vlodf, nrd.direct_branch_map) verify_modf_n2_lodf_identity(vmodf, vlodf, ptdf, arc_idx1, arc_idx2) end + +@testset "VirtualMODF parallel-group N-k tests on RTS-GMLC" begin + # Shared fixture: RTS-GMLC system augmented with a second circuit on the A14 + # transformer (bus 109 → 111) to create a 2-circuit parallel group. + sys = PSB.build_system(PSB.PSITestSystems, "test_RTS_GMLC_sys") + xfmr_a14 = PSY.get_component(PSY.TapTransformer, sys, "A14") + PSY.add_component!(sys, _clone_transformer_with_name(xfmr_a14, "A14_b")) + + # Resolve the canonical arc key for the parallel group once. A lightweight + # VirtualMODF is constructed here only to query the reduction maps; each + # child testset builds its own vmodf so contingency caches stay isolated. + nrd_shared = get_network_reduction_data(VirtualMODF(sys)) + par_key = if haskey(nrd_shared.parallel_branch_map, (109, 111)) + (109, 111) + else + (111, 109) + end + + @testset "N-2 trip of one parallel transformer + a line" begin + vmodf = VirtualMODF(sys) + nrd = get_network_reduction_data(vmodf) + + @test haskey(nrd.parallel_branch_map, par_key) + @test length(nrd.parallel_branch_map[par_key].branches) == 2 + + # N-2 partner line: A18 (bus 111 → 113), selected as the line incident on + # bus 111 with the largest |LODF| coupling with the parallel-group arc + # (|LODF| ≈ 0.428 vs A19's 0.231). Confirmed non-islanding by building + # PTDF(sys_post) without error. + # + # Selection probe (run once, result baked in): + # vlodf = VirtualLODF(sys) + # |vlodf[(111,113), par_arc_idx]| ≈ 0.4285 <- largest + # |vlodf[(111,114), par_arc_idx]| ≈ 0.2310 + sel_line_name = "A18" + + A14_b_branch = PSY.get_component(PSY.TapTransformer, sys, "A14_b") + line_branch = PSY.get_component(PSY.Line, sys, sel_line_name) + + mod_a14b = NetworkModification(vmodf, A14_b_branch) + mod_a18 = NetworkModification(vmodf, line_branch) + + n2_mods = vcat( + collect(mod_a14b.arc_modifications), + collect(mod_a18.arc_modifications), + ) + n2_mod = NetworkModification("n2_parallel_xfmr_plus_line", n2_mods) + + ctg_uuid = Base.UUID(UInt128(515151)) + ctg = ContingencySpec(ctg_uuid, n2_mod) + vmodf.contingency_cache[ctg_uuid] = ctg + + sys_post = deepcopy(sys) + PSY.remove_component!( + sys_post, + PSY.get_component(PSY.TapTransformer, sys_post, "A14_b"), + ) + PSY.remove_component!( + sys_post, + PSY.get_component(PSY.Line, sys_post, sel_line_name), + ) + ptdf_post = PTDF(sys_post) + + pre_bus_axis = collect(PNM.get_bus_axis(vmodf)) + ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis) + post_arc_lookup = Dict{Tuple{Int, Int}, Int}( + a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post)) + ) + + atol = 1e-6 + for (label, arc_key) in [ + ("parallel pair (109,111)", par_key), + ("A19 other 230kV off bus 111", (111, 114)), + ("A12-1 138kV off bus 109", (108, 109)), + ("B8 area-200", (204, 209)), + ("C5 area-300", (302, 306)), + ] + mon_idx = PNM.get_arc_lookup(vmodf)[arc_key] + modf_row = vmodf[mon_idx, ctg] + @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol) + end + end + + @testset "N-3 trip of one parallel transformer + two lines" begin + # N-3 outage: {A14_b, A18, A19} + # A18 (bus 111 -> 113): |LODF| ≈ 0.428 with the parallel-group arc. + # A19 (bus 111 -> 114): |LODF| ≈ 0.231 with the parallel-group arc. + # Both lines are off bus 111 (area-100 hub). PTDF(sys_post) confirmed + # non-islanding (size (73, 106) after removing three branches). + vmodf = VirtualMODF(sys) + nrd = get_network_reduction_data(vmodf) + + @test haskey(nrd.parallel_branch_map, par_key) + @test length(nrd.parallel_branch_map[par_key].branches) == 2 + + A14_b_branch = PSY.get_component(PSY.TapTransformer, sys, "A14_b") + a18_branch = PSY.get_component(PSY.Line, sys, "A18") + a19_branch = PSY.get_component(PSY.Line, sys, "A19") + + mod_a14b = NetworkModification(vmodf, A14_b_branch) + mod_a18 = NetworkModification(vmodf, a18_branch) + mod_a19 = NetworkModification(vmodf, a19_branch) + + n3_mods = vcat( + collect(mod_a14b.arc_modifications), + collect(mod_a18.arc_modifications), + collect(mod_a19.arc_modifications), + ) + n3_mod = NetworkModification("n3_parallel_xfmr_plus_two_lines", n3_mods) + + ctg_uuid = Base.UUID(UInt128(515251)) + ctg = ContingencySpec(ctg_uuid, n3_mod) + vmodf.contingency_cache[ctg_uuid] = ctg + + sys_post = deepcopy(sys) + PSY.remove_component!( + sys_post, + PSY.get_component(PSY.TapTransformer, sys_post, "A14_b"), + ) + PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A18")) + PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A19")) + ptdf_post = PTDF(sys_post) + + pre_bus_axis = collect(PNM.get_bus_axis(vmodf)) + ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis) + post_arc_lookup = Dict{Tuple{Int, Int}, Int}( + a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post)) + ) + + # (111, 114) is A19 which is outaged and absent from sys_post, so it is + # not included here. + atol = 1e-6 + for (label, arc_key) in [ + ("parallel pair (109,111)", par_key), + ("A20 230kV hub (112,113)", (112, 113)), + ("A12-1 138kV off bus 109 (108,109)", (108, 109)), + ("B8 area-200 (204,209)", (204, 209)), + ("C5 area-300 (302,306)", (302, 306)), + ] + if !haskey(post_arc_lookup, arc_key) && + !haskey(post_arc_lookup, (arc_key[2], arc_key[1])) + @info "Skipping $label: arc $arc_key not present in sys_post" + continue + end + mon_idx = PNM.get_arc_lookup(vmodf)[arc_key] + modf_row = vmodf[mon_idx, ctg] + @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol) + end + end + + @testset "N-3 trip of three regular lines (Woodbury M=3)" begin + # N-3 selection: {A23, B20, C20} — three PSY.Line instances from three + # different areas (area 1, 2, 3) so the Woodbury W matrix is genuinely + # M=3 with full rank. None is in a parallel group or incident on a bus + # whose removal would island the network. PTDF(sys_post) confirmed + # non-islanding (size (73, 105) after removing three branches). + # A23: area 1, bus 114 -> 116, 230 kV + # B20: area 2, bus 212 -> 213, 230 kV + # C20: area 3, bus 312 -> 313, 230 kV + vmodf = VirtualMODF(sys) + nrd = get_network_reduction_data(vmodf) + + a23_branch = PSY.get_component(PSY.Line, sys, "A23") + b20_branch = PSY.get_component(PSY.Line, sys, "B20") + c20_branch = PSY.get_component(PSY.Line, sys, "C20") + + mod_a23 = NetworkModification(vmodf, a23_branch) + mod_b20 = NetworkModification(vmodf, b20_branch) + mod_c20 = NetworkModification(vmodf, c20_branch) + + n3_mods = vcat( + collect(mod_a23.arc_modifications), + collect(mod_b20.arc_modifications), + collect(mod_c20.arc_modifications), + ) + n3_mod = NetworkModification("n3_three_regular_lines", n3_mods) + + ctg_uuid = Base.UUID(UInt128(515252)) + ctg = ContingencySpec(ctg_uuid, n3_mod) + vmodf.contingency_cache[ctg_uuid] = ctg + + sys_post = deepcopy(sys) + PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "A23")) + PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "B20")) + PSY.remove_component!(sys_post, PSY.get_component(PSY.Line, sys_post, "C20")) + ptdf_post = PTDF(sys_post) + + pre_bus_axis = collect(PNM.get_bus_axis(vmodf)) + ptdf_post_row = _ptdf_post_row_builder(ptdf_post, pre_bus_axis) + post_arc_lookup = Dict{Tuple{Int, Int}, Int}( + a => i for (i, a) in enumerate(PNM.get_arc_axis(ptdf_post)) + ) + + # Monitored arcs: + # 1. (116, 119) A28: adjacent to A23's to-bus (116), area 1. + # 2. (212, 223) B21: adjacent to B20's from-bus (212), area 2. + # 3. (312, 323) C21: adjacent to C20's from-bus (312), area 3. + # 4. (109, 111) parallel pair: sanity check that the parallel-augmented + # system still behaves correctly when A23/B20/C20 are not part of the N-3. + # 5. (204, 209) B8: unrelated area-200 line. + # 6. (302, 306) C5: unrelated area-300 line. + atol = 1e-6 + for (label, arc_key) in [ + ("A28 adj A23 to-bus 116 (116,119)", (116, 119)), + ("B21 adj B20 from-bus 212 (212,223)", (212, 223)), + ("C21 adj C20 from-bus 312 (312,323)", (312, 323)), + ("parallel pair sanity (109,111)", par_key), + ("B8 area-200 (204,209)", (204, 209)), + ("C5 area-300 (302,306)", (302, 306)), + ] + if !haskey(post_arc_lookup, arc_key) && + !haskey(post_arc_lookup, (arc_key[2], arc_key[1])) + @info "Skipping $label: arc $arc_key not present in sys_post" + continue + end + mon_idx = PNM.get_arc_lookup(vmodf)[arc_key] + modf_row = vmodf[mon_idx, ctg] + @test isapprox(modf_row, ptdf_post_row(arc_key); atol = atol) + end + end +end diff --git a/test/test_network_modification.jl b/test/test_network_modification.jl index 7bdb278ad..2b158fd4a 100644 --- a/test/test_network_modification.jl +++ b/test/test_network_modification.jl @@ -258,8 +258,8 @@ end # serial baseline. The KLU path is exercised by the threaded testsets in # `test/test_virtual_modf.jl`; this complements that coverage on the # AppleAccelerate path. - if !PowerNetworkMatrices._has_apple_accelerate_ext() - @info "Skipping: AppleAccelerate extension not loaded." + if !PowerNetworkMatrices._has_apple_accelerate_backend() + @info "Skipping: AppleAccelerate backend not available on this platform." return end if Threads.nthreads() < 2 diff --git a/test/test_partial_lodf.jl b/test/test_partial_lodf.jl index 53363593f..e34eb8d0a 100644 --- a/test/test_partial_lodf.jl +++ b/test/test_partial_lodf.jl @@ -89,7 +89,9 @@ end @testset "Partial LODF: half-susceptance matches rebuilt ground truth" begin sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5") - vlodf = VirtualLODF(sys5) + # Pin KLU here: the ground-truth path below calls `PNM.solve!` directly on + # `vlodf.K`, which is dispatched on `KLULinSolveCache` only. + vlodf = VirtualLODF(sys5; linear_solver = "KLU") e = 1 # test arc index b_e = vlodf.arc_susceptances[e] diff --git a/test/test_powerflow_matrix_types.jl b/test/test_powerflow_matrix_types.jl index 870f650c5..33e634208 100644 --- a/test/test_powerflow_matrix_types.jl +++ b/test/test_powerflow_matrix_types.jl @@ -2,7 +2,7 @@ sys = PSB.build_system(PSB.PSITestSystems, "c_sys5") @testset "DC_ABA_Matrix_Factorized" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() M = ABA_Matrix(sys; factorize = true) @test M isa PNM.DC_ABA_Matrix_Factorized end @@ -14,7 +14,7 @@ end @testset "DC_PTDF_Matrix" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() @test PTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_PTDF_Matrix end @test PTDF(sys; linear_solver = "KLU") isa PNM.DC_PTDF_Matrix @@ -22,7 +22,7 @@ end @testset "DC_vPTDF_Matrix" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() @test VirtualPTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_vPTDF_Matrix end diff --git a/test/test_ptdf.jl b/test/test_ptdf.jl index 749cbf7a0..fb427491f 100644 --- a/test/test_ptdf.jl +++ b/test/test_ptdf.jl @@ -8,8 +8,8 @@ @info "Skipped MKLPardiso tests (extension not loaded)" continue end - if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate" - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate" + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" continue end sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5") diff --git a/test/test_virtual_lodf.jl b/test/test_virtual_lodf.jl index 7301f3b70..7c2a9bd90 100644 --- a/test/test_virtual_lodf.jl +++ b/test/test_virtual_lodf.jl @@ -148,8 +148,36 @@ end @test test_value end +@testset "Test Virtual LODF with Apple Accelerate" begin + if !PNM._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate VirtualLODF tests (backend unavailable on this platform)" + else + sys = PSB.build_system(PSB.PSITestSystems, "c_sys14") + + vlodf_aa = VirtualLODF(sys; linear_solver = "AppleAccelerate") + vlodf_klu = VirtualLODF(sys; linear_solver = "KLU") + + @test contains(string(typeof(vlodf_aa.K)), "AAFactorCache") + @test vlodf_klu.K isa PNM.KLULinSolveCache{Float64} + + arc_axis = PNM.get_arc_axis(vlodf_aa) + @test arc_axis == PNM.get_arc_axis(vlodf_klu) + for arc in arc_axis + row_aa = vlodf_aa[arc, :] + row_klu = vlodf_klu[arc, :] + @test isapprox(row_aa, row_klu, atol = 1e-9) + end + + # macOS default should resolve to AppleAccelerate. + vlodf_default = VirtualLODF(sys) + @test contains(string(typeof(vlodf_default.K)), "AAFactorCache") + end +end + @testset "_get_PTDF_A_diag: matches reference implementation" begin # Reference: full-bus dot product. Slow, unambiguously correct. + # Captures the return value of `_solve_factorization` to stay + # backend-agnostic (mirrors the fast path). function _reference_ptdf_a_diag(K, BA, A, ref_bus_positions::Set{Int}) n_branches = size(BA, 2) n_buses = size(BA, 1) @@ -164,10 +192,10 @@ end bus_idx = valid_ix[idx] ba_col[idx] = BA[bus_idx, i] end - PNM._solve_factorization(K, ba_col) + lin_solve = PNM._solve_factorization(K, ba_col) fill!(ptdf_row, 0.0) for idx in 1:n_valid - ptdf_row[valid_ix[idx]] = ba_col[idx] + ptdf_row[valid_ix[idx]] = lin_solve[idx] end for j in 1:n_buses diag_[i] += ptdf_row[j] * A[i, j] @@ -176,16 +204,24 @@ end return diag_ end + backends = String["KLU"] + if PNM._has_apple_accelerate_backend() + push!(backends, "AppleAccelerate") + end + for case in ("c_sys5", "c_sys14") sys = PSB.build_system(PSB.PSITestSystems, case) A = PNM.IncidenceMatrix(sys) BA = PNM.BA_Matrix(sys) ref_pos = Set(PNM.get_ref_bus_position(A)) ABA = PNM.calculate_ABA_matrix(A.data, BA.data, ref_pos) - K = PNM.klu_factorize(ABA) - fast = PNM._get_PTDF_A_diag(K, BA.data, A.data, ref_pos) - ref = _reference_ptdf_a_diag(K, BA.data, A.data, ref_pos) - @test fast ≈ ref atol = 1e-12 rtol = 0 + for solver_name in backends + solver = PNM.resolve_linear_solver(solver_name) + K = PNM._create_factorization(solver, ABA) + fast = PNM._get_PTDF_A_diag(K, BA.data, A.data, ref_pos) + ref = _reference_ptdf_a_diag(K, BA.data, A.data, ref_pos) + @test fast ≈ ref atol = 1e-12 rtol = 0 + end end end diff --git a/test/test_virtual_modf.jl b/test/test_virtual_modf.jl index 40f4dbbfd..3c0dfc254 100644 --- a/test/test_virtual_modf.jl +++ b/test/test_virtual_modf.jl @@ -436,6 +436,39 @@ end end end +@testset "VirtualMODF with Apple Accelerate backend matches KLU" begin + if !PNM._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate VirtualMODF tests (backend unavailable on this platform)" + else + sys, _ = _build_c_sys14_with_outages() + + vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate") + vmodf_klu = VirtualMODF(sys; linear_solver = "KLU") + + # Factorization should be the AA cache type. + @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache") + @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64} + + registered_aa = get_registered_contingencies(vmodf_aa) + registered_klu = get_registered_contingencies(vmodf_klu) + @test !isempty(registered_aa) + @test keys(registered_aa) == keys(registered_klu) + + # Compare post-contingency rows for every registered contingency + # against the KLU build, sweeping all monitored arcs. + arc_axis = PNM.get_arc_axis(vmodf_aa) + @test arc_axis == PNM.get_arc_axis(vmodf_klu) + for (uuid, ctg_aa) in registered_aa + ctg_klu = registered_klu[uuid] + for arc in arc_axis + row_aa = vmodf_aa[arc, ctg_aa] + row_klu = vmodf_klu[arc, ctg_klu] + @test isapprox(row_aa, row_klu, atol = 1e-9) + end + end + end +end + @testset "VirtualMODF concurrent getindex on the SAME (arc, ctg) is consistent" begin # Complements the previous testset: there, each (arc, ctg) pair appears # once in the work list, so only the `woodbury_cache` first-call race is @@ -646,3 +679,37 @@ end PNM.clear_caches!(vmodf) end + +@testset "VirtualMODF: KLU and AppleAccelerate backend parity" begin + if !PNM._has_apple_accelerate_backend() + @info "Skipped VirtualMODF AA/KLU parity (backend unavailable on this platform)" + return + end + + sys = PSB.build_system(PSB.PSITestSystems, "c_sys5") + + vmodf_klu = VirtualMODF(sys; linear_solver = "KLU") + vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate") + + @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64} + @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache") + + # Trigger the lazy PTDF_A_diag on both backends. + diag_klu = vmodf_klu.PTDF_A_diag + diag_aa = vmodf_aa.PTDF_A_diag + @test length(diag_klu) == length(diag_aa) + @test isapprox(diag_aa, diag_klu, atol = 1e-9) + + # Register the same N-1 contingency on both and compare one MODF row. + e = 1 + b_e = vmodf_klu.arc_susceptances[e] + ctg_uuid = Base.UUID(UInt128(424242)) + mod = NetworkModification("aa_parity_outage", [ArcModification(e, -b_e)]) + ctg = ContingencySpec(ctg_uuid, mod) + vmodf_klu.contingency_cache[ctg_uuid] = ctg + vmodf_aa.contingency_cache[ctg_uuid] = ctg + + row_klu = vmodf_klu[2, ctg] + row_aa = vmodf_aa[2, ctg] + @test isapprox(row_aa, row_klu, atol = 1e-9) +end diff --git a/test/test_virtual_ptdf.jl b/test/test_virtual_ptdf.jl index 5993d4d03..83d7ebb9e 100644 --- a/test/test_virtual_ptdf.jl +++ b/test/test_virtual_ptdf.jl @@ -1,8 +1,8 @@ # if it fails, we don't want the terminal to be flooded with errors, therefore failfast=true @testset "Test Virtual PTDF matrices" for solver in ("KLU", "AppleAccelerate") - if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate" - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate" + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" continue end sys = PSB.build_system( @@ -185,8 +185,8 @@ end end @testset "Test Virtual PTDF with Apple Accelerate" begin - if !PowerNetworkMatrices._has_apple_accelerate_ext() - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" else # Test VirtualPTDF with Apple Accelerate solver sys = PSB.build_system(PSB.PSITestSystems, "c_sys14") @@ -194,8 +194,8 @@ end # Create VirtualPTDF with AppleAccelerate vptdf_aa = VirtualPTDF(sys; linear_solver = "AppleAccelerate") - # Verify the factorization type is AAFactorization - @test contains(string(typeof(vptdf_aa.K)), "AAFactorization") + # Verify the factorization type is AAFactorCache + @test contains(string(typeof(vptdf_aa.K)), "AAFactorCache") # Create reference VirtualPTDF with KLU vptdf_klu = VirtualPTDF(sys; linear_solver = "KLU") @@ -209,7 +209,7 @@ end # Test with tolerance vptdf_aa_tol = VirtualPTDF(sys; linear_solver = "AppleAccelerate", tol = 1e-2) - @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorization") + @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorCache") # Test with distributed slack buscount = length(PSY.get_available_components(PSY.ACBus, sys)) @@ -217,7 +217,7 @@ end dist_slack = Dict(i => dist_slack_factor for i in 1:buscount) vptdf_aa_slack = VirtualPTDF(sys; linear_solver = "AppleAccelerate", dist_slack = dist_slack) - @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorization") + @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorCache") # Compare with KLU for distributed slack case vptdf_klu_slack = VirtualPTDF(sys; linear_solver = "KLU", dist_slack = dist_slack)