Skip to content

Commit acb0528

Browse files
Merge pull request #572 from SciML/fastlapackinterface
Make FastLapackInterface.jl an extension as well
2 parents 3bff586 + ae0b5e0 commit acb0528

File tree

9 files changed

+165
-146
lines changed

9 files changed

+165
-146
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
99
ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
1010
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
1111
EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
12-
FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
1312
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
1413
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
1514
Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
@@ -35,6 +34,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
3534
CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
3635
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
3736
FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
37+
FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
3838
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
3939
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
4040
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -52,6 +52,7 @@ LinearSolveCUDAExt = "CUDA"
5252
LinearSolveCUDSSExt = "CUDSS"
5353
LinearSolveEnzymeExt = "EnzymeCore"
5454
LinearSolveFastAlmostBandedMatricesExt = "FastAlmostBandedMatrices"
55+
LinearSolveFastLapackInterfaceExt = "FastLapackInterface"
5556
LinearSolveHYPREExt = "HYPRE"
5657
LinearSolveIterativeSolversExt = "IterativeSolvers"
5758
LinearSolveKernelAbstractionsExt = "KernelAbstractions"
@@ -126,6 +127,7 @@ BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
126127
BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
127128
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
128129
FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
130+
FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
129131
FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
130132
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
131133
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
@@ -150,4 +152,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
150152
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
151153

152154
[targets]
153-
test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "JET", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "Enzyme", "FiniteDiff", "BandedMatrices", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak"]
155+
test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "JET", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "Enzyme", "FiniteDiff", "BandedMatrices", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak", "FastLapackInterface"]

docs/pages.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
pages = ["index.md",
44
"Tutorials" => Any["tutorials/linear.md",
5-
"tutorials/caching_interface.md",
6-
"tutorials/accelerating_choices.md"],
5+
"tutorials/caching_interface.md",
6+
"tutorials/accelerating_choices.md"],
77
"Basics" => Any["basics/LinearProblem.md",
88
"basics/common_solver_opts.md",
99
"basics/OperatorAssumptions.md",

docs/src/solvers/solvers.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ use `Krylov_GMRES()`.
8484

8585
### RecursiveFactorization.jl
8686

87+
!!! note
88+
89+
Using this solver requires adding the package RecursiveFactorization.jl, i.e. `using RecursiveFactorization`
90+
8791
```@docs
8892
RFLUFactorization
8993
```
@@ -123,7 +127,13 @@ FastLapackInterface.jl is a package that allows for a lower-level interface to t
123127
calls to allow for preallocating workspaces to decrease the overhead of the wrappers.
124128
LinearSolve.jl provides a wrapper to these routines in a way where an initialized solver
125129
has a non-allocating LU factorization. In theory, this post-initialized solve should always
126-
be faster than the Base.LinearAlgebra version.
130+
be faster than the Base.LinearAlgebra version. In practice, with the way we wrap the solvers,
131+
we do not see a performance benefit and in fact benchmarks tend to show this inhibits
132+
performance.
133+
134+
!!! note
135+
136+
Using this solver requires adding the package FastLapackInterface.jl, i.e. `using FastLapackInterface`
127137

128138
```@docs
129139
FastLUFactorization
@@ -157,10 +167,6 @@ KrylovJL
157167

158168
### MKL.jl
159169

160-
!!! note
161-
162-
Using this solver requires adding the package MKL_jll.jl, i.e. `using MKL_jll`
163-
164170
```@docs
165171
MKLLUFactorization
166172
```
Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Accelerating your Linear Solves
22

33
!!! note
4+
45
This section is essential if you wish to achieve maximum performance with
56
LinearSolve.jl, especially on v7 and above. Please ensure the tips of this
67
section are adhered to when optimizing code and benchmarking.
@@ -16,7 +17,7 @@ scenarios, so let's dive in.
1617
## Understanding Performance of Dense Linear Solves
1718

1819
The performance of dense linear solvers is highly dependent on the size of the matrix
19-
and the chosen architecture to run on, i.e. the CPU.
20+
and the chosen architecture to run on, i.e. the CPU.
2021
[This issue](https://github.com/SciML/LinearSolve.jl/issues/357) gathered benchmark data
2122
from many different users and is summarized in the following graphs:
2223

@@ -25,33 +26,34 @@ from many different users and is summarized in the following graphs:
2526
Now one thing that is immediate is for example that AppleAccelerate generally does well
2627
on Apple M-series chips, MKL generally does well on Intel, etc. And we know this in
2728
LinearSolve.jl, in fact we automatically default to different BLASes based on the CPU
28-
architecture already as part of the design! So that covers most of the variation, but
29+
architecture already as part of the design! So that covers most of the variation, but
2930
there are a few major tips to note when fine tuning the results to your system:
3031

31-
1. One of the best methods for size 150x150 matrices and below is RecursiveFactorization.jl.
32-
This is a pure Julia BLAS system, but it has a high load time overhead, and thus as of
33-
v7 it's no longer loaded by default! Thus if your matrices are in this range and you would
34-
value better run times at the cost of compile and load times, it is recommended you add
35-
`using RecursiveFactorization`. The defaulting algorithm will then consider it in its list
36-
and will automatically (in an architecture-specific way) insert it as it feels necesssary.
37-
2. One of the major factors that can inhibit BLAS performance on LU factorization is multithreading.
38-
In many of these plots you can see a giant dip in GFLOPs (higher is better) when a certain size
39-
threshold is hit. This is because, for the number of chosen threads, there was not enough work
40-
and thus when the threading threshold is hit you get a hit to the performance due to the added
41-
overhead. The threading performance can be a per-system thing, and it can be greatly influenced
42-
by the number of cores on your system and the number of threads you allow. Thus for example,
43-
OpenBLAS' LU factorization seems to generally be really bad at guessing the thread switch point
44-
for CPUs with really high core/thread counts. If this is the case, you may want to investigate
45-
decreasing your number of BLAS threads, i.e. via `BLAS.set_num_threads(i)`. Note that
46-
RecursiveFactorization.jl uses your Julia thread pool instead of the BLAS threads.
47-
3. The switch points between algorithms can be fairly inexact. LinearSolve.jl tried to keep a tab
48-
on where they are per platform and keep updated, but it can be a moving battle. You may be
49-
able to eek out some performance by testing between the various options on your platform, i.e.
50-
RFLUFactorization vs LUFactorization vs AppleAccelerateLUFactorization (M-series) vs
51-
MKLFactorization (X86) and hardcoding the choice for your problem if the default did not make
52-
the right guess.
32+
1. One of the best methods for size 150x150 matrices and below is RecursiveFactorization.jl.
33+
This is a pure Julia BLAS system, but it has a high load time overhead, and thus as of
34+
v7 it's no longer loaded by default! Thus if your matrices are in this range and you would
35+
value better run times at the cost of compile and load times, it is recommended you add
36+
`using RecursiveFactorization`. The defaulting algorithm will then consider it in its list
37+
and will automatically (in an architecture-specific way) insert it as it feels necesssary.
38+
2. One of the major factors that can inhibit BLAS performance on LU factorization is multithreading.
39+
In many of these plots you can see a giant dip in GFLOPs (higher is better) when a certain size
40+
threshold is hit. This is because, for the number of chosen threads, there was not enough work
41+
and thus when the threading threshold is hit you get a hit to the performance due to the added
42+
overhead. The threading performance can be a per-system thing, and it can be greatly influenced
43+
by the number of cores on your system and the number of threads you allow. Thus for example,
44+
OpenBLAS' LU factorization seems to generally be really bad at guessing the thread switch point
45+
for CPUs with really high core/thread counts. If this is the case, you may want to investigate
46+
decreasing your number of BLAS threads, i.e. via `BLAS.set_num_threads(i)`. Note that
47+
RecursiveFactorization.jl uses your Julia thread pool instead of the BLAS threads.
48+
3. The switch points between algorithms can be fairly inexact. LinearSolve.jl tried to keep a tab
49+
on where they are per platform and keep updated, but it can be a moving battle. You may be
50+
able to eek out some performance by testing between the various options on your platform, i.e.
51+
RFLUFactorization vs LUFactorization vs AppleAccelerateLUFactorization (M-series) vs
52+
MKLFactorization (X86) and hardcoding the choice for your problem if the default did not make
53+
the right guess.
5354

5455
!!! warn
56+
5557
As noted, RecursiveFactorization.jl is one of the fastest linear solvers for smaller dense
5658
matrices but requires `using RecursiveFactorization` in order to be used in the default
5759
solver setups! Thus it's recommended that any optimized code or benchmarks sets this up.
@@ -69,17 +71,18 @@ LinearSolve.jl just uses a very simple "if small then use KLU and if large use U
6971
is validated by this plot, but leaves a lot to be desired. In particular, the following rules
7072
should be thought about:
7173

72-
1. Pardiso is a great solver, you should try `using Pardiso` and using `MKLPardiso()` in many
73-
scenarios.
74-
2. The more structured a sparsity pattern is, the worse KLU is in comparison to the other
75-
algorithms.
76-
3. A Krylov subspace method with proper preconditioning will be better than direct solvers
77-
when the matrices get large enough. You could always precondition a sparse matrix with
78-
iLU as an easy choice, though the tolerance would need to be tuned in a problem-specific
79-
way.
74+
1. Pardiso is a great solver, you should try `using Pardiso` and using `MKLPardiso()` in many
75+
scenarios.
76+
2. The more structured a sparsity pattern is, the worse KLU is in comparison to the other
77+
algorithms.
78+
3. A Krylov subspace method with proper preconditioning will be better than direct solvers
79+
when the matrices get large enough. You could always precondition a sparse matrix with
80+
iLU as an easy choice, though the tolerance would need to be tuned in a problem-specific
81+
way.
8082

8183
!!! note
84+
8285
UMFPACK does better when the BLAS is not OpenBLAS. Try `using MKL` on Intel and AMD Ryzen
8386
platforms and UMPACK will be faster! LinearSolve.jl cannot default to this as this changes
8487
global settings and thus only defaults to MKL locally, and thus cannot change the setting
85-
within UMFPACK.
88+
within UMFPACK.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
module LinearSolveFastLapackInterfaceExt
2+
3+
using LinearSolve, LinearAlgebra
4+
using FastLapackInterface
5+
6+
struct WorkspaceAndFactors{W, F}
7+
workspace::W
8+
factors::F
9+
end
10+
11+
function LinearSolve.init_cacheval(::FastLUFactorization, A, b, u, Pl, Pr,
12+
maxiters::Int, abstol, reltol, verbose::Bool,
13+
assumptions::OperatorAssumptions)
14+
ws = LUWs(A)
15+
return WorkspaceAndFactors(
16+
ws, LinearSolve.ArrayInterface.lu_instance(convert(AbstractMatrix, A)))
17+
end
18+
19+
function SciMLBase.solve!(
20+
cache::LinearSolve.LinearCache, alg::FastLUFactorization; kwargs...)
21+
A = cache.A
22+
A = convert(AbstractMatrix, A)
23+
ws_and_fact = LinearSolve.@get_cacheval(cache, :FastLUFactorization)
24+
if cache.isfresh
25+
# we will fail here if A is a different *size* than in a previous version of the same cache.
26+
# it may instead be desirable to resize the workspace.
27+
LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.LU(LAPACK.getrf!(
28+
ws_and_fact.workspace,
29+
A)...)
30+
cache.cacheval = ws_and_fact
31+
cache.isfresh = false
32+
end
33+
y = ldiv!(cache.u, cache.cacheval.factors, cache.b)
34+
SciMLBase.build_linear_solution(alg, y, nothing, cache)
35+
end
36+
37+
function LinearSolve.init_cacheval(
38+
alg::FastQRFactorization{NoPivot}, A::AbstractMatrix, b, u, Pl, Pr,
39+
maxiters::Int, abstol, reltol, verbose::Bool,
40+
assumptions::OperatorAssumptions)
41+
ws = QRWYWs(A; blocksize = alg.blocksize)
42+
return WorkspaceAndFactors(ws,
43+
LinearSolve.ArrayInterface.qr_instance(convert(AbstractMatrix, A)))
44+
end
45+
function LinearSolve.init_cacheval(
46+
::FastQRFactorization{ColumnNorm}, A::AbstractMatrix, b, u, Pl, Pr,
47+
maxiters::Int, abstol, reltol, verbose::Bool,
48+
assumptions::OperatorAssumptions)
49+
ws = QRpWs(A)
50+
return WorkspaceAndFactors(ws,
51+
LinearSolve.ArrayInterface.qr_instance(convert(AbstractMatrix, A)))
52+
end
53+
54+
function LinearSolve.init_cacheval(alg::FastQRFactorization, A, b, u, Pl, Pr,
55+
maxiters::Int, abstol, reltol, verbose::Bool,
56+
assumptions::OperatorAssumptions)
57+
return init_cacheval(alg, convert(AbstractMatrix, A), b, u, Pl, Pr,
58+
maxiters::Int, abstol, reltol, verbose::Bool,
59+
assumptions::OperatorAssumptions)
60+
end
61+
62+
function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::FastQRFactorization{P};
63+
kwargs...) where {P}
64+
A = cache.A
65+
A = convert(AbstractMatrix, A)
66+
ws_and_fact = LinearSolve.@get_cacheval(cache, :FastQRFactorization)
67+
if cache.isfresh
68+
# we will fail here if A is a different *size* than in a previous version of the same cache.
69+
# it may instead be desirable to resize the workspace.
70+
if P === NoPivot
71+
LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.QRCompactWY(LAPACK.geqrt!(
72+
ws_and_fact.workspace,
73+
A)...)
74+
else
75+
LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.QRPivoted(LAPACK.geqp3!(
76+
ws_and_fact.workspace,
77+
A)...)
78+
end
79+
cache.cacheval = ws_and_fact
80+
cache.isfresh = false
81+
end
82+
y = ldiv!(cache.u, cache.cacheval.factors, cache.b)
83+
SciMLBase.build_linear_solution(alg, y, nothing, cache)
84+
end
85+
86+
end

src/LinearSolve.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ using SciMLOperators
1414
using SciMLOperators: AbstractSciMLOperator, IdentityOperator
1515
using Setfield
1616
using UnPack
17-
using FastLapackInterface
1817
using DocStringExtensions
1918
using EnumX
2019
using Markdown

src/extension_algs.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,31 @@ function RFLUFactorization(; pivot = Val(true), thread = Val(true), throwerror =
107107
RFLUFactorization(pivot, thread; throwerror)
108108
end
109109

110+
# There's no options like pivot here.
111+
# But I'm not sure it makes sense as a GenericFactorization
112+
# since it just uses `LAPACK.getrf!`.
113+
"""
114+
`FastLUFactorization()`
115+
116+
The FastLapackInterface.jl version of the LU factorization. Notably,
117+
this version does not allow for choice of pivoting method.
118+
"""
119+
struct FastLUFactorization <: AbstractDenseFactorization end
120+
121+
"""
122+
`FastQRFactorization()`
123+
124+
The FastLapackInterface.jl version of the QR factorization.
125+
"""
126+
struct FastQRFactorization{P} <: AbstractDenseFactorization
127+
pivot::P
128+
blocksize::Int
129+
end
130+
131+
# is 36 or 16 better here? LinearAlgebra and FastLapackInterface use 36,
132+
# but QRFactorization uses 16.
133+
FastQRFactorization() = FastQRFactorization(NoPivot(), 36)
134+
110135
"""
111136
```julia
112137
MKLPardisoFactorize(; nprocs::Union{Int, Nothing} = nothing,

0 commit comments

Comments
 (0)