From c6247c7a99126215aa19b840d1c3710f690b426e Mon Sep 17 00:00:00 2001 From: "Luiz M. Faria" Date: Mon, 7 Sep 2020 14:29:11 +0200 Subject: [PATCH 1/4] add vectorized+threaded version of laplace --- lap3dkernel/julia/lap3dkernel.jl | 51 ++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/lap3dkernel/julia/lap3dkernel.jl b/lap3dkernel/julia/lap3dkernel.jl index b8f859c..674ec13 100644 --- a/lap3dkernel/julia/lap3dkernel.jl +++ b/lap3dkernel/julia/lap3dkernel.jl @@ -14,6 +14,7 @@ using LinearAlgebra using Printf using BenchmarkTools using Base.Threads +using SIMD # this is useless, apparently: #using Devectorize @@ -62,6 +63,41 @@ function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above end return pot end + +function lap3dcharge_devec_par_new(y,q,x,V=Vec{4,Float64}) # multi-threaded version of above + xt,yt,zt = x[1,:],x[2,:],x[3,:] + xs,ys,zs = y[1,:],y[2,:],y[3,:] + lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V) +end + +function lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V) + T = eltype(xs) + @assert length(xs) == length(ys) == length(zs) + @assert length(xt) == length(xt) == length(zt) + ns,nt = length(xs), length(xt) + pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later + @threads for t in 1:length(V):nt + Xt_vec = vload(V,xt,t) + Yt_vec = vload(V,yt,t) + Zt_vec = vload(V,zt,t) + pot_vec = _inner_loop(Xt_vec,Yt_vec,Zt_vec,t,xs,ys,zs,q) + vstore(pot_vec,pot,t) + end + return pot +end + +function _inner_loop(Xt_vec::T,Yt_vec::T,Zt_vec::T,t,xs,ys,zs,q) where {T} + prefac = 1/(4*pi) + pot_vec = zero(T) + for s in eachindex(xs) + dx = xs[s] - Xt_vec + dy = ys[s] - Yt_vec + dz = zs[s] - Zt_vec + r2 = dx*dx + dy*dy + dz*dz + pot_vec += prefac * q[s] / sqrt(r2) #rij + end + return pot_vec +end ns = 10000 nt = 10000 @@ -71,20 +107,29 @@ q = randn(ns) # charges t = @elapsed lap3dcharge(y,q,x) # discards return value t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled #pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs -@printf("targ-vec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) +check = lap3dcharge(y,q,x) |> sum +@printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) # 0.06 Gpair/s, ie 5x slower than devec lap3dcharge_devec(y,q,x) # compile it? t = @elapsed lap3dcharge_devec(y,q,x) # discards return value t = @elapsed lap3dcharge_devec(y,q,x) # discards return value -@printf("devec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) +check = lap3dcharge_devec(y,q,x) |> sum +@printf("devec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) # best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded) t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value -@printf("devec par: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) +check = lap3dcharge_devec_par(y,q,x) |> sum +@printf("devec par: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) # 1.26 Gpair/s, matches py+numba +t = @elapsed lap3dcharge_devec_par_new(y,q,x) # discards return value +t = @elapsed lap3dcharge_devec_par_new(y,q,x) # discards return value +check = lap3dcharge_devec_par_new(y,q,x) |> sum +@printf("devec par new: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + + # S. Johnson chat: use benchmark tools. # @btime lap3dcharge_devec_par($y,$q,$x); # the $ is a btime thing: means figure out the type of args before benchmarking From 28a9e10a278937fbbe265b61e87c1644a860fb7f Mon Sep 17 00:00:00 2001 From: "Luiz M. Faria" Date: Mon, 7 Sep 2020 14:52:09 +0200 Subject: [PATCH 2/4] make functions work with Float32 --- lap3dkernel/julia/lap3dkernel.jl | 82 +++++++++++++++++--------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/lap3dkernel/julia/lap3dkernel.jl b/lap3dkernel/julia/lap3dkernel.jl index 674ec13..cfa9e24 100644 --- a/lap3dkernel/julia/lap3dkernel.jl +++ b/lap3dkernel/julia/lap3dkernel.jl @@ -15,17 +15,15 @@ using Printf using BenchmarkTools using Base.Threads using SIMD +using VectorizationBase: pick_vector_width # this is useless, apparently: #using Devectorize -function eye(n) - return Matrix(1.0I,n,n) -end - function lap3dcharge(y,q,x) # vec over targs + T = eltype(y) nt = size(x,2) - pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later - prefac = 1/(4*pi) + pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later + prefac = T(1/(4*pi)) for j in eachindex(q) # loop over srcs r2 = sum((x .- y[:,j]).^2,dims=1) # sq dist r = sqrt.(r2) @@ -35,11 +33,12 @@ function lap3dcharge(y,q,x) # vec over targs end function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD + T = eltype(y) nt = size(x,2) ns = size(y,2) - pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later + pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later prefac = 1/(4*pi) -@inbounds for i = 1:nt # targs + @inbounds for i = 1:nt # targs @simd for j = 1:ns # srcs #r2ij = sum((x[:,i] - y[:,j]).^2) # sq dist - terrible! r2ij = (x[1,i]-y[1,j])^2+ (x[2,i]-y[2,j])^2+ (x[3,i]-y[3,j])^2 @@ -51,8 +50,9 @@ function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD end function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above + T = eltype(y) nt = size(x,2) - pot = zeros(nt) # note zeros(nt), col vec, fails to add r later + pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later prefac = 1/(4*pi) @threads for i in eachindex(pot) # targs. @simd for j in eachindex(q) # srcs @@ -99,36 +99,40 @@ function _inner_loop(Xt_vec::T,Yt_vec::T,Zt_vec::T,t,xs,ys,zs,q) where {T} return pot_vec end -ns = 10000 -nt = 10000 -x = rand(3,nt) -y = rand(3,ns) -q = randn(ns) # charges -t = @elapsed lap3dcharge(y,q,x) # discards return value -t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled -#pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs -check = lap3dcharge(y,q,x) |> sum -@printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) -# 0.06 Gpair/s, ie 5x slower than devec - -lap3dcharge_devec(y,q,x) # compile it? -t = @elapsed lap3dcharge_devec(y,q,x) # discards return value -t = @elapsed lap3dcharge_devec(y,q,x) # discards return value -check = lap3dcharge_devec(y,q,x) |> sum -@printf("devec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) -# best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded) - -t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value -t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value -check = lap3dcharge_devec_par(y,q,x) |> sum -@printf("devec par: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) -# 1.26 Gpair/s, matches py+numba - -t = @elapsed lap3dcharge_devec_par_new(y,q,x) # discards return value -t = @elapsed lap3dcharge_devec_par_new(y,q,x) # discards return value -check = lap3dcharge_devec_par_new(y,q,x) |> sum -@printf("devec par new: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) - +for T = (Float32,Float64) # element type (e.g. Float64 or Float32) + N = pick_vector_width(T) + V = Vec{N,T} + ns = 10000 + nt = 10000 + x = rand(T,3,nt) + y = rand(T,3,ns) + q = randn(T,ns) # charges + t = @elapsed lap3dcharge(y,q,x) # discards return value + t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled + #pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs + check = lap3dcharge(y,q,x) |> sum + @printf("Result with type %s: \n",T) + @printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # 0.06 Gpair/s, ie 5x slower than devec + + lap3dcharge_devec(y,q,x) # compile it? + t = @elapsed lap3dcharge_devec(y,q,x) # discards return value + t = @elapsed lap3dcharge_devec(y,q,x) # discards return value + check = lap3dcharge_devec(y,q,x) |> sum + @printf("devec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded) + + t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value + t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value + check = lap3dcharge_devec_par(y,q,x) |> sum + @printf("devec par: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # 1.26 Gpair/s, matches py+numba + + t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value + t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value + check = lap3dcharge_devec_par_new(y,q,x,V) |> sum + @printf("devec par new: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) +end # S. Johnson chat: use benchmark tools. # @btime lap3dcharge_devec_par($y,$q,$x); From 56c0698700e68ca2582316227e44e22db81d3c4e Mon Sep 17 00:00:00 2001 From: "Luiz M. Faria" Date: Mon, 7 Sep 2020 14:55:45 +0200 Subject: [PATCH 3/4] modify command line options to julia program --- lap3dkernel/run_all_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lap3dkernel/run_all_tests.sh b/lap3dkernel/run_all_tests.sh index 1b371b5..ee8b70e 100755 --- a/lap3dkernel/run_all_tests.sh +++ b/lap3dkernel/run_all_tests.sh @@ -18,9 +18,9 @@ echo FORTRAN: #(cd fortran; ifort lap3dpottest.f90 -qopenmp -fPIC -O3 -march=native -funroll-loops -mkl; ./a.out) -echo JULIA: +echo JULIA with $(nproc) processors: # note you'll need to import some Pkgs into your julia distro -JULIA_NUM_THREADS=$(nproc) julia julia/lap3dkernel.jl +JULIA_NUM_THREADS=$(nproc) julia --check-bounds=no -O3 julia/lap3dkernel.jl echo PYTHON: # assumes python is your v3 version w/ numba, eg via: source activate idp From 7d7f4c507d4e447051102b3b0c4b65d96076bcfb Mon Sep 17 00:00:00 2001 From: "Luiz M. Faria" Date: Mon, 7 Sep 2020 15:23:58 +0200 Subject: [PATCH 4/4] cleanup + some comments --- lap3dkernel/julia/lap3dkernel.jl | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/lap3dkernel/julia/lap3dkernel.jl b/lap3dkernel/julia/lap3dkernel.jl index cfa9e24..d6b1bff 100644 --- a/lap3dkernel/julia/lap3dkernel.jl +++ b/lap3dkernel/julia/lap3dkernel.jl @@ -14,10 +14,8 @@ using LinearAlgebra using Printf using BenchmarkTools using Base.Threads -using SIMD -using VectorizationBase: pick_vector_width -# this is useless, apparently: -#using Devectorize +using SIMD # helps when "manual" vectorization is needed +using VectorizationBase: pick_vector_width # use to to determine appropriate size of register vector function lap3dcharge(y,q,x) # vec over targs T = eltype(y) @@ -64,7 +62,7 @@ function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above return pot end -function lap3dcharge_devec_par_new(y,q,x,V=Vec{4,Float64}) # multi-threaded version of above +function lap3dcharge_devec_par_new(y,q,x,V=Vec{4,Float64}) xt,yt,zt = x[1,:],x[2,:],x[3,:] xs,ys,zs = y[1,:],y[2,:],y[3,:] lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V) @@ -99,9 +97,9 @@ function _inner_loop(Xt_vec::T,Yt_vec::T,Zt_vec::T,t,xs,ys,zs,q) where {T} return pot_vec end -for T = (Float32,Float64) # element type (e.g. Float64 or Float32) - N = pick_vector_width(T) - V = Vec{N,T} +for T in (Float32,Float64) # element type (e.g. Float64 or Float32) + N = pick_vector_width(T) # number of elements of type T that can be stored on the vector register + V = Vec{N,T} # SIMD.jl type ns = 10000 nt = 10000 x = rand(T,3,nt) @@ -109,7 +107,6 @@ for T = (Float32,Float64) # element type (e.g. Float64 or Float32) q = randn(T,ns) # charges t = @elapsed lap3dcharge(y,q,x) # discards return value t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled - #pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs check = lap3dcharge(y,q,x) |> sum @printf("Result with type %s: \n",T) @printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9)