diff --git a/lap3dkernel/julia/lap3dkernel.jl b/lap3dkernel/julia/lap3dkernel.jl index b8f859c..d6b1bff 100644 --- a/lap3dkernel/julia/lap3dkernel.jl +++ b/lap3dkernel/julia/lap3dkernel.jl @@ -14,17 +14,14 @@ using LinearAlgebra using Printf using BenchmarkTools using Base.Threads -# this is useless, apparently: -#using Devectorize - -function eye(n) - return Matrix(1.0I,n,n) -end +using SIMD # helps when "manual" vectorization is needed +using VectorizationBase: pick_vector_width # use to to determine appropriate size of register vector function lap3dcharge(y,q,x) # vec over targs + T = eltype(y) nt = size(x,2) - pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later - prefac = 1/(4*pi) + pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later + prefac = T(1/(4*pi)) for j in eachindex(q) # loop over srcs r2 = sum((x .- y[:,j]).^2,dims=1) # sq dist r = sqrt.(r2) @@ -34,11 +31,12 @@ function lap3dcharge(y,q,x) # vec over targs end function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD + T = eltype(y) nt = size(x,2) ns = size(y,2) - pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later + pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later prefac = 1/(4*pi) -@inbounds for i = 1:nt # targs + @inbounds for i = 1:nt # targs @simd for j = 1:ns # srcs #r2ij = sum((x[:,i] - y[:,j]).^2) # sq dist - terrible! r2ij = (x[1,i]-y[1,j])^2+ (x[2,i]-y[2,j])^2+ (x[3,i]-y[3,j])^2 @@ -50,8 +48,9 @@ function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD end function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above + T = eltype(y) nt = size(x,2) - pot = zeros(nt) # note zeros(nt), col vec, fails to add r later + pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later prefac = 1/(4*pi) @threads for i in eachindex(pot) # targs. @simd for j in eachindex(q) # srcs @@ -62,28 +61,75 @@ function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above end return pot end + +function lap3dcharge_devec_par_new(y,q,x,V=Vec{4,Float64}) + xt,yt,zt = x[1,:],x[2,:],x[3,:] + xs,ys,zs = y[1,:],y[2,:],y[3,:] + lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V) +end + +function lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V) + T = eltype(xs) + @assert length(xs) == length(ys) == length(zs) + @assert length(xt) == length(xt) == length(zt) + ns,nt = length(xs), length(xt) + pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later + @threads for t in 1:length(V):nt + Xt_vec = vload(V,xt,t) + Yt_vec = vload(V,yt,t) + Zt_vec = vload(V,zt,t) + pot_vec = _inner_loop(Xt_vec,Yt_vec,Zt_vec,t,xs,ys,zs,q) + vstore(pot_vec,pot,t) + end + return pot +end + +function _inner_loop(Xt_vec::T,Yt_vec::T,Zt_vec::T,t,xs,ys,zs,q) where {T} + prefac = 1/(4*pi) + pot_vec = zero(T) + for s in eachindex(xs) + dx = xs[s] - Xt_vec + dy = ys[s] - Yt_vec + dz = zs[s] - Zt_vec + r2 = dx*dx + dy*dy + dz*dz + pot_vec += prefac * q[s] / sqrt(r2) #rij + end + return pot_vec +end -ns = 10000 -nt = 10000 -x = rand(3,nt) -y = rand(3,ns) -q = randn(ns) # charges -t = @elapsed lap3dcharge(y,q,x) # discards return value -t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled -#pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs -@printf("targ-vec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) -# 0.06 Gpair/s, ie 5x slower than devec +for T in (Float32,Float64) # element type (e.g. Float64 or Float32) + N = pick_vector_width(T) # number of elements of type T that can be stored on the vector register + V = Vec{N,T} # SIMD.jl type + ns = 10000 + nt = 10000 + x = rand(T,3,nt) + y = rand(T,3,ns) + q = randn(T,ns) # charges + t = @elapsed lap3dcharge(y,q,x) # discards return value + t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled + check = lap3dcharge(y,q,x) |> sum + @printf("Result with type %s: \n",T) + @printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # 0.06 Gpair/s, ie 5x slower than devec + + lap3dcharge_devec(y,q,x) # compile it? + t = @elapsed lap3dcharge_devec(y,q,x) # discards return value + t = @elapsed lap3dcharge_devec(y,q,x) # discards return value + check = lap3dcharge_devec(y,q,x) |> sum + @printf("devec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded) -lap3dcharge_devec(y,q,x) # compile it? -t = @elapsed lap3dcharge_devec(y,q,x) # discards return value -t = @elapsed lap3dcharge_devec(y,q,x) # discards return value -@printf("devec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) -# best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded) + t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value + t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value + check = lap3dcharge_devec_par(y,q,x) |> sum + @printf("devec par: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) + # 1.26 Gpair/s, matches py+numba -t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value -t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value -@printf("devec par: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9) -# 1.26 Gpair/s, matches py+numba + t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value + t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value + check = lap3dcharge_devec_par_new(y,q,x,V) |> sum + @printf("devec par new: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9) +end # S. Johnson chat: use benchmark tools. # @btime lap3dcharge_devec_par($y,$q,$x); diff --git a/lap3dkernel/run_all_tests.sh b/lap3dkernel/run_all_tests.sh index 1b371b5..ee8b70e 100755 --- a/lap3dkernel/run_all_tests.sh +++ b/lap3dkernel/run_all_tests.sh @@ -18,9 +18,9 @@ echo FORTRAN: #(cd fortran; ifort lap3dpottest.f90 -qopenmp -fPIC -O3 -march=native -funroll-loops -mkl; ./a.out) -echo JULIA: +echo JULIA with $(nproc) processors: # note you'll need to import some Pkgs into your julia distro -JULIA_NUM_THREADS=$(nproc) julia julia/lap3dkernel.jl +JULIA_NUM_THREADS=$(nproc) julia --check-bounds=no -O3 julia/lap3dkernel.jl echo PYTHON: # assumes python is your v3 version w/ numba, eg via: source activate idp