Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to Julia version of lap3dkernel #7

Merged
merged 4 commits into from
Sep 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 76 additions & 30 deletions lap3dkernel/julia/lap3dkernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,14 @@ using LinearAlgebra
using Printf
using BenchmarkTools
using Base.Threads
# this is useless, apparently:
#using Devectorize

function eye(n)
return Matrix(1.0I,n,n)
end
using SIMD # helps when "manual" vectorization is needed
using VectorizationBase: pick_vector_width # use to to determine appropriate size of register vector

function lap3dcharge(y,q,x) # vec over targs
T = eltype(y)
nt = size(x,2)
pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later
prefac = 1/(4*pi)
pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later
prefac = T(1/(4*pi))
for j in eachindex(q) # loop over srcs
r2 = sum((x .- y[:,j]).^2,dims=1) # sq dist
r = sqrt.(r2)
Expand All @@ -34,11 +31,12 @@ function lap3dcharge(y,q,x) # vec over targs
end

function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD
T = eltype(y)
nt = size(x,2)
ns = size(y,2)
pot = zeros(1,nt) # note zeros(nt), col vec, fails to add r later
pot = zeros(T,1,nt) # note zeros(nt), col vec, fails to add r later
prefac = 1/(4*pi)
@inbounds for i = 1:nt # targs
@inbounds for i = 1:nt # targs
@simd for j = 1:ns # srcs
#r2ij = sum((x[:,i] - y[:,j]).^2) # sq dist - terrible!
r2ij = (x[1,i]-y[1,j])^2+ (x[2,i]-y[2,j])^2+ (x[3,i]-y[3,j])^2
Expand All @@ -50,8 +48,9 @@ function lap3dcharge_devec(y,q,x) # unwrap i,j. C-style coding, SIMD
end

function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above
T = eltype(y)
nt = size(x,2)
pot = zeros(nt) # note zeros(nt), col vec, fails to add r later
pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later
prefac = 1/(4*pi)
@threads for i in eachindex(pot) # targs.
@simd for j in eachindex(q) # srcs
Expand All @@ -62,28 +61,75 @@ function lap3dcharge_devec_par(y,q,x) # multi-threaded version of above
end
return pot
end

function lap3dcharge_devec_par_new(y,q,x,V=Vec{4,Float64})
xt,yt,zt = x[1,:],x[2,:],x[3,:]
xs,ys,zs = y[1,:],y[2,:],y[3,:]
lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V)
end

function lap3dcharge_devec_par_new(xs,ys,zs,xt,yt,zt,q,V)
T = eltype(xs)
@assert length(xs) == length(ys) == length(zs)
@assert length(xt) == length(xt) == length(zt)
ns,nt = length(xs), length(xt)
pot = zeros(T,nt) # note zeros(nt), col vec, fails to add r later
@threads for t in 1:length(V):nt
Xt_vec = vload(V,xt,t)
Yt_vec = vload(V,yt,t)
Zt_vec = vload(V,zt,t)
pot_vec = _inner_loop(Xt_vec,Yt_vec,Zt_vec,t,xs,ys,zs,q)
vstore(pot_vec,pot,t)
end
return pot
end

function _inner_loop(Xt_vec::T,Yt_vec::T,Zt_vec::T,t,xs,ys,zs,q) where {T}
prefac = 1/(4*pi)
pot_vec = zero(T)
for s in eachindex(xs)
dx = xs[s] - Xt_vec
dy = ys[s] - Yt_vec
dz = zs[s] - Zt_vec
r2 = dx*dx + dy*dy + dz*dz
pot_vec += prefac * q[s] / sqrt(r2) #rij
end
return pot_vec
end

ns = 10000
nt = 10000
x = rand(3,nt)
y = rand(3,ns)
q = randn(ns) # charges
t = @elapsed lap3dcharge(y,q,x) # discards return value
t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled
#pot, t = @timed lap3dcharge(y,q,x) # gets the time in secs
@printf("targ-vec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9)
# 0.06 Gpair/s, ie 5x slower than devec
for T in (Float32,Float64) # element type (e.g. Float64 or Float32)
N = pick_vector_width(T) # number of elements of type T that can be stored on the vector register
V = Vec{N,T} # SIMD.jl type
ns = 10000
nt = 10000
x = rand(T,3,nt)
y = rand(T,3,ns)
q = randn(T,ns) # charges
t = @elapsed lap3dcharge(y,q,x) # discards return value
t = @elapsed lap3dcharge(y,q,x) # discards return value - is already compiled
check = lap3dcharge(y,q,x) |> sum
@printf("Result with type %s: \n",T)
@printf("targ-vec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9)
# 0.06 Gpair/s, ie 5x slower than devec

lap3dcharge_devec(y,q,x) # compile it?
t = @elapsed lap3dcharge_devec(y,q,x) # discards return value
t = @elapsed lap3dcharge_devec(y,q,x) # discards return value
check = lap3dcharge_devec(y,q,x) |> sum
@printf("devec: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9)
# best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded)

lap3dcharge_devec(y,q,x) # compile it?
t = @elapsed lap3dcharge_devec(y,q,x) # discards return value
t = @elapsed lap3dcharge_devec(y,q,x) # discards return value
@printf("devec: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9)
# best, single-threaded, 0.33 Gpair/s (py+numba was 1.3 since multithreaded)
t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value
t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value
check = lap3dcharge_devec_par(y,q,x) |> sum
@printf("devec par: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9)
# 1.26 Gpair/s, matches py+numba

t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value
t = @elapsed lap3dcharge_devec_par(y,q,x) # discards return value
@printf("devec par: %d src-targ pairs in %.3g s: %.3g Gpair/s\n",ns*nt,t,ns*nt/t/1e9)
# 1.26 Gpair/s, matches py+numba
t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value
t = @elapsed lap3dcharge_devec_par_new(y,q,x,V) # discards return value
check = lap3dcharge_devec_par_new(y,q,x,V) |> sum
@printf("devec par new: %d src-targ pairs, ans: %f \n \t time %.3g s %.3g Gpair/s\n",ns*nt,check,t,ns*nt/t/1e9)
end

# S. Johnson chat: use benchmark tools.
# @btime lap3dcharge_devec_par($y,$q,$x);
Expand Down
4 changes: 2 additions & 2 deletions lap3dkernel/run_all_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ echo FORTRAN:
#(cd fortran; ifort lap3dpottest.f90 -qopenmp -fPIC -O3 -march=native -funroll-loops -mkl; ./a.out)


echo JULIA:
echo JULIA with $(nproc) processors:
# note you'll need to import some Pkgs into your julia distro
JULIA_NUM_THREADS=$(nproc) julia julia/lap3dkernel.jl
JULIA_NUM_THREADS=$(nproc) julia --check-bounds=no -O3 julia/lap3dkernel.jl

echo PYTHON:
# assumes python is your v3 version w/ numba, eg via: source activate idp
Expand Down