diff --git a/README.md b/README.md index e472c2c..66a3b3b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # MHIT36 -Code for direct numerical simulation of Navier-Stokes equation coupled with a phase-field method (ACDI) for interface description. +Code for direct numerical simulation of Navier-Stokes equations coupled with a phase-field method (ACDI) for interface description. ~~~text ███ ███ ██ ██ ██ ████████ ██████ ██████ @@ -17,7 +17,7 @@ If you use this code, please cite the following work: @article{roccon2025, title = {MHIT36: A Phase-Field Code for Gpu Simulations of Multiphase Homogeneous Isotropic Turbulence}, author = {Roccon, A. and Enzenberger, L. and Zaza, D. and Soldati, A.}, - journal = {SSRN}, + journal = {Computer Physics Communications (in press)}, year = {2025}, doi = {http://dx.doi.org/10.2139/ssrn.5264052} } @@ -58,15 +58,36 @@ If you use this code, please cite the following work: ## Reference performance -Performance (NS only) - from last push -* 512 x 512 x 512 | 4 x A100@Leonardo | 19 ms/timestep -* 1024 x 1024 x 1024 | 4 x GH200@Nvidia | 161.9 ms/timestep (out of date) -* 1024 x 1024 x 1024 | 8 x A100@Leonardo | 150.1 ms/timestep -* 512 x 512 x 512 | 4 x H100@MN5-ACC | 18 ms/timestep (out of date) +Performance (NS only) +* 256 x 256 x 256 | 4 x A100@Leonardo | 16 ms/timestep +* 512 x 512 x 512 | 4 x A100@Leonardo | 270 ms/timestep +* 1024 x 1024 x 1024 | 32 x A100@Leonardo | 357 ms/timestep +* 2048 x 2048 x 2048 | 128 x A100@Leonardo | 820 ms/timestep +* 256 x 256 x 256 | 4 x H100@MN5-ACC | 13 ms/timestep +* 512 x 512 x 512 | 4 x H100@MN5-ACC | 230 ms/timestep +* 1024 x 1024 x 1024 | 32 x H100@MN5-ACC | 320 ms/timestep +* 2048 x 2048 x 2048 | 512 x H100@MN5-ACC | 259 ms/timestep -Performance (NS + ACDI) - from last push -* 512 x 512 x 512 | 4 x A100@Leonardo | 28 ms/timestep -* 1024 x 1024 x 1024 | 8 x A100@Leonardo | 288 ms/timestep +Phase-field introduces about 15% of overhead compared to NS only. + +## Scaling + +Strong scaling results obtained on Leonardo (4 x A100 64 GB x node) and MN5 (4 x H100 40 GB x node) +* Tested from 1 node up to 128 nodes (Leonardo) +* Tested from 1 node up to 256 nodes (MN5-ACC) +* Grid from 64 x 64 x 64 up to 4096 x 4096 x 4096 +* Very similar scaling for both NS and NS+ACDI + +![Scal](val/scaling.png) + + +## Validation + +Benchamrk present in "W.M.VanRees, A.Leonard, D.Pullin, P.Koumoutsakos, A comparison of vortex and pseudo-spectral methods for the simulation of periodic vortical flows at high Reynolds numbers,J. Comput. Phys.2 30(8)(2011)2794–2805" and also Used in CaNS. + +Time evolution of the viscous dissipation: + +![Test](val/val.png) ## Contributing diff --git a/make_lib_leo.sh b/make_lib_leo.sh index c942f50..530055f 100755 --- a/make_lib_leo.sh +++ b/make_lib_leo.sh @@ -6,4 +6,6 @@ module load profile/candidate module load nvhpc/25.3 module load hpcx-mpi/2.19 cmake .. +#enable nvshmem +#cmake -DCUDECOMP_BUILD_EXTRAS=1 -DCUDECOMP_ENABLE_NVSHMEM=1 .. make -j diff --git a/multi/Makefile_local b/multi/Makefile_local index d928029..fc154b8 100644 --- a/multi/Makefile_local +++ b/multi/Makefile_local @@ -4,7 +4,7 @@ LD = $(FC) RM = /bin/rm -f # Paths (Modify if necessary) -CUDECOMP_DIR = /home/milton/MHIT36_cuDecomp/cuDecomp/build/ +CUDECOMP_DIR = /home/milton/MHIT36/cuDecomp/build/ # Compiler and Linker Flags FFLAGS = -fast -acc -gpu=managed -Minfo=accel -Mfree -Mpreprocess -cpp -Minfo=accel -cuda -I$(CUDECOMP_DIR)/include diff --git a/multi/PoissonExampleD2Z.f90 b/multi/PoissonExampleD2Z.f90 deleted file mode 100755 index a8b4820..0000000 --- a/multi/PoissonExampleD2Z.f90 +++ /dev/null @@ -1,339 +0,0 @@ -!Optimized verison compared to the stanard one as it use D2Z in the first step -! This avoid transpostions with nx x ny x nz complex and instead use nx/2+1 x ny x nz - - -#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) -!#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) exit(1) - -! Solves poisson equation -! -! u_xx + u_yy + u_zz= phi(x,y,z) -! -! where -! -! phi(x,y,z) = sin(Mx*x)*sin(My*y)*sin(Mz*z) -! -! on domain 0 <= x <= 2*pi, 0 <= y <= 2*pi, 0 <= z <= 2*pi - -program main - use cudafor - use cudecomp - use cufft - use mpi - - implicit none - - ! Command line arguments - ! grid dimensions - integer :: nx, ny, nz - integer :: comm_backend - integer :: pr, pc - - ! grid size - real(8), parameter :: Lx = 1.0, Ly = 1.0, Lz = 1.0 - ! modes for phi - integer, parameter :: Mx = 1, My = 2, Mz = 1 - real(8), parameter :: twopi = 8.0_8*atan(1.0_8) - real(8) :: err, maxErr = -1.0, times, timef - character(len=40) :: namefile - - - ! grid - real(8), allocatable :: x(:) - real(8) :: hx, hy, hz - ! wavenumbers - real(8), allocatable :: kx(:) - real(8), device, allocatable :: kx_d(:) - - real(8), allocatable :: phi(:), ua(:,:,:), rhsp(:,:,:), p(:,:,:) - !complex(8), allocatable :: rhspc(:,:,:) - complex(8), device, allocatable :: phi_d(:) - complex(8), pointer, device, contiguous :: work_d(:) - - ! MPI - integer :: rank, ranks, ierr - integer :: localRank, localComm - - ! cudecomp - type(cudecompHandle) :: handle - type(cudecompGridDesc) :: grid_desc - type(cudecompGridDescConfig) :: config - type(cudecompGridDescAutotuneOptions) :: options - - integer :: pdims(2) - integer :: gdims(3) - integer :: npx, npy, npz - type(cudecompPencilInfo) :: piX, piY, piZ - integer(8) :: nElemX, nElemY, nElemZ, nElemWork - - ! CUFFT - integer :: planX, planY, planZ, planXF, planXB - integer :: batchsize - integer :: status - - logical :: skip_next - character(len=16) :: arg - integer :: i, j ,k , jl, kl, jg, kg - - - ! MPI initialization - - call mpi_init(ierr) - if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_init failed: ', ierr - call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr) - if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_rank failed: ', ierr - call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr) - if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_size failed: ', ierr - - call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr) - if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_split_type failed: ', ierr - call mpi_comm_rank(localComm, localRank, ierr) - if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_rank on local rank failed: ', ierr - ierr = cudaSetDevice(localRank) - - ! Parse command-line arguments - nx = 64 - ny = nx - nz = nx - pr = 2 - pc = 1 - comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P - - - ! cudecomp initialization - - CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD)) - - CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config)) - pdims = [pr, pc] - config%pdims = pdims - gdims = [ nx/2+1, ny, nz] - config%gdims = gdims - config%transpose_comm_backend = comm_backend - config%transpose_axis_contiguous = .true. - - CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options)) - options%dtype = CUDECOMP_DOUBLE_COMPLEX - if (comm_backend == 0) then - options%autotune_transpose_backend = .true. - endif - - CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options)) - - if (rank == 0) then - write(*,"('Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2) - write(*,"('Using ', a, ' backend ...')") cudecompTransposeCommBackendToString(config%transpose_comm_backend) - end if - - ! get pencil info - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1)) - nElemX = piX%size - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2)) - nElemY = piY%size - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3)) - nElemZ = piZ%size - - ! get workspace size - CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork)) - - ! CUFFT initialization - - batchSize = piX%shape(2)*piX%shape(3) - status = cufftPlan1D(planX, nx, CUFFT_Z2Z, batchSize) - if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan' - - batchSize = piX%shape(2)*piX%shape(3) - status = cufftPlan1D(planXF, nx, CUFFT_D2Z, batchSize) - if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X-forward plan' - - batchSize = piX%shape(2)*piX%shape(3) - status = cufftPlan1D(planXB, nx, CUFFT_Z2D, batchSize) - if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X-backward plan' - - batchSize = piY%shape(2)*piY%shape(3) - status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize) - if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan' - - batchSize = piZ%shape(2)*piZ%shape(3) - status = cufftPlan1D(planZ, nz, CUFFT_Z2Z, batchSize) - if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Z plan' - - - ! Physical grid - - allocate(x(nx)) - - hx = twopi/nx - do i = 1, nx - x(i) = hx*i - enddo - ! Wavenumbers - - allocate(kx(nx)) - - do i = 1, nx/2 - kx(i) = (i-1) - enddo - do i = nx/2+1, nx - kx(i) = (i-1-nx) - enddo - allocate(kx_d, source=kx) - - ! allocate arrays - - allocate(phi(max(nElemX, nElemY, nElemZ))) - allocate(phi_d, mold=phi) - allocate(ua(nx, piX%shape(2), piX%shape(3))) - allocate(rhsp(nx, piX%shape(2), piX%shape(3))) - !allocate(rhspc(nx, piX%shape(2), piX%shape(3))) - allocate(p(nx, piX%shape(2), piX%shape(3))) - - - CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork)) - - !initialize rigth hand side + analytic solution - do kl = 1, piX%shape(3) - kg = piX%lo(3) + kl - 1 - do jl = 1, piX%shape(2) - jg = piX%lo(2) + jl - 1 - do i = 1, nx - rhsp(i,jl,kl) = sin(Mx*x(i))*sin(My*x(jg))*sin(Mz*x(kg)) - !rhspc(i,jl,kl) = cmplx(rhsp(i,jl,kl),0.d0) - ua(i,jl,kl) = -rhsp(i,jl,kl)/(Mx**2 + My**2 + Mz**2) - enddo - enddo - enddo - - write(namefile,'(a,i3.3,a)') 'rhsp_',rank,'.dat' - open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new') - write(55) rhsp - close(55) - - call cpu_time(times) - - ! phi(x,y,z) -> phi(kx,y,z) - !$acc host_data use_device(rhsp) - status = cufftExecD2Z(planXF, rhsp, phi_d) - if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status - !$acc end host_data - ! phi(kx,y,z) -> phi(y,z,kx) - CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX)) - ! phi(y,z,kx) -> phi(ky,z,kx) - status = cufftExecZ2Z(planY, phi_d, phi_d, CUFFT_FORWARD) - if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status - ! phi(ky,z,kx) -> phi(z,kx,ky) - CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX)) - ! phi(z,kx,ky) -> phi(kz,kx,ky) - status = cufftExecZ2Z(planZ, phi_d, phi_d, CUFFT_FORWARD) - if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status - - - block - complex(8), device, pointer :: phi3d(:,:,:) - real(8) :: k2 - integer :: il, jl, ig, jg - integer :: offsets(3), xoff, yoff - integer :: np(3) - np(piZ%order(1)) = piZ%shape(1) - np(piZ%order(2)) = piZ%shape(2) - np(piZ%order(3)) = piZ%shape(3) - call c_f_pointer(c_devloc(phi_d), phi3d, piZ%shape) - - - ! divide by -K**2, and normalize - - offsets(piZ%order(1)) = piZ%lo(1) - 1 - offsets(piZ%order(2)) = piZ%lo(2) - 1 - offsets(piZ%order(3)) = piZ%lo(3) - 1 - - xoff = offsets(1) - yoff = offsets(2) - npx = np(1) - npy = np(2) - !$cuf kernel do (2) - do jl = 1, npy - do il = 1, npx - jg = yoff + jl - ig = xoff + il - do k = 1, nz - k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2 - !phi3d(k,il,jl) = phi3d(k,il,jl)/(int(nx,8)*int(ny,8)*int(nz,8)) - phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8)) - enddo - enddo - enddo - - ! specify mean (corrects division by zero wavenumber above) - if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0 - - end block - - - ! phi(kz,kx,ky) -> phi(z,kx,ky) - status = cufftExecZ2Z(planZ, phi_d, phi_d, CUFFT_INVERSE) - if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status - ! phi(z,kx,ky) -> phi(ky,z,kx) - CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX)) - ! phi(ky,z,kx) -> phi(y,z,kx) - status = cufftExecZ2Z(planY, phi_d, phi_d, CUFFT_INVERSE) - if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status - ! phi(y,z,kx) -> phi(kx,y,z) - CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX)) - ! phi(kx,y,z) -> phi(x,y,z) - !$acc host_data use_device(p) - status = cufftExecZ2D(planXB, phi_d, p) - if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status - !$acc end host_data - - err=0.d0 - maxErr=-1.d0 - !$acc kernels - do kl = 1, piX%shape(3) - do jl = 1, piX%shape(2) - do i = 1, nx - err = abs(ua(i,jl,kl)-p(i,jl,kl)) - !rhsp(i,jl,kl)=real(rhspc(i,jl,kl)) - if (err > maxErr) maxErr = err - enddo - enddo - enddo - !$acc end kernels - - write(namefile,'(a,i3.3,a)') 'rhspo_',rank,'.dat' - open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new') - write(55) p - close(55) - - write(namefile,'(a,i3.3,a)') 'ua_',rank,'.dat' - open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new') - write(55) ua - close(55) - - - write(*,"('[', i0, '] Max Error: ', e12.6)") rank, maxErr - - ! cleanup - - !status = cufftDestroy(planX) - !if (status /= CUFFT_SUCCESS) write(*,*) 'X plan destroy: ', status - status = cufftDestroy(planXF) - if (status /= CUFFT_SUCCESS) write(*,*) 'XF plan destroy: ', status - status = cufftDestroy(planXB) - if (status /= CUFFT_SUCCESS) write(*,*) 'XB plan destroy: ', status - status = cufftDestroy(planY) - if (status /= CUFFT_SUCCESS) write(*,*) 'Y plan destroy: ', status - status = cufftDestroy(planZ) - if (status /= CUFFT_SUCCESS) write(*,*) 'Z plan destroy: ', status - - deallocate(x) - deallocate(kx) - deallocate(kx_d) - deallocate(phi, phi_d, ua) - - CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc, work_d)) - CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc)) - CHECK_CUDECOMP_EXIT(cudecompFinalize(handle)) - - call mpi_finalize(ierr) - -end program main \ No newline at end of file diff --git a/multi/go_leo.sh b/multi/go_leo.sh index 2be0e05..3bb0e99 100644 --- a/multi/go_leo.sh +++ b/multi/go_leo.sh @@ -25,7 +25,4 @@ echo "Using directory: $ROOT_DIR" export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH chmod 777 binder.sh -mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36 - -# mpirun -n 4 nsys profile --trace=cuda,nvtx,mpi -o profile_output_%q{SLURM_PROCID} --stats=true ./mhit36 - +mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36 \ No newline at end of file diff --git a/multi/go_leo_prof.sh b/multi/go_leo_prof.sh index 6368296..ebcb52e 100755 --- a/multi/go_leo_prof.sh +++ b/multi/go_leo_prof.sh @@ -49,5 +49,3 @@ mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile -t cuda,nvtx,mpi,op # for nsight compute report - all kernels + mapping + nic # mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36 - - diff --git a/multi/local.sh b/multi/local.sh index 6b4e9dc..04116e8 100644 --- a/multi/local.sh +++ b/multi/local.sh @@ -4,7 +4,7 @@ MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/24.3/compilers/man; export MANPATH PATH=$NVCOMPILERS/$NVARCH/24.3/compilers/bin:$PATH; export PATH export PATH=$NVCOMPILERS/$NVARCH/24.3/comm_libs/mpi/bin:$PATH export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/24.3/comm_libs/mpi/man -LD_LIBRARY_PATH=/home/milton/MHIT36_cuDecomp/cuDecomp/build/lib +LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib #clean folder output rm -rf output mkdir output diff --git a/multi/main.f90 b/multi/main.f90 index 04a2cdf..7b8b1ad 100755 --- a/multi/main.f90 +++ b/multi/main.f90 @@ -10,7 +10,6 @@ program main use param use mpivar use cudecompvar -use nvtx implicit none @@ -56,7 +55,7 @@ program main integer :: np(3) ! Enable or disable phase field (acceleration eneabled by default) -#define phiflag 1 +#define phiflag 0 !######################################################################################################################################## ! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START @@ -137,15 +136,14 @@ program main options%halo_axis = 1 CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options)) -! Print information on configuration -!if (rank == 0) then -! write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2) -! write(*,"(' Using ', a, ' transpose backend ...')") & -! cudecompTransposeCommBackendToString(config%transpose_comm_backend) -! write(*,"(' Using ', a, ' halo backend ...')") & -! cudecompHaloCommBackendToString(config%halo_comm_backend) -!endif - +Print information on configuration +if (rank == 0) then + write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2) + write(*,"(' Using ', a, ' transpose backend ...')") & + cudecompTransposeCommBackendToString(config%transpose_comm_backend) + write(*,"(' Using ', a, ' halo backend ...')") & + cudecompHaloCommBackendToString(config%halo_comm_backend) +endif ! Get pencil info for the grid descriptor in the physical space ! This function returns a pencil struct (piX, piY or piZ) that contains the shape, global lower and upper index bounds (lo and hi), @@ -167,16 +165,6 @@ program main CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork)) CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo)) - - - - - -! Get pencil info for the grid descriptor in the complex space -!gdims = [nx/2+1, ny, nz] -!config%gdims = gdims -!CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options)) - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo)) nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo) ! Pencil info in Y-configuration present in PiY @@ -191,9 +179,6 @@ program main - - - ! CUFFT initialization -- Create Plans ! Forward 1D FFT in X: D2Z batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension) @@ -239,18 +224,17 @@ program main ! compute here the cos to avoid multiple computations of cos mycos(i)=cos(k0*x(i)+dx/2) enddo + !######################################################################################################################################## ! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END !######################################################################################################################################## - - - !######################################################################################################################################## ! START STEP 2: ALLOCATE ARRAYS !######################################################################################################################################## + ! allocate arrays allocate(psi(max(nElemX, nElemY, nElemZ))) !largest among the pencil allocate(psi_real(max(nElemX, nElemY, nElemZ))) !largest among the pencil @@ -270,11 +254,11 @@ program main allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) !PFM variables #if phiflag == 1 -allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3))) -allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3))) -allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3))) -allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3))) -allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces + allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi_o(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces #endif ! allocate arrays for transpositions and halo exchanges @@ -289,18 +273,13 @@ program main - - - - - !######################################################################################################################################## ! START STEP 3: FLOW AND PHASE FIELD INIT !######################################################################################################################################## ! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O) ! 3.2 Call halo exchnages along Y and Z for u,v,w and phi if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder -if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)" + if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)" if (inflow .eq. 0) then if (rank.eq.0) write(*,*) "Initialize Taylor-green" do k = 1+halo_ext, piX%shape(3)-halo_ext @@ -316,12 +295,12 @@ program main enddo endif if (inflow .eq. 1) then - if (rank.eq.0) write(*,*) "Initialize from data" - call readfield(1) - call readfield(2) - call readfield(3) - endif + if (rank.eq.0) write(*,*) "Initialize from data" + call readfield(1) + call readfield(2) + call readfield(3) endif +endif if (restart .eq. 1) then !restart, ignore inflow and read the tstart field if (rank.eq.0) write(*,*) "Initialize velocity field (from output folder), iteration:", tstart call readfield_restart(tstart,1) @@ -345,35 +324,35 @@ program main ! initialize phase-field #if phiflag == 1 -if (restart .eq. 0) then -if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)' - if (inphi .eq. 0) then - if (rank.eq.0) write(*,*) 'Spherical drop' - do k = 1+halo_ext, piX%shape(3)-halo_ext - kg = piX%lo(3) + k - 1 - do j = 1+halo_ext, piX%shape(2)-halo_ext - jg = piX%lo(2) + j - 1 - do i = 1, piX%shape(1) - pos=(x(i)-lx/2)**2d0 + (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0 - phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps)) + if (restart .eq. 0) then + if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)' + if (inphi .eq. 0) then + if (rank.eq.0) write(*,*) 'Spherical drop' + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 + do j = 1+halo_ext, piX%shape(2)-halo_ext + jg = piX%lo(2) + j - 1 + do i = 1, piX%shape(1) + pos=(x(i)-lx/2)**2d0 + (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0 + phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps)) + enddo enddo - enddo - enddo + enddo + endif + if (inphi .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize phase-field from data" + call readfield(5) + endif endif - if (inphi .eq. 1) then - if (rank.eq.0) write(*,*) "Initialize phase-field from data" - call readfield(5) + if (restart .eq. 1) then + write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart + call readfield_restart(tstart,5) endif -endif -if (restart .eq. 1) then - write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart - call readfield_restart(tstart,5) -endif -! update halo -!$acc host_data use_device(phi) -CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) -CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) -!$acc end host_data + ! update halo + !$acc host_data use_device(phi) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data #endif !Save initial fields (only if a fresh start) @@ -384,22 +363,20 @@ program main call writefield(tstart,3) call writefield(tstart,4) #if phiflag == 1 - call writefield(tstart,5) + call writefield(tstart,5) #endif endif + !######################################################################################################################################## ! END STEP 3: FLOW AND PHASE FIELD INIT !######################################################################################################################################## - - - - -! ######################################################################################################################################## +!######################################################################################################################################## ! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP -! ######################################################################################################################################## +!######################################################################################################################################## + ! First step use Euler alpha=1.0d0 beta=0.0d0 @@ -410,99 +387,103 @@ program main !$acc data create(rhsu_o, rhsv_o, rhsw_o) !$acc data copyin(mysin, mycos) call cpu_time(t_start) + ! Start temporal loop do t=tstart,tfin - ! Create custom label for each marker + ! Create custom label for each marker write(itcount,'(i4)') t - ! Range with custom color - call nvtxStartRange("Iteration "//itcount,t) + ! Range with custom color (uncomment for profiling) + ! call nvtxStartRange("Iteration "//itcount,t) if (rank.eq.0) write(*,*) "Time step",t,"of",tfin call cpu_time(times) - call nvtxStartRange("Phase-field") + ! (uncomment for profiling) + ! call nvtxStartRange("Phase-field") + !######################################################################################################################################## ! START STEP 4: PHASE-FIELD SOLVER (EXPLICIT) !######################################################################################################################################## + #if phiflag == 1 - !$acc kernels - do k=1, piX%shape(3) - do j=1, piX%shape(2) - do i=1,nx - ! compute distance function psi (used to compute normals) - val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log - psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) - ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh - tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo enddo enddo - enddo - !$acc end kernels + !$acc end kernels - gamma=1.d0*gumax - !$acc parallel loop tile(16,4,2) - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext - do i=1,nx - ! 4.1 RHS computation - ip=i+1 - jp=j+1 - kp=k+1 - im=i-1 - jm=j-1 - km=k-1 - if (ip .gt. nx) ip=1 - if (im .lt. 1) im=nx - ! convective (first three lines) and diffusive (last three lines) - rhsphi(i,j,k) = & - - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi & - - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi & - - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi & - + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + & - eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + & - eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi) - ! 4.1.3. Compute normals for sharpening term (gradient) - normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) - normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) - normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + gamma=1.d0*gumax + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + rhsphi(i,j,k) = & + - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi & + - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi & + - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi & + + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + & + eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + & + eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo enddo enddo - enddo - - ! Update normx,normy and normz halos, required to then compute normal derivative - !$acc host_data use_device(normx) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - !$acc host_data use_device(normy) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - !$acc host_data use_device(normz) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - ! 4.1.3. Compute Sharpening term (gradient) - ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avodi 0/0) - !$acc kernels - do k=1, piX%shape(3) - do j=1, piX%shape(2) - do i=1,nx - normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16) - ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16) - normx(i,j,k) = normx(i,j,k)*normod - normy(i,j,k) = normy(i,j,k)*normod - normz(i,j,k) = normz(i,j,k)*normod + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! 4.1.3. Compute Sharpening term (gradient) + ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avodi 0/0) + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16) + ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16) + normx(i,j,k) = normx(i,j,k)*normod + normy(i,j,k) = normy(i,j,k)*normod + normz(i,j,k) = normz(i,j,k)*normod + enddo enddo enddo - enddo - !$acc end kernels + !$acc end kernels - ! Compute sharpening term - !$acc kernels - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext do i=1,nx ip=i+1 jp=j+1 @@ -512,61 +493,46 @@ program main km=k-1 if (ip .gt. nx) ip=1 if (im .lt. 1) im=nx - ! OLD CDI - !rhsphi(i,j,k)=rhsphi(i,j,k)+gamma*(((phi(ip,j,k)**2d0-phi(ip,j,k))*normx(ip,j,k)-(phi(im,j,k)**2d0-phi(im,j,k))*normx(im,j,k))*0.5d0*dxi + & - ! ((phi(i,jp,k)**2d0-phi(i,jp,k))*normy(i,jp,k)-(phi(i,jm,k)**2d0-phi(i,jm,k))*normy(i,jm,k))*0.5d0*dxi + & - ! ((phi(i,j,kp)**2d0-phi(i,j,kp))*normz(i,j,kp)-(phi(i,j,km)**2d0-phi(i,j,km))*normz(i,j,km))*0.5d0*dxi) - ! NEW ACDI - ! rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-(tanh(0.5d0*psidi(ip,j,k)*epsi))**2)*normx(ip,j,k)- 0.25d0*(1.d0-(tanh(0.5d0*psidi(im,j,k)*epsi))**2)*normx(im,j,k))*0.5*dxi +& - ! (0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jp,k)*epsi))**2)*normy(i,jp,k)- 0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jm,k)*epsi))**2)*normy(i,jm,k))*0.5*dxi +& - ! - ! ACDI improved version with pre-computed tanh - rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - & - 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + & - (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - & - 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dxi + & - (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - & - 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5*dxi) - - ! rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-(tanh(0.5d0*psidi(ip,j,k)*epsi))**2)*normx(ip,j,k)- & - ! 0.25d0*(1.d0-(tanh(0.5d0*psidi(im,j,k)*epsi))**2)*normx(im,j,k))*0.5*dxi +& - ! (0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jp,k)*epsi))**2)*normy(i,jp,k) - & - ! 0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jm,k)*epsi))**2)*normy(i,jm,k))*0.5*dxi +& - ! + ! ACDI with pre-computed tanh + rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - & + 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + & + (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - & + 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dxi + & + (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - & + 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5*dxi) enddo - enddo - enddo - !$acc end kernels + enddo + enddo + !$acc end kernels - ! 4.2 Get phi at n+1 - !$acc kernels - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext + ! 4.2 Get phi at n+1 using AB2 + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext do i=1,nx - phi(i,j,k) = phi(i,j,k) + dt*rhsphi(i,j,k) + phi(i,j,k) = phi(i,j,k) + dt*(alpha*rhsphi(i,j,k)-beta*rhsphi_o(i,j,k)) + rhsphi_o(i,j,k)=rhsphi(i,j,k) enddo - enddo - enddo - !$acc end kernels + enddo + enddo + !$acc end kernels - ! 4.3 Call halo exchnages along Y and Z for phi - !$acc host_data use_device(phi) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data + ! 4.3 Call halo exchnages along Y and Z for phi + !$acc host_data use_device(phi) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data #endif - !######################################################################################################################################## - ! END STEP 4: PHASE-FIELD SOLVER (EXPLICIT) - !######################################################################################################################################## - call nvtxEndRange - - + ! (uncomment for profiling) + ! call nvtxEndRange + !######################################################################################################################################## + ! END STEP 4: PHASE-FIELD SOLVER + !######################################################################################################################################## - call nvtxStartRange("Projection") !######################################################################################################################################## ! START STEP 5: USTAR COMPUTATION (PROJECTION STEP) !######################################################################################################################################## @@ -574,6 +540,9 @@ program main ! 5.2 obtain ustar and store old rhs in rhs_o ! 5.3 Call halo exchnages along Y and Z for u,v,w + ! (uncomment for profiling) + ! call nvtxStartRange("Projection") + ! Projection step, convective terms ! 5.1a Convective terms NS ! Loop on inner nodes @@ -627,102 +596,103 @@ program main rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi + ! NS forcing kg = piX%lo(3) + k - 1 - jg = piX%lo(2) + j - 1 + jg = piX%lo(2) + j - 1 + ! ABC forcing - ! rhsu(i,j,k)= rhsu(i,j,k) + f3*sin(k0*x(kg))+f2*cos(k0*x(jg)) rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg) - ! rhsv(i,j,k)= rhsv(i,j,k) + f1*sin(k0*x(i))+f3*cos(k0*x(kg)) rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg) - ! rhsw(i,j,k)= rhsw(i,j,k) + f2*sin(k0*x(jg))+f1*cos(k0*x(i)) rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i) + ! TG Forcing !rhsu(i,j,k)= rhsu(i,j,k) + f1*sin(k0*x(i))*cos(k0*x(j))*cos(k0*x(k)) !rhsv(i,j,k)= rhsv(i,j,k) - f1*cos(k0*x(i))*sin(k0*x(j))*sin(k0*x(k)) + enddo enddo enddo ! Surface tension forces #if phiflag == 1 - !$acc kernels - !Obtain surface tension forces evaluated at the center of the cell (same as where phi is located) - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext - do i=1,nx - ip=i+1 - jp=j+1 - kp=k+1 - im=i-1 - jm=j-1 - km=k-1 - if (ip .gt. nx) ip=1 - if (im .lt. 1) im=nx - chempot=phi(i,j,k)*(1.d0-phi(i,j,k))*(1.d0-2.d0*phi(i,j,k))*epsi-eps*(phi(ip,j,k)+phi(im,j,k)+phi(i,jp,k)+phi(i,jm,k)+phi(i,j,kp)+phi(i,j,km)- 6.d0*phi(i,j,k))*ddxi - ! chempot*gradphi - fxst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi - fyst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi - fzst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi + !$acc kernels + !Obtain surface tension forces evaluated at the center of the cell (same as where phi is located) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + chempot=phi(i,j,k)*(1.d0-phi(i,j,k))*(1.d0-2.d0*phi(i,j,k))*epsi-eps*(phi(ip,j,k)+phi(im,j,k)+phi(i,jp,k)+phi(i,jm,k)+phi(i,j,kp)+phi(i,j,km)- 6.d0*phi(i,j,k))*ddxi + ! chempot*gradphi + fxst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi + fyst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi + fzst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi + enddo enddo enddo - enddo - !$acc end kernels + !$acc end kernels - ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points) - !$acc host_data use_device(fxst) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - !$acc host_data use_device(fyst) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - !$acc host_data use_device(fzst) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) - !$acc end host_data - - ! Interpolate force at velocity points - !$acc kernels - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext - do i=1,nx - im=i-1 - jm=j-1 - km=k-1 - if (im .lt. 1) im=nx - rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi - rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi - rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi - u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) - v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) - w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) - rhsu_o(i,j,k)=rhsu(i,j,k) - rhsv_o(i,j,k)=rhsv(i,j,k) - rhsw_o(i,j,k)=rhsw(i,j,k) - enddo + ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points) + !$acc host_data use_device(fxst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(fyst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(fzst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Interpolate force at velocity points + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + im=i-1 + jm=j-1 + km=k-1 + if (im .lt. 1) im=nx + rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi + rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi + rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi + u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) + v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) + w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo enddo - enddo - !$acc end kernels + !$acc end kernels #else - ! 5.2 find u, v and w star (explicit Eulero), only in the inner nodes - !$acc kernels - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext - do i=1,nx - u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) - v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) - w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) - rhsu_o(i,j,k)=rhsu(i,j,k) - rhsv_o(i,j,k)=rhsv(i,j,k) - rhsw_o(i,j,k)=rhsw(i,j,k) - enddo + ! 5.2 find u, v and w star (explicit Eulero), only in the inner nodes + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) + v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) + w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo enddo - enddo - !$acc end kernels + !$acc end kernels #endif @@ -731,11 +701,6 @@ program main ! After first step move to AB2 alpha=1.5d0 beta= 0.5d0 - ! !$acc kernels - ! rhsu_o=rhsu - ! rhsv_o=rhsv - ! rhsw_o=rhsw - ! !$acc end kernels ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid !$acc host_data use_device(u) @@ -750,43 +715,47 @@ program main CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) !$acc end host_data + + ! (uncomment for profiling) + ! call nvtxEndRange + !######################################################################################################################################## ! END STEP 5: USTAR COMPUTATION !######################################################################################################################################## - call nvtxEndRange - - - - - - call nvtxStartRange("Poisson") + !######################################################################################################################################## ! START STEP 6: POISSON SOLVER FOR PRESSURE !######################################################################################################################################## ! initialize rhs and analytical solution ! 6.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center ! I've done the halo updates so to compute the divergence at the pencil border i have the *star from the halo - call nvtxStartRange("compute RHS") - !$acc kernels - do k=1+halo_ext, piX%shape(3)-halo_ext - do j=1+halo_ext, piX%shape(2)-halo_ext - do i=1,nx - ip=i+1 - jp=j+1 - kp=k+1 - if (ip > nx) ip=1 - rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k)) - rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k)) - rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k)) - enddo - enddo - enddo - !$acc end kernels + ! (uncomment for profiling) + ! call nvtxStartRange("Poisson") - call nvtxEndRange + ! (uncomment for profiling) + ! call nvtxStartRange("compute RHS") + + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + if (ip > nx) ip=1 + rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k)) + enddo + enddo + enddo + !$acc end kernels + + ! (uncomment for profiling) + ! call nvtxEndRange ! ! !beginDEBUG ! !$acc kernels @@ -807,127 +776,106 @@ program main ! enddo ! !$acc end kernels - ! !endDEBUG + ! (uncomment for profiling) + ! call nvtxStartRange("FFT forward w/ transpositions") - call nvtxStartRange("FFT forward w/ transpositions") - - !$acc host_data use_device(rhsp) - status = cufftExecD2Z(planXf, rhsp, psi_d) - if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status - !$acc end host_data - - !!Nyquist to be set to zero? - ! block - ! complex(8), device, pointer :: psi_dev(:,:,:) - ! call c_f_pointer( c_devloc(psi_d), psi_dev, & - ! [piX_d2z%shape(1), piX_d2z%shape(2), piX_d2z%shape(3)] ) - ! !$acc kernels - ! do k = 1, piX_d2z%shape(3) - ! do j = 1, piX_d2z%shape(2) - ! psi_dev(nx/2+1, j, k) = (0.0_8, 0.0_8) - ! end do - ! end do - ! !$acc end kernels - ! end block - - - ! psi(kx,y,z) -> psi(y,z,kx) - CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0])) - ! psi(y,z,kx) -> psi(ky,z,kx) - status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD) - if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status - ! psi(ky,z,kx) -> psi(z,kx,ky) - CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) - ! psi(z,kx,ky) -> psi(kz,kx,ky) - status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD) - if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status - ! END of FFT3D forward - - call nvtxEndRange - - !block - np(piZ_d2z%order(1)) = piZ_d2z%shape(1) - np(piZ_d2z%order(2)) = piZ_d2z%shape(2) - np(piZ_d2z%order(3)) = piZ_d2z%shape(3) - call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape) - - - ! divide by -K**2, and normalize - - offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1 - offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1 - offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1 - - xoff = offsets(1) - yoff = offsets(2) - npx = np(1) - npy = np(2) - - call nvtxStartRange("Solution") + !$acc host_data use_device(rhsp) + status = cufftExecD2Z(planXf, rhsp, psi_d) + if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status + !$acc end host_data - !$acc kernels - do jl = 1, npy - jg = yoff + jl - do il = 1, npx - ig = xoff + il - do k = 1, nz - k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2 - phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8)) - enddo + ! psi(kx,y,z) -> psi(y,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0])) + ! psi(y,z,kx) -> psi(ky,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status + ! psi(ky,z,kx) -> psi(z,kx,ky) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + ! psi(z,kx,ky) -> psi(kz,kx,ky) + status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD) + if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status + ! END of FFT3D forward + + ! (uncomment for profiling) + ! call nvtxEndRange + + np(piZ_d2z%order(1)) = piZ_d2z%shape(1) + np(piZ_d2z%order(2)) = piZ_d2z%shape(2) + np(piZ_d2z%order(3)) = piZ_d2z%shape(3) + call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape) + + ! divide by -K**2, and normalize + + offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1 + offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1 + offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1 + + xoff = offsets(1) + yoff = offsets(2) + npx = np(1) + npy = np(2) + + ! (uncomment for profiling) + ! call nvtxStartRange("Solution") + + !$acc kernels + do jl = 1, npy + jg = yoff + jl + do il = 1, npx + ig = xoff + il + do k = 1, nz + k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2 + phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8)) enddo enddo - ! specify mean (corrects division by zero wavenumber above) - if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0 - !$acc end kernels + enddo + ! specify mean (corrects division by zero wavenumber above) + if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0 + !$acc end kernels - call nvtxEndRange + ! (uncomment for profiling) + ! call nvtxEndRange - !end block - - call nvtxStartRange("FFT backwards w/ transpositions") - - ! psi(kz,kx,ky) -> psi(z,kx,ky) - status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE) - if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status - ! psi(z,kx,ky) -> psi(ky,z,kx) - CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) - ! psi(ky,z,kx) -> psi(y,z,kx) - status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE) - if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status - ! psi(y,z,kx) -> psi(kx,y,z) - CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents)) - !$acc host_data use_device(p) - ! psi(kx,y,z) -> psi(x,y,z) - status = cufftExecZ2D(planXb, psi_d, p) - if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status - !$acc end host_data + ! (uncomment for profiling) + ! call nvtxStartRange("FFT backwards w/ transpositions") + + ! psi(kz,kx,ky) -> psi(z,kx,ky) + status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE) + if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status + ! psi(z,kx,ky) -> psi(ky,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + ! psi(ky,z,kx) -> psi(y,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status + ! psi(y,z,kx) -> psi(kx,y,z) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents)) + !$acc host_data use_device(p) + ! psi(kx,y,z) -> psi(x,y,z) + status = cufftExecZ2D(planXb, psi_d, p) + if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status + !$acc end host_data - call nvtxEndRange + ! (uncomment for profiling) + ! call nvtxEndRange !$acc host_data use_device(p) - ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data - ! Update X-pencil halos in Y and Z direction - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) - CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data + ! Update X-pencil halos in Y and Z direction + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) !$acc end host_data - - + ! (uncomment for profiling) + ! call nvtxEndRange !######################################################################################################################################## ! END STEP 7: POISSON SOLVER FOR PRESSURE !######################################################################################################################################## - call nvtxEndRange - - - - - call nvtxStartRange("Correction") !######################################################################################################################################## ! START STEP 8: VELOCITY CORRECTION ! ######################################################################################################################################## @@ -936,6 +884,10 @@ program main ! 8.3 Call halo exchnages along Y and Z for u,v,w ! Correct velocity, pressure has also the halo ! Correction + removal of mean velocity altogether for performance + + ! (uncomment for profiling) + ! call nvtxStartRange("Correction") + !$acc kernels umean=0.d0 vmean=0.d0 @@ -943,16 +895,16 @@ program main do k=1+halo_ext, piX%shape(3)-halo_ext do j=1+halo_ext, piX%shape(2)-halo_ext do i = 1, piX%shape(1) ! equal to nx (no halo on x) - im=i-1 - jm=j-1 - km=k-1 - if (im < 1) im=nx - u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi - v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi - w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi - umean=umean + u(i,j,k) - vmean=vmean + v(i,j,k) - wmean=wmean + w(i,j,k) + im=i-1 + jm=j-1 + km=k-1 + if (im < 1) im=nx + u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi + v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi + w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi + umean=umean + u(i,j,k) + vmean=vmean + v(i,j,k) + wmean=wmean + w(i,j,k) enddo enddo enddo @@ -960,7 +912,6 @@ program main ! Remove mean velocity (get local mean of the rank) - ! Divide by total number of points in the pencil umean=umean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext) vmean=vmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext) @@ -1004,18 +955,15 @@ program main if (cou .gt. 7) stop endif - ! beginDZ - ! CHECK - To be removed in actual computation - - ! endDZ + call cpu_time(timef) + if (rank.eq.0) print '(" Time elapsed = ",f8.1," ms")',1000*(timef-times) + ! (uncomment for profiling) + ! call nvtxEndRange - call cpu_time(timef) - if (rank.eq.0) print '(" Time elapsed = ",f6.1," ms")',1000*(timef-times) !######################################################################################################################################## ! END STEP 8: VELOCITY CORRECTION !######################################################################################################################################## - call nvtxEndRange !######################################################################################################################################## @@ -1023,22 +971,23 @@ program main ! ######################################################################################################################################## if (mod(t,dump) .eq. 0) then if (rank .eq. 0) write(*,*) "Saving output files" - ! write velocity and pressure fiels (1-4) - call writefield(t,1) - call writefield(t,2) - call writefield(t,3) - call writefield(t,4) - #if phiflag == 1 + ! write velocity and pressure fiels (1-4) + call writefield(t,1) + call writefield(t,2) + call writefield(t,3) + call writefield(t,4) + #if phiflag == 1 ! write phase-field (5) call writefield(t,5) - #endif + #endif endif !######################################################################################################################################## ! END STEP 9: OUTPUT FIELDS N !######################################################################################################################################## -call nvtxEndRange -!call nvtxEndRange +! (uncomment for profiling) +! call nvtxEndRange + enddo call cpu_time(t_end) elapsed = t_end-t_start @@ -1052,7 +1001,7 @@ program main deallocate(tanh_psi, mysin, mycos) deallocate(rhsu,rhsv,rhsw) deallocate(rhsu_o,rhsv_o,rhsw_o) -deallocate(phi,rhsphi,normx,normy,normz) +deallocate(phi,rhsphi,rhsphi_o,normx,normy,normz) call mpi_finalize(ierr) diff --git a/multi/module.f90 b/multi/module.f90 index 7222f29..32d83c3 100755 --- a/multi/module.f90 +++ b/multi/module.f90 @@ -62,75 +62,5 @@ module phase end module phase -! added NVTX for profiing from maxcuda/NVTX_example -module nvtx -use iso_c_binding -implicit none -integer,private :: col(7) = [ int(Z'0000ff00'), int(Z'000000ff'), int(Z'00ffff00'), int(Z'00ff00ff'), int(Z'0000ffff'), int(Z'00ff0000'), int(Z'00ffffff')] -character,private,target :: tempName(256) -type, bind(C):: nvtxEventAttributes - integer(C_INT16_T):: version=1 - integer(C_INT16_T):: size=48 ! - integer(C_INT):: category=0 - integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1 - integer(C_INT):: color - integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0 - integer(C_INT):: reserved0 - integer(C_INT64_T):: payload ! union uint,int,double - integer(C_INT):: messageType=1 ! NVTX_MESSAGE_TYPE_ASCII = 1 - type(C_PTR):: message ! ascii char -end type - -interface nvtxRangePush - ! push range with custom label and standard color - subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') - use iso_c_binding - character(kind=C_CHAR) :: name(256) - end subroutine - - ! push range with custom label and custom color - subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') - use iso_c_binding - import:: nvtxEventAttributes - type(nvtxEventAttributes):: event - end subroutine -end interface - -interface nvtxRangePop - subroutine nvtxRangePop() bind(C, name='nvtxRangePop') - end subroutine -end interface - -contains - -subroutine nvtxStartRange(name,id) - character(kind=c_char,len=*) :: name - integer, optional:: id - type(nvtxEventAttributes):: event - character(kind=c_char,len=256) :: trimmed_name - integer:: i - - trimmed_name=trim(name)//c_null_char - - ! move scalar trimmed_name into character array tempName - do i=1,LEN(trim(name)) + 1 - tempName(i) = trimmed_name(i:i) - enddo - - - if ( .not. present(id)) then - call nvtxRangePush(tempName) - else - event%color=col(mod(id,7)+1) - event%message=c_loc(tempName) - call nvtxRangePushEx(event) - end if -end subroutine - -subroutine nvtxEndRange - call nvtxRangePop -end subroutine - -end module nvtx diff --git a/multi/readinput.f90 b/multi/readinput.f90 index b7262e9..0a25a33 100644 --- a/multi/readinput.f90 +++ b/multi/readinput.f90 @@ -53,6 +53,7 @@ subroutine readinput write(*,*) "██ ██ ██ ██ ██ ██ ██ ██ ██ ██" write(*,*) "██ ██ ██ ██ ██ ██ ██████ ██████" write(*,*) "----------------------------------------------" + write(*,*) 'Grid:', nx, 'x', nx, 'x', nx write(*,*) "Restart ", restart write(*,*) "Tstart ", tstart write(*,*) "Tfin ", tfin diff --git a/multi/testpush.sh b/multi/testpush.sh deleted file mode 100644 index 2e0e21a..0000000 --- a/multi/testpush.sh +++ /dev/null @@ -1,2 +0,0 @@ -make clean -make diff --git a/val/.DS_Store b/val/.DS_Store new file mode 100644 index 0000000..fb0b244 Binary files /dev/null and b/val/.DS_Store differ diff --git a/vtkwrite.m b/vtkwrite.m deleted file mode 100644 index 91be51a..0000000 --- a/vtkwrite.m +++ /dev/null @@ -1,279 +0,0 @@ -function vtkwrite( filename,dataType,varargin ) -% VTKWRITE Writes 3D Matlab array into VTK file format. -% vtkwrite(filename,'structured_grid',x,y,z,'vectors',title,u,v,w) writes -% a structured 3D vector data into VTK file, with name specified by the string -% filename. (u,v,w) are the vector components at the points (x,y,z). x,y,z -% should be 3-D matrices like those generated by meshgrid, where -% point(ijk) is specified by x(i,j,k), y(i,j,k) and z(i,j,k). -% The matrices x,y,z,u,v,w must all be the same size and contain -% corrresponding position and vector component. The string title specifies -% the name of the vector field to be saved. -% -% vtkwrite(filename,'structured_grid',x,y,z,'scalars',title,r) writes a 3D -% scalar data into VTK file whose name is specified by the string -% filename. r is the scalar value at the points (x,y,z). The matrices -% x,y,z,r must all be the same size and contain the corresponding position -% and scalar values. -% -% vtkwrite(filename,'structured_grid',x,y,z,'vectors',title,u,v,w,'scalars', -% title2,r) writes a 3D structured grid that contains both vector and scalar values. -% x,y,z,u,v,w,r must all be the same size and contain the corresponding -% positon, vector and scalar values. -% -% vtkwrite(filename, 'structured_points', title, m) saves matrix m (could -% be 1D, 2D or 3D array) into vtk as structured points. -% -% vtkwrite(filename, 'structured_points', title, m, 'spacing', sx, sy, sz) -% allows user to specify spacing. (default: 1, 1, 1). This is the aspect -% ratio of a single voxel. -% -% vtkwrite(filename, 'structured_points', title, m, 'origin', ox, oy, oz) -% allows user to speicify origin of dataset. (default: 0, 0, 0). -% -% vtkwrite(filename,'unstructured_grid',x,y,z,'vectors',title,u,v,w,'scalars', -% title2,r) writes a 3D unstructured grid that contains both vector and scalar values. -% x,y,z,u,v,w,r must all be the same size and contain the corresponding -% positon, vector and scalar values. -% -% vtkwrite(filename, 'polydata', 'lines', x, y, z) exports a 3D line where -% x,y,z are coordinates of the points that make the line. x, y, z are -% vectors containing the coordinates of points of the line, where point(n) -% is specified by x(n), y(n) and z(n). -% -% vtkwrite(filename,'polydata','lines',x,y,z,'Precision',n) allows you to -% specify precision of the exported number up to n digits after decimal -% point. Default precision is 3 digits. -% -% vtkwrite(filename,'polydata','triangle',x,y,z,tri) exports a list of -% triangles where x,y,z are the coordinates of the points and tri is an -% m*3 matrix whose rows denote the points of the individual triangles. -% -% vtkwrite(filename,'polydata','tetrahedron',x,y,z,tetra) exports a list -% of tetrahedrons where x,y,z are the coordinates of the points -% and tetra is an m*4 matrix whose rows denote the points of individual -% tetrahedrons. -% -% vtkwrite('execute','polydata','lines',x,y,z) will save data with default -% filename ''matlab_export.vtk' and automatically loads data into -% ParaView. -% -% Version 2.3 -% Copyright, Chaoyuan Yeh, 2016 -% Codes are modified from William Thielicke and David Gingras's submission. - -if strcmpi(filename,'execute'), filename = 'matlab_export.vtk'; end -fid = fopen(filename, 'w'); -% VTK files contain five major parts -% 1. VTK DataFile Version -fprintf(fid, '# vtk DataFile Version 2.0\n'); -% 2. Title -fprintf(fid, 'VTK from Matlab\n'); - - -binaryflag = any(strcmpi(varargin, 'BINARY')); -if any(strcmpi(varargin, 'PRECISION')) - precision = num2str(varargin{find(strcmpi(vin, 'PRECISION'))+1}); -else - precision = '2'; -end - -switch upper(dataType) - case 'STRUCTURED_POINTS' - title = varargin{1}; - m = varargin{2}; - if any(strcmpi(varargin, 'spacing')) - sx = varargin{find(strcmpi(varargin, 'spacing'))+1}; - sy = varargin{find(strcmpi(varargin, 'spacing'))+2}; - sz = varargin{find(strcmpi(varargin, 'spacing'))+3}; - else - sx = 1; - sy = 1; - sz = 1; - end - if any(strcmpi(varargin, 'origin')) - ox = varargin{find(strcmpi(varargin, 'origin'))+1}; - oy = varargin{find(strcmpi(varargin, 'origin'))+2}; - oz = varargin{find(strcmpi(varargin, 'origin'))+3}; - else - ox = 0; - oy = 0; - oz = 0; - end - [nx, ny, nz] = size(m); - setdataformat(fid, binaryflag); - - fprintf(fid, 'DATASET STRUCTURED_POINTS\n'); - fprintf(fid, 'DIMENSIONS %d %d %d\n', nx, ny, nz); - fprintf(fid, ['SPACING ', num2str(sx), ' ', num2str(sy), ' ',... - num2str(sz), '\n']); - fprintf(fid, ['ORIGIN ', num2str(ox), ' ', num2str(oy), ' ',... - num2str(oz), '\n']); - fprintf(fid, 'POINT_DATA %d\n', nx*ny*nz); - fprintf(fid, ['SCALARS ', title, ' float 1\n']); - fprintf(fid,'LOOKUP_TABLE default\n'); - if ~binaryflag - spec = ['%0.', precision, 'f ']; - fprintf(fid, spec, m(:)'); - else - fwrite(fid, m(:)', 'float', 'b'); - end - - case {'STRUCTURED_GRID','UNSTRUCTURED_GRID'} - % 3. The format data proper is saved in (ASCII or Binary). Use - % fprintf to write data in the case of ASCII and fwrite for binary. - if numel(varargin)<6, error('Not enough input arguments'); end - setdataformat(fid, binaryflag); -% fprintf(fid, 'BINARY\n'); - x = varargin{1}; - y = varargin{2}; - z = varargin{3}; - if sum(size(x)==size(y) & size(y)==size(z))~=length(size(x)) - error('Input dimesions do not match') - end - n_elements = numel(x); - % 4. Type of Dataset ( can be STRUCTURED_POINTS, STRUCTURED_GRID, - % UNSTRUCTURED_GRID, POLYDATA, RECTILINEAR_GRID or FIELD ) - % This part, dataset structure, begins with a line containing the - % keyword 'DATASET' followed by a keyword describing the type of dataset. - % Then the geomettry part describes geometry and topology of the dataset. - if strcmpi(dataType,'STRUCTURED_GRID') - fprintf(fid, 'DATASET STRUCTURED_GRID\n'); - fprintf(fid, 'DIMENSIONS %d %d %d\n', size(x,1), size(x,2), size(x,3)); - else - fprintf(fid, 'DATASET UNSTRUCTURED_GRID\n'); - end - fprintf(fid, ['POINTS ' num2str(n_elements) ' float\n']); - output = [x(:)'; y(:)'; z(:)']; - - if ~binaryflag - spec = ['%0.', precision, 'f ']; - fprintf(fid, spec, output); - else - fwrite(fid, output, 'float', 'b'); - end - % 5.This final part describe the dataset attributes and begins with the - % keywords 'POINT_DATA' or 'CELL_DATA', followed by an integer number - % specifying the number of points of cells. Other keyword/data combination - % then define the actual dataset attribute values. - fprintf(fid, ['\nPOINT_DATA ' num2str(n_elements)]); - % Parse remaining argument. - vidx = find(strcmpi(varargin,'VECTORS')); - sidx = find(strcmpi(varargin,'SCALARS')); - if vidx~=0 - for ii = 1:length(vidx) - title = varargin{vidx(ii)+1}; - % Data enteries begin with a keyword specifying data type - % and numeric format. - fprintf(fid, ['\nVECTORS ', title,' float\n']); - output = [varargin{ vidx(ii) + 2 }(:)';... - varargin{ vidx(ii) + 3 }(:)';... - varargin{ vidx(ii) + 4 }(:)']; - - if ~binaryflag - spec = ['%0.', precision, 'f ']; - fprintf(fid, spec, output); - else - fwrite(fid, output, 'float', 'b'); - end -% fwrite(fid, [reshape(varargin{vidx(ii)+2},1,n_elements);... -% reshape(varargin{vidx(ii)+3},1,n_elements);... -% reshape(varargin{vidx(ii)+4},1,n_elements)],'float','b'); - end - end - if sidx~=0 - for ii = 1:length(sidx) - title = varargin{sidx(ii)+1}; - fprintf(fid, ['\nSCALARS ', title,' float\n']); - fprintf(fid, 'LOOKUP_TABLE default\n'); - if ~binaryflag - spec = ['%0.', precision, 'f ']; - fprintf(fid, spec, varargin{ sidx(ii) + 2}); - else - fwrite(fid, varargin{ sidx(ii) + 2}, 'float', 'b'); - end -% fwrite(fid, reshape(varargin{sidx(ii)+2},1,n_elements),'float','b'); - end - end - - case 'POLYDATA' - - fprintf(fid, 'ASCII\n'); - if numel(varargin)<4, error('Not enough input arguments'); end - x = varargin{2}(:); - y = varargin{3}(:); - z = varargin{4}(:); - if numel(varargin)<4, error('Not enough input arguments'); end - if sum(size(x)==size(y) & size(y)==size(z))~= length(size(x)) - error('Input dimesions do not match') - end - n_elements = numel(x); - fprintf(fid, 'DATASET POLYDATA\n'); - if mod(n_elements,3)==1 - x(n_elements+1:n_elements+2,1)=[0;0]; - y(n_elements+1:n_elements+2,1)=[0;0]; - z(n_elements+1:n_elements+2,1)=[0;0]; - elseif mod(n_elements,3)==2 - x(n_elements+1,1)=0; - y(n_elements+1,1)=0; - z(n_elements+1,1)=0; - end - nbpoint = numel(x); - fprintf(fid, ['POINTS ' num2str(nbpoint) ' float\n']); - - spec = [repmat(['%0.', precision, 'f '], 1, 9), '\n']; - - output = [x(1:3:end-2), y(1:3:end-2), z(1:3:end-2),... - x(2:3:end-1), y(2:3:end-1), z(2:3:end-1),... - x(3:3:end), y(3:3:end), z(3:3:end)]'; - - fprintf(fid, spec, output); - - switch upper(varargin{1}) - case 'LINES' - if mod(n_elements,2)==0 - nbLine = 2*n_elements-2; - else - nbLine = 2*(n_elements-1); - end - conn1 = zeros(nbLine,1); - conn2 = zeros(nbLine,1); - conn2(1:nbLine/2) = 1:nbLine/2; - conn1(1:nbLine/2) = conn2(1:nbLine/2)-1; - conn1(nbLine/2+1:end) = 1:nbLine/2; - conn2(nbLine/2+1:end) = conn1(nbLine/2+1:end)-1; - fprintf(fid,'\nLINES %d %d\n',nbLine,3*nbLine); - fprintf(fid,'2 %d %d\n',[conn1';conn2']); - case 'TRIANGLE' - ntri = length(varargin{5}); - fprintf(fid,'\nPOLYGONS %d %d\n',ntri,4*ntri); - fprintf(fid,'3 %d %d %d\n',(varargin{5}-1)'); - case 'TETRAHEDRON' - ntetra = length(varargin{5}); - fprintf(fid,'\nPOLYGONS %d %d\n',ntetra,5*ntetra); - fprintf(fid,'4 %d %d %d %d\n',(varargin{5}-1)'); - end -end -fclose(fid); -if strcmpi(filename,'matlab_export.vtk') - switch computer - case {'PCWIN','PCWIN64'} - !paraview.exe --data='matlab_export.vtk' & - % Exclamation point character is a shell escape, the rest of the - % input line will be sent to operating system. It can not take - % variables, though. The & at the end of line will return control to - % Matlab even when the outside process is still running. - case {'GLNXA64','MACI64'} - !paraview --data='matlab_export.vtk' & - end -end -end - -function setdataformat(fid, binaryflag) - -if ~binaryflag - fprintf(fid, 'ASCII\n'); -else - fprintf(fid, 'BINARY\n'); -end -end - \ No newline at end of file