diff --git a/channel/Makefile_leonardo b/channel/Makefile_leonardo new file mode 100644 index 0000000..ff923ef --- /dev/null +++ b/channel/Makefile_leonardo @@ -0,0 +1,33 @@ +# Compiler +FC = mpif90 +LD = $(FC) + +# Paths (Modify if necessary) +ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) +$(info $$ROOT_DIR is [${ROOT_DIR}]) +PARENT_DIR := $(abspath $(ROOT_DIR)/..) +$(info $$PARENT_DIR is [${PARENT_DIR}]) +CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build +CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none +EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat + +export $(CUDECOMP_DIR)/lib + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/ +LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt + +# Source and Object Files +OBJS = module.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< diff --git a/channel/Makefile_local b/channel/Makefile_local new file mode 100644 index 0000000..6cb5517 --- /dev/null +++ b/channel/Makefile_local @@ -0,0 +1,28 @@ +# Compiler +FC = mpif90 +LD = $(FC) +RM = /bin/rm -f + +# Paths (Modify if necessary) +CUDECOMP_DIR = /home/milton/TCF36/cuDecomp/build/ + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include +LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft + +# Source and Object Files +OBJS = module.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< + + diff --git a/channel/README.md b/channel/README.md new file mode 100644 index 0000000..e73f5c9 --- /dev/null +++ b/channel/README.md @@ -0,0 +1,31 @@ +# Channel configuratuion + +Code for turbulent channel flow simulations (closed channel). +Solution of Navier-Stokes equations + phase-field method (ACDI) + energy equation (passive scalar). +For time integration, a fully explicit scheme is used: Euler explicit for ACDI + RK3 for Navier-Stokes and temperature + +## Main features at a glance: +- No-slip boundary conditions at the two walls ($z=\pm h$) +- Poisson solver based on FFT2 along $x$ and $y$ + TDMA along $z$. +- Non-uniform grid along $z$, any stretching function can be used (defined in read_input.f90) +- Domain of arbitary dimensions along all three directions + +**Autotuning of decomposition** +- Default: `pr=0` and `pc=0` → enables **automatic process decomposition**. +- Only input needed: **total number of MPI tasks**. +- No recompilation required when changing MPI processes. + +**Optional features (conditional compilation)** +- Phase-field module: Can be enabled or disabled. Default is single-phase. +- Passive scaler: Can be enabled or disabled. Default is single-phase. + +## Turbulent channel flow +- Turbulent channel flow at $Re_\tau=590$, Grid: $1536 \times 768 \times 576$ + +![Test](../val/tcf2.png) + +## Nodes numbering and staggered grid + +X-pencil configuration: + +![Test](../val/grid_channel.png) diff --git a/channel/binder.sh b/channel/binder.sh new file mode 100755 index 0000000..b449dda --- /dev/null +++ b/channel/binder.sh @@ -0,0 +1,12 @@ +#!/bin/bash +case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in +0) export UCX_NET_DEVICES=mlx5_0:1 ;; +1) export UCX_NET_DEVICES=mlx5_1:1 ;; +2) export UCX_NET_DEVICES=mlx5_2:1 ;; +3) export UCX_NET_DEVICES=mlx5_3:1 ;; +esac + +echo Launching on $UCX_NET_DEVICES + +$* + diff --git a/channel/go_leo.sh b/channel/go_leo.sh new file mode 100644 index 0000000..2424115 --- /dev/null +++ b/channel/go_leo.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --account="IscrB_EXCEED" +#SBATCH --job-name="cudec" +#SBATCH --time=00:05:00 +#SBATCH --nodes=1 ##adjust +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=8 +#SBATCH --output=test.out +#SBATCH -p boost_usr_prod +#SBATCH --error=test.err +#SBATCH --qos=boost_qos_dbg + +module load profile/candidate +module load nvhpc/25.3 +module load hpcx-mpi/2.19 +CURRENT_DIR="$(pwd)" +ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib" +echo "Using directory: $ROOT_DIR" +export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH + +chmod 777 binder.sh +mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36 \ No newline at end of file diff --git a/channel/input.inp b/channel/input.inp new file mode 100644 index 0000000..7e814cb --- /dev/null +++ b/channel/input.inp @@ -0,0 +1,21 @@ +0 ! fresh start=0 restart =1 +0 ! iteration to start from (fresh start 0 otherwise different) +200000 ! Final time step +1000 ! Saving frequency +4.0 ! Streamwise domain lenght Lx +4.0 ! Spanwise domain width Ly +2.0 ! Wall-normal domain height Lz=2h +0.01 ! Stretching factor quasi~0=uniform 1.5 mid-stretching and 3.0 extreme (do not use zero) +1 ! inflow condition (0=zero, 1=perturbed 3D field, 2=read from fields) +0 ! phase-field initial condition (0=circular drop, 1=read from fields) +0 ! theta initial condition (0=uniform, 1=read from fields) +0.0001 ! dt for the simulation +0.0066666 ! mu (viscosity) +1.000 ! density +0.0 ! Pressure gradient along x (negative if flow along x) +0.0 ! Pressure gradient along y (negative if flow along y) +0.5 ! Initial drop radius for phase-field +0.1d0 ! Surface tension +1.0 ! Ratio eps/dx +0.00666666 ! kappa (thermal diffusivity) +50.0 ! alphag for buoyancy diff --git a/channel/leo.sh b/channel/leo.sh new file mode 100644 index 0000000..f3d5f46 --- /dev/null +++ b/channel/leo.sh @@ -0,0 +1,8 @@ +module load profile/candidate +module load nvhpc/25.3 +module load hpcx-mpi/2.19 +cp Makefile_leonardo Makefile +make clean +make +mkdir -p output + diff --git a/channel/local.sh b/channel/local.sh new file mode 100644 index 0000000..4f27984 --- /dev/null +++ b/channel/local.sh @@ -0,0 +1,15 @@ +NVARCH=Linux_x86_64; export NVARCH +NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS +MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH +PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH +export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH +export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man +LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib +#clean folder output +rm -rf output +mkdir output +cp Makefile_local Makefile +#rm *.dat +make clean +make +mpirun -np 2 ./mhit36 diff --git a/channel/main.f90 b/channel/main.f90 new file mode 100644 index 0000000..9c4f06f --- /dev/null +++ b/channel/main.f90 @@ -0,0 +1,1028 @@ +#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) + +program main +use cudafor +use cudecomp +use cufft +use mpi +use velocity +use phase +use temperature +use param +use mpivar +use cudecompvar +use nvtx + + +implicit none +! timer for scaling test +real :: t_start, t_end, elapsed +! grid dimensions +integer :: comm_backend +integer :: pr, pc +! cudecomp +! cuFFT +integer :: planXf, planXb, planY +integer :: batchsize +integer :: status +integer :: i,j,k,il,jl,kl,ig,jg,kg,t,stage +integer :: im,ip,jm,jp,km,kp,last,idx +! TDMA variables +double precision, allocatable :: a(:), b(:), c(:) +double complex, allocatable :: d(:), sol(:) +! working arrays +double complex, allocatable :: psi(:) +double precision, allocatable :: ua(:,:,:) +double precision, allocatable :: uaa(:,:,:) +double complex, device, allocatable :: psi_d(:) +double precision, device, allocatable :: vel_d(:) ! only used for implicit diffusion in z +double complex, pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:) +character(len=40) :: namefile +character(len=4) :: itcount +! Code variables +double precision ::err, maxErr, meanp, gmeanp +double complex, device, pointer :: psi3d(:,:,:) +double precision :: k2 +!integer :: il, jl, ig, jg +integer :: offsets(3), xoff, yoff +integer :: np(3) +! Alan Williamson classic +double precision, parameter :: alpha(3) = (/ 8.d0/15.d0, 5.d0/12.d0, 3.d0/4.d0 /) +double precision, parameter :: beta(3) = (/ 0.d0, -17.d0/60.d0, -5.d0/12.d0 /) +! Stage coefficients for diffusion-optimized SSP RK3 +!real(kind=8), parameter :: alpha(3) = (/ 0.444370493651235d0, 0.555629506348765d0, 1.0d0 /) +!real(kind=8), parameter :: beta(3) = (/ 0.0d0, -0.122243120495896d0, -0.377756879504104d0 /) + +! Enable or disable phase field +#define phiflag 0 +! Enable or disable temperature field +#define thetaflag 1 + +!######################################################################################################################################## +! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START +!######################################################################################################################################## +! MPI initialization, put in rank the local MPI rank number and ranks total number +! Same procedura defined in the cuDecomp documentation +call mpi_init(ierr) +call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr) +call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr) +call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr) +call mpi_comm_rank(localComm, localRank, ierr) +ierr = cudaSetDevice(localRank) !assign GPU to MPI rank + +! Define grid and decomposition +call readinput + +! domain decomposition (pencils in y and z) +pr = 0 +pc = 0 +halo_ext=1 + +! CuDECOMP initialization and settings +comm_backend = 0 ! Enable full autotuning +CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD)) +! config is a struct and pr and pc are the number of pencils along the two directions +! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults. +CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config)) ! Initializing to default values is required to ensure no entries are left uninitialized. +pdims = [pr, pc] !pr and pc are the number of pencil along the different directions +config%pdims = pdims +halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil +! for transpositions +config%transpose_comm_backend = comm_backend +config%transpose_axis_contiguous = .true. +! for halo exchanges +config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI +! Setting for periodic halos in all directions (non required to be in config) +halo_periods = [.true., .true., .false.] +! create spectral grid descriptor first to select pdims for optimal transposes +gdims = [nx/2+1, ny, nz] +config%gdims = gdims +! Set up autotuning options for spectral grid (transpose related settings) +CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options)) +options%dtype = CUDECOMP_DOUBLE_COMPLEX +if (comm_backend == 0) then + options%autotune_transpose_backend = .true. + options%autotune_halo_backend = .false. +endif +options%transpose_use_inplace_buffers = .true. +options%transpose_input_halo_extents(:, 1) = halo +options%transpose_output_halo_extents(:, 4) = halo +CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options)) +! create physical grid descriptor +! take previous config and modify the global grid (nx instead of nx/2+1) +! reset transpose_comm_backend to default value to avoid picking up possible nvshmem +! transpose backend selection (this impacts how workspaces are allocated) +gdims = [nx, ny, nz] +config%gdims = gdims +config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P +! Set up autotuning options for physical grid (halo related settings) +CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options)) +options%dtype = CUDECOMP_DOUBLE_COMPLEX +if (comm_backend == 0) then + options%autotune_halo_backend = .true. +endif +options%halo_extents(:) = halo +options%halo_periods(:) = halo_periods +options%halo_axis = 1 +CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options)) +! Get pencil info for the grid descriptor in the physical space pencil struct (piX, piY or piZ) +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo)) +nElemX = piX%size !<- number of total elments in x-configuratiion (including halo) +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2)) +nElemY = piY%size +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3)) +nElemZ = piZ%size +! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used) +CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork)) +CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo)) +! Get pencil info for the grid descriptor in the complex space +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo)) +nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo) +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2)) +nElemY_d2z = piY_d2z%size +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3)) +nElemZ_d2z = piZ_d2z%size +! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used) +CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z)) +CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z)) +! End cuDecomp initialization + + +! CUFFT initialization -- Create Plans (along x anf y only, z not required) +! Forward 1D FFT in X: D2Z +batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension) +status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward' +! Backward 1D FFT in X: Z2D +batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension) +status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward' +! it's always 2 and 3 because y-pencil have coordinates y,z,x +batchSize = piY_d2z%shape(2)*piY_d2z%shape(3) +status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward' +!######################################################################################################################################## +! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END +!######################################################################################################################################## + + + + + +!######################################################################################################################################## +! START STEP 2: ALLOCATE ARRAYS +!######################################################################################################################################## +! Pressure variable +allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3))) +allocate(p(piX%shape(1), piX%shape(2), piX%shape(3))) +allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z))) +!NS variables +allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector +! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field +allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w +allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w +!allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) (debug only) +!TDMA solver +allocate(a(0:nz+1),b(0:nz+1),c(0:nz+1),d(0:nz+1),sol(0:nz+1)) +!PFM variables +#if phiflag == 1 +allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3))) +allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3))) +allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3))) +allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3))) +allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces +#endif +!Temperature variables +#if thetaflag == 1 +allocate(theta(piX%shape(1),piX%shape(2),piX%shape(3)),rhstheta(piX%shape(1),piX%shape(2),piX%shape(3))) +allocate(rhstheta_o(piX%shape(1),piX%shape(2),piX%shape(3))) +#endif +! allocate arrays for transpositions and halo exchanges +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork)) +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo)) +! allocate arrays for transpositions +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z)) +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z)) ! not required +!######################################################################################################################################## +! END STEP2: ALLOCATE ARRAYS +!######################################################################################################################################## + + + + + + + + +!######################################################################################################################################## +! START STEP 3: FLOW FIELD, PHASE-FIELD AND TEMPERATURE INIT +!######################################################################################################################################## +! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O) +! 3.2 Call halo exchnages along Y and Z for u, v, w, phi and theta +if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder +if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)" + if (inflow .eq. 0) then + if (rank.eq.0) write(*,*) "Initialize zero velocity field" + do k = 1+halo_ext, piX%shape(3)-halo_ext + do j = 1+halo_ext, piX%shape(2)-halo_ext + do i = 1, piX%shape(1) + u(i,j,k) = 0.0d0 + v(i,j,k) = 0.0d0 + w(i,j,k) = 0.0d0 + enddo + enddo + enddo + endif + if (inflow .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize laminar flow (x) + 3D perturbation" + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 - halo_ext + do j = 1+halo_ext, piX%shape(2)-halo_ext + jg = piX%lo(2) + j - 1 - halo_ext + do i = 1, piX%shape(1) + amp=3.d0 + mx=3.03d0 + my=2.02d0 + mz=4.d0 + !3D divergence free flow with fluctuations that satisfies the boundary conditions + u(i,j,k) = 1.d0*(1.d0 - ((2*z(kg) - lz)/lz)**2) ! + u(i,j,k) = u(i,j,k) - amp*cos(twopi*mx*x(i)/lx)*sin(twopi*my*y(jg)/ly)*2.d0*twopi/lz*sin(twopi*z(kg)/lz)*cos(twopi*z(kg)/lz) + u(i,j,k) = u(i,j,k) + amp*sin(twopi*mx*x(i)/lx)*(-twopi*my/ly)*sin(2.d0*twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz) + v(i,j,k) = -amp*cos(twopi*my*y(jg)/ly)*(twopi*mx/lx)*cos(twopi*mx*x(i)/lx)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz) + w(i,j,k) = amp*cos(twopi*mx*x(i)/lx)*(twopi*mx/lx)*sin(twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz) + enddo + enddo + enddo + endif + if (inflow .eq. 2) then + if (rank.eq.0) write(*,*) "Initialize from data" + call readfield(1) + call readfield(2) + call readfield(3) + endif +endif +if (restart .eq. 1) then !restart, ignore inflow and read the tstart field + if (rank.eq.0) write(*,*) "Initialize velocity field (from output folder), iteration:", tstart + call readfield_restart(tstart,1) + call readfield_restart(tstart,2) + call readfield_restart(tstart,3) +endif + +! update halo cells along y and z directions (enough only if pr and pc are non-unitary) +!$acc host_data use_device(u) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +!$acc host_data use_device(v) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +!$acc host_data use_device(w) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data + +! initialize phase-field +#if phiflag == 1 +if (restart .eq. 0) then +if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)' + if (inphi .eq. 0) then + if (rank.eq.0) write(*,*) 'Spherical drop' + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 - halo_ext + do j = 1+halo_ext, piX%shape(2)-halo_ext + jg = piX%lo(2) + j - 1 - halo_ext + do i = 1, piX%shape(1) + pos=(x(i)-lx/2)**2d0 + (y(jg)-ly/2)**2d0 + (z(kg)-lz/2)**2d0 + phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps)) + enddo + enddo + enddo + endif + if (inphi .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize phase-field from data" + call readfield(5) + endif +endif +if (restart .eq. 1) then + write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart + call readfield_restart(tstart,5) +endif +! update halo +!$acc host_data use_device(phi) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +#endif + +! initialize temperature field +#if thetaflag == 1 +if (restart .eq. 0) then +if (rank.eq.0) write(*,*) 'Initialize temperature field (fresh start)' + if (intheta .eq. 0) then + if (rank.eq.0) write(*,*) 'Uniform temperature field' + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 - halo_ext + do j = 1+halo_ext, piX%shape(2)-halo_ext + do i = 1, piX%shape(1) + theta(i,j,k) = 0.d0!1.d0 - z(kg) ! uniform temperature + enddo + enddo + enddo + endif + if (intheta .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize temperature from data" + call readfield(6) + endif +endif +if (restart .eq. 1) then + write(*,*) "Initialize temperature (restart, from output folder), iteration:", tstart + call readfield_restart(tstart,6) +endif +! update halo +!$acc host_data use_device(theta) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +#endif + +!Save initial fields (only if a fresh start) +if (restart .eq. 0) then + if (rank.eq.0) write(*,*) "Save initial fields" + call writefield(tstart,1) + call writefield(tstart,2) + call writefield(tstart,3) + call writefield(tstart,4) + #if phiflag == 1 + call writefield(tstart,5) + #endif + #if thetaflag == 1 + call writefield(tstart,6) ! temperature + #endif +endif +!######################################################################################################################################## +! END STEP 3: FLOW FIELD, PHASE-FIELD AND TEMP INIT FIELD INIT +!######################################################################################################################################## + + + + + + +! ######################################################################################################################################## +! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP +! ######################################################################################################################################## +! First step use Euler +gumax=1.d0 +tstart=tstart+1 +gamma=1.d0*gumax +!$acc data copyin(piX) +#if thetaflag == 1 +!$acc data create(rhsu_o, rhsv_o, rhsw_o, rhstheta_o) +#else +!$acc data create(rhsu_o, rhsv_o, rhsw_o) +#endif +!$acc data copyin(mysin, mycos) +call cpu_time(t_start) +! Start temporal loop +do t=tstart,tfin + ! Create custom label for each marker + write(itcount,'(i4)') t + ! Range with custom color + call nvtxStartRange("Iteration "//itcount,t) + + if (rank.eq.0) write(*,*) "Time step",t,"of",tfin + call cpu_time(times) + + call nvtxStartRange("Phase-field") + !######################################################################################################################################## + ! START STEP 4: PHASE-FIELD SOLVER (EXPLICIT) + !######################################################################################################################################## + #if phiflag == 1 + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + + gamma=1.d0*gumax + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + rhsphi(i,j,k) = & + - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi & + - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dyi & + - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dzci(kg) & + + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + & + eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddyi + & + eps*((phi(i,j,kp)-phi(i,j,k))*dzi(kg+1) - (phi(i,j,k) -phi(i,j,km))*dzi(kg))*dzci(kg)) ! first between centers and then betwenn faces + ! Compute normals for sharpening term (gradient) + normx(i,j,k) = 0.5d0*(psidi(ip,j,k) - psidi(im,j,k))*dxi + normy(i,j,k) = 0.5d0*(psidi(i,jp,k) - psidi(i,jm,k))*dyi + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))/(z(kg+1)-z(kg-1)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx,normy,normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! 4.1.3. Compute Sharpening term (gradient) + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16) + ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16) + normx(i,j,k) = normx(i,j,k)*normod + normy(i,j,k) = normy(i,j,k)*normod + normz(i,j,k) = normz(i,j,k)*normod + enddo + enddo + enddo + !$acc end kernels + + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - & + 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + & + (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - & + 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dyi + & + (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - & + 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))/(z(kg+1)-z(kg-1))) + enddo + enddo + enddo + !$acc end kernels + + ! Get phi at n+1 + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + phi(i,j,k) = phi(i,j,k) + dt*rhsphi(i,j,k) + enddo + enddo + enddo + !$acc end kernels + + !$acc host_data use_device(phi) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + #endif + !######################################################################################################################################## + ! END STEP 4: PHASE-FIELD SOLVER (EXPLICIT) + !######################################################################################################################################## + call nvtxEndRange + + + !######################################################################################################################################## + ! START STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT) + !######################################################################################################################################## + #if thetaflag == 1 + ! Temperature solver (RK3 explicit) + do stage = 1,3 + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective terms + rhstheta(i,j,k) = & + - (u(ip,j,k)*0.5d0*(theta(ip,j,k)+theta(i,j,k)) - u(i,j,k)*0.5d0*(theta(i,j,k)+theta(im,j,k)))*dxi & + - (v(i,jp,k)*0.5d0*(theta(i,jp,k)+theta(i,j,k)) - v(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,jm,k)))*dyi & + - (w(i,j,kp)*0.5d0*(theta(i,j,kp)+theta(i,j,k)) - w(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,j,km)))*dzci(kg) + ! diffusive terms + rhstheta(i,j,k) = rhstheta(i,j,k) + kappa*((theta(ip,j,k)-2.d0*theta(i,j,k)+theta(im,j,k))*ddxi + & + (theta(i,jp,k)-2.d0*theta(i,j,k)+theta(i,jm,k))*ddyi + & + ((theta(i,j,kp)-theta(i,j,k))*dzi(kg+1) - (theta(i,j,k) -theta(i,j,km))*dzi(kg))*dzci(kg)) + + enddo + enddo + enddo + !$acc parallel loop collapse(3) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + theta(i,j,k) = theta(i,j,k) + dt*alpha(stage)*rhstheta(i,j,k) + dt*beta(stage)*rhstheta_o(i,j,k) + rhstheta_o(i,j,k)=rhstheta(i,j,k) + enddo + enddo + enddo + ! update halos + !$acc host_data use_device(theta) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + ! impose temperature boundary conditions, interpolated as done for the velocity (see node sketch) + !$acc parallel loop collapse(3) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + kg = piX%lo(3) + k - 1 - halo_ext + if (kg .eq. 1) theta(i,j,k-1) = 2.d0*( 1.d0) - theta(i,j,k) ! mean value between kg and kg-1 (top wall) equal to 1 + if (kg .eq. nz) theta(i,j,k+1) = 2.d0*(-1.d0) - theta(i,j,k) ! mean value between kg and kg+1 (bottom wall) equal to -1 + enddo + enddo + enddo + enddo + #endif + !######################################################################################################################################## + ! END STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT) + !######################################################################################################################################## + + + + + call nvtxStartRange("Projection") + !######################################################################################################################################## + ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP) + !######################################################################################################################################## + ! 6.1 compute rhs + ! 6.2 obtain ustar and store old rhs in rhs_o + ! 6.3 Call halo exchnages along Y and Z for u,v,w + + ! Projection step + do stage = 1,3 + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! compute the products (conservative form) + h11 = 0.25d0*((u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k)) - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k)))*dxi + h12 = 0.25d0*((u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dyi + h13 = 0.25d0*((u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp)) - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k)))*dzci(kg) ! divide by cell height + h21 = 0.25d0*((u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dxi + h22 = 0.25d0*((v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k)) - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k)))*dyi + h23 = 0.25d0*((w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k)) - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km)))*dzci(kg) ! divide by cell height + h31 = 0.25d0*((w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km)) - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km)))*dxi + h32 = 0.25d0*((v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k)) - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k)))*dyi + h33 = 0.25d0*((w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k)) - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km)))*dzi(kg) ! divie by distance between centers + rhsu(i,j,k)=-(h11+h12+h13) + rhsv(i,j,k)=-(h21+h22+h23) + rhsw(i,j,k)=-(h31+h32+h33) + ! viscous/diffusive terms + h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi + h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddyi + h13 = mu*((u(i,j,kp)-u(i,j,k))*dzi(kg+1)-(u(i,j,k)-u(i,j,km))*dzi(kg))*dzci(kg) + h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi + h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddyi + h23 = mu*((v(i,j,kp)-v(i,j,k))*dzi(kg+1)-(v(i,j,k)-v(i,j,km))*dzi(kg))*dzci(kg) + h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi + h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddyi + h33 = mu*((w(i,j,kp)-w(i,j,k))*dzci(kg+1)-(w(i,j,k)-w(i,j,km))*dzci(kg))*dzi(kg) ! face to face and then center to center + rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi + rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi + rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi + ! Pressure driven + rhsu(i,j,k)=rhsu(i,j,k) - gradpx + rhsv(i,j,k)=rhsv(i,j,k) - gradpy + enddo + enddo + enddo + + ! Surface tension forces (Obtain surface tension forces evaluated at the center of the cell (same as where phi is located) + #if phiflag == 1 + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dyi + (normz(i,j,kp)-normz(i,j,km))/(z(kg+1)-z(kg-1)) + fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi + fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dyi + fzst(i,j,k)= -sigma*curv*(phi(i,j,kp)-phi(i,j,km))/(z(kg+1)-z(kg-1)) + enddo + enddo + enddo + !$acc end kernels + + ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points) + !$acc host_data use_device(fxst,fyst,fzst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Interpolate force at velocity points + !$acc parallel loop collapse(3) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + im=i-1 + jm=j-1 + km=k-1 + if (im .lt. 1) im=nx + #if thetaflag == 0 + ! Add surface tension force only (ACDI only) + rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi + rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi + rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi + #elif thetaflag == 1 + ! Add here also buoyancy force if temperature is active (case ACDI + temperature) + rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k)) + #endif + u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(im,j,k))*dxi + v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,jm,k))*dyi + w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,j,km))*dzi + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo + enddo + #else + ! 5.2 find u, v and w star (RK3), single-phase case + !$acc parallel loop collapse(3) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! Add here also buoyancy force if temperature is active (case NS + temperature) + #if thetaflag == 1 + km=k-1 + rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k)) + #endif + u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k) + v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k) + w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k) + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo + enddo + #endif + + ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid + !$acc host_data use_device(u,v,w) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! impose velocity boundary conditions, w is at the wall, u and v interpolate so that the mean value is zero, no-slip assumted, i.e. u=0, can be extented to any value + ! even when stretched the spacing cell face is centered + !$acc parallel loop collapse(3) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + kg = piX%lo(3) + k - 1 -halo_ext + ! bottom wall + if (kg .eq. 1) u(i,j,k-1)= -u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero + if (kg .eq. 1) v(i,j,k-1)= -v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero + if (kg .eq. 1) w(i,j,k)=0.d0 ! w point is at the wall + ! top wall + if (kg .eq. nz) u(i,j,k+1)= -u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero + if (kg .eq. nz) v(i,j,k+1)= -v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero + if (kg .eq. nz+1) w(i,j,k)=0.d0 ! w point (nz+1) is at the wall + enddo + enddo + enddo + enddo + !######################################################################################################################################## + ! END STEP 6: USTAR COMPUTATION + !######################################################################################################################################## + call nvtxEndRange + + + call nvtxStartRange("Poisson") + !######################################################################################################################################## + ! START STEP 7: POISSON SOLVER FOR PRESSURE + !######################################################################################################################################## + ! initialize rhs and analytical solution + ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center + ! I've done the halo updates so to compute the divergence at the pencil border i have the *star from the halo + call nvtxStartRange("compute RHS") + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + kg = piX%lo(3) + k - 1 - halo_ext + if (ip > nx) ip=1 + rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dyi/dt)*(v(i,jp,k)-v(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dzci(kg)/dt)*(w(i,j,kp)-w(i,j,k)) + enddo + enddo + enddo + !$acc end kernels + call nvtxEndRange + + call nvtxStartRange("FFT forward w/ transpositions") + !$acc host_data use_device(rhsp) + status = cufftExecD2Z(planXf, rhsp, psi_d) + if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status + !$acc end host_data + ! psi(kx,y,z) -> psi(y,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0])) + ! psi(y,z,kx) -> psi(ky,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status + ! psi(ky,z,kx) -> psi(z,kx,ky) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + + call nvtxEndRange + np(piZ_d2z%order(1)) = piZ_d2z%shape(1) + np(piZ_d2z%order(2)) = piZ_d2z%shape(2) + np(piZ_d2z%order(3)) = piZ_d2z%shape(3) + call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape) + offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1 + offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1 + offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1 + + xoff = offsets(1) + yoff = offsets(2) + npx = np(1) + npy = np(2) + call nvtxStartRange("Solution") + !$acc parallel loop collapse(2) gang private(a,b,c,d,factor) + do jl = 1, npy + do il = 1, npx + ! compute index global wavenumber ig and jg + jg = yoff + jl + ig = xoff + il + ! Set up tridiagonal system for each i and j + ! Fill diagonals and rhs for each + ! 0 and ny+1 are the ghost nodes + do k = 1, nz + a(k) = 2.0d0*(dzi(k)**2*dzi(k+1))/(dzi(k)+dzi(k+1)) + c(k) = 2.0d0*(dzi(k)*dzi(k+1)**2)/(dzi(k)+dzi(k+1)) + b(k) = -a(k) - c(k) - (kx_d(ig)**2 + ky_d(jg)**2) + d(k) = psi3d(k,il,jl) + enddo + ! Neumann BC at bottom + a(0) = 0.0d0 + b(0) = -1.d0*dzi(1)*dzi(1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg) + c(0) = 1.d0*dzi(1)*dzi(1) + d(0) = 0.0d0 + ! Neumann BC at top + a(nz+1) = 1.0d0*dzi(nz+1)*dzi(nz+1) + b(nz+1) = -1.0d0*dzi(nz+1)*dzi(nz+1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg) + c(nz+1) = 0.0d0 + d(nz+1) = 0.0d0 + ! Enforce pressure at one point? one interior point, avodig messing up with BC + ! need brackets? + if (ig == 1 .and. jg == 1) then + a(1) = 0.d0 + b(1) = 1.d0 + c(1) = 0.d0 + d(1) = 0.d0 + end if + ! Forward elimination (Thomas) + !$acc loop seq + do k = 1, nz+1 + factor = a(k)/b(k-1) + b(k) = b(k) - factor*c(k-1) + d(k) = d(k) - factor*d(k-1) + end do + ! Back substitution + psi3d(nz+1,il,jl) = d(nz+1)/b(nz+1) + ! check on pivot like flutas? + !$acc loop seq + do k = nz, 1, -1 + psi3d(k,il,jl) = (d(k) - c(k)*psi3d(k+1,il,jl))/b(k) + end do + end do + end do + + call nvtxStartRange("FFT backwards along x and y w/ transpositions") + ! psi(z,kx,ky) -> psi(ky,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + ! psi(ky,z,kx) -> psi(y,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status + ! psi(y,z,kx) -> psi(kx,y,z) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents)) + !$acc host_data use_device(p) + ! psi(kx,y,z) -> p(x,y,z) + status = cufftExecZ2D(planXb, psi_d, p) + if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status + !$acc end host_data + + ! normalize pressure (must be done here, not in the TDMA) + !$acc parallel loop collapse(3) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + p(i,j,k) = p(i,j,k)/nx/ny + end do + end do + end do + + call nvtxEndRange + ! update halo nodes with pressure + ! Update X-pencil halos + !$acc host_data use_device(p) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !######################################################################################################################################## + ! END STEP 7: POISSON SOLVER FOR PRESSURE + !######################################################################################################################################## + call nvtxEndRange + + + call nvtxStartRange("Correction") + !######################################################################################################################################## + ! START STEP 8: VELOCITY CORRECTION + ! ######################################################################################################################################## + ! 8.1 Correct velocity + ! 8.2 Call halo update + ! Correct velocity, pressure has also the halo + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i = 1, piX%shape(1) ! equal to nx (no halo on x) + im=i-1 + jm=j-1 + km=k-1 + kg=piX%lo(3) + k - 1 - halo_ext + if (im < 1) im=nx + u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi + v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dyi + w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dzi(kg) + enddo + enddo + enddo + !$acc end kernels + + ! 8.3 update halos (y direction), required to then compute the RHS of Poisson equation because of staggered grid + !$acc host_data use_device(u,v,w) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! impose velocity boundary conditions, can be optimized, no real gain + ! w is at the wall, u and v interpolate so that the mean value is zero + ! no-slip assumted, i.e. u=0, can be extented to any value + umax=0.d0 + vmax=0.d0 + wmax=0.d0 + !$acc parallel loop collapse(3) reduction(max:umax,vmax,wmax) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + kg = piX%lo(3) + k - 1 - halo_ext + ! bottom wall + if (kg .eq. 1) u(i,j,k-1) = -u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero + if (kg .eq. 1) v(i,j,k-1) = -v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero + if (kg .eq. 1) w(i,j,k) = 0.d0 ! w point is at the wall + ! top wall + if (kg .eq. nz) u(i,j,k+1)= -u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero + if (kg .eq. nz) v(i,j,k+1)= -v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero + if (kg .eq. nz+1) w(i,j,k)=0.d0 ! w point (nz+1) is at the wall + umax=max(umax,u(i,j,k)) + vmax=max(vmax,v(i,j,k)) + wmax=max(wmax,w(i,j,k)) + enddo + enddo + enddo + + call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr) + call MPI_Allreduce(vmax,gvmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr) + call MPI_Allreduce(wmax,gwmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr) + + cflx=gumax*dt*dxi + cfly=gvmax*dt*dyi + cflz=gwmax*dt*lz/nz + cou=max(cflx,cfly) + cou=max(cou,gcflz) + if (rank.eq.0) then + write(*,*) "CFL (max among tasks)", cou + if (cou .gt. 7) stop + endif + + call cpu_time(timef) + if (rank.eq.0) print '(" Time elapsed = ",f6.1," ms")',1000*(timef-times) + !######################################################################################################################################## + ! END STEP 8: VELOCITY CORRECTION + !######################################################################################################################################## + call nvtxEndRange + + + !######################################################################################################################################## + ! START STEP 9: OUTPUT FIELDS + ! ######################################################################################################################################## + if (mod(t,dump) .eq. 0) then + if (rank .eq. 0) write(*,*) "Saving output files" + ! write velocity and pressure fiels (1-4) + call writefield(t,1) + call writefield(t,2) + call writefield(t,3) + call writefield(t,4) + #if phiflag == 1 + ! write phase-field (5) + call writefield(t,5) + #endif + #if thetaflag == 1 + ! write temperature field (6) + call writefield(t,6) + #endif + endif + !######################################################################################################################################## + ! END STEP 9: OUTPUT FIELDS N + !######################################################################################################################################## + +call nvtxEndRange +!call nvtxEndRange +enddo +call cpu_time(t_end) +elapsed = t_end-t_start +if (rank .eq. 0) write(*,*) 'Elapsed time (seconds):', elapsed +!$acc end data +!$acc end data +!$acc end data + +! Remove allocated variables (add new) +deallocate(u,v,w) +deallocate(tanh_psi, mysin, mycos) +deallocate(rhsu,rhsv,rhsw) +deallocate(rhsu_o,rhsv_o,rhsw_o) +#if phiflag == 1 +deallocate(phi,rhsphi,normx,normy,normz) +#endif +#if thetaflag == 1 +deallocate(theta,rhstheta,rhstheta_o) +#endif + +call mpi_finalize(ierr) + +end program main \ No newline at end of file diff --git a/channel/module.f90 b/channel/module.f90 new file mode 100644 index 0000000..6ea93b7 --- /dev/null +++ b/channel/module.f90 @@ -0,0 +1,149 @@ +module param + integer, parameter :: nx=200 + integer, parameter :: ny=200 + integer, parameter :: nz=100 + double precision :: pi, rhoi, twopi + double precision :: lx, dx, dxi, ddxi + double precision :: ly, dy, dyi, ddyi + double precision :: lz + double precision, allocatable :: x(:), y(:), z(:), dzi(:), dzci(:), kx(:), ky(:) + double precision, device, allocatable :: kx_d(:), ky_d(:) + integer :: restart,tstart,tfin,dump + double precision :: gamma, normod, factor, csi + double precision :: dt,mu,rho !flow parameters + integer :: inflow, inphi, intheta + double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters + double precision :: times, timef, alphag + double precision :: gradpx, gradpy, noise, lflow, gflow, ubulk, cflx, cfly, cflz, gcflz + double precision :: amp, mx, my, mz ! for perturbed flow + double precision :: kappa ! temperature parameters: thermal diffusivity, Prandtl number +end module param + + +module mpivar + ! MPI variables + integer :: rank, ranks, ierr + integer :: localRank, localComm +end module mpivar + + +module cudecompvar + use cudecomp + integer :: npx, npy, npz + type(cudecompHandle) :: handle + type(cudecompGridDesc) :: grid_desc,grid_descD2Z + type(cudecompGridDescConfig) :: config + type(cudecompGridDescAutotuneOptions) :: options + integer :: pdims(2) ! pr x pc pencils + integer :: gdims(3) ! global grid dimensions + integer :: halo(3) ! halo extensions + integer :: halo_ext ! 0 no halo, 1 means 1 halo + type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration + type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z ! size of the pencils in x- y- and z-configuration for D2Z + type(cudecompPencilInfo) :: piX_Poiss + integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z + integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z + logical :: halo_periods(3) +end module cudecompvar + + +module velocity + double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:) + double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:) + double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:) + complex(8), allocatable :: rhsp_complex(:,:,:) + double precision, allocatable :: rhsp(:,:,:), p(:,:,:) + double precision, allocatable :: rhspp(:,:,:), pp(:,:,:) + double precision, allocatable :: div(:,:,:) + double precision :: uc, vc, wc, umax, vmax, wmax, gumax, gvmax, gwmax, cou + double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33 + double precision, allocatable :: mysin(:), mycos(:) +end module velocity + + +module phase + double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:) + double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:) + double precision :: chempot, curv + double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:) +end module phase + + +module temperature + double precision, allocatable :: theta(:,:,:), rhstheta(:,:,:) + double precision, allocatable :: rhstheta_o(:,:,:) +end module temperature + + +! added NVTX for profiing from maxcuda/NVTX_example +module nvtx +use iso_c_binding +implicit none +integer,private :: col(7) = [ int(Z'0000ff00'), int(Z'000000ff'), int(Z'00ffff00'), int(Z'00ff00ff'), int(Z'0000ffff'), int(Z'00ff0000'), int(Z'00ffffff')] +character,private,target :: tempName(256) + +type, bind(C):: nvtxEventAttributes + integer(C_INT16_T):: version=1 + integer(C_INT16_T):: size=48 ! + integer(C_INT):: category=0 + integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1 + integer(C_INT):: color + integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0 + integer(C_INT):: reserved0 + integer(C_INT64_T):: payload ! union uint,int,double + integer(C_INT):: messageType=1 ! NVTX_MESSAGE_TYPE_ASCII = 1 + type(C_PTR):: message ! ascii char +end type + +interface nvtxRangePush + ! push range with custom label and standard color + subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') + use iso_c_binding + character(kind=C_CHAR) :: name(256) + end subroutine + + ! push range with custom label and custom color + subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') + use iso_c_binding + import:: nvtxEventAttributes + type(nvtxEventAttributes):: event + end subroutine +end interface + +interface nvtxRangePop + subroutine nvtxRangePop() bind(C, name='nvtxRangePop') + end subroutine +end interface + +contains + +subroutine nvtxStartRange(name,id) + character(kind=c_char,len=*) :: name + integer, optional:: id + type(nvtxEventAttributes):: event + character(kind=c_char,len=256) :: trimmed_name + integer:: i + + trimmed_name=trim(name)//c_null_char + + ! move scalar trimmed_name into character array tempName + do i=1,LEN(trim(name)) + 1 + tempName(i) = trimmed_name(i:i) + enddo + + + if ( .not. present(id)) then + call nvtxRangePush(tempName) + else + event%color=col(mod(id,7)+1) + event%message=c_loc(tempName) + call nvtxRangePushEx(event) + end if +end subroutine + +subroutine nvtxEndRange + call nvtxRangePop +end subroutine + +end module nvtx + diff --git a/channel/plot_fields.py b/channel/plot_fields.py new file mode 100644 index 0000000..5847922 --- /dev/null +++ b/channel/plot_fields.py @@ -0,0 +1,569 @@ +# # %% +# %%html +# + +# %% +# %matplotlib widget +# %matplotlib inline + +import ipympl +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.style as style +# from ing_theme_matplotlib import mpl_style +# from qbstyles import mpl_style + +import os +import glob + +from scipy.interpolate import griddata + + +from matplotlib.widgets import Cursor + +import h5py + +plt.style.use("dark_background") +# mpl_style(dark=True) + + +################################################################# +foldername = './output/' + +# PARAMETERS: + +# fields to plot +fields = ['theta'] + +# number of points in each direction +# Grid parameters (user-defined) +nx = 256 # number of points in x +ny = 128 # number of points in y +nz = 200 # number of points in z + +Lx = 6.0 # length of domain in x +Ly = 3.0 # length of domain in y +Lz = 2.0 # length of domain in z + +# compute the derivative of the fields (show them instead of the neormal fields) +# 0: no derivative +# 1: x derivative +# 2: y derivative +# 3: z derivative +# list more flag to compute consecutive derivatives (forder 1 FD) +derivative_vec = [0] + +# # normal direction of the 2D slice: +# 1: x-direction +# 2: y-direction +# 3: z-direction +slice_dir = 2 + +# index to take the slice (from 1 to nx_i, choose -1 for computing the average) +slice_idx = 0 + +# slice_idx = 222 + +# time_steps to plot +# ts_vec = [0] +# ts_vec = range(800000,840500,10000) +# ts_vec = range(0,900001,100000) +ts_vec = [60000] # Test with just one timestep + +# ts_vec = [10000] + +# set 1 to compute time averaged quantities, 0 otherwise +timeaverage = 0 + +# set 1 to compute fluctuating components, 0 otherwise (expensive) +fluct = 0 + +# value for the fontsize: +fontsize_val = 10 + +# show heatmaps +showmaps_flag = 1 + +# slice of slice (leave -1 to compute the mean) +meanprof_slice = 0 + +# value of the figure size +figsize_val = 8 + +# save figure in png format +savefig_flag = 0 + +# save data in h5 format +savedata_flag = 0 + +# Aspect ratio of the heatmaps (-1 for auto, i.e. proportional, WARNING:can't handle huge AR) +AR_plot = -1 + +# Set your desired color limits here +vmin = None # e.g., vmin = 0.0 +vmax = None # e.g., vmax = 1.0 + +# vmin = -1 # e.g., vmin = 0.0 +# vmax = 1 # e.g., vmax = 1.0 + +################################################################# +# AUXILIARY FUNCTIONS +def get_spectrum(x, signal, title='Spectrum Analysis'): + + # Compute the Fourier Transform + spectrum = np.fft.fft(signal) + + # Calculate the amplitude and phase + amplitude = np.abs(spectrum) + phase = np.angle(spectrum) + + # Frequency values corresponding to the FFT components + frequencies = np.fft.fftfreq(len(x), d=(x[1] - x[0])) + + # cut negative freq + positive_freq_indices = frequencies >= 0 + frequencies = frequencies[positive_freq_indices] + amplitude = amplitude[positive_freq_indices] + phase = phase[positive_freq_indices] + + # Sort frequencies and corresponding arrays + sorted_indices = np.argsort(frequencies) + frequencies = frequencies[sorted_indices] + amplitude = amplitude[sorted_indices] + phase = phase[sorted_indices] + + # crop 2/3 of freq + amplitude = amplitude[1:int(np.floor(len(frequencies)*2/3))] + phase = phase[1:int(np.floor(len(frequencies)*2/3))] + frequencies = frequencies[1:int(np.floor(len(frequencies)*2/3))] + + wavenum = frequencies*2*np.pi + return [wavenum,amplitude,phase] + +################################################################# + +# font = {'family' : 'normal', +# 'weight' : 'normal', +# 'size' : 6} + +plt.rcParams.update({'font.size': 5}) + +# params_txt = {'axes.labelsize': 5,'axes.titlesize':5, 'text.fontsize': 5, 'legend.fontsize': 5, 'xtick.labelsize': 5, 'ytick.labelsize': 5} +# plt.rcParams.update(params_txt) + +# solve conflicts: if fluctuations is compute mean is needed +if fluct == 1: + timeaverage = 1 + +if timeaverage == 1: + meansuff = '_MEAN' +else: + meansuff = '' + +done = 0 +# GENERATED ON: +from datetime import datetime + +# Print current date and time briefly +print("Current date and time:", datetime.now()) + +glob_profiles = [] + +FLUCARRAYS_time = [] + +x = np.linspace(0, Lx, nx) +y = np.linspace(0, Ly, ny) +z = np.linspace(0, Lz, nz) + +# Define the dimensions of the reshaped arrays (nvec) [y z x] + +nvec = (nx, ny, nz) # [y, z, x] order in your data file + +# nvec = (512, 513, 512) # Update with the actual dimensions +# nvec = (256, 257, 256) +# # nvec = (128, 129, 128) +# nvec = (0, 0, 0) + +nx = nvec[0] +ny = nvec[1] +nz = nvec[2] + +for n_step in ts_vec: + file_names = [] + + for fld in fields: + file_names.append(fld + '_{:08d}.dat') + + # Initialize an empty list to store the data arrays + data_arrays = [] + + # initialize arrays with sums if want to calculate time averages + if timeaverage == 1: + flucarray = [] + if n_step == ts_vec[0]: + sumarray = [] + + # Read the data from each file and reshape + id_fnames = -1 + for fld in fields: + file_name = f"{fld}_{n_step:08d}.dat" + file_path = foldername + file_name + id_fnames = id_fnames+1 + + # Check if file exists + if not os.path.exists(file_path): + print(f"Warning: File {file_path} not found, skipping...") + continue + + with open(file_path, 'rb') as file: + total_elements = np.prod(nvec) + data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,)) + data = data.reshape(np.flip(nvec))*1.0 + + data = data.transpose((2, 1, 0)) # Permute first and third index to match the convection [x,z,y] + + # Validate data + print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}") + + dersuff = '' + if derivative_vec[0] != 0: + dersuff = '_' + for ider in derivative_vec: + derivative_x, derivative_z, derivative_y = np.gradient(data, x, z, y) + if ider == 1: + data = derivative_x + dersuff = dersuff+'x' + elif ider == 2: + data = derivative_y + dersuff = dersuff+'y' + elif ider == 3: + data = derivative_z + dersuff = dersuff+'z' + + # define axes: + if slice_dir == 1: + hor_name = 'y' + ver_name = 'z' + hor = y + nhor = ny + ver = z + nver = nz + elif slice_dir == 2: + hor_name = 'x' + ver_name = 'z' + hor = x + nhor = nx + ver = z + nver = nz + elif slice_dir == 3: + hor_name = 'x' + ver_name = 'y' + hor = x + nhor = nx + ver = y + nver = ny + + if slice_idx == -1: + type_name = 'average' + if slice_dir == 1: + mean_array = np.mean(data, axis=0) + elif slice_dir == 2: + mean_array = np.mean(data, axis=1) + elif slice_dir == 3: + mean_array = np.mean(data, axis=2) + + else: + type_name = 'slice' + if slice_dir == 1: + mean_array = data[slice_idx-1,:,:] + elif slice_dir == 2: + mean_array = data[:,slice_idx-1,:] + elif slice_dir == 3: + mean_array = data[:,:,slice_idx-1] + + data_arrays.append(mean_array) + + if timeaverage == 1: + if fluct == 1: + flucarray.append(mean_array) + if n_step == ts_vec[0]: + sumarray.append(mean_array) + else: + sumarray[id_fnames] = sumarray[id_fnames]+mean_array + + if n_step == ts_vec[-1]: + data_arrays[id_fnames] = sumarray[id_fnames]/len(ts_vec) + + FLUCARRAYS_time.append(flucarray) + + # Plot each array as a heat map with coordinates + + if timeaverage == 0 or n_step == ts_vec[-1]: + if showmaps_flag == 1: + for i, array in enumerate(data_arrays): + if array.ndim != 2: + print(f"Invalid shape for {file_name}") + continue + + N = 500j + extent = (hor.min(),hor.max(),ver.min(),ver.max()) + + HOR,VER = np.meshgrid(hor,ver) + hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N] + + plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val)) + + ax = plt.gca() + # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()]) + # im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()]) + # Use pcolormesh for non-uniform grids + X, Y = np.meshgrid(hor,ver) + + im = ax.pcolormesh(X, Y, array.T, cmap='jet', shading='gouraud', vmin=vmin, vmax=vmax) # smoother shading + + ax.set_aspect('equal') # <-- Add this line to set axis equal + + plt.xlabel(hor_name, fontsize=fontsize_val) + plt.ylabel(ver_name, fontsize=fontsize_val) + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.yticks(fontsize=fontsize_val, rotation=0) + + # create an axes on the right side of ax. The width of cax will be 5% + # of ax and the padding between cax and ax will be fixed at 0.05 inch. + cax = plt.axes([0.15, 0.95, 0.7, 0.03]) + plt.colorbar(im, cax=cax,orientation = "horizontal") + cax.xaxis.set_ticks_position('top') + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.title(file_name+dersuff,fontsize=fontsize_val) + + plt.show() + + # save arrays and figure in the case of temporal average + if timeaverage == 1: + if savefig_flag == 1: + plt.savefig('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'.png', dpi=800) + + # Save the arrays to an HDF5 file using h5py + if savedata_flag == 1: + with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'timeav.h5', 'w') as hf: + hf.create_dataset('data', data=array) + hf.create_dataset('x', data=hor) + hf.create_dataset('y', data=ver) + + + # # # # # PLOT ALL LINES IN THE SAME FIGURE + # # Plot the vertical profiles in the opposite direction with different colors + # plt.figure(figsize=(8, 6)) + + # for i, array in enumerate(data_arrays): + # if array.ndim != 2: + # print(f"Invalid shape for {file_name}") + # continue + + # # Compute the vertical profile in the opposite direction + # vertical_profile = np.mean(array, axis=0)[::-1] + + # # Plot the vertical profile with a different color for each array + # plt.plot(vertical_profile, ver, label=file_name, alpha=0.7) + + # plt.title(fields[i]+'Profiles',fontsize=fontsize_val) + # plt.xlabel(fields[i] +'Mean',fontsize = fontsize_val) + # plt.ylabel(ver_name,fontsize = fontsize_val) + # plt.legend(loc ="best",fontsize = fontsize_val) + # plt.xticks(fontsize=fontsize_val, rotation=0) + # plt.yticks(fontsize=fontsize_val, rotation=0) + # plt.show() + + # Plot the vertical profiles in the opposite direction with different colors + + vertical_profile = [] + + for i, array in enumerate(data_arrays): + if array.ndim != 2: + print(f"Invalid shape for {file_name}") + continue + + # Compute the vertical profile in the opposite direction + if meanprof_slice == -1: + vertical_profile.append(np.mean(array, axis=0)) + horproflabelaux = ' Mean' + else: + vertical_profile.append(np.transpose(array[meanprof_slice-1,:])) + horproflabelaux = ' ' + + + # plt.figure(figsize=(8, 6)) + + # # Plot the vertical profile with a different color for each array + # plt.plot(vertical_profile[i], ver, label=file_name, alpha=0.7) + + # plt.title(fields[i]+' Profiles',fontsize=fontsize_val) + # plt.xlabel(fields[i] +' Mean',fontsize = fontsize_val) + # plt.ylabel(ver_name,fontsize = fontsize_val) + # plt.legend(loc ="best",fontsize = fontsize_val) + # plt.xticks(fontsize=fontsize_val, rotation=0) + # plt.yticks(fontsize=fontsize_val, rotation=0) + # plt.show() + + glob_profiles.append(vertical_profile) + +prf_plt = [] + +if fluct == 1: + for i, array in enumerate(data_arrays): + mean = data_arrays[id_fnames] + rmsfluc = np.zeros_like(mean) + for j in range(len(ts_vec)): + flucs = FLUCARRAYS_time[j] + singfield = flucs[i] + rmsfluc = rmsfluc+np.square(singfield-mean) + rmsfluc = np.sqrt(rmsfluc/len(ts_vec)) + + # PLOT RMS FIELD + N = 500j + extent = (hor.min(),hor.max(),ver.min(),ver.max()) + + HOR,VER = np.meshgrid(hor, ver) + hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N] + aaa = np.ones((len(HOR.flatten()),2)) + aaa[:,0] = HOR.flatten() + aaa[:,1] = VER.flatten() + + resampled = griddata( aaa, rmsfluc.T.flatten(), (hors, vers), method='linear') + + if done == 0: + done = 1 + if AR_plot == -1: + AR_plot = 1/(hor.max()-hor.min())*(ver.max()-ver.min()) + AR_plt_flag = 0 + else: + AR_plt_flag = 1 + + plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val*AR_plot)) + + ax = plt.gca() + if AR_plt_flag == 1: + # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()]) + im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()]) + else: + # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()]) + im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()]) + + # plt.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent)#extent=[y.min(), y.max(), z.min(), z.max()]) + # plt.colorbar(orientation = "horizontal") + + # plt.title(file_name,fontsize=fontsize_val) + plt.xlabel(hor_name,fontsize=fontsize_val) + plt.ylabel(ver_name,fontsize=fontsize_val) + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.yticks(fontsize=fontsize_val, rotation=0) + + # create an axes on the right side of ax. The width of cax will be 5% + # of ax and the padding between cax and ax will be fixed at 0.05 inch. + divider = make_axes_locatable(ax) + cax = divider.append_axes("top", size="5%", pad=0.05) + plt.colorbar(im, cax=cax,orientation = "horizontal") + cax.xaxis.set_ticks_position('top') + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.title(file_name+dersuff+'_RMS',fontsize=fontsize_val) + + plt.show() + + # SAVE FLUCTUATION FIELDS + if savedata_flag == 1: + with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'time_rms.h5', 'w') as hf: + hf.create_dataset('data', data=rmsfluc) + hf.create_dataset('x', data=hor) + hf.create_dataset('y', data=ver) + +numfields = len(glob_profiles[0]) + +for i in range(numfields): + + plt.figure(figsize=(figsize_val, 9)) + + for j in range(len(ts_vec)): + + profile_sets = glob_profiles[j] + single_profile = profile_sets[i] + + # plt.figure(figsize=(8, 6)) + + # Plot the vertical profile with a different color for each array + plt.plot(single_profile, ver, label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7) + + + if fld == 'u' and slice_dir != 3: + zmin = np.min(ver) + zmax = np.max(ver) + zc = 0.5 * (zmin + zmax) + zlen = zmax - zmin + + # Parabolic profile: u(z) = umax * (1 - ((z - zc)/H)^2), with H = zlen/2 + H = zlen / 2.0 + umax = 1.0 / (2.0 * 1.0) # assuming mu = rho = 1 → umax = 1/(2μ) + zvals = np.linspace(zmin, zmax, 256) + u_analytical = umax * (1.0 - ((zvals - zc)/H)**2) + + plt.plot(u_analytical, zvals, 'r--', linewidth=2, label='Poiseuille (analytical)') + + plt.title(fields[i]+dersuff+' Profiles',fontsize=fontsize_val) + plt.xlabel(fields[i]+dersuff + horproflabelaux,fontsize = fontsize_val) + plt.ylabel(ver_name,fontsize = fontsize_val) + # plt.legend(loc ="best",fontsize = fontsize_val) + plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val) + plt.subplots_adjust(right=0.70) + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.yticks(fontsize=fontsize_val, rotation=0) + plt.show() + +for i in range(numfields): + + plt.figure(figsize=(figsize_val, 8)) + + for j in range(len(ts_vec)): + + profile_sets = glob_profiles[j] + single_profile = profile_sets[i] + + # plt.figure(figsize=(8, 6)) + + # Plot the vertical profile with a different color for each array + [freq,amp,phase] = get_spectrum(ver,single_profile) + + # Plot amplitude on log scale + plt.subplot(2, 1, 1) + plt.plot(freq, amp,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7) + plt.xscale('log') + if np.any(amp > 0): # Only use log scale if there are positive values + plt.yscale('log') + plt.title('Amplitude') + plt.xlabel('k_'+ver_name) + plt.ylabel('Amplitude') + + # Plot phase on log scale + plt.subplot(2, 1, 2) + plt.plot(freq, phase,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7) + plt.xscale('log') + plt.title('Phase') + plt.xlabel('k_'+ver_name) + plt.ylabel('Phase (radians)') + + # Adjust layout and show the plot + plt.subplot(2, 1, 1) + plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val) + plt.tight_layout() + plt.show() diff --git a/channel/plot_profiles.py b/channel/plot_profiles.py new file mode 100644 index 0000000..086c742 --- /dev/null +++ b/channel/plot_profiles.py @@ -0,0 +1,163 @@ +# # %% +# %%html +# + +# %% +# %matplotlib widget +# %matplotlib inline + +import ipympl +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.style as style +# from ing_theme_matplotlib import mpl_style +# from qbstyles import mpl_style + +import os +import glob + +from scipy.interpolate import griddata + + +from matplotlib.widgets import Cursor + +import h5py + +plt.style.use("dark_background") +# mpl_style(dark=True) + + +################################################################# +foldername = './output/' + +# PARAMETERS: + +# fields to plot +fields = ['theta'] + +# number of points in each direction +# Grid parameters (user-defined) +nx = 256 # number of points in x +ny = 128 # number of points in y +nz = 200 # number of points in z + +nx = 2 # number of points in x +ny = 2 # number of points in y +nz = 1000 # number of points in z + +Lx = 6.0 # length of domain in x +Ly = 3.0 # length of domain in y +Lz = 2.0 # length of domain in z + +# compute the derivative of the fields (show them instead of the neormal fields) +# 0: no derivative +# 1: x derivative +# 2: y derivative +# 3: z derivative +# list more flag to compute consecutive derivatives (forder 1 FD) +derivative_vec = [0] + +# # normal direction of the 2D slice: +# 1: x-direction +# 2: y-direction +# 3: z-direction +slice_dir = 2 + +# index to take the slice (from 1 to nx_i, choose -1 for computing the average) +slice_idx = 0 + +# slice_idx = 222 + +# time_steps to plot +ts_vec = range(0,230000,10000) + +# ts_vec = [10000] + +# set 1 to compute time averaged quantities, 0 otherwise +timeaverage = 0 + +# set 1 to compute fluctuating components, 0 otherwise (expensive) +fluct = 0 + +# value for the fontsize: +fontsize_val = 10 + +x = np.linspace(0, Lx, nx) +y = np.linspace(0, Ly, ny) +z = np.linspace(0, Lz, nz) + +# Define the dimensions of the reshaped arrays (nvec) [y z x] + +nvec = (nx, ny, nz) # [y, z, x] order in your data file + +# nvec = (512, 513, 512) # Update with the actual dimensions +# nvec = (256, 257, 256) +# # nvec = (128, 129, 128) +# nvec = (0, 0, 0) + +nx = nvec[0] +ny = nvec[1] +nz = nvec[2] + +id_fnames = -1 + +for fld in fields: + + id_fnames = id_fnames+1 + + + plt.figure(figsize=(10, 9)) + + for n_step in ts_vec: + file_names = [] + + file_names.append(fld + '_{:08d}.dat') + + + # Read the data from each file and reshape + + file_name = f"{fld}_{n_step:08d}.dat" + file_path = foldername + file_name + + # Check if file exists + if not os.path.exists(file_path): + print(f"Warning: File {file_path} not found, skipping...") + continue + + with open(file_path, 'rb') as file: + total_elements = np.prod(nvec) + data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,)) + + data = data.reshape(np.flip(nvec))*1.0 + + # Validate data + print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}") + + prof = np.mean(data, axis=(1, 2)) #.transpose((1,0,2)) + + # prof = np.flip(np.mean(data, axis=(0, 2))) + + plt.plot(prof, z, label=f'{fld}_{n_step}', alpha=0.7) + + plt.title(fld+' Profiles',fontsize=fontsize_val) + plt.xlabel(fld,fontsize = fontsize_val) + plt.ylabel("z",fontsize = fontsize_val) + # plt.legend(loc ="best",fontsize = fontsize_val) + plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val) + plt.subplots_adjust(right=0.70) + plt.xticks(fontsize=fontsize_val, rotation=0) + plt.yticks(fontsize=fontsize_val, rotation=0) + plt.show() diff --git a/channel/post.py b/channel/post.py new file mode 100644 index 0000000..4bcd91d --- /dev/null +++ b/channel/post.py @@ -0,0 +1,35 @@ +# %% +import numpy as np +import matplotlib.pyplot as plt + +nx, ny, nz = 256, 128, 200 + +data = np.fromfile('output/u_00005000.dat', dtype=np.float64) # or float64 +print("Data size:", data.size) +data = data.reshape((nz, ny, nx)) + +slice_index = ny // 2 # middle of z +slice_data = data[ :,slice_index, :] # xy slice at fixed z + +print("Max:", np.max(data)) +print("Min:", np.min(data)) +plt.figure(figsize=(14,6)) +plt.imshow(slice_data, cmap='jet', origin='lower', aspect='3.14') +plt.colorbar(label='Velocity') +plt.xlabel('x') +plt.ylabel('y') +plt.axis('scaled') +plt.show() +#1D plot +#line_data = data[1, 1, : ] # middle column +#plt.subplot(1, 2, 2) +#plt.plot(line_data, np.arange(ny), color='black') +#plt.xlabel('Variable') +#plt.ylabel('y') +#plt.title('1D Profile along y (x = nx/2)') +#plt.grid(True) + +#plt.tight_layout() +#plt.show() + +# %% diff --git a/channel/readinput.f90 b/channel/readinput.f90 new file mode 100644 index 0000000..b792e99 --- /dev/null +++ b/channel/readinput.f90 @@ -0,0 +1,155 @@ + +!########################################################################## +!########################################################################### +subroutine readinput +use velocity +use phase +use temperature +use param +use mpivar +implicit none +integer :: i,j,k +double precision :: zk + +open(unit=55,file='input.inp',form='formatted',status='old') +!Time step parameters +read(55,*) restart +read(55,*) tstart +read(55,*) tfin +read(55,*) dump +! Domain size +read(55,*) lx +read(55,*) ly +read(55,*) lz +read(55,*) csi +!Flow parameters +read(55,*) inflow +read(55,*) inphi +read(55,*) intheta +read(55,*) dt +read(55,*) mu +read(55,*) rho +! forcing parameters +read(55,*) gradpx +read(55,*) gradpy +! phase-field parameters +read(55,*) radius +read(55,*) sigma +read(55,*) epsr +! temperature parameters +read(55,*) kappa +read(55,*) alphag +close(55) + +! compute pre-defined constants +twopi=8.0_8*atan(1.0_8) +pi=twopi/2.d0 +dx = lx/nx +dy = ly/ny +dxi = 1.d0/dx +dyi = 1.d0/dy +ddxi = 1.d0/dx/dx +ddyi = 1.d0/dy/dy +rhoi=1.d0/rho +eps=epsr*dx +epsi=1.d0/eps +enum=1.e-16 +!write(*,*) "Check on stability", dt*mu*dzi*dzi + +if (rank .eq. 0) then + !enable/disable for debug check parameters + write(*,*) "----------------------------------------------" + write(*,*) "███ ███ ██ ██ ██ ████████ ██████ ██████" + write(*,*) "████ ████ ██ ██ ██ ██ ██ ██" + write(*,*) "██ ████ ██ ███████ ██ ██ █████ ███████" + write(*,*) "██ ██ ██ ██ ██ ██ ██ ██ ██ ██" + write(*,*) "██ ██ ██ ██ ██ ██ ██████ ██████" + write(*,*) "----------------------------------------------" + write(*,*) "-------------Channel flow setup---------------" + write(*,*) 'Grid:', nx, 'x', ny, 'x', nz + write(*,*) "Restart ", restart + write(*,*) "Tstart ", tstart + write(*,*) "Tfin ", tfin + write(*,*) "Dump ", dump + write(*,*) "Inflow ", inflow + write(*,*) "Deltat ", dt + write(*,*) "Mu ", mu + write(*,*) "Rho ", rho + write(*,*) "Gradpx ", gradpx + write(*,*) "Gradpy ", gradpy + write(*,*) "Radius ", radius + write(*,*) "Sigma ", sigma + write(*,*) "Eps ", eps + write(*,*) "Kappa ", kappa + write(*,*) "Alphag ", alphag + write(*,*) "Lx ", lx + write(*,*) "Ly ", ly + write(*,*) "Lz ", lz + write(*,*) "Z-stretch ", csi +! write(*,*) "dx", dx +! write(*,*) "dxi", dxi +! write(*,*) "ddxi", ddxi +! write(*,*) "dy", dx +! write(*,*) "dyi", dyi +! write(*,*) "ddyi", ddyi +! write(*,*) "dz", dz +! write(*,*) "dzi", dzi +! write(*,*) "ddzi", ddzi +endif + + + +! define wavenumbers and grid points axis +! define grid (then move in readinput) +allocate(x(nx),y(ny),z(0:nz+1),dzci(nz),dzi(nz+1),kx(nx),ky(ny)) +! location of the pressure nodes (cell centers) +x(1)=dx/2 +do i = 2, nx + x(i) = x(i-1) + dx +enddo +y(1)=dy/2 +do j = 2, ny + y(j) = y(j-1) + dy +enddo +! stretched grid along z; z axis include also the two ghost nodes located at +/- dz/2 above and below the wall +do k = 1, nz + zk=(dble(k)-0.5d0)/dble(nz) + z(k) = 0.5d0*dble(lz)*(1.d0+tanh(csi*(zk-0.5d0))/tanh(0.5d0*csi)) +enddo +z(0)=-z(1) +z(nz+1)= lz+(lz-z(nz)) +! compute inverse of dz (between cell faces) +dzci(1) = 2.d0/(z(1)+ z(2)) +dzci(nz) = 1.d0/(lz-0.5d0*(z(nz)+z(nz-1))) +do k = 2, nz-1 + dzci(k) = 2.d0/(z(k+1)-z(k-1)) +enddo +! compute inverse of dz (between nodes) +dzi(1)=0.5d0/z(1) +!dzi(nz+1)=0.5d0/(lz-z(nz)) +do k=1, nz+1 + dzi(k) = 1.d0/(z(k)-z(k-1)) +enddo +!write(*,*) "dzi", dzi +! wavenumber +do i = 1, nx/2 + kx(i) = (i-1)*(twopi/lx) +enddo +do i = nx/2+1, nx + kx(i) = (i-1-nx)*(twopi/lx) +enddo +do j = 1, ny/2 + ky(j) = (j-1)*(twopi/ly) +enddo +do j = ny/2+1, ny + ky(j) = (j-1-ny)*(twopi/ly) +enddo +! allocate kx_d and ky_d on the device +allocate(kx_d, source=kx) +allocate(ky_d, source=ky) + + + +end subroutine + + diff --git a/channel/readwrite.f90 b/channel/readwrite.f90 new file mode 100644 index 0000000..3b54006 --- /dev/null +++ b/channel/readwrite.f90 @@ -0,0 +1,356 @@ +subroutine writefield(t,fieldn) +! Output field, file is written in the /output folder + +use velocity +use phase +use temperature +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: t,fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: out(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 2) then + out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 3) then + out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 4) then + out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 5) then + out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 6) then + out=theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +deallocate(out) + +end subroutine + + + + + + + + + + +subroutine readfield(fieldn) +! Input field, file is written in the /input folder + +use velocity +use phase +use temperature +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: in(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1) +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + namefile='./input/u.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 2) then + namefile='./input/v.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 3) then + namefile='./input/w.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 4) then + namefile='./input/p.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 5) then + namefile='./input/phi.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 6) then + namefile='./input/theta.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +deallocate(in) + +end subroutine + + + + + + + + + + +subroutine readfield_restart(t,fieldn) +! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!) + +use velocity +use phase +use temperature +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: t,fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: in(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1) +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 2) then + write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 3) then + write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 4) then + write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 5) then + write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 6) then + write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +deallocate(in) + +end subroutine \ No newline at end of file diff --git a/channel/restart_local.sh b/channel/restart_local.sh new file mode 100755 index 0000000..f761baf --- /dev/null +++ b/channel/restart_local.sh @@ -0,0 +1,10 @@ +NVARCH=Linux_x86_64; export NVARCH +NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS +MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH +PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH +export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH +export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man +LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib + + +mpirun -np 2 ./tcf36 \ No newline at end of file diff --git a/channel/tdma.f90 b/channel/tdma.f90 new file mode 100644 index 0000000..2cde690 --- /dev/null +++ b/channel/tdma.f90 @@ -0,0 +1,312 @@ +! Subroutine still under development, not included in the makefile at the moment (7/8/25) +subroutine tdmau + implicit none + integer il, jl + ! TDMA for u velocity, solve from 1 to nz+1 where 1 is the w collocation point on the bottom and nz+1 the upper one + ! Load all variables via modules + ! move pencil information in a module + ! Set up tridiagonal system for each i and j + ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i) + ! Dirilecht BC: w = 0 at k=1 and k=nz+1 + ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? + + use velocity + implicit none + call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape) + + !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) + do jl = 1, npy + do il = 1, npx + + !inner nodes + do k = 2, nz-1 + a(k) = 1.0d0*dzi*dzi + b(k) = -2.0d0*dzi*dzi + c(k) = 1.0d0*dzi*dzi + d(k) = u(k,il,jl) ! check if correct and consisten with CN + enddo + + ! Dirilecht BC at k=1 + a(1) = 0.0d0 + b(1) = 1.0d0 + c(1) = 0.0d0 + d(1) = 0.0d0 + + ! Dirilecht BC at k=nz+1 + a(nz) = 0.0d0 + b(nz) = 1.0d0 + c(nz) = 0.0d0 + d(nz) = 0.0d0 + + ! Forward elimination (Thomas) + !$acc loop seq + do k = 2, nz + factor = a(k)/b(k-1) + b(k) = b(k) - factor*c(k-1) + d(k) = d(k) - factor*d(k-1) + end do + + ! Back substitution + sol(nz) = d(nz)/b(nz+1) + ! check on pivot like flutas? + !$acc loop seq + do k = nz-1, 1, -1 + sol(k) = (d(k) - c(k)*sol(k+1))/b(k) + end do + + ! Store solution in array that do the back FFT + do k=1,nz + vel3d(k,il,jl) = sol(k) + enddo + end do + end do +end subroutine + + + + + + + + + + + + +subroutine tdmav + implicit none + integer il, jl + ! TDMA for v velocity, solve from 1 to nz where 1 is the v collocation point on the bottom and nz the upper one + ! Load all variables via modules + ! move pencil information in a module + ! Set up tridiagonal system for each i and j + ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i) + ! Dirilecht BC: v = 0 at k=1 and k=nz + ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? + ! Same as u but just RHS is different + + use velocity + implicit none + call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape) + + !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) + do jl = 1, npy + do il = 1, npx + + !inner nodes + do k = 2, nz-1 + a(k) = 1.0d0*dzi*dzi + b(k) = -2.0d0*dzi*dzi + c(k) = 1.0d0*dzi*dzi + d(k) = v(k,il,jl) ! check if correct and consisten with CN + enddo + + ! Dirilecht BC at k=1 + a(1) = 0.0d0 + b(1) = 1.0d0 + c(1) = 0.0d0 + d(1) = 0.0d0 + + ! Dirilecht BC at k=nz + a(nz+1) = 0.0d0 + b(nz+1) = 1.0d0 + c(nz+1) = 0.0d0 + d(nz+1) = 0.0d0 + + ! Forward elimination (Thomas) + !$acc loop seq + do k = 2, nz + factor = a(k)/b(k-1) + b(k) = b(k) - factor*c(k-1) + d(k) = d(k) - factor*d(k-1) + end do + + ! Back substitution + sol(nz) = d(nz)/b(nz) + ! check on pivot like flutas? + !$acc loop seq + do k = nz-1, 1, -1 + sol(k) = (d(k) - c(k)*sol(k+1))/b(k) + end do + + ! Store solution in array that do the back FFT + do k=1,nz + vel3d(k,il,jl) = sol(k) + enddo + end do + end do +end subroutine + + + + + + + + + + + + + + + + +subroutine tdmaw + implicit none + integer il, jl + ! TDMA for w velocity, solve from 1 to ny+1 where 1 is the w collocation point on the bottom and ny+1 the upper one + ! Load all variables via modules + ! move pencil information in a module + ! Set up tridiagonal system for each i and j + ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i) + ! Dirilecht BC: w = 0 at k=1 and k=nz+1 + ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? + + use velocity + implicit none + call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape) + + !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) + do jl = 1, npy + do il = 1, npx + + !inner nodes + do k = 2, nz + a(k) = 1.0d0*dzi*dzi + b(k) = -2.0d0*dzi*dzi + c(k) = 1.0d0*dzi*dzi + d(k) = w(k,il,jl) ! check if correct and consisten with CN + enddo + + ! Dirilecht BC at k=1 + a(1) = 0.0d0 + b(1) = 1.0d0 + c(1) = 0.0d0 + d(1) = 0.0d0 + + ! Dirilecht BC at k=nz+1 + a(nz+1) = 0.0d0 + b(nz+1) = 1.0d0 + c(nz+1) = 0.0d0 + d(nz+1) = 0.0d0 + + ! Forward elimination (Thomas) + !$acc loop seq + do k = 2, nz+1 + factor = a(k)/b(k-1) + b(k) = b(k) - factor*c(k-1) + d(k) = d(k) - factor*d(k-1) + end do + + ! Back substitution + sol(nz+1) = d(nz+1)/b(nz+1) + ! check on pivot like flutas? + !$acc loop seq + do k = nz, 1, -1 + sol(k) = (d(k) - c(k)*sol(k+1))/b(k) + end do + + ! Store solution in array that do the back FFT + ! only up to nz, because i am not trasposing the halo, i can eventually + do k=1,nz + vel3d(k,il,jl) = sol(k) + enddo + end do + end do +end subroutine + + + + + + + + +subroutine tdmap + ! TDMA for pressure, solve from 0 to ny+1 where 0 is the first ghost node and ny+1 the upper ghost node + ! When ready, remove it from the main code and just call this function + ! Load all variables via modules + ! move pencil information in a module + ! Set up tridiagonal system for each i and j + ! The system is: (A_z) * pc(k-1,ky,kx) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k,ky,kx) = rhs(k,ky,kx) + ! Neumann BC: d/dz pc = 0 at w collocation points + ! Fill diagonals and rhs for each + ! 0 and ny+1 are the ghost nodes + ! a, b, c and are complex (because of FFT) + + use velocity + + implicit none + + offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1 + offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1 + offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1 + call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape) + + xoff = offsets(1) + yoff = offsets(2) + npx = np(1) + npy = np(2) + + !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) + do jl = 1, npy + do il = 1, npx + ! compute index global wavenumber ig and jg + jg = yoff + jl + ig = xoff + il + + do k = 1, nz + a(k) = 1.0d0*dzi*dzi + b(k) = -2.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg) + c(k) = 1.0d0*dzi*dzi + d(k) = psi3d(k,il,jl) + enddo + + ! Neumann BC at bottom + a(0) = 0.0d0 + b(0) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg) + c(0) = 1.0d0*dzi*dzi + d(0) = 0.0d0 + + ! ghost node elimintaion trick + ! Neumann BC at top + a(nz+1) = 1.0d0*dzi*dzi + b(nz+1) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg) + c(nz+1) = 0.0d0 + d(nz+1) = 0.0d0 + + ! Enforce pressure at one point? one interior point, avodig messing up with BC + ! need brackets? + if (ig == 1 .and. jg == 1) then + a(1) = 0.d0 + b(1) = 1.d0 + c(1) = 0.d0 + d(1) = 0.d0 + end if + + ! Forward elimination (Thomas) + !$acc loop seq + do k = 1, nz+1 + factor = a(k)/b(k-1) + b(k) = b(k) - factor*c(k-1) + d(k) = d(k) - factor*d(k-1) + end do + + ! Back substitution + sol(nz+1) = d(nz+1)/b(nz+1) + ! check on pivot like flutas? + !$acc loop seq + do k = nz, 0, -1 + sol(k) = (d(k) - c(k)*sol(k+1))/b(k) + end do + + ! Store solution in array that do the back FFT + do k=1,nz + psi3d(k,il,jl) = sol(k) + enddo + end do + end do + + +end subroutine \ No newline at end of file diff --git a/channel/testpush.sh b/channel/testpush.sh new file mode 100644 index 0000000..2e0e21a --- /dev/null +++ b/channel/testpush.sh @@ -0,0 +1,2 @@ +make clean +make diff --git a/cuDecomp b/cuDecomp index 7968153..907b18e 160000 --- a/cuDecomp +++ b/cuDecomp @@ -1 +1 @@ -Subproject commit 796815382081da97054602aed68baae88b658ee9 +Subproject commit 907b18e901321c1ca6e6142ae4c2a4cc3ff49030 diff --git a/hit/Makefile b/hit/Makefile new file mode 100644 index 0000000..0ee5198 --- /dev/null +++ b/hit/Makefile @@ -0,0 +1,33 @@ +# Compiler +FC = mpif90 +LD = $(FC) + +# Paths (Modify if necessary) +ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) +$(info $$ROOT_DIR is [${ROOT_DIR}]) +PARENT_DIR := $(abspath $(ROOT_DIR)/..) +$(info $$PARENT_DIR is [${PARENT_DIR}]) +CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build +CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none +EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat + +export $(CUDECOMP_DIR)/lib + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/ +LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt + +# Source and Object Files +OBJS = module.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< diff --git a/hit/Makefile_leonardo b/hit/Makefile_leonardo new file mode 100755 index 0000000..061dbd9 --- /dev/null +++ b/hit/Makefile_leonardo @@ -0,0 +1,33 @@ +# Compiler +FC = mpif90 +LD = $(FC) + +# Paths (Modify if necessary) +ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) +$(info $$ROOT_DIR is [${ROOT_DIR}]) +PARENT_DIR := $(abspath $(ROOT_DIR)/..) +$(info $$PARENT_DIR is [${PARENT_DIR}]) +CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build +CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none +EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat + +export $(CUDECOMP_DIR)/lib + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/ +LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt + +# Source and Object Files +OBJS = module.f90 particles.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< diff --git a/hit/Makefile_local b/hit/Makefile_local new file mode 100644 index 0000000..08156c7 --- /dev/null +++ b/hit/Makefile_local @@ -0,0 +1,28 @@ +# Compiler +FC = mpif90 +LD = $(FC) +RM = /bin/rm -f + +# Paths (Modify if necessary) +CUDECOMP_DIR = /home/milton/MHIT36/cuDecomp/build/ + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -Minfo=accel -cuda -I$(CUDECOMP_DIR)/include +LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft -lcuda -lnvToolsExt + +# Source and Object Files +OBJS = module.f90 particles.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< + + diff --git a/hit/Makefile_mn5 b/hit/Makefile_mn5 new file mode 100644 index 0000000..fe0febe --- /dev/null +++ b/hit/Makefile_mn5 @@ -0,0 +1,33 @@ +# Compiler +FC = mpif90 +LD = $(FC) + +# Paths (Modify if necessary) +ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) +$(info $$ROOT_DIR is [${ROOT_DIR}]) +PARENT_DIR := $(abspath $(ROOT_DIR)/..) +$(info $$PARENT_DIR is [${PARENT_DIR}]) +CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build +CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none +EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat + +export $(CUDECOMP_DIR)/lib + +# Compiler and Linker Flags +FFLAGS = -fast -acc -gpu=managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/ +LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt + +# Source and Object Files +OBJS = module.f90 main.f90 readinput.f90 readwrite.f90 +EXEC = mhit36 + +$(EXEC): $(OBJS) + $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@ + +clean: + $(RM) $(EXEC) + +.SUFFIXES: .o + +.f90.o: + $(FC) -c $(FFLAGS) $< diff --git a/hit/binder.sh b/hit/binder.sh new file mode 100755 index 0000000..b449dda --- /dev/null +++ b/hit/binder.sh @@ -0,0 +1,12 @@ +#!/bin/bash +case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in +0) export UCX_NET_DEVICES=mlx5_0:1 ;; +1) export UCX_NET_DEVICES=mlx5_1:1 ;; +2) export UCX_NET_DEVICES=mlx5_2:1 ;; +3) export UCX_NET_DEVICES=mlx5_3:1 ;; +esac + +echo Launching on $UCX_NET_DEVICES + +$* + diff --git a/hit/go_leo.sh b/hit/go_leo.sh new file mode 100644 index 0000000..7545b12 --- /dev/null +++ b/hit/go_leo.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --account="IscrB_EXCEED" +#SBATCH --job-name="cudec" +#SBATCH --time=00:05:00 +#SBATCH --nodes=1 ##adjust +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=8 +#SBATCH --output=test.out +#SBATCH -p boost_usr_prod +#SBATCH --error=test.err +#SBATCH --qos=boost_qos_dbg + +#module load nvhpc/25.3 +#module load cuda/12.3 +#module load openmpi/4.1.6--nvhpc--24.3 +module load profile/candidate +module load nvhpc/25.3 +module load hpcx-mpi/2.19 +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH +CURRENT_DIR="$(pwd)" +ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib" +echo "Using directory: $ROOT_DIR" +export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH + +chmod 777 binder.sh +mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36 \ No newline at end of file diff --git a/hit/go_leo_bprod.sh b/hit/go_leo_bprod.sh new file mode 100644 index 0000000..15c7b1a --- /dev/null +++ b/hit/go_leo_bprod.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --account="IscrB_SONORA" +#SBATCH --job-name="cudec" +#SBATCH --time=00:10:00 +#SBATCH --nodes=128 ##adjust +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=8 ## override threads limitation +#SBATCH --output=test.out +#SBATCH --partition=boost_usr_prod +#SBATCH --qos=boost_qos_bprod +#SBATCH --error=test.err + +module load nvhpc/24.3 +module load cuda/12.3 +module load openmpi/4.1.6--nvhpc--24.3 +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_$ +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_P$ +CURRENT_DIR="$(pwd)" +ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib" +echo "Using directory: $ROOT_DIR" +export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH + +#export OMP_NUM_THREADS=16 +export OMP_PROC_BIND=spread +export OMP_PLACES=cores + +mpirun -n 512 ./mhit36 diff --git a/hit/go_leo_prof.sh b/hit/go_leo_prof.sh new file mode 100755 index 0000000..ebcb52e --- /dev/null +++ b/hit/go_leo_prof.sh @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH --account="tra25_openhack" +#SBATCH --job-name="cudec" +#SBATCH --time=00:10:00 +#SBATCH --nodes=1 ##adjust +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --output=test.out +#SBATCH -p boost_usr_prod +#SBATCH --qos=boost_qos_dbg +#SBATCH --error=test.err +#### if mapping is on +#SBATCH --cpus-per-task=8 + + +#module load nvhpc/24.3 +#module load cuda/12.3 +#module load openmpi/4.1.6--nvhpc--24.3 +module load profile/candidate +module load nvhpc/25.3 +module load hpcx-mpi/2.19 +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH +CURRENT_DIR="$(pwd)" +ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib" +echo "Using directory: $ROOT_DIR" +export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH + +# simple run +#mpirun -n 4 ./mhit36 + +# profile on single node (all ranks inside one profile) +#nsys profile -t cuda,nvtx,mpi,openacc mpirun -np 4 ./mhit36 + +# profile (one rep for rank) +#mpirun -n 4 nsys profile --trace=cuda,nvtx,mpi -o profile_output_%q{SLURM_PROCID} --stats=true ./mhit36 +#mpirun -n 4 nsys profile -t cuda,nvtx,mpi -o report.$SLURM_LOCALID ./mhit36 +#srun -n 4 nsys profile -t cuda,nvtx,mpi --output=nsys_report_rank%t ./mhit36nsys profile --multiprocess=true -t cuda,nvtx,mpi -o report $ + +# profile + node mapping + nic +mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile -t cuda,nvtx,mpi,openacc --nic-metrics=true ./binder.sh ./mhit36 + +# for nsight compute report +#mpirun -n 4 ncu --kernel-name main_659 --set=full --import-source=yes -o profile -f --launch-skip 3 --launch-count 1 "./mhit36" + +# for nsight compute report - all kernels +# mpirun -n 4 ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./mhit36 + +# for nsight compute report - all kernels + mapping + nic +# mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36 + diff --git a/hit/go_mn5.sh b/hit/go_mn5.sh new file mode 100644 index 0000000..7855336 --- /dev/null +++ b/hit/go_mn5.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --ntasks=16 # --ntasks=8 when used two nodes +#SBATCH --account="ehpc244" +#SBATCH --job-name="cudec" +#SBATCH --time=00:5:00 +#SBATCH --gres=gpu:4 +#SBATCH --cpus-per-task=20 +#SBATCH --output=test.out +#SBATCH -p boost_usr_prod +#SBATCH --qos=acc_debug +#SBATCH --error=test.err + +module load nvidia-hpc-sdk/24.3 +CURRENT_DIR="$(pwd)" +ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib" +echo "Using directory: $ROOT_DIR" +export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH + + +mpirun -n 8 ./mhit36 \ No newline at end of file diff --git a/hit/input.inp b/hit/input.inp new file mode 100644 index 0000000..845aab6 --- /dev/null +++ b/hit/input.inp @@ -0,0 +1,18 @@ +0 ! fresh start=0 restart =1 +0 ! iteration to start from (fresh start 0 otherwise different) +10000 ! Final time step +100000 ! Saving frequency +0 ! inflow condition (0=Taylor green, 1=read from fields) +0 ! phase-field initial condition (0=circular drop, 1=read from fields) +0.0002 ! dt for the simulation +0.006 ! mu (viscosity) +1.000 ! density +1.0 ! A forcing parameter (ABC forcing) +1.0 ! B forcing parameter (ABC forcing) +1.0 ! C forcing parameter (ABC forcing) +2.0 ! Forced wavenumber (ABC forcing) +0.5 ! Initial drop radius for phase-field +1.0d0 ! Surface tension +1.0 ! Ratio eps/dx +10000 ! Number particles +1 ! Particle initial condition (0 from file, 1 random, 2 random in drops) diff --git a/hit/leo.sh b/hit/leo.sh new file mode 100755 index 0000000..364635a --- /dev/null +++ b/hit/leo.sh @@ -0,0 +1,12 @@ +# old modules - ! if you use these, changes in Makefile are needed from -gpu=mem:managed to -gpu=managed +#module load nvhpc/24.3 +#module load cuda/12.3 +#module load openmpi/4.1.6--nvhpc--24.3 +module load profile/candidate +module load nvhpc/25.3 +module load hpcx-mpi/2.19 +cp Makefile_leonardo Makefile +make clean +make +mkdir -p output +# mpirun -np 4 ./mhit36 diff --git a/hit/local.sh b/hit/local.sh new file mode 100644 index 0000000..4f27984 --- /dev/null +++ b/hit/local.sh @@ -0,0 +1,15 @@ +NVARCH=Linux_x86_64; export NVARCH +NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS +MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH +PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH +export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH +export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man +LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib +#clean folder output +rm -rf output +mkdir output +cp Makefile_local Makefile +#rm *.dat +make clean +make +mpirun -np 2 ./mhit36 diff --git a/hit/main.f90 b/hit/main.f90 new file mode 100644 index 0000000..beb351f --- /dev/null +++ b/hit/main.f90 @@ -0,0 +1,1495 @@ +#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) + +program main +use cudafor +use cudecomp +use cufft +use mpi +use velocity +use phase +use particles +use param +use mpivar +use cudecompvar + +use, intrinsic :: ieee_arithmetic + +implicit none +! timer for scaling test +real :: t_start, t_end, elapsed +! grid dimensions +integer :: comm_backend +integer :: pr, pc +! cudecomp +! cuFFT +integer :: planXf, planXb +integer :: planY, planZ +integer :: batchsize +integer :: status +integer :: i,j,k,il,jl,kl,ig,jg,kg,t +integer :: im,ip,jm,jp,km,kp,last +integer :: inY,enY,inZ,enZ +!beginDEBUG +integer, parameter :: Mx = 1, My = 2, Mz = 1 +!endDEBUG +real(8), device, allocatable :: kx_d(:) +! working arrays +complex(8), allocatable :: psi(:) +real(8), allocatable :: ua(:,:,:) +real(8), allocatable :: uaa(:,:,:) + +real(8), allocatable :: psi_real(:) +! real(8), device, allocatable :: psi_real_d(:) +complex(8), device, allocatable :: psi_d(:) +complex(8), pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:) +character(len=40) :: namefile +character(len=4) :: itcount +! Code variables + +real(8)::err,maxErr + +complex(8), device, pointer :: phi3d(:,:,:) +real(8) :: k2 +!integer :: il, jl, ig, jg +integer :: offsets(3), xoff, yoff +integer :: np(3) + +! Enable or disable phase field (acceleration eneabled by default) +#define phiflag 1 +! Enable or disable particle Lagrangian tracking (tracers) +#define partflag 0 + +!######################################################################################################################################## +! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START +!######################################################################################################################################## +! MPI initialization, put in rank the local MPI rank number and ranks total number +! Same procedura defined in the cuDecomp documentation +call mpi_init(ierr) +call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr) +call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr) + +call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr) +call mpi_comm_rank(localComm, localRank, ierr) +ierr = cudaSetDevice(localRank) !assign GPU to MPI rank + +! Define grid and decomposition +call readinput + +! number of process along each direction (r means y and c means z) +pr = 0 +pc = 0 +halo_ext=1 +! comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P +comm_backend = 0 ! Enable full autotuning + +CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD)) + +! config is a struct and pr and pc are the number of pencils along the two directions +! gdims is the global grid +! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults. +! Initializing to default values is required to ensure no entries are left uninitialized. +CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config)) +pdims = [pr, pc] !pr and pc are the number of pencil along the different directions +config%pdims = pdims +! gdims = [nx, ny, nz] +! config%gdims = gdims +halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil +! for transpositions +config%transpose_comm_backend = comm_backend +config%transpose_axis_contiguous = .true. +! for halo exchanges +config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI +! Setting for periodic halos in all directions (non required to be in config) +halo_periods = [.true., .true., .true.] + +! create spectral grid descriptor first to select pdims for optimal transposes +gdims = [nx/2+1, ny, nz] +config%gdims = gdims + +! Set up autotuning options for spectral grid (transpose related settings) +CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options)) +options%dtype = CUDECOMP_DOUBLE_COMPLEX +if (comm_backend == 0) then + options%autotune_transpose_backend = .true. + options%autotune_halo_backend = .false. +endif +options%transpose_use_inplace_buffers = .true. +options%transpose_input_halo_extents(:, 1) = halo +options%transpose_output_halo_extents(:, 4) = halo + +CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options)) + +! create physical grid descriptor +! take previous config and modify the global grid (nx instead of nx/2+1) +! reset transpose_comm_backend to default value to avoid picking up possible nvshmem +! transpose backend selection (this impacts how workspaces are allocated) +gdims = [nx, ny, nz] +config%gdims = gdims +config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P + +! Set up autotuning options for physical grid (halo related settings) +CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options)) +options%dtype = CUDECOMP_DOUBLE_COMPLEX +if (comm_backend == 0) then + options%autotune_halo_backend = .true. +endif +options%halo_extents(:) = halo +options%halo_periods(:) = halo_periods +options%halo_axis = 1 +CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options)) + +!Print information on configuration +! issue with NVHPC > 25.X, simply avoid this call (not critical) +!if (rank == 0) then +! write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2) +! write(*,"(' Using ', a, ' transpose backend ...')") & +! cudecompTransposeCommBackendToString(config%transpose_comm_backend) +! write(*,"(' Using ', a, ' halo backend ...')") & +! cudecompHaloCommBackendToString(config%halo_comm_backend) +!endif + +! Get pencil info for the grid descriptor in the physical space +! This function returns a pencil struct (piX, piY or piZ) that contains the shape, global lower and upper index bounds (lo and hi), +! size of the pencil, and an order array to indicate the memory layout that will be used (to handle permuted, axis-contiguous layouts). +! Additionally, there is a halo_extents data member that indicates the depth of halos for the pencil, by axis. +! If no halo regions are necessary, a NULL pointer can be provided in place of this array (or omitted) +! Pencil info in x-configuration present in PiX (shape,lo,hi,halo_extents,size) +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo)) +nElemX = piX%size !<- number of total elments in x-configuratiion (including halo) +! Pencil info in Y-configuration present in PiY +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2)) +nElemY = piY%size +! Pencil info in Z-configuration present in PiZ +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3)) +nElemZ = piZ%size + +! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used) +CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork)) +CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo)) + +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo)) +nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo) +! Pencil info in Y-configuration present in PiY +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2)) +nElemY_d2z = piY_d2z%size +! Pencil info in Z-configuration present in PiZ +CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3)) +nElemZ_d2z = piZ_d2z%size +! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used) +CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z)) +CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z)) + +! Get the global rank of neighboring processes in PiX config (required only for particles) +CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, 1 , .true. , nidp1y)) +CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, -1 , .true. , nidm1y)) +CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, 1 , .true. , nidp1z)) +CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, -1 , .true. , nidm1z)) + + +! CUFFT initialization -- Create Plans +! Forward 1D FFT in X: D2Z +batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension) +status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward' + +! Backward 1D FFT in X: Z2D +batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension) +status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward' + +! it's always 2 and 3 because y-pencil have coordinates y,z,x +batchSize = piY_d2z%shape(2)*piY_d2z%shape(3) +status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward' + +! it's always 2 and 3 because y-pencil have coordinates z,y,x +batchSize = piZ_d2z%shape(2)*piZ_d2z%shape(3) +status = cufftPlan1D(planZ, nz, CUFFT_Z2Z, batchSize) +if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Z plan Forward & Backward' + + + +! define grid +allocate(x(nx),x_ext(nx+1)) +x(1)= 0 +do i = 2, nx + x(i) = x(i-1) + dx +enddo + +x_ext(1:nx) = x(1:nx) +x_ext(nx+1) = lx + +! Offsets in the X-pencil decomposition +pix_yoff = pix%lo(2)-1 +pix_zoff = pix%lo(3)-1 + +! Boundaries of each PiX pencil +yinf = x_ext( pix%lo(2) ) +ysup = x_ext( pix%hi(2) + 1 ) +zinf = x_ext( pix%lo(3) ) +zsup = x_ext( pix%hi(3) + 1 ) + +! Physical size of each PiX pencil +lyloc = ysup - yinf +lzloc = zsup - zinf + +! define wavenumbers +allocate(kx(nx)) +do i = 1, nx/2 + kx(i) = (i-1)*(twoPi/lx) +enddo +do i = nx/2+1, nx + kx(i) = (i-1-nx)*(twoPi/LX) +enddo +! allocate k_d on the device (later on remove and use OpenACC + managed memory?) +allocate(kx_d, source=kx) + +allocate(mysin(nx), mycos(nx)) +do i=1,nx + ! compute here the sin to avoid multiple computations of sin + mysin(i)=sin(k0*x(i)+dx/2) + ! compute here the cos to avoid multiple computations of cos + mycos(i)=cos(k0*x(i)+dx/2) +enddo + +! Initial distribution of particles among the processes +nploc = npart/ranks +nplocmax = nploc*2 + +!######################################################################################################################################## +! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END +!######################################################################################################################################## + + + +!######################################################################################################################################## +! START STEP 2: ALLOCATE ARRAYS +!######################################################################################################################################## + +! allocate arrays +allocate(psi(max(nElemX, nElemY, nElemZ))) !largest among the pencil +allocate(psi_real(max(nElemX, nElemY, nElemZ))) !largest among the pencil +allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z))) ! phi on device +allocate(ua(nx, piX%shape(2), piX%shape(3))) + +! Pressure variable +allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3))) +allocate(p(piX%shape(1), piX%shape(2), piX%shape(3))) + +!allocate variables +!NS variables +allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector +! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field +allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w +allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w +allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) +!PFM variables +#if phiflag == 1 + allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces + allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3))) +#endif + +! allocate arrays for transpositions and halo exchanges +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork)) +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo)) +! allocate arrays for transpositions +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z)) +CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z)) + +#if partflag == 1 +! Particle variables +allocate(part(1:nplocmax,1:ninfop)) +allocate(partbuff(1:nplocmax,1:ninfop)) +allocate(vec_p(1:nplocmax)) +allocate(order_p(1:nplocmax)) +allocate(buffvar1(1:ninfop,1:nploc)) +allocate(buffvar2(1:ninfop,1:nploc)) +#endif +!######################################################################################################################################## +! END STEP2: ALLOCATE ARRAYS +!######################################################################################################################################## + + + +!######################################################################################################################################## +! START STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT +!######################################################################################################################################## +! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O) +! 3.2 Call halo exchnages along Y and Z for u,v,w and phi +if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder + if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)" + if (inflow .eq. 0) then + if (rank.eq.0) write(*,*) "Initialize Taylor-green" + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 + do j = 1+halo_ext, piX%shape(2)-halo_ext + jg = piX%lo(2) + j - 1 + do i = 1, piX%shape(1) + u(i,j,k) = sin(x(i)-dx/2)*cos(x(jg))*cos(x(kg)) + v(i,j,k) = -cos(x(i))*sin(x(jg)-dx/2)*cos(x(kg)) + w(i,j,k) = 0.d0 + enddo + enddo + enddo + endif + if (inflow .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize from data" + call readfield(1) + call readfield(2) + call readfield(3) + endif +endif +if (restart .eq. 1) then !restart, ignore inflow and read the tstart field + if (rank.eq.0) write(*,*) "Initialize velocity field (from output folder), iteration:", tstart + call readfield_restart(tstart,1) + call readfield_restart(tstart,2) + call readfield_restart(tstart,3) +endif + +! update halo cells along y and z directions (enough only if pr and pc are non-unitary) +!$acc host_data use_device(u) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +!$acc host_data use_device(v) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data +!$acc host_data use_device(w) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) +CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) +!$acc end host_data + +! initialize phase-field +#if phiflag == 1 + if (restart .eq. 0) then + if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)' + if (inphi .eq. 0) then + if (rank.eq.0) write(*,*) 'Spherical drop' + do k = 1+halo_ext, piX%shape(3)-halo_ext + kg = piX%lo(3) + k - 1 + do j = 1+halo_ext, piX%shape(2)-halo_ext + jg = piX%lo(2) + j - 1 + do i = 1, piX%shape(1) + pos=(x(i)-lx/2)**2d0 + (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0 + phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps)) + enddo + enddo + enddo + endif + if (inphi .eq. 1) then + if (rank.eq.0) write(*,*) "Initialize phase-field from data" + call readfield(5) + endif + endif + if (restart .eq. 1) then + write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart + call readfield_restart(tstart,5) + endif + ! update halo + !$acc host_data use_device(phi) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data +#endif + +#if partflag == 1 + if (restart .eq. 0) then + if (rank.eq.0) write(*,*) 'Initialize particles (fresh start)' + if (inpart .eq. 1) then + if (rank.eq.0) write(*,*) 'Random Position whole Domain' + call particlegenerator(inpart) + endif + if (inpart .eq. 2) then + if (rank.eq.0) write(*,*) 'Random Position in Drops' + call particlegenerator(inpart) + endif + endif + ! if (restart .eq. 1) then + ! write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart + ! call readfield_restart(tstart,5) + ! endif +#endif + +!Save initial fields (only if a fresh start) +if (restart .eq. 0) then + if (rank.eq.0) write(*,*) "Save initial fields" + call writefield(tstart,1) + call writefield(tstart,2) + call writefield(tstart,3) + call writefield(tstart,4) + #if phiflag == 1 + call writefield(tstart,5) + #endif +endif + +!######################################################################################################################################## +! END STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT +!######################################################################################################################################## + + + +!######################################################################################################################################## +! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP +!######################################################################################################################################## + +! First step use Euler +alpha=1.0d0 +beta=0.0d0 +gumax=8.0d0 +tstart=tstart+1 +gamma=1.d0*gumax ! initial gamma for phase-field +!$acc data copyin(piX) +!$acc data create(rhsu_o, rhsv_o, rhsw_o) +!$acc data copyin(mysin, mycos) +#if partflag == 1 +!$acc data copy(part,partbuff) +!$acc data create(vec_p, order_p) +!$acc data create(buffvar1,buffvar2) +#endif +call cpu_time(t_start) + +! Start temporal loop +do t=tstart,tfin + ! Create custom label for each marker (profiling) + write(itcount,'(i4)') t + ! Range with custom color (uncomment for profiling) + ! call nvtxStartRange("Iteration "//itcount,t) + + if (rank.eq.0) write(*,*) "Time step",t,"of",tfin + call cpu_time(times) + + !######################################################################################################################################## + ! START STEP 4: PARTICLES (TRACERS) + !######################################################################################################################################## + #if partflag == 1 + ! Operations: + ! 4.1 Perform Interpolation (consider passing to trilinear: more accurate, but way more expensive) + ! 4.2 Integrate with Adams-Bashforth + ! 4.3 Order and transfer in y + ! 4.4 Order and transfer in z + ! 4.5 Check Leakage of Particles + + call LinearInterpolation() + ! Particle Tracker Integration + ! Two-Step Adams-Bashfort (Euler for first step) + !$acc parallel loop collapse(2) default(present) + do j = 0, 2 + do i = 1, nploc + part(i,Ixp+j)=part(i,Ixp+j)+& + dt*(alpha*part(i,Iup+j)-beta*part(i,Iup1+j)) + enddo + enddo + + ! Transfer in y + call SortPartY() + call CountPartTransfY() + call SendPartUP(2) + call SendPartDOWN(2) + + ! Transfer in z + call SortPartZ() + call CountPartTransfZ() + call SendPartUP(3) + call SendPartDOWN(3) + + ! Check Particles esacping the domain (leakage) + call ParticlesLeakage() + ! Shift data for next step + !$acc parallel loop collapse(2) default(present) + do j = 0, 2 + do i = 1, nploc + part(i,Iup1+j)=part(i,Iup+j) + enddo + enddo + write(*,*) 'rank',rank, 'nploc',nploc + #endif + !######################################################################################################################################## + ! END STEP 4: PARTICLES + !######################################################################################################################################## + + + ! (uncomment for profiling) + ! call nvtxStartRange("Phase-field") + !######################################################################################################################################## + ! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT) + !######################################################################################################################################## + #if phiflag == 1 + ! 4.2 Get phi at n+1 using RK4 + ! first stage of RK4 - saved in rhsphi + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + + gamma=1.d0*gumax ! update gamma every time step with the current max velocity value + if (rank.eq.0) write(*,*) "gamma:", gamma + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first six lines) and diffusive (last three lines) + ! flux-splitting + rhsphi(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi & + + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + & + eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + & + eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum) + normx_f(i,j,k) = normx(i,j,k)*normag + normy_f(i,j,k) = normy(i,j,k)*normag + normz_f(i,j,k) = normz(i,j,k)*normag + enddo + enddo + enddo + !$acc end kernels + + ! Compute sharpening term flux split like CaNS-Fizzy + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! second stage of RK4 - saved in rhsphik2 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik2(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! third stage of RK4 - saved in rhsphik3 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik3(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + ! forth stage of RK4 - saved in rhsphik4 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik4(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! Update phi with RK4 + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + & + 2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k)) + enddo + enddo + enddo + !$acc end kernels + + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + #endif + + ! (uncomment for profiling) + ! call nvtxEndRange + + !######################################################################################################################################## + ! END STEP 5: PHASE-FIELD SOLVER + !######################################################################################################################################## + + + !######################################################################################################################################## + ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP) + !######################################################################################################################################## + ! 6.1 compute rhs + ! 6.2 obtain ustar and store old rhs in rhs_o + ! 6.3 Call halo exchnages along Y and Z for u,v,w + + ! (uncomment for profiling) + ! call nvtxStartRange("Projection") + ! 6.1a Convective and diffusiver terms NS + ! Loop on inner nodes + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + ! Manual periodicity ony along x (x-pencil), along y and z directions use halos + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! compute the products (conservative form) + h11 = (u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k)) - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k)) + h12 = (u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)) + h13 = (u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp)) - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k)) + h21 = (u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)) + h22 = (v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k)) - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k)) + h23 = (w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k)) - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km)) + h31 = (w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km)) - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km)) + h32 = (v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k)) - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k)) + h33 = (w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k)) - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km)) + ! compute the derivative + h11=h11*0.25d0*dxi + h12=h12*0.25d0*dxi + h13=h13*0.25d0*dxi + h21=h21*0.25d0*dxi + h22=h22*0.25d0*dxi + h23=h23*0.25d0*dxi + h31=h31*0.25d0*dxi + h32=h32*0.25d0*dxi + h33=h33*0.25d0*dxi + ! add to the rhs + rhsu(i,j,k)=-(h11+h12+h13) + rhsv(i,j,k)=-(h21+h22+h23) + rhsw(i,j,k)=-(h31+h32+h33) + ! viscos term + h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi + h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddxi + h13 = mu*(u(i,j,kp)-2.d0*u(i,j,k)+u(i,j,km))*ddxi + h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi + h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddxi + h23 = mu*(v(i,j,kp)-2.d0*v(i,j,k)+v(i,j,km))*ddxi + h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi + h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddxi + h33 = mu*(w(i,j,kp)-2.d0*w(i,j,k)+w(i,j,km))*ddxi + rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi + rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi + rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi + ! NS forcing + kg = piX%lo(3) + k - 1 + jg = piX%lo(2) + j - 1 + ! ABC forcing + rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg) + rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg) + rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i) + enddo + enddo + enddo + + ! Surface tension forces + #if phiflag == 1 + !$acc kernels + !Obtain surface tension forces evaluated at the center of the cell (where phi is located) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! continuum-surface force implementation + curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi + !compute capillary forces: sigma*curvature*gradphi + fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi + fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi + fzst(i,j,k)= -sigma*curv*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi + enddo + enddo + enddo + !$acc end kernels + + ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points) + !$acc host_data use_device(fxst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(fyst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(fzst) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Interpolate force at velocity points + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + im=i-1 + jm=j-1 + km=k-1 + if (im .lt. 1) im=nx + rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi + rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi + rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi + u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) + v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) + w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo + enddo + !$acc end kernels + #else + ! 6.2 find u, v and w star only in the inner nodes + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k)) + v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k)) + w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k)) + rhsu_o(i,j,k)=rhsu(i,j,k) + rhsv_o(i,j,k)=rhsv(i,j,k) + rhsw_o(i,j,k)=rhsw(i,j,k) + enddo + enddo + enddo + !$acc end kernels + #endif + + ! store rhs* in rhs*_o + ! First step is done with Euler explicit and then move to AB2 + alpha=1.5d0 + beta= 0.5d0 + + ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid + !$acc host_data use_device(u) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(v) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(w) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + ! (uncomment for profiling) + ! call nvtxEndRange + !######################################################################################################################################## + ! END STEP 6: USTAR COMPUTATION + !######################################################################################################################################## + + + + !######################################################################################################################################## + ! START STEP 7: POISSON SOLVER FOR PRESSURE + !######################################################################################################################################## + ! initialize rhs and analytical solution + ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center + ! (uncomment for profiling) + ! call nvtxStartRange("Poisson") + ! call nvtxStartRange("compute RHS") + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + if (ip > nx) ip=1 + rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k)) + rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k)) + enddo + enddo + enddo + !$acc end kernels + ! (uncomment for profiling) + ! call nvtxEndRange + ! call nvtxStartRange("FFT forward w/ transpositions") + !$acc host_data use_device(rhsp) + status = cufftExecD2Z(planXf, rhsp, psi_d) + if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status + !$acc end host_data + + ! psi(kx,y,z) -> psi(y,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0])) + ! psi(y,z,kx) -> psi(ky,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status + ! psi(ky,z,kx) -> psi(z,kx,ky) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + ! psi(z,kx,ky) -> psi(kz,kx,ky) + status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD) + if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status + ! END of FFT3D forward + + ! call nvtxEndRange + np(piZ_d2z%order(1)) = piZ_d2z%shape(1) + np(piZ_d2z%order(2)) = piZ_d2z%shape(2) + np(piZ_d2z%order(3)) = piZ_d2z%shape(3) + call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape) + ! divide by -K**2, and normalize + offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1 + offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1 + offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1 + xoff = offsets(1) + yoff = offsets(2) + npx = np(1) + npy = np(2) + ! call nvtxStartRange("Solution") + !$acc kernels + do jl = 1, npy + jg = yoff + jl + do il = 1, npx + ig = xoff + il + do k = 1, nz + k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2 + phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8)) + enddo + enddo + enddo + ! specify mean (corrects division by zero wavenumber above) + if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0 + !$acc end kernels + ! call nvtxEndRange + ! call nvtxStartRange("FFT backwards w/ transpositions") + + ! psi(kz,kx,ky) -> psi(z,kx,ky) + status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE) + if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status + ! psi(z,kx,ky) -> psi(ky,z,kx) + CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) + ! psi(ky,z,kx) -> psi(y,z,kx) + status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE) + if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status + ! psi(y,z,kx) -> psi(kx,y,z) + CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents)) + !$acc host_data use_device(p) + ! psi(kx,y,z) -> psi(x,y,z) + status = cufftExecZ2D(planXb, psi_d, p) + if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status + !$acc end host_data + ! call nvtxEndRange + + !$acc host_data use_device(p) + ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data + ! Update X-pencil halos in Y and Z direction + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + ! call nvtxEndRange + !######################################################################################################################################## + ! END STEP 7: POISSON SOLVER FOR PRESSURE + !######################################################################################################################################## + + + + !######################################################################################################################################## + ! START STEP 8: VELOCITY CORRECTION + ! ######################################################################################################################################## + ! 8.1 Correct velocity + ! 8.2 Remove mean velocity if using ABC forcing + ! 8.3 Call halo exchnages along Y and Z for u,v,w + ! Correct velocity, pressure has also the halo + ! call nvtxStartRange("Correction") + + !$acc kernels + umean=0.d0 + vmean=0.d0 + wmean=0.d0 + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i = 1, piX%shape(1) ! equal to nx (no halo on x) + im=i-1 + jm=j-1 + km=k-1 + if (im < 1) im=nx + u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi + v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi + w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi + umean=umean + u(i,j,k) + vmean=vmean + v(i,j,k) + wmean=wmean + w(i,j,k) + enddo + enddo + enddo + !$acc end kernels + + ! Remove mean velocity (get local mean of the rank) + + ! Divide by total number of points in the pencil + umean=umean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext) + vmean=vmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext) + wmean=wmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext) + + ! Find global mean (MPI_SUM and then divide by number of ranks) + call MPI_Allreduce(umean,gumean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr) + call MPI_Allreduce(vmean,gvmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr) + call MPI_Allreduce(wmean,gwmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr) + + ! remove mean value + !$acc kernels + u=u-(gumean/ranks) + v=v-(gvmean/ranks) + w=w-(gwmean/ranks) + !$acc end kernels + + ! 8.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid + !$acc host_data use_device(u) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(v) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(w) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! find local maximum velocity + uc=maxval(u) + vc=maxval(v) + wc=maxval(w) + umax=max(wc,max(uc,vc)) + call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr) + cou=gumax*dt*dxi + if (rank.eq.0) then + write(*,*) "CFL (max among tasks)", cou + write(*,*) "umax", gumax + if (cou .gt. 7) stop + if (.not. ieee_is_finite(cou)) then + print *, "Error: cou is infinite or NaN!" + stop + end if + endif + + call cpu_time(timef) + if (rank.eq.0) print '(" Time elapsed = ",f8.1," ms")',1000*(timef-times) + + ! (uncomment for profiling) + ! call nvtxEndRange + + !######################################################################################################################################## + ! END STEP 8: VELOCITY CORRECTION + !######################################################################################################################################## + + + !######################################################################################################################################## + ! START STEP 9: OUTPUT FIELDS + ! ######################################################################################################################################## + if (mod(t,dump) .eq. 0) then + if (rank .eq. 0) write(*,*) "Saving output files" + ! write velocity and pressure fiels (1-4) + call writefield(t,1) + call writefield(t,2) + call writefield(t,3) + call writefield(t,4) + #if phiflag == 1 + ! write phase-field (5) + call writefield(t,5) + #endif + endif + !######################################################################################################################################## + ! END STEP 9: OUTPUT FIELDS N + !######################################################################################################################################## + +! (uncomment for profiling) +! call nvtxEndRange + +enddo +call cpu_time(t_end) +elapsed = t_end-t_start +if (rank .eq. 0) write(*,*) 'Elapsed time (seconds):', elapsed +#if partflag == 1 +!$acc end data +!$acc end data +!$acc end data +#endif +!$acc end data +!$acc end data +!$acc end data + +#if partflag == 1 +! Particle variables +deallocate(part) +deallocate(partbuff) +deallocate(vec_p) +deallocate(order_p) +deallocate(buffvar1) +deallocate(buffvar2) +#endif + +! Remove allocated variables (add new) +deallocate(x_ext) +deallocate(u,v,w) +deallocate(tanh_psi, mysin, mycos) +deallocate(rhsu,rhsv,rhsw) +deallocate(rhsu_o,rhsv_o,rhsw_o) +deallocate(phi,rhsphi,normx,normy,normz) +deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4) + +call mpi_finalize(ierr) + +end program main \ No newline at end of file diff --git a/hit/mn5.sh b/hit/mn5.sh new file mode 100644 index 0000000..12a62b4 --- /dev/null +++ b/hit/mn5.sh @@ -0,0 +1,5 @@ +module purge +module load nvidia-hpc-sdk/24.3 +cp Makefile_mn5 Makefile +make clean +make \ No newline at end of file diff --git a/hit/module.f90 b/hit/module.f90 new file mode 100755 index 0000000..1a25def --- /dev/null +++ b/hit/module.f90 @@ -0,0 +1,82 @@ +module param + integer, parameter :: nx=512 + integer :: ny=nx,nz=nx + double precision :: pi,lx,dx,dxi,ddxi,rhoi,twopi + integer :: restart,tstart,tfin,dump + double precision :: gamma, normag + double precision :: dt,mu,rho !flow parameters + integer :: inflow, inphi + double precision :: f1,f2,f3,k0 ! forcing parameters + double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters + double precision :: times,timef + ! other variables (wavenumber, grid location) + real(8), allocatable :: x(:), kx(:) + real(8), allocatable :: x_ext(:) + double precision :: yinf, ysup ! Inf and Sup of y in PiX (no halo) + double precision :: zinf, zsup ! Inf and Sup of z in PiX (no halo) + double precision :: lyloc, lzloc ! Inf and Sup of y in PiX + +end module param + + +module mpivar + ! MPI variables + integer :: rank, ranks, ierr + integer :: localRank, localComm + integer :: nidp1y, nidm1y, nidp1z, nidm1z +end module mpivar + + +module cudecompvar + use cudecomp + integer :: npx, npy, npz + type(cudecompHandle) :: handle + type(cudecompGridDesc) :: grid_desc,grid_descD2Z + type(cudecompGridDescConfig) :: config + type(cudecompGridDescAutotuneOptions) :: options + integer :: pdims(2) ! pr x pc pencils + integer :: gdims(3) ! global grid dimensions + integer :: halo(3) ! halo extensions + integer :: halo_ext ! 0 no halo, 1 means 1 halo + type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration + type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z ! size of the pencils in x- y- and z-configuration for D2Z + type(cudecompPencilInfo) :: piX_Poiss + integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z + integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z + logical :: halo_periods(3) + integer :: pix_yoff,pix_zoff +end module cudecompvar + + +module velocity + double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:) + double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:) + double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:) + complex(8), allocatable :: rhsp_complex(:,:,:) + double precision, allocatable :: rhsp(:,:,:), p(:,:,:) + double precision, allocatable :: rhspp(:,:,:), pp(:,:,:) + double precision, allocatable :: div(:,:,:) + double precision :: uc, vc, wc, umax, gumax=1.0d0, cou, alpha, beta + double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33 + double precision :: umean, vmean, wmean, gumean, gvmean, gwmean + double precision, allocatable :: mysin(:), mycos(:) +end module velocity + + +module phase + double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:) + double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:) + double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:) + double precision :: curv + double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:) + double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:) + double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp + double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp + double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp + double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13 + double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp +end module phase + + + + diff --git a/hit/particles.f90 b/hit/particles.f90 new file mode 100644 index 0000000..5e7d0ea --- /dev/null +++ b/hit/particles.f90 @@ -0,0 +1,382 @@ +module particles + use sort + implicit none + ! Total number of particles + integer:: npart + ! Local (per process) number of particles + integer :: nploc + ! Estimated maximum number of particles per process + integer :: nplocmax + ! Array containing the info of the particles (every column is a different particle) + double precision, allocatable :: part(:,:) + ! Buffer for particle array + double precision, allocatable :: partbuff(:,:) + ! Number of Info per particle + integer, parameter :: ninfop = 12 + !....Indices for particle info + integer, parameter :: Itag=1 ! Row 1: Tag of the particle + integer, parameter :: Ixp=2,Iyp=3,Izp=4 ! Row 2 to 4: Current x,y,z of the particle + integer, parameter :: Iup=5,Ivp=6,Iwp=7 ! Row 5 to 7: Current u,v,w of the particle + integer, parameter :: Iphip=8 ! Row 8: phi at the current location of the particle + integer, parameter :: Iup1=9,Ivp1=10,Iwp1=11 ! Row 9 to 11: Previous (step) u,v,w of the particle + integer, parameter :: Iphip1=12 ! Row 12: Previous (step) u,v,w of the particle + + ! Array for particle ordering + double precision, allocatable :: vec_p(:) + integer, allocatable :: order_p(:) + integer :: inpart + + ! Number of Particles to be Transferred + integer::Nsendp1,Nsendm1,Nrecvp1,Nrecvm1 + + ! Buffer for particle transfer + double precision, allocatable,dimension(:,:)::buffvar1,buffvar2 + + contains +!*********************************************************************** +subroutine particlegenerator(inflag) + use param + use mpivar + use cudecompvar + implicit none + integer :: inflag + integer :: i + + !....Particle Tag + do i = 1, nploc + part(i,Itag) = dble(i) + part(i,Itag) = dble(i) + rank * nploc + enddo + + + if(inflag.eq.1)then + !....Generate xp position + call RANDOM_SEED() + call RANDOM_NUMBER(vec_p) + do i = 1, nploc + part(i,Ixp) = vec_p(i) * lx + enddo + + !....Generate yp position + call RANDOM_SEED() + call RANDOM_NUMBER(vec_p) + do i = 1, nploc + part(i,Iyp) = vec_p(i) * lyloc + yinf + end do + + !....Generate zp position + call RANDOM_SEED() + call RANDOM_NUMBER(vec_p) + do i = 1, nploc + part(i,Izp) = vec_p(i) * lzloc + zinf + end do + endif + + return +end subroutine +!*********************************************************************** +subroutine linearinterpolation + use velocity + use param + use cudecompvar + implicit none + integer :: i + double precision :: xpt, ypt, zpt + integer :: ix, iy, iz, ixplus1 + double precision :: tx, ty, tz + + !$acc parallel loop present(part,u,v,w) private(xpt,ypt,zpt,ix,iy,iz,tx,ty,tz) + do i = 1, nploc + ! Particle position + xpt = part(i,Ixp) + ypt = part(i,Iyp) + zpt = part(i,Izp) + + ! Global left-cell indices (uniform, fully periodic) + ix = floor( xpt / dx ) + iy = floor( ypt / dx ) + iz = floor( zpt / dx ) + + ! Local fractions in [0,1) + tx = xpt/dx - dble(ix) + ty = ypt/dx - dble(iy) + tz = zpt/dx - dble(iz) + + ! Map to 1-based global indices and wrap, then to local by offsets + ix = 1 + ix + ixplus1 = ix + 1 + ! Periodicity + if(ixplus1>nx) ixplus1 = 1 + ! Periodicity is already accounted by the halos + iy = 1 + iy - pix_yoff + iz = 1 + iz - pix_zoff + + ! 1-D linear per component (MAC: faces along native dir) + part(i,Iup) = (1.0d0 - tx)*u(ix , iy , iz ) + tx*u(ixplus1 , iy , iz ) + part(i,Ivp) = (1.0d0 - ty)*v(ix , iy , iz ) + ty*v(ix , iy+1 , iz ) + part(i,Iwp) = (1.0d0 - tz)*w(ix , iy , iz ) + tz*w(ix , iy , iz+1 ) + end do + !$acc end parallel loop + + return +end subroutine +!*********************************************************************** +subroutine SortPartY() + implicit none + integer :: i, j + + !$acc parallel loop collapse(2) + do j = 1, ninfop + do i = 1, nploc + partbuff(i,j) = part(i,j) + vec_p(i) = part(i,Iyp) + order_p(i) = i + enddo + enddo + + !$acc host_data use_device(vec_p,order_p) + call fsort(vec_p,order_p,nploc,.true.) + !$acc end host_data + + !$acc parallel loop collapse(2) + do j = 1, ninfop + do i = 1, nploc + part(i,j) = partbuff(order_p(i),j) + end do + end do + + return +end subroutine +!*********************************************************************** +subroutine SortPartZ() + implicit none + integer :: i, j + + !$acc parallel loop collapse(2) + do j = 1, ninfop + do i = 1, nploc + partbuff(i,j) = part(i,j) + vec_p(i) = part(i,Izp) + order_p(i) = i + enddo + enddo + + !$acc host_data use_device(vec_p,order_p) + call fsort(vec_p,order_p,nploc,.true.) + !$acc end host_data + + !$acc parallel loop collapse(2) + do j = 1, ninfop + do i = 1, nploc + part(i,j) = partbuff(order_p(i),j) + end do + end do + + return +end subroutine +!*********************************************************************** +subroutine CountPartTransfY() + use param + implicit none + integer :: flag,scanner + integer :: i + integer :: addp,addm + + Nsendp1=0 !Number of particle to send to nid+1 (in Y) + Nrecvm1=0 !Number of particle to receive from nid-1 (in Y) + Nsendm1=0 !Number of particle to send to nid-1 (in Y) + Nrecvp1=0 !Number of particle to receive from nid+1 (in Y) + + + !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1) + do i = 1, nploc + addp=0 + addm=0 + if(part(i,Izp).GE.ysup) addp=1 + if(part(i,Izp).LT.yinf) addm=1 + Nsendp1=Nsendp1+addp + Nsendm1=Nsendm1+addm + end do + + return +end subroutine +!*********************************************************************** +subroutine CountPartTransfZ() + use param + implicit none + integer :: flag,scanner + integer :: i + integer :: addp,addm + + Nsendp1=0 !Number of particle to send to nid+1 (in Z) + Nrecvm1=0 !Number of particle to receive from nid-1 (in Z) + Nsendm1=0 !Number of particle to send to nid-1 (in Z) + Nrecvp1=0 !Number of particle to receive from nid+1 (in Z) + + + !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1) + do i = 1, nploc + addp=0 + addm=0 + if(part(i,Izp).GE.zsup) addp=1 + if(part(i,Izp).LT.zinf) addm=1 + Nsendp1=Nsendp1+addp + Nsendm1=Nsendm1+addm + end do + + return +end subroutine +!*********************************************************************** +subroutine SendPartUP(direction) + use mpi + use mpivar + implicit none + integer :: direction + integer::NsendMAX + integer,dimension(MPI_STATUS_SIZE):: status + integer :: ii, jj + integer :: nidp1, nidm1 + + if(direction.eq.2)then + nidp1 = nidp1y + nidm1 = nidm1y + elseif(direction.eq.3)then + nidp1 = nidp1z + nidm1 = nidm1z + endif + + !...Proc nid send to proc nid+1 the number of particles + !...that nid+1 has to receive from nid + call MPI_Sendrecv(Nsendp1,1,MPI_INTEGER, nidp1, nidp1,& + Nrecvm1,1,MPI_INTEGER,nidm1,rank,MPI_COMM_WORLD,status,ierr) + + call MPI_Allreduce(Nsendp1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,ierr) + + if(NsendMAX.GT.0)then + if(Nsendp1.GT.0)then + !$acc parallel loop collapse(2) + do jj=1,Nsendp1 + do ii=1,ninfop + buffvar1(ii,jj)=part(nploc-Nsendp1+jj,ii) + end do + end do + end if + + !$acc host_data use_device(buffvar1,buffvar2) + !....Proc nid send to nid+1 and receive from nid-1 + call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendp1),MPI_DOUBLE_PRECISION,& + nidp1, nidp1,buffvar2(1,1),int(ninfop*Nrecvm1),MPI_DOUBLE_PRECISION,& + nidm1,rank,MPI_COMM_WORLD,STATUS,ierr) + !$acc end host_data + + if(Nrecvm1.GT.0)then + !....Each proc place received data in the first block + !$acc parallel loop collapse(2) + do jj=1,ninfop + do ii=1,Nrecvm1 + partbuff(ii,jj)=buffvar2(jj,ii) + end do + end do + end if + end if + + + !....Adjust central block (not transferred data) + !$acc parallel loop collapse(2) + do jj=1,ninfop + do ii=1,(nploc-Nsendm1-Nsendp1) + partbuff(Nrecvm1+ii,jj)=part(Nsendm1+ii,jj) + end do + end do + + return +end subroutine +!*********************************************************************** +subroutine SendPartDOWN(direction) + use mpi + use mpivar + implicit none + integer :: direction + integer::NsendMAX + integer,dimension(MPI_STATUS_SIZE):: status + integer :: ii, jj + integer :: nidp1, nidm1 + + + if(direction.eq.2)then + nidp1 = nidp1y + nidm1 = nidm1y + elseif(direction.eq.3)then + nidp1 = nidp1z + nidm1 = nidm1z + endif + + !...Proc nid send to proc nid-1 the number of particles + !...that nid-1 has to receive from nid + call MPI_Sendrecv(Nsendm1,1,MPI_INTEGER, nidm1, nidm1,& + Nrecvp1,1,MPI_INTEGER,nidp1,rank,MPI_COMM_WORLD,STATUS,ierr) + + call MPI_Allreduce(Nsendm1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_Comm_World,ierr) + + + if(NsendMAX.GT.0)then + if(Nsendm1.GT.0)then + !$acc parallel loop collapse(2) + do jj=1,Nsendm1 + do ii=1,ninfop + buffvar1(ii,jj)=part(jj,ii) + end do + end do + end if + + !$acc host_data use_device(buffvar1,buffvar2) + !....Proc nid send to nid-1 and receive from nid+1 + call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendm1),MPI_DOUBLE_PRECISION,& + nidm1, nidm1,buffvar2(1,1),int(ninfop*Nrecvp1),MPI_DOUBLE_PRECISION,& + nidp1, rank ,MPI_COMM_WORLD,STATUS,ierr) + !$acc end host_data + + if(Nrecvp1.GT.0)then + !....Each proc append received data + !$acc parallel loop collapse(2) + do jj=1,ninfop + do ii=1,Nrecvp1 + partbuff(ii+Nrecvm1+nploc-Nsendm1-Nsendp1,jj)=buffvar2(jj,ii) + end do + end do + end if + + end if + + !...Update Local Number of Paricles + nploc=nploc+Nrecvm1+Nrecvp1-Nsendp1-Nsendm1 + !$acc parallel loop collapse(2) + do jj=1,ninfop + do ii=1,nploc + part(ii,jj)=partbuff(ii,jj) + end do + end do + + + return +end subroutine +!*********************************************************************** +subroutine ParticlesLeakage() + use param + !....Particles which escape from the domain along x, y and z must + !....be reintroduced + implicit none + + !$acc kernels + !....Along x + part(1:nploc,Ixp)=part(1:nploc,Ixp)-lx*FLOOR(part(1:nploc,Ixp)/lx) + !....Along y + part(1:nploc,Iyp)=part(1:nploc,Iyp)-lx*FLOOR(part(1:nploc,Iyp)/lx) + !....Along z + part(1:nploc,Izp)=part(1:nploc,Izp)-lx*FLOOR(part(1:nploc,Izp)/lx) + !$acc end kernels + + return +end subroutine +!*********************************************************************** +end module particles \ No newline at end of file diff --git a/hit/readinput.f90 b/hit/readinput.f90 new file mode 100644 index 0000000..2dabaa2 --- /dev/null +++ b/hit/readinput.f90 @@ -0,0 +1,83 @@ + +!########################################################################## +!########################################################################### +subroutine readinput +use velocity +use phase +use param +use mpivar +use particles +implicit none + +open(unit=55,file='input.inp',form='formatted',status='old') +!Time step parameters +read(55,*) restart +read(55,*) tstart +read(55,*) tfin +read(55,*) dump +!Flow parameters +read(55,*) inflow +read(55,*) inphi +read(55,*) dt +read(55,*) mu +read(55,*) rho +! forcing parameters +read(55,*) f1 +read(55,*) f2 +read(55,*) f3 +read(55,*) k0 +! phase-field parameters +read(55,*) radius +read(55,*) sigma +read(55,*) epsr +! particle parameters +read(55,*) npart +read(55,*) inpart + + +! compute pre-defined constant +twopi=8.0_8*atan(1.0_8) +lx=twopi +! dx = lx/(nx-1) +dx = lx/(nx) + +dxi=1.d0/dx +ddxi=1.d0/dx/dx +rhoi=1.d0/rho +eps=epsr*dx +epsi=1.d0/eps +enum=1.e-16 + +if (rank .eq. 0) then + !enable/disable for debug check parameters + write(*,*) "----------------------------------------------" + write(*,*) "███ ███ ██ ██ ██ ████████ ██████ ██████" + write(*,*) "████ ████ ██ ██ ██ ██ ██ ██" + write(*,*) "██ ████ ██ ███████ ██ ██ █████ ███████" + write(*,*) "██ ██ ██ ██ ██ ██ ██ ██ ██ ██" + write(*,*) "██ ██ ██ ██ ██ ██ ██████ ██████" + write(*,*) "----------------------------------------------" + write(*,*) 'Grid:', nx, 'x', nx, 'x', nx + write(*,*) "Restart ", restart + write(*,*) "Tstart ", tstart + write(*,*) "Tfin ", tfin + write(*,*) "Dump ", dump + write(*,*) "Inflow ", inflow + write(*,*) "Deltat ", dt + write(*,*) "Mu ", mu + write(*,*) "Rho ", rho + write(*,*) "f1,f2,f3,k0", f1,f2,f3,k0 + write(*,*) "Radius ", radius + write(*,*) "Sigma ", sigma + write(*,*) "Eps ", eps + write(*,*) "Epsi ", epsi + write(*,*) "Lx ", lx + write(*,*) "dx", dx + write(*,*) "dxi", dxi + write(*,*) "ddxi", ddxi + write(*,*) "rhoi", rhoi +endif + +end subroutine + + diff --git a/hit/readwrite.f90 b/hit/readwrite.f90 new file mode 100644 index 0000000..44df20f --- /dev/null +++ b/hit/readwrite.f90 @@ -0,0 +1,318 @@ +subroutine writefield(t,fieldn) +! Output field, file is written in the /output folder + +use velocity +use phase +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: t,fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: out(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 2) then + out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 3) then + out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 4) then + out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +if (fieldn .eq. 5) then + out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo) + write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) +endif + +deallocate(out) + +end subroutine + + + + + + + + + + +subroutine readfield(fieldn) +! Input field, file is written in the /input folder + +use velocity +use phase +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: in(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1) +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + namefile='./input/u.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 2) then + namefile='./input/v.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 3) then + namefile='./input/w.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 4) then + namefile='./input/p.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 5) then + namefile='./input/phi.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +deallocate(in) + +end subroutine + + + + + + + + + + +subroutine readfield_restart(t,fieldn) +! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!) + +use velocity +use phase +use mpi +use mpivar +use param +use cudecompvar + +implicit none + +integer :: g_size(3),p_size(3),fstart(3) +integer :: t,fieldn +character(len=40) :: namefile +integer(mpi_offset_kind) :: offset=0 +integer :: f_handle ! file handle +integer :: ftype +double precision, allocatable :: in(:,:,:) + +! fieldn=1 means u +! fieldn=2 means v +! fieldn=3 means w +! fieldn=4 means p +! fieldn=5 means phi + +! define basic quantities to be used later (gloabl and pencil size) +g_size=[nx, ny, nz] ! global size +p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x +fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1) +! for debug +!write(*,*) "g_size", g_size +!write(*,*) "p_size", p_size +!write(*,*) "fstart", fstart +allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read + +!write(*,*) "in readwrite" + +if (fieldn .eq. 1) then + write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 2) then + write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 3) then + write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 4) then + write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +if (fieldn .eq. 5) then + write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat' + call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr) + call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr) + call mpi_type_commit(ftype,ierr) + call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr) + !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr) + call mpi_file_close(f_handle,ierr) + phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos +endif + +deallocate(in) + +end subroutine \ No newline at end of file diff --git a/hit/testpush.sh b/hit/testpush.sh new file mode 100644 index 0000000..2e0e21a --- /dev/null +++ b/hit/testpush.sh @@ -0,0 +1,2 @@ +make clean +make diff --git a/multi/main.f90 b/multi/main.f90 old mode 100755 new mode 100644 index 67dd398..beb351f --- a/multi/main.f90 +++ b/multi/main.f90 @@ -285,11 +285,13 @@ program main allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) !PFM variables #if phiflag == 1 - allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi_o(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3))) allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3))) allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3))) allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3))) + allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3))) allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces + allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3))) #endif ! allocate arrays for transpositions and halo exchanges @@ -439,9 +441,9 @@ program main ! First step use Euler alpha=1.0d0 beta=0.0d0 -gumax=1.d0 +gumax=8.0d0 tstart=tstart+1 -gamma=1.d0*gumax +gamma=1.d0*gumax ! initial gamma for phase-field !$acc data copyin(piX) !$acc data create(rhsu_o, rhsv_o, rhsw_o) !$acc data copyin(mysin, mycos) @@ -462,7 +464,6 @@ program main if (rank.eq.0) write(*,*) "Time step",t,"of",tfin call cpu_time(times) - !######################################################################################################################################## ! START STEP 4: PARTICLES (TRACERS) !######################################################################################################################################## @@ -519,6 +520,8 @@ program main ! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT) !######################################################################################################################################## #if phiflag == 1 + ! 4.2 Get phi at n+1 using RK4 + ! first stage of RK4 - saved in rhsphi !$acc kernels do k=1, piX%shape(3) do j=1, piX%shape(2) @@ -533,7 +536,8 @@ program main enddo !$acc end kernels - gamma=1.d0*gumax + gamma=1.d0*gumax ! update gamma every time step with the current max velocity value + if (rank.eq.0) write(*,*) "gamma:", gamma !$acc parallel loop tile(16,4,2) do k=1+halo_ext, piX%shape(3)-halo_ext do j=1+halo_ext, piX%shape(2)-halo_ext @@ -547,14 +551,160 @@ program main km=k-1 if (ip .gt. nx) ip=1 if (im .lt. 1) im=nx - ! convective (first three lines) and diffusive (last three lines) + ! convective (first six lines) and diffusive (last three lines) + ! flux-splitting rhsphi(i,j,k) = & - - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi & - - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi & - - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi & + - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi & + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + & eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + & - eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi) + eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum) + normx_f(i,j,k) = normx(i,j,k)*normag + normy_f(i,j,k) = normy(i,j,k)*normag + normz_f(i,j,k) = normz(i,j,k)*normag + enddo + enddo + enddo + !$acc end kernels + + ! Compute sharpening term flux split like CaNS-Fizzy + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! second stage of RK4 - saved in rhsphik2 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) + enddo + enddo + enddo + !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik2(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) ! 4.1.3. Compute normals for sharpening term (gradient) normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) @@ -577,20 +727,133 @@ program main CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) !$acc end host_data - ! 4.1.3. Compute Sharpening term (gradient) - ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) + ! Compute sharpening term !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! third stage of RK4 - saved in rhsphik3 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2) do k=1, piX%shape(3) do j=1, piX%shape(2) do i=1,nx - normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum) - normx(i,j,k) = normx(i,j,k)*normag - normy(i,j,k) = normy(i,j,k)*normag - normz(i,j,k) = normz(i,j,k)*normag + phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) enddo enddo enddo !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik3(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo + + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data ! Compute sharpening term !$acc kernels @@ -605,31 +868,186 @@ program main km=k-1 if (ip .gt. nx) ip=1 if (im .lt. 1) im=nx - ! ACDI with pre-computed tanh - rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - & - 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5d0*dxi + & - (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - & - 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5d0*dxi + & - (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - & - 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5d0*dxi) + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + ! forth stage of RK4 - saved in rhsphik4 + !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3) + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k) + enddo + enddo + enddo + !$acc end parallel loop + ! 4.3 Call halo exchanges along Y and Z for phi + !$acc host_data use_device(phi_tmp) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc kernels + do k=1, piX%shape(3) + do j=1, piX%shape(2) + do i=1,nx + ! compute distance function psi (used to compute normals) + val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log + psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum)) + ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh + tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi) enddo enddo enddo !$acc end kernels + !$acc parallel loop tile(16,4,2) + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ! 4.1 RHS computation + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + ! convective (first three lines) and diffusive (last three lines) + ! flux-splitting + rhsphik4(i,j,k) = & + - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - & + min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi & + - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - & + min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi & + - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - & + min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi & + + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + & + eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + & + eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi) + ! 4.1.3. Compute normals for sharpening term (gradient) + normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k)) + normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k)) + normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km)) + enddo + enddo + enddo - ! 4.2 Get phi at n+1 using AB2 + ! Update normx,normy and normz halos, required to then compute normal derivative + !$acc host_data use_device(normx) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normy) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + !$acc host_data use_device(normz) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) + CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) + !$acc end host_data + + ! Compute sharpening term + !$acc kernels + do k=1+halo_ext, piX%shape(3)-halo_ext + do j=1+halo_ext, piX%shape(2)-halo_ext + do i=1,nx + ip=i+1 + jp=j+1 + kp=k+1 + im=i-1 + jm=j-1 + km=k-1 + if (ip .gt. nx) ip=1 + if (im .lt. 1) im=nx + normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k)) + normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k)) + normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k)) + normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k)) + normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k)) + normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k)) + normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k)) + normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k)) + normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k)) + normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k)) + normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k)) + normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k)) + normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k)) + normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k)) + normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k)) + normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k)) + normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k)) + normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k)) + ! sharpening term + ! + rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum) + rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum) + rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum) + rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum) + rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum) + rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum) + ! + sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01) + sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11) + sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02) + sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12) + sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03) + sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13) + ! + rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm)) + enddo + enddo + enddo + !$acc end kernels + + ! Update phi with RK4 !$acc kernels do k=1+halo_ext, piX%shape(3)-halo_ext do j=1+halo_ext, piX%shape(2)-halo_ext do i=1,nx - phi(i,j,k) = phi(i,j,k) + dt*(alpha*rhsphi(i,j,k)-beta*rhsphi_o(i,j,k)) - rhsphi_o(i,j,k)=rhsphi(i,j,k) + phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + & + 2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k)) enddo enddo enddo !$acc end kernels - ! 4.3 Call halo exchnages along Y and Z for phi + ! 4.3 Call halo exchanges along Y and Z for phi !$acc host_data use_device(phi) CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2)) CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3)) @@ -732,7 +1150,7 @@ program main if (ip .gt. nx) ip=1 if (im .lt. 1) im=nx ! continuum-surface force implementation - curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dxi + 0.5d0*(normz(i,j,kp)-normz(i,j,km))*dxi + curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi !compute capillary forces: sigma*curvature*gradphi fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi @@ -999,6 +1417,7 @@ program main cou=gumax*dt*dxi if (rank.eq.0) then write(*,*) "CFL (max among tasks)", cou + write(*,*) "umax", gumax if (cou .gt. 7) stop if (.not. ieee_is_finite(cou)) then print *, "Error: cou is infinite or NaN!" @@ -1068,9 +1487,9 @@ program main deallocate(tanh_psi, mysin, mycos) deallocate(rhsu,rhsv,rhsw) deallocate(rhsu_o,rhsv_o,rhsw_o) -deallocate(phi,rhsphi,rhsphi_o,normx,normy,normz) +deallocate(phi,rhsphi,normx,normy,normz) +deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4) call mpi_finalize(ierr) -end program main - +end program main \ No newline at end of file diff --git a/multi/module.f90 b/multi/module.f90 index 8168c1b..1a25def 100755 --- a/multi/module.f90 +++ b/multi/module.f90 @@ -64,10 +64,17 @@ end module velocity module phase - double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), rhsphi_o(:,:,:), psidi(:,:,:), tanh_psi(:,:,:) + double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:) double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:) + double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:) double precision :: curv double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:) + double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:) + double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp + double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp + double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp + double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13 + double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp end module phase