diff --git a/channel/Makefile_leonardo b/channel/Makefile_leonardo
new file mode 100644
index 0000000..ff923ef
--- /dev/null
+++ b/channel/Makefile_leonardo
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
diff --git a/channel/Makefile_local b/channel/Makefile_local
new file mode 100644
index 0000000..6cb5517
--- /dev/null
+++ b/channel/Makefile_local
@@ -0,0 +1,28 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+RM = /bin/rm -f
+
+# Paths (Modify if necessary)
+CUDECOMP_DIR = /home/milton/TCF36/cuDecomp/build/
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include
+LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
+
+
diff --git a/channel/README.md b/channel/README.md
new file mode 100644
index 0000000..e73f5c9
--- /dev/null
+++ b/channel/README.md
@@ -0,0 +1,31 @@
+# Channel configuratuion
+
+Code for turbulent channel flow simulations (closed channel).
+Solution of Navier-Stokes equations + phase-field method (ACDI) + energy equation (passive scalar).
+For time integration, a fully explicit scheme is used: Euler explicit for ACDI + RK3 for Navier-Stokes and temperature
+
+## Main features at a glance:
+- No-slip boundary conditions at the two walls ($z=\pm h$)
+- Poisson solver based on FFT2 along $x$ and $y$ + TDMA along $z$.
+- Non-uniform grid along $z$, any stretching function can be used (defined in read_input.f90)
+- Domain of arbitary dimensions along all three directions
+
+**Autotuning of decomposition**
+- Default: `pr=0` and `pc=0` → enables **automatic process decomposition**.
+- Only input needed: **total number of MPI tasks**.
+- No recompilation required when changing MPI processes.
+
+**Optional features (conditional compilation)**
+- Phase-field module: Can be enabled or disabled. Default is single-phase.
+- Passive scaler: Can be enabled or disabled. Default is single-phase.
+
+## Turbulent channel flow
+- Turbulent channel flow at $Re_\tau=590$, Grid: $1536 \times 768 \times 576$
+
+
+
+## Nodes numbering and staggered grid
+
+X-pencil configuration:
+
+
diff --git a/channel/binder.sh b/channel/binder.sh
new file mode 100755
index 0000000..b449dda
--- /dev/null
+++ b/channel/binder.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in
+0) export UCX_NET_DEVICES=mlx5_0:1 ;;
+1) export UCX_NET_DEVICES=mlx5_1:1 ;;
+2) export UCX_NET_DEVICES=mlx5_2:1 ;;
+3) export UCX_NET_DEVICES=mlx5_3:1 ;;
+esac
+
+echo Launching on $UCX_NET_DEVICES
+
+$*
+
diff --git a/channel/go_leo.sh b/channel/go_leo.sh
new file mode 100644
index 0000000..2424115
--- /dev/null
+++ b/channel/go_leo.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+#SBATCH --account="IscrB_EXCEED"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:05:00
+#SBATCH --nodes=1 ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --error=test.err
+#SBATCH --qos=boost_qos_dbg
+
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+chmod 777 binder.sh
+mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36
\ No newline at end of file
diff --git a/channel/input.inp b/channel/input.inp
new file mode 100644
index 0000000..7e814cb
--- /dev/null
+++ b/channel/input.inp
@@ -0,0 +1,21 @@
+0 ! fresh start=0 restart =1
+0 ! iteration to start from (fresh start 0 otherwise different)
+200000 ! Final time step
+1000 ! Saving frequency
+4.0 ! Streamwise domain lenght Lx
+4.0 ! Spanwise domain width Ly
+2.0 ! Wall-normal domain height Lz=2h
+0.01 ! Stretching factor quasi~0=uniform 1.5 mid-stretching and 3.0 extreme (do not use zero)
+1 ! inflow condition (0=zero, 1=perturbed 3D field, 2=read from fields)
+0 ! phase-field initial condition (0=circular drop, 1=read from fields)
+0 ! theta initial condition (0=uniform, 1=read from fields)
+0.0001 ! dt for the simulation
+0.0066666 ! mu (viscosity)
+1.000 ! density
+0.0 ! Pressure gradient along x (negative if flow along x)
+0.0 ! Pressure gradient along y (negative if flow along y)
+0.5 ! Initial drop radius for phase-field
+0.1d0 ! Surface tension
+1.0 ! Ratio eps/dx
+0.00666666 ! kappa (thermal diffusivity)
+50.0 ! alphag for buoyancy
diff --git a/channel/leo.sh b/channel/leo.sh
new file mode 100644
index 0000000..f3d5f46
--- /dev/null
+++ b/channel/leo.sh
@@ -0,0 +1,8 @@
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+cp Makefile_leonardo Makefile
+make clean
+make
+mkdir -p output
+
diff --git a/channel/local.sh b/channel/local.sh
new file mode 100644
index 0000000..4f27984
--- /dev/null
+++ b/channel/local.sh
@@ -0,0 +1,15 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+#clean folder output
+rm -rf output
+mkdir output
+cp Makefile_local Makefile
+#rm *.dat
+make clean
+make
+mpirun -np 2 ./mhit36
diff --git a/channel/main.f90 b/channel/main.f90
new file mode 100644
index 0000000..9c4f06f
--- /dev/null
+++ b/channel/main.f90
@@ -0,0 +1,1028 @@
+#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
+
+program main
+use cudafor
+use cudecomp
+use cufft
+use mpi
+use velocity
+use phase
+use temperature
+use param
+use mpivar
+use cudecompvar
+use nvtx
+
+
+implicit none
+! timer for scaling test
+real :: t_start, t_end, elapsed
+! grid dimensions
+integer :: comm_backend
+integer :: pr, pc
+! cudecomp
+! cuFFT
+integer :: planXf, planXb, planY
+integer :: batchsize
+integer :: status
+integer :: i,j,k,il,jl,kl,ig,jg,kg,t,stage
+integer :: im,ip,jm,jp,km,kp,last,idx
+! TDMA variables
+double precision, allocatable :: a(:), b(:), c(:)
+double complex, allocatable :: d(:), sol(:)
+! working arrays
+double complex, allocatable :: psi(:)
+double precision, allocatable :: ua(:,:,:)
+double precision, allocatable :: uaa(:,:,:)
+double complex, device, allocatable :: psi_d(:)
+double precision, device, allocatable :: vel_d(:) ! only used for implicit diffusion in z
+double complex, pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:)
+character(len=40) :: namefile
+character(len=4) :: itcount
+! Code variables
+double precision ::err, maxErr, meanp, gmeanp
+double complex, device, pointer :: psi3d(:,:,:)
+double precision :: k2
+!integer :: il, jl, ig, jg
+integer :: offsets(3), xoff, yoff
+integer :: np(3)
+! Alan Williamson classic
+double precision, parameter :: alpha(3) = (/ 8.d0/15.d0, 5.d0/12.d0, 3.d0/4.d0 /)
+double precision, parameter :: beta(3) = (/ 0.d0, -17.d0/60.d0, -5.d0/12.d0 /)
+! Stage coefficients for diffusion-optimized SSP RK3
+!real(kind=8), parameter :: alpha(3) = (/ 0.444370493651235d0, 0.555629506348765d0, 1.0d0 /)
+!real(kind=8), parameter :: beta(3) = (/ 0.0d0, -0.122243120495896d0, -0.377756879504104d0 /)
+
+! Enable or disable phase field
+#define phiflag 0
+! Enable or disable temperature field
+#define thetaflag 1
+
+!########################################################################################################################################
+! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START
+!########################################################################################################################################
+! MPI initialization, put in rank the local MPI rank number and ranks total number
+! Same procedura defined in the cuDecomp documentation
+call mpi_init(ierr)
+call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
+call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr)
+call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr)
+call mpi_comm_rank(localComm, localRank, ierr)
+ierr = cudaSetDevice(localRank) !assign GPU to MPI rank
+
+! Define grid and decomposition
+call readinput
+
+! domain decomposition (pencils in y and z)
+pr = 0
+pc = 0
+halo_ext=1
+
+! CuDECOMP initialization and settings
+comm_backend = 0 ! Enable full autotuning
+CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD))
+! config is a struct and pr and pc are the number of pencils along the two directions
+! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults.
+CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config)) ! Initializing to default values is required to ensure no entries are left uninitialized.
+pdims = [pr, pc] !pr and pc are the number of pencil along the different directions
+config%pdims = pdims
+halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil
+! for transpositions
+config%transpose_comm_backend = comm_backend
+config%transpose_axis_contiguous = .true.
+! for halo exchanges
+config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI
+! Setting for periodic halos in all directions (non required to be in config)
+halo_periods = [.true., .true., .false.]
+! create spectral grid descriptor first to select pdims for optimal transposes
+gdims = [nx/2+1, ny, nz]
+config%gdims = gdims
+! Set up autotuning options for spectral grid (transpose related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+ options%autotune_transpose_backend = .true.
+ options%autotune_halo_backend = .false.
+endif
+options%transpose_use_inplace_buffers = .true.
+options%transpose_input_halo_extents(:, 1) = halo
+options%transpose_output_halo_extents(:, 4) = halo
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options))
+! create physical grid descriptor
+! take previous config and modify the global grid (nx instead of nx/2+1)
+! reset transpose_comm_backend to default value to avoid picking up possible nvshmem
+! transpose backend selection (this impacts how workspaces are allocated)
+gdims = [nx, ny, nz]
+config%gdims = gdims
+config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+! Set up autotuning options for physical grid (halo related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+ options%autotune_halo_backend = .true.
+endif
+options%halo_extents(:) = halo
+options%halo_periods(:) = halo_periods
+options%halo_axis = 1
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
+! Get pencil info for the grid descriptor in the physical space pencil struct (piX, piY or piZ)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo))
+nElemX = piX%size !<- number of total elments in x-configuratiion (including halo)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2))
+nElemY = piY%size
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3))
+nElemZ = piZ%size
+! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo))
+! Get pencil info for the grid descriptor in the complex space
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo))
+nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2))
+nElemY_d2z = piY_d2z%size
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3))
+nElemZ_d2z = piZ_d2z%size
+! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z))
+! End cuDecomp initialization
+
+
+! CUFFT initialization -- Create Plans (along x anf y only, z not required)
+! Forward 1D FFT in X: D2Z
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward'
+! Backward 1D FFT in X: Z2D
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward'
+! it's always 2 and 3 because y-pencil have coordinates y,z,x
+batchSize = piY_d2z%shape(2)*piY_d2z%shape(3)
+status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward'
+!########################################################################################################################################
+! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
+!########################################################################################################################################
+
+
+
+
+
+!########################################################################################################################################
+! START STEP 2: ALLOCATE ARRAYS
+!########################################################################################################################################
+! Pressure variable
+allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3)))
+allocate(p(piX%shape(1), piX%shape(2), piX%shape(3)))
+allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z)))
+!NS variables
+allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector
+! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field
+allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+!allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) (debug only)
+!TDMA solver
+allocate(a(0:nz+1),b(0:nz+1),c(0:nz+1),d(0:nz+1),sol(0:nz+1))
+!PFM variables
+#if phiflag == 1
+allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+#endif
+!Temperature variables
+#if thetaflag == 1
+allocate(theta(piX%shape(1),piX%shape(2),piX%shape(3)),rhstheta(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(rhstheta_o(piX%shape(1),piX%shape(2),piX%shape(3)))
+#endif
+! allocate arrays for transpositions and halo exchanges
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo))
+! allocate arrays for transpositions
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z)) ! not required
+!########################################################################################################################################
+! END STEP2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+
+
+
+
+
+
+
+!########################################################################################################################################
+! START STEP 3: FLOW FIELD, PHASE-FIELD AND TEMPERATURE INIT
+!########################################################################################################################################
+! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O)
+! 3.2 Call halo exchnages along Y and Z for u, v, w, phi and theta
+if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder
+if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
+ if (inflow .eq. 0) then
+ if (rank.eq.0) write(*,*) "Initialize zero velocity field"
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ do i = 1, piX%shape(1)
+ u(i,j,k) = 0.0d0
+ v(i,j,k) = 0.0d0
+ w(i,j,k) = 0.0d0
+ enddo
+ enddo
+ enddo
+ endif
+ if (inflow .eq. 1) then
+ if (rank.eq.0) write(*,*) "Initialize laminar flow (x) + 3D perturbation"
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ kg = piX%lo(3) + k - 1 - halo_ext
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ jg = piX%lo(2) + j - 1 - halo_ext
+ do i = 1, piX%shape(1)
+ amp=3.d0
+ mx=3.03d0
+ my=2.02d0
+ mz=4.d0
+ !3D divergence free flow with fluctuations that satisfies the boundary conditions
+ u(i,j,k) = 1.d0*(1.d0 - ((2*z(kg) - lz)/lz)**2) !
+ u(i,j,k) = u(i,j,k) - amp*cos(twopi*mx*x(i)/lx)*sin(twopi*my*y(jg)/ly)*2.d0*twopi/lz*sin(twopi*z(kg)/lz)*cos(twopi*z(kg)/lz)
+ u(i,j,k) = u(i,j,k) + amp*sin(twopi*mx*x(i)/lx)*(-twopi*my/ly)*sin(2.d0*twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+ v(i,j,k) = -amp*cos(twopi*my*y(jg)/ly)*(twopi*mx/lx)*cos(twopi*mx*x(i)/lx)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+ w(i,j,k) = amp*cos(twopi*mx*x(i)/lx)*(twopi*mx/lx)*sin(twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+ enddo
+ enddo
+ enddo
+ endif
+ if (inflow .eq. 2) then
+ if (rank.eq.0) write(*,*) "Initialize from data"
+ call readfield(1)
+ call readfield(2)
+ call readfield(3)
+ endif
+endif
+if (restart .eq. 1) then !restart, ignore inflow and read the tstart field
+ if (rank.eq.0) write(*,*) "Initialize velocity field (from output folder), iteration:", tstart
+ call readfield_restart(tstart,1)
+ call readfield_restart(tstart,2)
+ call readfield_restart(tstart,3)
+endif
+
+! update halo cells along y and z directions (enough only if pr and pc are non-unitary)
+!$acc host_data use_device(u)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+!$acc host_data use_device(v)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+!$acc host_data use_device(w)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+
+! initialize phase-field
+#if phiflag == 1
+if (restart .eq. 0) then
+if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
+ if (inphi .eq. 0) then
+ if (rank.eq.0) write(*,*) 'Spherical drop'
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ kg = piX%lo(3) + k - 1 - halo_ext
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ jg = piX%lo(2) + j - 1 - halo_ext
+ do i = 1, piX%shape(1)
+ pos=(x(i)-lx/2)**2d0 + (y(jg)-ly/2)**2d0 + (z(kg)-lz/2)**2d0
+ phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+ enddo
+ enddo
+ enddo
+ endif
+ if (inphi .eq. 1) then
+ if (rank.eq.0) write(*,*) "Initialize phase-field from data"
+ call readfield(5)
+ endif
+endif
+if (restart .eq. 1) then
+ write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+ call readfield_restart(tstart,5)
+endif
+! update halo
+!$acc host_data use_device(phi)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+#endif
+
+! initialize temperature field
+#if thetaflag == 1
+if (restart .eq. 0) then
+if (rank.eq.0) write(*,*) 'Initialize temperature field (fresh start)'
+ if (intheta .eq. 0) then
+ if (rank.eq.0) write(*,*) 'Uniform temperature field'
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ kg = piX%lo(3) + k - 1 - halo_ext
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ do i = 1, piX%shape(1)
+ theta(i,j,k) = 0.d0!1.d0 - z(kg) ! uniform temperature
+ enddo
+ enddo
+ enddo
+ endif
+ if (intheta .eq. 1) then
+ if (rank.eq.0) write(*,*) "Initialize temperature from data"
+ call readfield(6)
+ endif
+endif
+if (restart .eq. 1) then
+ write(*,*) "Initialize temperature (restart, from output folder), iteration:", tstart
+ call readfield_restart(tstart,6)
+endif
+! update halo
+!$acc host_data use_device(theta)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+#endif
+
+!Save initial fields (only if a fresh start)
+if (restart .eq. 0) then
+ if (rank.eq.0) write(*,*) "Save initial fields"
+ call writefield(tstart,1)
+ call writefield(tstart,2)
+ call writefield(tstart,3)
+ call writefield(tstart,4)
+ #if phiflag == 1
+ call writefield(tstart,5)
+ #endif
+ #if thetaflag == 1
+ call writefield(tstart,6) ! temperature
+ #endif
+endif
+!########################################################################################################################################
+! END STEP 3: FLOW FIELD, PHASE-FIELD AND TEMP INIT FIELD INIT
+!########################################################################################################################################
+
+
+
+
+
+
+! ########################################################################################################################################
+! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP
+! ########################################################################################################################################
+! First step use Euler
+gumax=1.d0
+tstart=tstart+1
+gamma=1.d0*gumax
+!$acc data copyin(piX)
+#if thetaflag == 1
+!$acc data create(rhsu_o, rhsv_o, rhsw_o, rhstheta_o)
+#else
+!$acc data create(rhsu_o, rhsv_o, rhsw_o)
+#endif
+!$acc data copyin(mysin, mycos)
+call cpu_time(t_start)
+! Start temporal loop
+do t=tstart,tfin
+ ! Create custom label for each marker
+ write(itcount,'(i4)') t
+ ! Range with custom color
+ call nvtxStartRange("Iteration "//itcount,t)
+
+ if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
+ call cpu_time(times)
+
+ call nvtxStartRange("Phase-field")
+ !########################################################################################################################################
+ ! START STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
+ !########################################################################################################################################
+ #if phiflag == 1
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ gamma=1.d0*gumax
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ rhsphi(i,j,k) = &
+ - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi &
+ - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dyi &
+ - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dzci(kg) &
+ + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &
+ eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddyi + &
+ eps*((phi(i,j,kp)-phi(i,j,k))*dzi(kg+1) - (phi(i,j,k) -phi(i,j,km))*dzi(kg))*dzci(kg)) ! first between centers and then betwenn faces
+ ! Compute normals for sharpening term (gradient)
+ normx(i,j,k) = 0.5d0*(psidi(ip,j,k) - psidi(im,j,k))*dxi
+ normy(i,j,k) = 0.5d0*(psidi(i,jp,k) - psidi(i,jm,k))*dyi
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))/(z(kg+1)-z(kg-1))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx,normy,normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! 4.1.3. Compute Sharpening term (gradient)
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16)
+ ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16)
+ normx(i,j,k) = normx(i,j,k)*normod
+ normy(i,j,k) = normy(i,j,k)*normod
+ normz(i,j,k) = normz(i,j,k)*normod
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Compute sharpening term
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
+ 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + &
+ (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
+ 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dyi + &
+ (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
+ 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))/(z(kg+1)-z(kg-1)))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Get phi at n+1
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ phi(i,j,k) = phi(i,j,k) + dt*rhsphi(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ !$acc host_data use_device(phi)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ #endif
+ !########################################################################################################################################
+ ! END STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
+ !########################################################################################################################################
+ call nvtxEndRange
+
+
+ !########################################################################################################################################
+ ! START STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT)
+ !########################################################################################################################################
+ #if thetaflag == 1
+ ! Temperature solver (RK3 explicit)
+ do stage = 1,3
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective terms
+ rhstheta(i,j,k) = &
+ - (u(ip,j,k)*0.5d0*(theta(ip,j,k)+theta(i,j,k)) - u(i,j,k)*0.5d0*(theta(i,j,k)+theta(im,j,k)))*dxi &
+ - (v(i,jp,k)*0.5d0*(theta(i,jp,k)+theta(i,j,k)) - v(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,jm,k)))*dyi &
+ - (w(i,j,kp)*0.5d0*(theta(i,j,kp)+theta(i,j,k)) - w(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,j,km)))*dzci(kg)
+ ! diffusive terms
+ rhstheta(i,j,k) = rhstheta(i,j,k) + kappa*((theta(ip,j,k)-2.d0*theta(i,j,k)+theta(im,j,k))*ddxi + &
+ (theta(i,jp,k)-2.d0*theta(i,j,k)+theta(i,jm,k))*ddyi + &
+ ((theta(i,j,kp)-theta(i,j,k))*dzi(kg+1) - (theta(i,j,k) -theta(i,j,km))*dzi(kg))*dzci(kg))
+
+ enddo
+ enddo
+ enddo
+ !$acc parallel loop collapse(3)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ theta(i,j,k) = theta(i,j,k) + dt*alpha(stage)*rhstheta(i,j,k) + dt*beta(stage)*rhstheta_o(i,j,k)
+ rhstheta_o(i,j,k)=rhstheta(i,j,k)
+ enddo
+ enddo
+ enddo
+ ! update halos
+ !$acc host_data use_device(theta)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ ! impose temperature boundary conditions, interpolated as done for the velocity (see node sketch)
+ !$acc parallel loop collapse(3)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ kg = piX%lo(3) + k - 1 - halo_ext
+ if (kg .eq. 1) theta(i,j,k-1) = 2.d0*( 1.d0) - theta(i,j,k) ! mean value between kg and kg-1 (top wall) equal to 1
+ if (kg .eq. nz) theta(i,j,k+1) = 2.d0*(-1.d0) - theta(i,j,k) ! mean value between kg and kg+1 (bottom wall) equal to -1
+ enddo
+ enddo
+ enddo
+ enddo
+ #endif
+ !########################################################################################################################################
+ ! END STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT)
+ !########################################################################################################################################
+
+
+
+
+ call nvtxStartRange("Projection")
+ !########################################################################################################################################
+ ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP)
+ !########################################################################################################################################
+ ! 6.1 compute rhs
+ ! 6.2 obtain ustar and store old rhs in rhs_o
+ ! 6.3 Call halo exchnages along Y and Z for u,v,w
+
+ ! Projection step
+ do stage = 1,3
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! compute the products (conservative form)
+ h11 = 0.25d0*((u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k)) - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k)))*dxi
+ h12 = 0.25d0*((u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dyi
+ h13 = 0.25d0*((u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp)) - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k)))*dzci(kg) ! divide by cell height
+ h21 = 0.25d0*((u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dxi
+ h22 = 0.25d0*((v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k)) - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k)))*dyi
+ h23 = 0.25d0*((w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k)) - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km)))*dzci(kg) ! divide by cell height
+ h31 = 0.25d0*((w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km)) - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km)))*dxi
+ h32 = 0.25d0*((v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k)) - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k)))*dyi
+ h33 = 0.25d0*((w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k)) - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km)))*dzi(kg) ! divie by distance between centers
+ rhsu(i,j,k)=-(h11+h12+h13)
+ rhsv(i,j,k)=-(h21+h22+h23)
+ rhsw(i,j,k)=-(h31+h32+h33)
+ ! viscous/diffusive terms
+ h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi
+ h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddyi
+ h13 = mu*((u(i,j,kp)-u(i,j,k))*dzi(kg+1)-(u(i,j,k)-u(i,j,km))*dzi(kg))*dzci(kg)
+ h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi
+ h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddyi
+ h23 = mu*((v(i,j,kp)-v(i,j,k))*dzi(kg+1)-(v(i,j,k)-v(i,j,km))*dzi(kg))*dzci(kg)
+ h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi
+ h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddyi
+ h33 = mu*((w(i,j,kp)-w(i,j,k))*dzci(kg+1)-(w(i,j,k)-w(i,j,km))*dzci(kg))*dzi(kg) ! face to face and then center to center
+ rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi
+ rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi
+ rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi
+ ! Pressure driven
+ rhsu(i,j,k)=rhsu(i,j,k) - gradpx
+ rhsv(i,j,k)=rhsv(i,j,k) - gradpy
+ enddo
+ enddo
+ enddo
+
+ ! Surface tension forces (Obtain surface tension forces evaluated at the center of the cell (same as where phi is located)
+ #if phiflag == 1
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dyi + (normz(i,j,kp)-normz(i,j,km))/(z(kg+1)-z(kg-1))
+ fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
+ fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dyi
+ fzst(i,j,k)= -sigma*curv*(phi(i,j,kp)-phi(i,j,km))/(z(kg+1)-z(kg-1))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
+ !$acc host_data use_device(fxst,fyst,fzst)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Interpolate force at velocity points
+ !$acc parallel loop collapse(3)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (im .lt. 1) im=nx
+ #if thetaflag == 0
+ ! Add surface tension force only (ACDI only)
+ rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
+ rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
+ rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
+ #elif thetaflag == 1
+ ! Add here also buoyancy force if temperature is active (case ACDI + temperature)
+ rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k))
+ #endif
+ u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(im,j,k))*dxi
+ v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,jm,k))*dyi
+ w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,j,km))*dzi
+ rhsu_o(i,j,k)=rhsu(i,j,k)
+ rhsv_o(i,j,k)=rhsv(i,j,k)
+ rhsw_o(i,j,k)=rhsw(i,j,k)
+ enddo
+ enddo
+ enddo
+ #else
+ ! 5.2 find u, v and w star (RK3), single-phase case
+ !$acc parallel loop collapse(3)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! Add here also buoyancy force if temperature is active (case NS + temperature)
+ #if thetaflag == 1
+ km=k-1
+ rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k))
+ #endif
+ u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k)
+ v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k)
+ w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k)
+ rhsu_o(i,j,k)=rhsu(i,j,k)
+ rhsv_o(i,j,k)=rhsv(i,j,k)
+ rhsw_o(i,j,k)=rhsw(i,j,k)
+ enddo
+ enddo
+ enddo
+ #endif
+
+ ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+ !$acc host_data use_device(u,v,w)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! impose velocity boundary conditions, w is at the wall, u and v interpolate so that the mean value is zero, no-slip assumted, i.e. u=0, can be extented to any value
+ ! even when stretched the spacing cell face is centered
+ !$acc parallel loop collapse(3)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ kg = piX%lo(3) + k - 1 -halo_ext
+ ! bottom wall
+ if (kg .eq. 1) u(i,j,k-1)= -u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
+ if (kg .eq. 1) v(i,j,k-1)= -v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
+ if (kg .eq. 1) w(i,j,k)=0.d0 ! w point is at the wall
+ ! top wall
+ if (kg .eq. nz) u(i,j,k+1)= -u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
+ if (kg .eq. nz) v(i,j,k+1)= -v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
+ if (kg .eq. nz+1) w(i,j,k)=0.d0 ! w point (nz+1) is at the wall
+ enddo
+ enddo
+ enddo
+ enddo
+ !########################################################################################################################################
+ ! END STEP 6: USTAR COMPUTATION
+ !########################################################################################################################################
+ call nvtxEndRange
+
+
+ call nvtxStartRange("Poisson")
+ !########################################################################################################################################
+ ! START STEP 7: POISSON SOLVER FOR PRESSURE
+ !########################################################################################################################################
+ ! initialize rhs and analytical solution
+ ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center
+ ! I've done the halo updates so to compute the divergence at the pencil border i have the *star from the halo
+ call nvtxStartRange("compute RHS")
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ kg = piX%lo(3) + k - 1 - halo_ext
+ if (ip > nx) ip=1
+ rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
+ rhsp(i,j,k) = rhsp(i,j,k) + (rho*dyi/dt)*(v(i,jp,k)-v(i,j,k))
+ rhsp(i,j,k) = rhsp(i,j,k) + (rho*dzci(kg)/dt)*(w(i,j,kp)-w(i,j,k))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ call nvtxEndRange
+
+ call nvtxStartRange("FFT forward w/ transpositions")
+ !$acc host_data use_device(rhsp)
+ status = cufftExecD2Z(planXf, rhsp, psi_d)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
+ !$acc end host_data
+ ! psi(kx,y,z) -> psi(y,z,kx)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
+ ! psi(y,z,kx) -> psi(ky,z,kx)
+ status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
+ ! psi(ky,z,kx) -> psi(z,kx,ky)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+
+ call nvtxEndRange
+ np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
+ np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
+ np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
+ call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape)
+ offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+ offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+ offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+
+ xoff = offsets(1)
+ yoff = offsets(2)
+ npx = np(1)
+ npy = np(2)
+ call nvtxStartRange("Solution")
+ !$acc parallel loop collapse(2) gang private(a,b,c,d,factor)
+ do jl = 1, npy
+ do il = 1, npx
+ ! compute index global wavenumber ig and jg
+ jg = yoff + jl
+ ig = xoff + il
+ ! Set up tridiagonal system for each i and j
+ ! Fill diagonals and rhs for each
+ ! 0 and ny+1 are the ghost nodes
+ do k = 1, nz
+ a(k) = 2.0d0*(dzi(k)**2*dzi(k+1))/(dzi(k)+dzi(k+1))
+ c(k) = 2.0d0*(dzi(k)*dzi(k+1)**2)/(dzi(k)+dzi(k+1))
+ b(k) = -a(k) - c(k) - (kx_d(ig)**2 + ky_d(jg)**2)
+ d(k) = psi3d(k,il,jl)
+ enddo
+ ! Neumann BC at bottom
+ a(0) = 0.0d0
+ b(0) = -1.d0*dzi(1)*dzi(1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+ c(0) = 1.d0*dzi(1)*dzi(1)
+ d(0) = 0.0d0
+ ! Neumann BC at top
+ a(nz+1) = 1.0d0*dzi(nz+1)*dzi(nz+1)
+ b(nz+1) = -1.0d0*dzi(nz+1)*dzi(nz+1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+ c(nz+1) = 0.0d0
+ d(nz+1) = 0.0d0
+ ! Enforce pressure at one point? one interior point, avodig messing up with BC
+ ! need brackets?
+ if (ig == 1 .and. jg == 1) then
+ a(1) = 0.d0
+ b(1) = 1.d0
+ c(1) = 0.d0
+ d(1) = 0.d0
+ end if
+ ! Forward elimination (Thomas)
+ !$acc loop seq
+ do k = 1, nz+1
+ factor = a(k)/b(k-1)
+ b(k) = b(k) - factor*c(k-1)
+ d(k) = d(k) - factor*d(k-1)
+ end do
+ ! Back substitution
+ psi3d(nz+1,il,jl) = d(nz+1)/b(nz+1)
+ ! check on pivot like flutas?
+ !$acc loop seq
+ do k = nz, 1, -1
+ psi3d(k,il,jl) = (d(k) - c(k)*psi3d(k+1,il,jl))/b(k)
+ end do
+ end do
+ end do
+
+ call nvtxStartRange("FFT backwards along x and y w/ transpositions")
+ ! psi(z,kx,ky) -> psi(ky,z,kx)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+ ! psi(ky,z,kx) -> psi(y,z,kx)
+ status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
+ ! psi(y,z,kx) -> psi(kx,y,z)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
+ !$acc host_data use_device(p)
+ ! psi(kx,y,z) -> p(x,y,z)
+ status = cufftExecZ2D(planXb, psi_d, p)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
+ !$acc end host_data
+
+ ! normalize pressure (must be done here, not in the TDMA)
+ !$acc parallel loop collapse(3)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ p(i,j,k) = p(i,j,k)/nx/ny
+ end do
+ end do
+ end do
+
+ call nvtxEndRange
+ ! update halo nodes with pressure
+ ! Update X-pencil halos
+ !$acc host_data use_device(p)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !########################################################################################################################################
+ ! END STEP 7: POISSON SOLVER FOR PRESSURE
+ !########################################################################################################################################
+ call nvtxEndRange
+
+
+ call nvtxStartRange("Correction")
+ !########################################################################################################################################
+ ! START STEP 8: VELOCITY CORRECTION
+ ! ########################################################################################################################################
+ ! 8.1 Correct velocity
+ ! 8.2 Call halo update
+ ! Correct velocity, pressure has also the halo
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i = 1, piX%shape(1) ! equal to nx (no halo on x)
+ im=i-1
+ jm=j-1
+ km=k-1
+ kg=piX%lo(3) + k - 1 - halo_ext
+ if (im < 1) im=nx
+ u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
+ v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dyi
+ w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dzi(kg)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! 8.3 update halos (y direction), required to then compute the RHS of Poisson equation because of staggered grid
+ !$acc host_data use_device(u,v,w)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! impose velocity boundary conditions, can be optimized, no real gain
+ ! w is at the wall, u and v interpolate so that the mean value is zero
+ ! no-slip assumted, i.e. u=0, can be extented to any value
+ umax=0.d0
+ vmax=0.d0
+ wmax=0.d0
+ !$acc parallel loop collapse(3) reduction(max:umax,vmax,wmax)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ kg = piX%lo(3) + k - 1 - halo_ext
+ ! bottom wall
+ if (kg .eq. 1) u(i,j,k-1) = -u(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
+ if (kg .eq. 1) v(i,j,k-1) = -v(i,j,k) ! mean value between kg and kg-1 (wall) equal to zero
+ if (kg .eq. 1) w(i,j,k) = 0.d0 ! w point is at the wall
+ ! top wall
+ if (kg .eq. nz) u(i,j,k+1)= -u(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
+ if (kg .eq. nz) v(i,j,k+1)= -v(i,j,k) ! mean value between kg and kg+1 (wall) equal to zero
+ if (kg .eq. nz+1) w(i,j,k)=0.d0 ! w point (nz+1) is at the wall
+ umax=max(umax,u(i,j,k))
+ vmax=max(vmax,v(i,j,k))
+ wmax=max(wmax,w(i,j,k))
+ enddo
+ enddo
+ enddo
+
+ call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+ call MPI_Allreduce(vmax,gvmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+ call MPI_Allreduce(wmax,gwmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+
+ cflx=gumax*dt*dxi
+ cfly=gvmax*dt*dyi
+ cflz=gwmax*dt*lz/nz
+ cou=max(cflx,cfly)
+ cou=max(cou,gcflz)
+ if (rank.eq.0) then
+ write(*,*) "CFL (max among tasks)", cou
+ if (cou .gt. 7) stop
+ endif
+
+ call cpu_time(timef)
+ if (rank.eq.0) print '(" Time elapsed = ",f6.1," ms")',1000*(timef-times)
+ !########################################################################################################################################
+ ! END STEP 8: VELOCITY CORRECTION
+ !########################################################################################################################################
+ call nvtxEndRange
+
+
+ !########################################################################################################################################
+ ! START STEP 9: OUTPUT FIELDS
+ ! ########################################################################################################################################
+ if (mod(t,dump) .eq. 0) then
+ if (rank .eq. 0) write(*,*) "Saving output files"
+ ! write velocity and pressure fiels (1-4)
+ call writefield(t,1)
+ call writefield(t,2)
+ call writefield(t,3)
+ call writefield(t,4)
+ #if phiflag == 1
+ ! write phase-field (5)
+ call writefield(t,5)
+ #endif
+ #if thetaflag == 1
+ ! write temperature field (6)
+ call writefield(t,6)
+ #endif
+ endif
+ !########################################################################################################################################
+ ! END STEP 9: OUTPUT FIELDS N
+ !########################################################################################################################################
+
+call nvtxEndRange
+!call nvtxEndRange
+enddo
+call cpu_time(t_end)
+elapsed = t_end-t_start
+if (rank .eq. 0) write(*,*) 'Elapsed time (seconds):', elapsed
+!$acc end data
+!$acc end data
+!$acc end data
+
+! Remove allocated variables (add new)
+deallocate(u,v,w)
+deallocate(tanh_psi, mysin, mycos)
+deallocate(rhsu,rhsv,rhsw)
+deallocate(rhsu_o,rhsv_o,rhsw_o)
+#if phiflag == 1
+deallocate(phi,rhsphi,normx,normy,normz)
+#endif
+#if thetaflag == 1
+deallocate(theta,rhstheta,rhstheta_o)
+#endif
+
+call mpi_finalize(ierr)
+
+end program main
\ No newline at end of file
diff --git a/channel/module.f90 b/channel/module.f90
new file mode 100644
index 0000000..6ea93b7
--- /dev/null
+++ b/channel/module.f90
@@ -0,0 +1,149 @@
+module param
+ integer, parameter :: nx=200
+ integer, parameter :: ny=200
+ integer, parameter :: nz=100
+ double precision :: pi, rhoi, twopi
+ double precision :: lx, dx, dxi, ddxi
+ double precision :: ly, dy, dyi, ddyi
+ double precision :: lz
+ double precision, allocatable :: x(:), y(:), z(:), dzi(:), dzci(:), kx(:), ky(:)
+ double precision, device, allocatable :: kx_d(:), ky_d(:)
+ integer :: restart,tstart,tfin,dump
+ double precision :: gamma, normod, factor, csi
+ double precision :: dt,mu,rho !flow parameters
+ integer :: inflow, inphi, intheta
+ double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters
+ double precision :: times, timef, alphag
+ double precision :: gradpx, gradpy, noise, lflow, gflow, ubulk, cflx, cfly, cflz, gcflz
+ double precision :: amp, mx, my, mz ! for perturbed flow
+ double precision :: kappa ! temperature parameters: thermal diffusivity, Prandtl number
+end module param
+
+
+module mpivar
+ ! MPI variables
+ integer :: rank, ranks, ierr
+ integer :: localRank, localComm
+end module mpivar
+
+
+module cudecompvar
+ use cudecomp
+ integer :: npx, npy, npz
+ type(cudecompHandle) :: handle
+ type(cudecompGridDesc) :: grid_desc,grid_descD2Z
+ type(cudecompGridDescConfig) :: config
+ type(cudecompGridDescAutotuneOptions) :: options
+ integer :: pdims(2) ! pr x pc pencils
+ integer :: gdims(3) ! global grid dimensions
+ integer :: halo(3) ! halo extensions
+ integer :: halo_ext ! 0 no halo, 1 means 1 halo
+ type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration
+ type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z ! size of the pencils in x- y- and z-configuration for D2Z
+ type(cudecompPencilInfo) :: piX_Poiss
+ integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z
+ integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z
+ logical :: halo_periods(3)
+end module cudecompvar
+
+
+module velocity
+ double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:)
+ double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:)
+ double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:)
+ complex(8), allocatable :: rhsp_complex(:,:,:)
+ double precision, allocatable :: rhsp(:,:,:), p(:,:,:)
+ double precision, allocatable :: rhspp(:,:,:), pp(:,:,:)
+ double precision, allocatable :: div(:,:,:)
+ double precision :: uc, vc, wc, umax, vmax, wmax, gumax, gvmax, gwmax, cou
+ double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33
+ double precision, allocatable :: mysin(:), mycos(:)
+end module velocity
+
+
+module phase
+ double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+ double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+ double precision :: chempot, curv
+ double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+end module phase
+
+
+module temperature
+ double precision, allocatable :: theta(:,:,:), rhstheta(:,:,:)
+ double precision, allocatable :: rhstheta_o(:,:,:)
+end module temperature
+
+
+! added NVTX for profiing from maxcuda/NVTX_example
+module nvtx
+use iso_c_binding
+implicit none
+integer,private :: col(7) = [ int(Z'0000ff00'), int(Z'000000ff'), int(Z'00ffff00'), int(Z'00ff00ff'), int(Z'0000ffff'), int(Z'00ff0000'), int(Z'00ffffff')]
+character,private,target :: tempName(256)
+
+type, bind(C):: nvtxEventAttributes
+ integer(C_INT16_T):: version=1
+ integer(C_INT16_T):: size=48 !
+ integer(C_INT):: category=0
+ integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1
+ integer(C_INT):: color
+ integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0
+ integer(C_INT):: reserved0
+ integer(C_INT64_T):: payload ! union uint,int,double
+ integer(C_INT):: messageType=1 ! NVTX_MESSAGE_TYPE_ASCII = 1
+ type(C_PTR):: message ! ascii char
+end type
+
+interface nvtxRangePush
+ ! push range with custom label and standard color
+ subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
+ use iso_c_binding
+ character(kind=C_CHAR) :: name(256)
+ end subroutine
+
+ ! push range with custom label and custom color
+ subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx')
+ use iso_c_binding
+ import:: nvtxEventAttributes
+ type(nvtxEventAttributes):: event
+ end subroutine
+end interface
+
+interface nvtxRangePop
+ subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
+ end subroutine
+end interface
+
+contains
+
+subroutine nvtxStartRange(name,id)
+ character(kind=c_char,len=*) :: name
+ integer, optional:: id
+ type(nvtxEventAttributes):: event
+ character(kind=c_char,len=256) :: trimmed_name
+ integer:: i
+
+ trimmed_name=trim(name)//c_null_char
+
+ ! move scalar trimmed_name into character array tempName
+ do i=1,LEN(trim(name)) + 1
+ tempName(i) = trimmed_name(i:i)
+ enddo
+
+
+ if ( .not. present(id)) then
+ call nvtxRangePush(tempName)
+ else
+ event%color=col(mod(id,7)+1)
+ event%message=c_loc(tempName)
+ call nvtxRangePushEx(event)
+ end if
+end subroutine
+
+subroutine nvtxEndRange
+ call nvtxRangePop
+end subroutine
+
+end module nvtx
+
diff --git a/channel/plot_fields.py b/channel/plot_fields.py
new file mode 100644
index 0000000..5847922
--- /dev/null
+++ b/channel/plot_fields.py
@@ -0,0 +1,569 @@
+# # %%
+# %%html
+#
+
+# %%
+# %matplotlib widget
+# %matplotlib inline
+
+import ipympl
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.style as style
+# from ing_theme_matplotlib import mpl_style
+# from qbstyles import mpl_style
+
+import os
+import glob
+
+from scipy.interpolate import griddata
+
+
+from matplotlib.widgets import Cursor
+
+import h5py
+
+plt.style.use("dark_background")
+# mpl_style(dark=True)
+
+
+#################################################################
+foldername = './output/'
+
+# PARAMETERS:
+
+# fields to plot
+fields = ['theta']
+
+# number of points in each direction
+# Grid parameters (user-defined)
+nx = 256 # number of points in x
+ny = 128 # number of points in y
+nz = 200 # number of points in z
+
+Lx = 6.0 # length of domain in x
+Ly = 3.0 # length of domain in y
+Lz = 2.0 # length of domain in z
+
+# compute the derivative of the fields (show them instead of the neormal fields)
+# 0: no derivative
+# 1: x derivative
+# 2: y derivative
+# 3: z derivative
+# list more flag to compute consecutive derivatives (forder 1 FD)
+derivative_vec = [0]
+
+# # normal direction of the 2D slice:
+# 1: x-direction
+# 2: y-direction
+# 3: z-direction
+slice_dir = 2
+
+# index to take the slice (from 1 to nx_i, choose -1 for computing the average)
+slice_idx = 0
+
+# slice_idx = 222
+
+# time_steps to plot
+# ts_vec = [0]
+# ts_vec = range(800000,840500,10000)
+# ts_vec = range(0,900001,100000)
+ts_vec = [60000] # Test with just one timestep
+
+# ts_vec = [10000]
+
+# set 1 to compute time averaged quantities, 0 otherwise
+timeaverage = 0
+
+# set 1 to compute fluctuating components, 0 otherwise (expensive)
+fluct = 0
+
+# value for the fontsize:
+fontsize_val = 10
+
+# show heatmaps
+showmaps_flag = 1
+
+# slice of slice (leave -1 to compute the mean)
+meanprof_slice = 0
+
+# value of the figure size
+figsize_val = 8
+
+# save figure in png format
+savefig_flag = 0
+
+# save data in h5 format
+savedata_flag = 0
+
+# Aspect ratio of the heatmaps (-1 for auto, i.e. proportional, WARNING:can't handle huge AR)
+AR_plot = -1
+
+# Set your desired color limits here
+vmin = None # e.g., vmin = 0.0
+vmax = None # e.g., vmax = 1.0
+
+# vmin = -1 # e.g., vmin = 0.0
+# vmax = 1 # e.g., vmax = 1.0
+
+#################################################################
+# AUXILIARY FUNCTIONS
+def get_spectrum(x, signal, title='Spectrum Analysis'):
+
+ # Compute the Fourier Transform
+ spectrum = np.fft.fft(signal)
+
+ # Calculate the amplitude and phase
+ amplitude = np.abs(spectrum)
+ phase = np.angle(spectrum)
+
+ # Frequency values corresponding to the FFT components
+ frequencies = np.fft.fftfreq(len(x), d=(x[1] - x[0]))
+
+ # cut negative freq
+ positive_freq_indices = frequencies >= 0
+ frequencies = frequencies[positive_freq_indices]
+ amplitude = amplitude[positive_freq_indices]
+ phase = phase[positive_freq_indices]
+
+ # Sort frequencies and corresponding arrays
+ sorted_indices = np.argsort(frequencies)
+ frequencies = frequencies[sorted_indices]
+ amplitude = amplitude[sorted_indices]
+ phase = phase[sorted_indices]
+
+ # crop 2/3 of freq
+ amplitude = amplitude[1:int(np.floor(len(frequencies)*2/3))]
+ phase = phase[1:int(np.floor(len(frequencies)*2/3))]
+ frequencies = frequencies[1:int(np.floor(len(frequencies)*2/3))]
+
+ wavenum = frequencies*2*np.pi
+ return [wavenum,amplitude,phase]
+
+#################################################################
+
+# font = {'family' : 'normal',
+# 'weight' : 'normal',
+# 'size' : 6}
+
+plt.rcParams.update({'font.size': 5})
+
+# params_txt = {'axes.labelsize': 5,'axes.titlesize':5, 'text.fontsize': 5, 'legend.fontsize': 5, 'xtick.labelsize': 5, 'ytick.labelsize': 5}
+# plt.rcParams.update(params_txt)
+
+# solve conflicts: if fluctuations is compute mean is needed
+if fluct == 1:
+ timeaverage = 1
+
+if timeaverage == 1:
+ meansuff = '_MEAN'
+else:
+ meansuff = ''
+
+done = 0
+# GENERATED ON:
+from datetime import datetime
+
+# Print current date and time briefly
+print("Current date and time:", datetime.now())
+
+glob_profiles = []
+
+FLUCARRAYS_time = []
+
+x = np.linspace(0, Lx, nx)
+y = np.linspace(0, Ly, ny)
+z = np.linspace(0, Lz, nz)
+
+# Define the dimensions of the reshaped arrays (nvec) [y z x]
+
+nvec = (nx, ny, nz) # [y, z, x] order in your data file
+
+# nvec = (512, 513, 512) # Update with the actual dimensions
+# nvec = (256, 257, 256)
+# # nvec = (128, 129, 128)
+# nvec = (0, 0, 0)
+
+nx = nvec[0]
+ny = nvec[1]
+nz = nvec[2]
+
+for n_step in ts_vec:
+ file_names = []
+
+ for fld in fields:
+ file_names.append(fld + '_{:08d}.dat')
+
+ # Initialize an empty list to store the data arrays
+ data_arrays = []
+
+ # initialize arrays with sums if want to calculate time averages
+ if timeaverage == 1:
+ flucarray = []
+ if n_step == ts_vec[0]:
+ sumarray = []
+
+ # Read the data from each file and reshape
+ id_fnames = -1
+ for fld in fields:
+ file_name = f"{fld}_{n_step:08d}.dat"
+ file_path = foldername + file_name
+ id_fnames = id_fnames+1
+
+ # Check if file exists
+ if not os.path.exists(file_path):
+ print(f"Warning: File {file_path} not found, skipping...")
+ continue
+
+ with open(file_path, 'rb') as file:
+ total_elements = np.prod(nvec)
+ data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,))
+ data = data.reshape(np.flip(nvec))*1.0
+
+ data = data.transpose((2, 1, 0)) # Permute first and third index to match the convection [x,z,y]
+
+ # Validate data
+ print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}")
+
+ dersuff = ''
+ if derivative_vec[0] != 0:
+ dersuff = '_'
+ for ider in derivative_vec:
+ derivative_x, derivative_z, derivative_y = np.gradient(data, x, z, y)
+ if ider == 1:
+ data = derivative_x
+ dersuff = dersuff+'x'
+ elif ider == 2:
+ data = derivative_y
+ dersuff = dersuff+'y'
+ elif ider == 3:
+ data = derivative_z
+ dersuff = dersuff+'z'
+
+ # define axes:
+ if slice_dir == 1:
+ hor_name = 'y'
+ ver_name = 'z'
+ hor = y
+ nhor = ny
+ ver = z
+ nver = nz
+ elif slice_dir == 2:
+ hor_name = 'x'
+ ver_name = 'z'
+ hor = x
+ nhor = nx
+ ver = z
+ nver = nz
+ elif slice_dir == 3:
+ hor_name = 'x'
+ ver_name = 'y'
+ hor = x
+ nhor = nx
+ ver = y
+ nver = ny
+
+ if slice_idx == -1:
+ type_name = 'average'
+ if slice_dir == 1:
+ mean_array = np.mean(data, axis=0)
+ elif slice_dir == 2:
+ mean_array = np.mean(data, axis=1)
+ elif slice_dir == 3:
+ mean_array = np.mean(data, axis=2)
+
+ else:
+ type_name = 'slice'
+ if slice_dir == 1:
+ mean_array = data[slice_idx-1,:,:]
+ elif slice_dir == 2:
+ mean_array = data[:,slice_idx-1,:]
+ elif slice_dir == 3:
+ mean_array = data[:,:,slice_idx-1]
+
+ data_arrays.append(mean_array)
+
+ if timeaverage == 1:
+ if fluct == 1:
+ flucarray.append(mean_array)
+ if n_step == ts_vec[0]:
+ sumarray.append(mean_array)
+ else:
+ sumarray[id_fnames] = sumarray[id_fnames]+mean_array
+
+ if n_step == ts_vec[-1]:
+ data_arrays[id_fnames] = sumarray[id_fnames]/len(ts_vec)
+
+ FLUCARRAYS_time.append(flucarray)
+
+ # Plot each array as a heat map with coordinates
+
+ if timeaverage == 0 or n_step == ts_vec[-1]:
+ if showmaps_flag == 1:
+ for i, array in enumerate(data_arrays):
+ if array.ndim != 2:
+ print(f"Invalid shape for {file_name}")
+ continue
+
+ N = 500j
+ extent = (hor.min(),hor.max(),ver.min(),ver.max())
+
+ HOR,VER = np.meshgrid(hor,ver)
+ hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N]
+
+ plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val))
+
+ ax = plt.gca()
+ # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+ # im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+ # Use pcolormesh for non-uniform grids
+ X, Y = np.meshgrid(hor,ver)
+
+ im = ax.pcolormesh(X, Y, array.T, cmap='jet', shading='gouraud', vmin=vmin, vmax=vmax) # smoother shading
+
+ ax.set_aspect('equal') # <-- Add this line to set axis equal
+
+ plt.xlabel(hor_name, fontsize=fontsize_val)
+ plt.ylabel(ver_name, fontsize=fontsize_val)
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.yticks(fontsize=fontsize_val, rotation=0)
+
+ # create an axes on the right side of ax. The width of cax will be 5%
+ # of ax and the padding between cax and ax will be fixed at 0.05 inch.
+ cax = plt.axes([0.15, 0.95, 0.7, 0.03])
+ plt.colorbar(im, cax=cax,orientation = "horizontal")
+ cax.xaxis.set_ticks_position('top')
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.title(file_name+dersuff,fontsize=fontsize_val)
+
+ plt.show()
+
+ # save arrays and figure in the case of temporal average
+ if timeaverage == 1:
+ if savefig_flag == 1:
+ plt.savefig('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'.png', dpi=800)
+
+ # Save the arrays to an HDF5 file using h5py
+ if savedata_flag == 1:
+ with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'timeav.h5', 'w') as hf:
+ hf.create_dataset('data', data=array)
+ hf.create_dataset('x', data=hor)
+ hf.create_dataset('y', data=ver)
+
+
+ # # # # # PLOT ALL LINES IN THE SAME FIGURE
+ # # Plot the vertical profiles in the opposite direction with different colors
+ # plt.figure(figsize=(8, 6))
+
+ # for i, array in enumerate(data_arrays):
+ # if array.ndim != 2:
+ # print(f"Invalid shape for {file_name}")
+ # continue
+
+ # # Compute the vertical profile in the opposite direction
+ # vertical_profile = np.mean(array, axis=0)[::-1]
+
+ # # Plot the vertical profile with a different color for each array
+ # plt.plot(vertical_profile, ver, label=file_name, alpha=0.7)
+
+ # plt.title(fields[i]+'Profiles',fontsize=fontsize_val)
+ # plt.xlabel(fields[i] +'Mean',fontsize = fontsize_val)
+ # plt.ylabel(ver_name,fontsize = fontsize_val)
+ # plt.legend(loc ="best",fontsize = fontsize_val)
+ # plt.xticks(fontsize=fontsize_val, rotation=0)
+ # plt.yticks(fontsize=fontsize_val, rotation=0)
+ # plt.show()
+
+ # Plot the vertical profiles in the opposite direction with different colors
+
+ vertical_profile = []
+
+ for i, array in enumerate(data_arrays):
+ if array.ndim != 2:
+ print(f"Invalid shape for {file_name}")
+ continue
+
+ # Compute the vertical profile in the opposite direction
+ if meanprof_slice == -1:
+ vertical_profile.append(np.mean(array, axis=0))
+ horproflabelaux = ' Mean'
+ else:
+ vertical_profile.append(np.transpose(array[meanprof_slice-1,:]))
+ horproflabelaux = ' '
+
+
+ # plt.figure(figsize=(8, 6))
+
+ # # Plot the vertical profile with a different color for each array
+ # plt.plot(vertical_profile[i], ver, label=file_name, alpha=0.7)
+
+ # plt.title(fields[i]+' Profiles',fontsize=fontsize_val)
+ # plt.xlabel(fields[i] +' Mean',fontsize = fontsize_val)
+ # plt.ylabel(ver_name,fontsize = fontsize_val)
+ # plt.legend(loc ="best",fontsize = fontsize_val)
+ # plt.xticks(fontsize=fontsize_val, rotation=0)
+ # plt.yticks(fontsize=fontsize_val, rotation=0)
+ # plt.show()
+
+ glob_profiles.append(vertical_profile)
+
+prf_plt = []
+
+if fluct == 1:
+ for i, array in enumerate(data_arrays):
+ mean = data_arrays[id_fnames]
+ rmsfluc = np.zeros_like(mean)
+ for j in range(len(ts_vec)):
+ flucs = FLUCARRAYS_time[j]
+ singfield = flucs[i]
+ rmsfluc = rmsfluc+np.square(singfield-mean)
+ rmsfluc = np.sqrt(rmsfluc/len(ts_vec))
+
+ # PLOT RMS FIELD
+ N = 500j
+ extent = (hor.min(),hor.max(),ver.min(),ver.max())
+
+ HOR,VER = np.meshgrid(hor, ver)
+ hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N]
+ aaa = np.ones((len(HOR.flatten()),2))
+ aaa[:,0] = HOR.flatten()
+ aaa[:,1] = VER.flatten()
+
+ resampled = griddata( aaa, rmsfluc.T.flatten(), (hors, vers), method='linear')
+
+ if done == 0:
+ done = 1
+ if AR_plot == -1:
+ AR_plot = 1/(hor.max()-hor.min())*(ver.max()-ver.min())
+ AR_plt_flag = 0
+ else:
+ AR_plt_flag = 1
+
+ plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val*AR_plot))
+
+ ax = plt.gca()
+ if AR_plt_flag == 1:
+ # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+ im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+ else:
+ # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()])
+ im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()])
+
+ # plt.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent)#extent=[y.min(), y.max(), z.min(), z.max()])
+ # plt.colorbar(orientation = "horizontal")
+
+ # plt.title(file_name,fontsize=fontsize_val)
+ plt.xlabel(hor_name,fontsize=fontsize_val)
+ plt.ylabel(ver_name,fontsize=fontsize_val)
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.yticks(fontsize=fontsize_val, rotation=0)
+
+ # create an axes on the right side of ax. The width of cax will be 5%
+ # of ax and the padding between cax and ax will be fixed at 0.05 inch.
+ divider = make_axes_locatable(ax)
+ cax = divider.append_axes("top", size="5%", pad=0.05)
+ plt.colorbar(im, cax=cax,orientation = "horizontal")
+ cax.xaxis.set_ticks_position('top')
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.title(file_name+dersuff+'_RMS',fontsize=fontsize_val)
+
+ plt.show()
+
+ # SAVE FLUCTUATION FIELDS
+ if savedata_flag == 1:
+ with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'time_rms.h5', 'w') as hf:
+ hf.create_dataset('data', data=rmsfluc)
+ hf.create_dataset('x', data=hor)
+ hf.create_dataset('y', data=ver)
+
+numfields = len(glob_profiles[0])
+
+for i in range(numfields):
+
+ plt.figure(figsize=(figsize_val, 9))
+
+ for j in range(len(ts_vec)):
+
+ profile_sets = glob_profiles[j]
+ single_profile = profile_sets[i]
+
+ # plt.figure(figsize=(8, 6))
+
+ # Plot the vertical profile with a different color for each array
+ plt.plot(single_profile, ver, label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+
+
+ if fld == 'u' and slice_dir != 3:
+ zmin = np.min(ver)
+ zmax = np.max(ver)
+ zc = 0.5 * (zmin + zmax)
+ zlen = zmax - zmin
+
+ # Parabolic profile: u(z) = umax * (1 - ((z - zc)/H)^2), with H = zlen/2
+ H = zlen / 2.0
+ umax = 1.0 / (2.0 * 1.0) # assuming mu = rho = 1 → umax = 1/(2μ)
+ zvals = np.linspace(zmin, zmax, 256)
+ u_analytical = umax * (1.0 - ((zvals - zc)/H)**2)
+
+ plt.plot(u_analytical, zvals, 'r--', linewidth=2, label='Poiseuille (analytical)')
+
+ plt.title(fields[i]+dersuff+' Profiles',fontsize=fontsize_val)
+ plt.xlabel(fields[i]+dersuff + horproflabelaux,fontsize = fontsize_val)
+ plt.ylabel(ver_name,fontsize = fontsize_val)
+ # plt.legend(loc ="best",fontsize = fontsize_val)
+ plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+ plt.subplots_adjust(right=0.70)
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.yticks(fontsize=fontsize_val, rotation=0)
+ plt.show()
+
+for i in range(numfields):
+
+ plt.figure(figsize=(figsize_val, 8))
+
+ for j in range(len(ts_vec)):
+
+ profile_sets = glob_profiles[j]
+ single_profile = profile_sets[i]
+
+ # plt.figure(figsize=(8, 6))
+
+ # Plot the vertical profile with a different color for each array
+ [freq,amp,phase] = get_spectrum(ver,single_profile)
+
+ # Plot amplitude on log scale
+ plt.subplot(2, 1, 1)
+ plt.plot(freq, amp,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+ plt.xscale('log')
+ if np.any(amp > 0): # Only use log scale if there are positive values
+ plt.yscale('log')
+ plt.title('Amplitude')
+ plt.xlabel('k_'+ver_name)
+ plt.ylabel('Amplitude')
+
+ # Plot phase on log scale
+ plt.subplot(2, 1, 2)
+ plt.plot(freq, phase,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+ plt.xscale('log')
+ plt.title('Phase')
+ plt.xlabel('k_'+ver_name)
+ plt.ylabel('Phase (radians)')
+
+ # Adjust layout and show the plot
+ plt.subplot(2, 1, 1)
+ plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+ plt.tight_layout()
+ plt.show()
diff --git a/channel/plot_profiles.py b/channel/plot_profiles.py
new file mode 100644
index 0000000..086c742
--- /dev/null
+++ b/channel/plot_profiles.py
@@ -0,0 +1,163 @@
+# # %%
+# %%html
+#
+
+# %%
+# %matplotlib widget
+# %matplotlib inline
+
+import ipympl
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.style as style
+# from ing_theme_matplotlib import mpl_style
+# from qbstyles import mpl_style
+
+import os
+import glob
+
+from scipy.interpolate import griddata
+
+
+from matplotlib.widgets import Cursor
+
+import h5py
+
+plt.style.use("dark_background")
+# mpl_style(dark=True)
+
+
+#################################################################
+foldername = './output/'
+
+# PARAMETERS:
+
+# fields to plot
+fields = ['theta']
+
+# number of points in each direction
+# Grid parameters (user-defined)
+nx = 256 # number of points in x
+ny = 128 # number of points in y
+nz = 200 # number of points in z
+
+nx = 2 # number of points in x
+ny = 2 # number of points in y
+nz = 1000 # number of points in z
+
+Lx = 6.0 # length of domain in x
+Ly = 3.0 # length of domain in y
+Lz = 2.0 # length of domain in z
+
+# compute the derivative of the fields (show them instead of the neormal fields)
+# 0: no derivative
+# 1: x derivative
+# 2: y derivative
+# 3: z derivative
+# list more flag to compute consecutive derivatives (forder 1 FD)
+derivative_vec = [0]
+
+# # normal direction of the 2D slice:
+# 1: x-direction
+# 2: y-direction
+# 3: z-direction
+slice_dir = 2
+
+# index to take the slice (from 1 to nx_i, choose -1 for computing the average)
+slice_idx = 0
+
+# slice_idx = 222
+
+# time_steps to plot
+ts_vec = range(0,230000,10000)
+
+# ts_vec = [10000]
+
+# set 1 to compute time averaged quantities, 0 otherwise
+timeaverage = 0
+
+# set 1 to compute fluctuating components, 0 otherwise (expensive)
+fluct = 0
+
+# value for the fontsize:
+fontsize_val = 10
+
+x = np.linspace(0, Lx, nx)
+y = np.linspace(0, Ly, ny)
+z = np.linspace(0, Lz, nz)
+
+# Define the dimensions of the reshaped arrays (nvec) [y z x]
+
+nvec = (nx, ny, nz) # [y, z, x] order in your data file
+
+# nvec = (512, 513, 512) # Update with the actual dimensions
+# nvec = (256, 257, 256)
+# # nvec = (128, 129, 128)
+# nvec = (0, 0, 0)
+
+nx = nvec[0]
+ny = nvec[1]
+nz = nvec[2]
+
+id_fnames = -1
+
+for fld in fields:
+
+ id_fnames = id_fnames+1
+
+
+ plt.figure(figsize=(10, 9))
+
+ for n_step in ts_vec:
+ file_names = []
+
+ file_names.append(fld + '_{:08d}.dat')
+
+
+ # Read the data from each file and reshape
+
+ file_name = f"{fld}_{n_step:08d}.dat"
+ file_path = foldername + file_name
+
+ # Check if file exists
+ if not os.path.exists(file_path):
+ print(f"Warning: File {file_path} not found, skipping...")
+ continue
+
+ with open(file_path, 'rb') as file:
+ total_elements = np.prod(nvec)
+ data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,))
+
+ data = data.reshape(np.flip(nvec))*1.0
+
+ # Validate data
+ print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}")
+
+ prof = np.mean(data, axis=(1, 2)) #.transpose((1,0,2))
+
+ # prof = np.flip(np.mean(data, axis=(0, 2)))
+
+ plt.plot(prof, z, label=f'{fld}_{n_step}', alpha=0.7)
+
+ plt.title(fld+' Profiles',fontsize=fontsize_val)
+ plt.xlabel(fld,fontsize = fontsize_val)
+ plt.ylabel("z",fontsize = fontsize_val)
+ # plt.legend(loc ="best",fontsize = fontsize_val)
+ plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+ plt.subplots_adjust(right=0.70)
+ plt.xticks(fontsize=fontsize_val, rotation=0)
+ plt.yticks(fontsize=fontsize_val, rotation=0)
+ plt.show()
diff --git a/channel/post.py b/channel/post.py
new file mode 100644
index 0000000..4bcd91d
--- /dev/null
+++ b/channel/post.py
@@ -0,0 +1,35 @@
+# %%
+import numpy as np
+import matplotlib.pyplot as plt
+
+nx, ny, nz = 256, 128, 200
+
+data = np.fromfile('output/u_00005000.dat', dtype=np.float64) # or float64
+print("Data size:", data.size)
+data = data.reshape((nz, ny, nx))
+
+slice_index = ny // 2 # middle of z
+slice_data = data[ :,slice_index, :] # xy slice at fixed z
+
+print("Max:", np.max(data))
+print("Min:", np.min(data))
+plt.figure(figsize=(14,6))
+plt.imshow(slice_data, cmap='jet', origin='lower', aspect='3.14')
+plt.colorbar(label='Velocity')
+plt.xlabel('x')
+plt.ylabel('y')
+plt.axis('scaled')
+plt.show()
+#1D plot
+#line_data = data[1, 1, : ] # middle column
+#plt.subplot(1, 2, 2)
+#plt.plot(line_data, np.arange(ny), color='black')
+#plt.xlabel('Variable')
+#plt.ylabel('y')
+#plt.title('1D Profile along y (x = nx/2)')
+#plt.grid(True)
+
+#plt.tight_layout()
+#plt.show()
+
+# %%
diff --git a/channel/readinput.f90 b/channel/readinput.f90
new file mode 100644
index 0000000..b792e99
--- /dev/null
+++ b/channel/readinput.f90
@@ -0,0 +1,155 @@
+
+!##########################################################################
+!###########################################################################
+subroutine readinput
+use velocity
+use phase
+use temperature
+use param
+use mpivar
+implicit none
+integer :: i,j,k
+double precision :: zk
+
+open(unit=55,file='input.inp',form='formatted',status='old')
+!Time step parameters
+read(55,*) restart
+read(55,*) tstart
+read(55,*) tfin
+read(55,*) dump
+! Domain size
+read(55,*) lx
+read(55,*) ly
+read(55,*) lz
+read(55,*) csi
+!Flow parameters
+read(55,*) inflow
+read(55,*) inphi
+read(55,*) intheta
+read(55,*) dt
+read(55,*) mu
+read(55,*) rho
+! forcing parameters
+read(55,*) gradpx
+read(55,*) gradpy
+! phase-field parameters
+read(55,*) radius
+read(55,*) sigma
+read(55,*) epsr
+! temperature parameters
+read(55,*) kappa
+read(55,*) alphag
+close(55)
+
+! compute pre-defined constants
+twopi=8.0_8*atan(1.0_8)
+pi=twopi/2.d0
+dx = lx/nx
+dy = ly/ny
+dxi = 1.d0/dx
+dyi = 1.d0/dy
+ddxi = 1.d0/dx/dx
+ddyi = 1.d0/dy/dy
+rhoi=1.d0/rho
+eps=epsr*dx
+epsi=1.d0/eps
+enum=1.e-16
+!write(*,*) "Check on stability", dt*mu*dzi*dzi
+
+if (rank .eq. 0) then
+ !enable/disable for debug check parameters
+ write(*,*) "----------------------------------------------"
+ write(*,*) "███ ███ ██ ██ ██ ████████ ██████ ██████"
+ write(*,*) "████ ████ ██ ██ ██ ██ ██ ██"
+ write(*,*) "██ ████ ██ ███████ ██ ██ █████ ███████"
+ write(*,*) "██ ██ ██ ██ ██ ██ ██ ██ ██ ██"
+ write(*,*) "██ ██ ██ ██ ██ ██ ██████ ██████"
+ write(*,*) "----------------------------------------------"
+ write(*,*) "-------------Channel flow setup---------------"
+ write(*,*) 'Grid:', nx, 'x', ny, 'x', nz
+ write(*,*) "Restart ", restart
+ write(*,*) "Tstart ", tstart
+ write(*,*) "Tfin ", tfin
+ write(*,*) "Dump ", dump
+ write(*,*) "Inflow ", inflow
+ write(*,*) "Deltat ", dt
+ write(*,*) "Mu ", mu
+ write(*,*) "Rho ", rho
+ write(*,*) "Gradpx ", gradpx
+ write(*,*) "Gradpy ", gradpy
+ write(*,*) "Radius ", radius
+ write(*,*) "Sigma ", sigma
+ write(*,*) "Eps ", eps
+ write(*,*) "Kappa ", kappa
+ write(*,*) "Alphag ", alphag
+ write(*,*) "Lx ", lx
+ write(*,*) "Ly ", ly
+ write(*,*) "Lz ", lz
+ write(*,*) "Z-stretch ", csi
+! write(*,*) "dx", dx
+! write(*,*) "dxi", dxi
+! write(*,*) "ddxi", ddxi
+! write(*,*) "dy", dx
+! write(*,*) "dyi", dyi
+! write(*,*) "ddyi", ddyi
+! write(*,*) "dz", dz
+! write(*,*) "dzi", dzi
+! write(*,*) "ddzi", ddzi
+endif
+
+
+
+! define wavenumbers and grid points axis
+! define grid (then move in readinput)
+allocate(x(nx),y(ny),z(0:nz+1),dzci(nz),dzi(nz+1),kx(nx),ky(ny))
+! location of the pressure nodes (cell centers)
+x(1)=dx/2
+do i = 2, nx
+ x(i) = x(i-1) + dx
+enddo
+y(1)=dy/2
+do j = 2, ny
+ y(j) = y(j-1) + dy
+enddo
+! stretched grid along z; z axis include also the two ghost nodes located at +/- dz/2 above and below the wall
+do k = 1, nz
+ zk=(dble(k)-0.5d0)/dble(nz)
+ z(k) = 0.5d0*dble(lz)*(1.d0+tanh(csi*(zk-0.5d0))/tanh(0.5d0*csi))
+enddo
+z(0)=-z(1)
+z(nz+1)= lz+(lz-z(nz))
+! compute inverse of dz (between cell faces)
+dzci(1) = 2.d0/(z(1)+ z(2))
+dzci(nz) = 1.d0/(lz-0.5d0*(z(nz)+z(nz-1)))
+do k = 2, nz-1
+ dzci(k) = 2.d0/(z(k+1)-z(k-1))
+enddo
+! compute inverse of dz (between nodes)
+dzi(1)=0.5d0/z(1)
+!dzi(nz+1)=0.5d0/(lz-z(nz))
+do k=1, nz+1
+ dzi(k) = 1.d0/(z(k)-z(k-1))
+enddo
+!write(*,*) "dzi", dzi
+! wavenumber
+do i = 1, nx/2
+ kx(i) = (i-1)*(twopi/lx)
+enddo
+do i = nx/2+1, nx
+ kx(i) = (i-1-nx)*(twopi/lx)
+enddo
+do j = 1, ny/2
+ ky(j) = (j-1)*(twopi/ly)
+enddo
+do j = ny/2+1, ny
+ ky(j) = (j-1-ny)*(twopi/ly)
+enddo
+! allocate kx_d and ky_d on the device
+allocate(kx_d, source=kx)
+allocate(ky_d, source=ky)
+
+
+
+end subroutine
+
+
diff --git a/channel/readwrite.f90 b/channel/readwrite.f90
new file mode 100644
index 0000000..3b54006
--- /dev/null
+++ b/channel/readwrite.f90
@@ -0,0 +1,356 @@
+subroutine writefield(t,fieldn)
+! Output field, file is written in the /output folder
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: out(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1]
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 2) then
+ out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 3) then
+ out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 4) then
+ out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 5) then
+ out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 6) then
+ out=theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+deallocate(out)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield(fieldn)
+! Input field, file is written in the /input folder
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ namefile='./input/u.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+ namefile='./input/v.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+ namefile='./input/w.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+ namefile='./input/p.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+ namefile='./input/phi.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 6) then
+ namefile='./input/theta.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield_restart(t,fieldn)
+! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!)
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+ write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+ write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+ write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+ write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 6) then
+ write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
\ No newline at end of file
diff --git a/channel/restart_local.sh b/channel/restart_local.sh
new file mode 100755
index 0000000..f761baf
--- /dev/null
+++ b/channel/restart_local.sh
@@ -0,0 +1,10 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+
+
+mpirun -np 2 ./tcf36
\ No newline at end of file
diff --git a/channel/tdma.f90 b/channel/tdma.f90
new file mode 100644
index 0000000..2cde690
--- /dev/null
+++ b/channel/tdma.f90
@@ -0,0 +1,312 @@
+! Subroutine still under development, not included in the makefile at the moment (7/8/25)
+subroutine tdmau
+ implicit none
+ integer il, jl
+ ! TDMA for u velocity, solve from 1 to nz+1 where 1 is the w collocation point on the bottom and nz+1 the upper one
+ ! Load all variables via modules
+ ! move pencil information in a module
+ ! Set up tridiagonal system for each i and j
+ ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+ ! Dirilecht BC: w = 0 at k=1 and k=nz+1
+ ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d?
+
+ use velocity
+ implicit none
+ call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+ !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol)
+ do jl = 1, npy
+ do il = 1, npx
+
+ !inner nodes
+ do k = 2, nz-1
+ a(k) = 1.0d0*dzi*dzi
+ b(k) = -2.0d0*dzi*dzi
+ c(k) = 1.0d0*dzi*dzi
+ d(k) = u(k,il,jl) ! check if correct and consisten with CN
+ enddo
+
+ ! Dirilecht BC at k=1
+ a(1) = 0.0d0
+ b(1) = 1.0d0
+ c(1) = 0.0d0
+ d(1) = 0.0d0
+
+ ! Dirilecht BC at k=nz+1
+ a(nz) = 0.0d0
+ b(nz) = 1.0d0
+ c(nz) = 0.0d0
+ d(nz) = 0.0d0
+
+ ! Forward elimination (Thomas)
+ !$acc loop seq
+ do k = 2, nz
+ factor = a(k)/b(k-1)
+ b(k) = b(k) - factor*c(k-1)
+ d(k) = d(k) - factor*d(k-1)
+ end do
+
+ ! Back substitution
+ sol(nz) = d(nz)/b(nz+1)
+ ! check on pivot like flutas?
+ !$acc loop seq
+ do k = nz-1, 1, -1
+ sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+ end do
+
+ ! Store solution in array that do the back FFT
+ do k=1,nz
+ vel3d(k,il,jl) = sol(k)
+ enddo
+ end do
+ end do
+end subroutine
+
+
+
+
+
+
+
+
+
+
+
+
+subroutine tdmav
+ implicit none
+ integer il, jl
+ ! TDMA for v velocity, solve from 1 to nz where 1 is the v collocation point on the bottom and nz the upper one
+ ! Load all variables via modules
+ ! move pencil information in a module
+ ! Set up tridiagonal system for each i and j
+ ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+ ! Dirilecht BC: v = 0 at k=1 and k=nz
+ ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d?
+ ! Same as u but just RHS is different
+
+ use velocity
+ implicit none
+ call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+ !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol)
+ do jl = 1, npy
+ do il = 1, npx
+
+ !inner nodes
+ do k = 2, nz-1
+ a(k) = 1.0d0*dzi*dzi
+ b(k) = -2.0d0*dzi*dzi
+ c(k) = 1.0d0*dzi*dzi
+ d(k) = v(k,il,jl) ! check if correct and consisten with CN
+ enddo
+
+ ! Dirilecht BC at k=1
+ a(1) = 0.0d0
+ b(1) = 1.0d0
+ c(1) = 0.0d0
+ d(1) = 0.0d0
+
+ ! Dirilecht BC at k=nz
+ a(nz+1) = 0.0d0
+ b(nz+1) = 1.0d0
+ c(nz+1) = 0.0d0
+ d(nz+1) = 0.0d0
+
+ ! Forward elimination (Thomas)
+ !$acc loop seq
+ do k = 2, nz
+ factor = a(k)/b(k-1)
+ b(k) = b(k) - factor*c(k-1)
+ d(k) = d(k) - factor*d(k-1)
+ end do
+
+ ! Back substitution
+ sol(nz) = d(nz)/b(nz)
+ ! check on pivot like flutas?
+ !$acc loop seq
+ do k = nz-1, 1, -1
+ sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+ end do
+
+ ! Store solution in array that do the back FFT
+ do k=1,nz
+ vel3d(k,il,jl) = sol(k)
+ enddo
+ end do
+ end do
+end subroutine
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+subroutine tdmaw
+ implicit none
+ integer il, jl
+ ! TDMA for w velocity, solve from 1 to ny+1 where 1 is the w collocation point on the bottom and ny+1 the upper one
+ ! Load all variables via modules
+ ! move pencil information in a module
+ ! Set up tridiagonal system for each i and j
+ ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+ ! Dirilecht BC: w = 0 at k=1 and k=nz+1
+ ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d?
+
+ use velocity
+ implicit none
+ call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+ !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol)
+ do jl = 1, npy
+ do il = 1, npx
+
+ !inner nodes
+ do k = 2, nz
+ a(k) = 1.0d0*dzi*dzi
+ b(k) = -2.0d0*dzi*dzi
+ c(k) = 1.0d0*dzi*dzi
+ d(k) = w(k,il,jl) ! check if correct and consisten with CN
+ enddo
+
+ ! Dirilecht BC at k=1
+ a(1) = 0.0d0
+ b(1) = 1.0d0
+ c(1) = 0.0d0
+ d(1) = 0.0d0
+
+ ! Dirilecht BC at k=nz+1
+ a(nz+1) = 0.0d0
+ b(nz+1) = 1.0d0
+ c(nz+1) = 0.0d0
+ d(nz+1) = 0.0d0
+
+ ! Forward elimination (Thomas)
+ !$acc loop seq
+ do k = 2, nz+1
+ factor = a(k)/b(k-1)
+ b(k) = b(k) - factor*c(k-1)
+ d(k) = d(k) - factor*d(k-1)
+ end do
+
+ ! Back substitution
+ sol(nz+1) = d(nz+1)/b(nz+1)
+ ! check on pivot like flutas?
+ !$acc loop seq
+ do k = nz, 1, -1
+ sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+ end do
+
+ ! Store solution in array that do the back FFT
+ ! only up to nz, because i am not trasposing the halo, i can eventually
+ do k=1,nz
+ vel3d(k,il,jl) = sol(k)
+ enddo
+ end do
+ end do
+end subroutine
+
+
+
+
+
+
+
+
+subroutine tdmap
+ ! TDMA for pressure, solve from 0 to ny+1 where 0 is the first ghost node and ny+1 the upper ghost node
+ ! When ready, remove it from the main code and just call this function
+ ! Load all variables via modules
+ ! move pencil information in a module
+ ! Set up tridiagonal system for each i and j
+ ! The system is: (A_z) * pc(k-1,ky,kx) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k,ky,kx) = rhs(k,ky,kx)
+ ! Neumann BC: d/dz pc = 0 at w collocation points
+ ! Fill diagonals and rhs for each
+ ! 0 and ny+1 are the ghost nodes
+ ! a, b, c and are complex (because of FFT)
+
+ use velocity
+
+ implicit none
+
+ offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+ offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+ offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+ call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape)
+
+ xoff = offsets(1)
+ yoff = offsets(2)
+ npx = np(1)
+ npy = np(2)
+
+ !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol)
+ do jl = 1, npy
+ do il = 1, npx
+ ! compute index global wavenumber ig and jg
+ jg = yoff + jl
+ ig = xoff + il
+
+ do k = 1, nz
+ a(k) = 1.0d0*dzi*dzi
+ b(k) = -2.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+ c(k) = 1.0d0*dzi*dzi
+ d(k) = psi3d(k,il,jl)
+ enddo
+
+ ! Neumann BC at bottom
+ a(0) = 0.0d0
+ b(0) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+ c(0) = 1.0d0*dzi*dzi
+ d(0) = 0.0d0
+
+ ! ghost node elimintaion trick
+ ! Neumann BC at top
+ a(nz+1) = 1.0d0*dzi*dzi
+ b(nz+1) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+ c(nz+1) = 0.0d0
+ d(nz+1) = 0.0d0
+
+ ! Enforce pressure at one point? one interior point, avodig messing up with BC
+ ! need brackets?
+ if (ig == 1 .and. jg == 1) then
+ a(1) = 0.d0
+ b(1) = 1.d0
+ c(1) = 0.d0
+ d(1) = 0.d0
+ end if
+
+ ! Forward elimination (Thomas)
+ !$acc loop seq
+ do k = 1, nz+1
+ factor = a(k)/b(k-1)
+ b(k) = b(k) - factor*c(k-1)
+ d(k) = d(k) - factor*d(k-1)
+ end do
+
+ ! Back substitution
+ sol(nz+1) = d(nz+1)/b(nz+1)
+ ! check on pivot like flutas?
+ !$acc loop seq
+ do k = nz, 0, -1
+ sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+ end do
+
+ ! Store solution in array that do the back FFT
+ do k=1,nz
+ psi3d(k,il,jl) = sol(k)
+ enddo
+ end do
+ end do
+
+
+end subroutine
\ No newline at end of file
diff --git a/channel/testpush.sh b/channel/testpush.sh
new file mode 100644
index 0000000..2e0e21a
--- /dev/null
+++ b/channel/testpush.sh
@@ -0,0 +1,2 @@
+make clean
+make
diff --git a/cuDecomp b/cuDecomp
index 7968153..907b18e 160000
--- a/cuDecomp
+++ b/cuDecomp
@@ -1 +1 @@
-Subproject commit 796815382081da97054602aed68baae88b658ee9
+Subproject commit 907b18e901321c1ca6e6142ae4c2a4cc3ff49030
diff --git a/hit/Makefile b/hit/Makefile
new file mode 100644
index 0000000..0ee5198
--- /dev/null
+++ b/hit/Makefile
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
diff --git a/hit/Makefile_leonardo b/hit/Makefile_leonardo
new file mode 100755
index 0000000..061dbd9
--- /dev/null
+++ b/hit/Makefile_leonardo
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 particles.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
diff --git a/hit/Makefile_local b/hit/Makefile_local
new file mode 100644
index 0000000..08156c7
--- /dev/null
+++ b/hit/Makefile_local
@@ -0,0 +1,28 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+RM = /bin/rm -f
+
+# Paths (Modify if necessary)
+CUDECOMP_DIR = /home/milton/MHIT36/cuDecomp/build/
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -Minfo=accel -cuda -I$(CUDECOMP_DIR)/include
+LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft -lcuda -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 particles.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
+
+
diff --git a/hit/Makefile_mn5 b/hit/Makefile_mn5
new file mode 100644
index 0000000..fe0febe
--- /dev/null
+++ b/hit/Makefile_mn5
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=managed -Mfree -Mpreprocess -cpp -cuda -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC): $(OBJS)
+ $(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+ $(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+ $(FC) -c $(FFLAGS) $<
diff --git a/hit/binder.sh b/hit/binder.sh
new file mode 100755
index 0000000..b449dda
--- /dev/null
+++ b/hit/binder.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in
+0) export UCX_NET_DEVICES=mlx5_0:1 ;;
+1) export UCX_NET_DEVICES=mlx5_1:1 ;;
+2) export UCX_NET_DEVICES=mlx5_2:1 ;;
+3) export UCX_NET_DEVICES=mlx5_3:1 ;;
+esac
+
+echo Launching on $UCX_NET_DEVICES
+
+$*
+
diff --git a/hit/go_leo.sh b/hit/go_leo.sh
new file mode 100644
index 0000000..7545b12
--- /dev/null
+++ b/hit/go_leo.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --account="IscrB_EXCEED"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:05:00
+#SBATCH --nodes=1 ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --error=test.err
+#SBATCH --qos=boost_qos_dbg
+
+#module load nvhpc/25.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+chmod 777 binder.sh
+mpirun -np 4 --map-by node:PE=8 --rank-by core ./binder.sh ./mhit36
\ No newline at end of file
diff --git a/hit/go_leo_bprod.sh b/hit/go_leo_bprod.sh
new file mode 100644
index 0000000..15c7b1a
--- /dev/null
+++ b/hit/go_leo_bprod.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --account="IscrB_SONORA"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:10:00
+#SBATCH --nodes=128 ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8 ## override threads limitation
+#SBATCH --output=test.out
+#SBATCH --partition=boost_usr_prod
+#SBATCH --qos=boost_qos_bprod
+#SBATCH --error=test.err
+
+module load nvhpc/24.3
+module load cuda/12.3
+module load openmpi/4.1.6--nvhpc--24.3
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_$
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_P$
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+#export OMP_NUM_THREADS=16
+export OMP_PROC_BIND=spread
+export OMP_PLACES=cores
+
+mpirun -n 512 ./mhit36
diff --git a/hit/go_leo_prof.sh b/hit/go_leo_prof.sh
new file mode 100755
index 0000000..ebcb52e
--- /dev/null
+++ b/hit/go_leo_prof.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --account="tra25_openhack"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:10:00
+#SBATCH --nodes=1 ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --qos=boost_qos_dbg
+#SBATCH --error=test.err
+#### if mapping is on
+#SBATCH --cpus-per-task=8
+
+
+#module load nvhpc/24.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+# simple run
+#mpirun -n 4 ./mhit36
+
+# profile on single node (all ranks inside one profile)
+#nsys profile -t cuda,nvtx,mpi,openacc mpirun -np 4 ./mhit36
+
+# profile (one rep for rank)
+#mpirun -n 4 nsys profile --trace=cuda,nvtx,mpi -o profile_output_%q{SLURM_PROCID} --stats=true ./mhit36
+#mpirun -n 4 nsys profile -t cuda,nvtx,mpi -o report.$SLURM_LOCALID ./mhit36
+#srun -n 4 nsys profile -t cuda,nvtx,mpi --output=nsys_report_rank%t ./mhit36nsys profile --multiprocess=true -t cuda,nvtx,mpi -o report $
+
+# profile + node mapping + nic
+mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile -t cuda,nvtx,mpi,openacc --nic-metrics=true ./binder.sh ./mhit36
+
+# for nsight compute report
+#mpirun -n 4 ncu --kernel-name main_659 --set=full --import-source=yes -o profile -f --launch-skip 3 --launch-count 1 "./mhit36"
+
+# for nsight compute report - all kernels
+# mpirun -n 4 ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./mhit36
+
+# for nsight compute report - all kernels + mapping + nic
+# mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36
+
diff --git a/hit/go_mn5.sh b/hit/go_mn5.sh
new file mode 100644
index 0000000..7855336
--- /dev/null
+++ b/hit/go_mn5.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH --ntasks=16 # --ntasks=8 when used two nodes
+#SBATCH --account="ehpc244"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:5:00
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=20
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --qos=acc_debug
+#SBATCH --error=test.err
+
+module load nvidia-hpc-sdk/24.3
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+
+mpirun -n 8 ./mhit36
\ No newline at end of file
diff --git a/hit/input.inp b/hit/input.inp
new file mode 100644
index 0000000..845aab6
--- /dev/null
+++ b/hit/input.inp
@@ -0,0 +1,18 @@
+0 ! fresh start=0 restart =1
+0 ! iteration to start from (fresh start 0 otherwise different)
+10000 ! Final time step
+100000 ! Saving frequency
+0 ! inflow condition (0=Taylor green, 1=read from fields)
+0 ! phase-field initial condition (0=circular drop, 1=read from fields)
+0.0002 ! dt for the simulation
+0.006 ! mu (viscosity)
+1.000 ! density
+1.0 ! A forcing parameter (ABC forcing)
+1.0 ! B forcing parameter (ABC forcing)
+1.0 ! C forcing parameter (ABC forcing)
+2.0 ! Forced wavenumber (ABC forcing)
+0.5 ! Initial drop radius for phase-field
+1.0d0 ! Surface tension
+1.0 ! Ratio eps/dx
+10000 ! Number particles
+1 ! Particle initial condition (0 from file, 1 random, 2 random in drops)
diff --git a/hit/leo.sh b/hit/leo.sh
new file mode 100755
index 0000000..364635a
--- /dev/null
+++ b/hit/leo.sh
@@ -0,0 +1,12 @@
+# old modules - ! if you use these, changes in Makefile are needed from -gpu=mem:managed to -gpu=managed
+#module load nvhpc/24.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+cp Makefile_leonardo Makefile
+make clean
+make
+mkdir -p output
+# mpirun -np 4 ./mhit36
diff --git a/hit/local.sh b/hit/local.sh
new file mode 100644
index 0000000..4f27984
--- /dev/null
+++ b/hit/local.sh
@@ -0,0 +1,15 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+#clean folder output
+rm -rf output
+mkdir output
+cp Makefile_local Makefile
+#rm *.dat
+make clean
+make
+mpirun -np 2 ./mhit36
diff --git a/hit/main.f90 b/hit/main.f90
new file mode 100644
index 0000000..beb351f
--- /dev/null
+++ b/hit/main.f90
@@ -0,0 +1,1495 @@
+#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
+
+program main
+use cudafor
+use cudecomp
+use cufft
+use mpi
+use velocity
+use phase
+use particles
+use param
+use mpivar
+use cudecompvar
+
+use, intrinsic :: ieee_arithmetic
+
+implicit none
+! timer for scaling test
+real :: t_start, t_end, elapsed
+! grid dimensions
+integer :: comm_backend
+integer :: pr, pc
+! cudecomp
+! cuFFT
+integer :: planXf, planXb
+integer :: planY, planZ
+integer :: batchsize
+integer :: status
+integer :: i,j,k,il,jl,kl,ig,jg,kg,t
+integer :: im,ip,jm,jp,km,kp,last
+integer :: inY,enY,inZ,enZ
+!beginDEBUG
+integer, parameter :: Mx = 1, My = 2, Mz = 1
+!endDEBUG
+real(8), device, allocatable :: kx_d(:)
+! working arrays
+complex(8), allocatable :: psi(:)
+real(8), allocatable :: ua(:,:,:)
+real(8), allocatable :: uaa(:,:,:)
+
+real(8), allocatable :: psi_real(:)
+! real(8), device, allocatable :: psi_real_d(:)
+complex(8), device, allocatable :: psi_d(:)
+complex(8), pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:)
+character(len=40) :: namefile
+character(len=4) :: itcount
+! Code variables
+
+real(8)::err,maxErr
+
+complex(8), device, pointer :: phi3d(:,:,:)
+real(8) :: k2
+!integer :: il, jl, ig, jg
+integer :: offsets(3), xoff, yoff
+integer :: np(3)
+
+! Enable or disable phase field (acceleration eneabled by default)
+#define phiflag 1
+! Enable or disable particle Lagrangian tracking (tracers)
+#define partflag 0
+
+!########################################################################################################################################
+! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START
+!########################################################################################################################################
+! MPI initialization, put in rank the local MPI rank number and ranks total number
+! Same procedura defined in the cuDecomp documentation
+call mpi_init(ierr)
+call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
+call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr)
+
+call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr)
+call mpi_comm_rank(localComm, localRank, ierr)
+ierr = cudaSetDevice(localRank) !assign GPU to MPI rank
+
+! Define grid and decomposition
+call readinput
+
+! number of process along each direction (r means y and c means z)
+pr = 0
+pc = 0
+halo_ext=1
+! comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+comm_backend = 0 ! Enable full autotuning
+
+CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD))
+
+! config is a struct and pr and pc are the number of pencils along the two directions
+! gdims is the global grid
+! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults.
+! Initializing to default values is required to ensure no entries are left uninitialized.
+CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config))
+pdims = [pr, pc] !pr and pc are the number of pencil along the different directions
+config%pdims = pdims
+! gdims = [nx, ny, nz]
+! config%gdims = gdims
+halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil
+! for transpositions
+config%transpose_comm_backend = comm_backend
+config%transpose_axis_contiguous = .true.
+! for halo exchanges
+config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI
+! Setting for periodic halos in all directions (non required to be in config)
+halo_periods = [.true., .true., .true.]
+
+! create spectral grid descriptor first to select pdims for optimal transposes
+gdims = [nx/2+1, ny, nz]
+config%gdims = gdims
+
+! Set up autotuning options for spectral grid (transpose related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+ options%autotune_transpose_backend = .true.
+ options%autotune_halo_backend = .false.
+endif
+options%transpose_use_inplace_buffers = .true.
+options%transpose_input_halo_extents(:, 1) = halo
+options%transpose_output_halo_extents(:, 4) = halo
+
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options))
+
+! create physical grid descriptor
+! take previous config and modify the global grid (nx instead of nx/2+1)
+! reset transpose_comm_backend to default value to avoid picking up possible nvshmem
+! transpose backend selection (this impacts how workspaces are allocated)
+gdims = [nx, ny, nz]
+config%gdims = gdims
+config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+
+! Set up autotuning options for physical grid (halo related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+ options%autotune_halo_backend = .true.
+endif
+options%halo_extents(:) = halo
+options%halo_periods(:) = halo_periods
+options%halo_axis = 1
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
+
+!Print information on configuration
+! issue with NVHPC > 25.X, simply avoid this call (not critical)
+!if (rank == 0) then
+! write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2)
+! write(*,"(' Using ', a, ' transpose backend ...')") &
+! cudecompTransposeCommBackendToString(config%transpose_comm_backend)
+! write(*,"(' Using ', a, ' halo backend ...')") &
+! cudecompHaloCommBackendToString(config%halo_comm_backend)
+!endif
+
+! Get pencil info for the grid descriptor in the physical space
+! This function returns a pencil struct (piX, piY or piZ) that contains the shape, global lower and upper index bounds (lo and hi),
+! size of the pencil, and an order array to indicate the memory layout that will be used (to handle permuted, axis-contiguous layouts).
+! Additionally, there is a halo_extents data member that indicates the depth of halos for the pencil, by axis.
+! If no halo regions are necessary, a NULL pointer can be provided in place of this array (or omitted)
+! Pencil info in x-configuration present in PiX (shape,lo,hi,halo_extents,size)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo))
+nElemX = piX%size !<- number of total elments in x-configuratiion (including halo)
+! Pencil info in Y-configuration present in PiY
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2))
+nElemY = piY%size
+! Pencil info in Z-configuration present in PiZ
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3))
+nElemZ = piZ%size
+
+! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo))
+
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo))
+nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo)
+! Pencil info in Y-configuration present in PiY
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2))
+nElemY_d2z = piY_d2z%size
+! Pencil info in Z-configuration present in PiZ
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3))
+nElemZ_d2z = piZ_d2z%size
+! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z))
+
+! Get the global rank of neighboring processes in PiX config (required only for particles)
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, 1 , .true. , nidp1y))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, -1 , .true. , nidm1y))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, 1 , .true. , nidp1z))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, -1 , .true. , nidm1z))
+
+
+! CUFFT initialization -- Create Plans
+! Forward 1D FFT in X: D2Z
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward'
+
+! Backward 1D FFT in X: Z2D
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward'
+
+! it's always 2 and 3 because y-pencil have coordinates y,z,x
+batchSize = piY_d2z%shape(2)*piY_d2z%shape(3)
+status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward'
+
+! it's always 2 and 3 because y-pencil have coordinates z,y,x
+batchSize = piZ_d2z%shape(2)*piZ_d2z%shape(3)
+status = cufftPlan1D(planZ, nz, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Z plan Forward & Backward'
+
+
+
+! define grid
+allocate(x(nx),x_ext(nx+1))
+x(1)= 0
+do i = 2, nx
+ x(i) = x(i-1) + dx
+enddo
+
+x_ext(1:nx) = x(1:nx)
+x_ext(nx+1) = lx
+
+! Offsets in the X-pencil decomposition
+pix_yoff = pix%lo(2)-1
+pix_zoff = pix%lo(3)-1
+
+! Boundaries of each PiX pencil
+yinf = x_ext( pix%lo(2) )
+ysup = x_ext( pix%hi(2) + 1 )
+zinf = x_ext( pix%lo(3) )
+zsup = x_ext( pix%hi(3) + 1 )
+
+! Physical size of each PiX pencil
+lyloc = ysup - yinf
+lzloc = zsup - zinf
+
+! define wavenumbers
+allocate(kx(nx))
+do i = 1, nx/2
+ kx(i) = (i-1)*(twoPi/lx)
+enddo
+do i = nx/2+1, nx
+ kx(i) = (i-1-nx)*(twoPi/LX)
+enddo
+! allocate k_d on the device (later on remove and use OpenACC + managed memory?)
+allocate(kx_d, source=kx)
+
+allocate(mysin(nx), mycos(nx))
+do i=1,nx
+ ! compute here the sin to avoid multiple computations of sin
+ mysin(i)=sin(k0*x(i)+dx/2)
+ ! compute here the cos to avoid multiple computations of cos
+ mycos(i)=cos(k0*x(i)+dx/2)
+enddo
+
+! Initial distribution of particles among the processes
+nploc = npart/ranks
+nplocmax = nploc*2
+
+!########################################################################################################################################
+! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START STEP 2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+! allocate arrays
+allocate(psi(max(nElemX, nElemY, nElemZ))) !largest among the pencil
+allocate(psi_real(max(nElemX, nElemY, nElemZ))) !largest among the pencil
+allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z))) ! phi on device
+allocate(ua(nx, piX%shape(2), piX%shape(3)))
+
+! Pressure variable
+allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3)))
+allocate(p(piX%shape(1), piX%shape(2), piX%shape(3)))
+
+!allocate variables
+!NS variables
+allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector
+! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field
+allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(div(piX%shape(1),piX%shape(2),piX%shape(3)))
+!PFM variables
+#if phiflag == 1
+ allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+ allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3)))
+#endif
+
+! allocate arrays for transpositions and halo exchanges
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo))
+! allocate arrays for transpositions
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z))
+
+#if partflag == 1
+! Particle variables
+allocate(part(1:nplocmax,1:ninfop))
+allocate(partbuff(1:nplocmax,1:ninfop))
+allocate(vec_p(1:nplocmax))
+allocate(order_p(1:nplocmax))
+allocate(buffvar1(1:ninfop,1:nploc))
+allocate(buffvar2(1:ninfop,1:nploc))
+#endif
+!########################################################################################################################################
+! END STEP2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT
+!########################################################################################################################################
+! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O)
+! 3.2 Call halo exchnages along Y and Z for u,v,w and phi
+if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder
+ if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
+ if (inflow .eq. 0) then
+ if (rank.eq.0) write(*,*) "Initialize Taylor-green"
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ kg = piX%lo(3) + k - 1
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ jg = piX%lo(2) + j - 1
+ do i = 1, piX%shape(1)
+ u(i,j,k) = sin(x(i)-dx/2)*cos(x(jg))*cos(x(kg))
+ v(i,j,k) = -cos(x(i))*sin(x(jg)-dx/2)*cos(x(kg))
+ w(i,j,k) = 0.d0
+ enddo
+ enddo
+ enddo
+ endif
+ if (inflow .eq. 1) then
+ if (rank.eq.0) write(*,*) "Initialize from data"
+ call readfield(1)
+ call readfield(2)
+ call readfield(3)
+ endif
+endif
+if (restart .eq. 1) then !restart, ignore inflow and read the tstart field
+ if (rank.eq.0) write(*,*) "Initialize velocity field (from output folder), iteration:", tstart
+ call readfield_restart(tstart,1)
+ call readfield_restart(tstart,2)
+ call readfield_restart(tstart,3)
+endif
+
+! update halo cells along y and z directions (enough only if pr and pc are non-unitary)
+!$acc host_data use_device(u)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+!$acc host_data use_device(v)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+!$acc host_data use_device(w)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data
+
+! initialize phase-field
+#if phiflag == 1
+ if (restart .eq. 0) then
+ if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
+ if (inphi .eq. 0) then
+ if (rank.eq.0) write(*,*) 'Spherical drop'
+ do k = 1+halo_ext, piX%shape(3)-halo_ext
+ kg = piX%lo(3) + k - 1
+ do j = 1+halo_ext, piX%shape(2)-halo_ext
+ jg = piX%lo(2) + j - 1
+ do i = 1, piX%shape(1)
+ pos=(x(i)-lx/2)**2d0 + (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0
+ phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+ enddo
+ enddo
+ enddo
+ endif
+ if (inphi .eq. 1) then
+ if (rank.eq.0) write(*,*) "Initialize phase-field from data"
+ call readfield(5)
+ endif
+ endif
+ if (restart .eq. 1) then
+ write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+ call readfield_restart(tstart,5)
+ endif
+ ! update halo
+ !$acc host_data use_device(phi)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+#endif
+
+#if partflag == 1
+ if (restart .eq. 0) then
+ if (rank.eq.0) write(*,*) 'Initialize particles (fresh start)'
+ if (inpart .eq. 1) then
+ if (rank.eq.0) write(*,*) 'Random Position whole Domain'
+ call particlegenerator(inpart)
+ endif
+ if (inpart .eq. 2) then
+ if (rank.eq.0) write(*,*) 'Random Position in Drops'
+ call particlegenerator(inpart)
+ endif
+ endif
+ ! if (restart .eq. 1) then
+ ! write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+ ! call readfield_restart(tstart,5)
+ ! endif
+#endif
+
+!Save initial fields (only if a fresh start)
+if (restart .eq. 0) then
+ if (rank.eq.0) write(*,*) "Save initial fields"
+ call writefield(tstart,1)
+ call writefield(tstart,2)
+ call writefield(tstart,3)
+ call writefield(tstart,4)
+ #if phiflag == 1
+ call writefield(tstart,5)
+ #endif
+endif
+
+!########################################################################################################################################
+! END STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP
+!########################################################################################################################################
+
+! First step use Euler
+alpha=1.0d0
+beta=0.0d0
+gumax=8.0d0
+tstart=tstart+1
+gamma=1.d0*gumax ! initial gamma for phase-field
+!$acc data copyin(piX)
+!$acc data create(rhsu_o, rhsv_o, rhsw_o)
+!$acc data copyin(mysin, mycos)
+#if partflag == 1
+!$acc data copy(part,partbuff)
+!$acc data create(vec_p, order_p)
+!$acc data create(buffvar1,buffvar2)
+#endif
+call cpu_time(t_start)
+
+! Start temporal loop
+do t=tstart,tfin
+ ! Create custom label for each marker (profiling)
+ write(itcount,'(i4)') t
+ ! Range with custom color (uncomment for profiling)
+ ! call nvtxStartRange("Iteration "//itcount,t)
+
+ if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
+ call cpu_time(times)
+
+ !########################################################################################################################################
+ ! START STEP 4: PARTICLES (TRACERS)
+ !########################################################################################################################################
+ #if partflag == 1
+ ! Operations:
+ ! 4.1 Perform Interpolation (consider passing to trilinear: more accurate, but way more expensive)
+ ! 4.2 Integrate with Adams-Bashforth
+ ! 4.3 Order and transfer in y
+ ! 4.4 Order and transfer in z
+ ! 4.5 Check Leakage of Particles
+
+ call LinearInterpolation()
+ ! Particle Tracker Integration
+ ! Two-Step Adams-Bashfort (Euler for first step)
+ !$acc parallel loop collapse(2) default(present)
+ do j = 0, 2
+ do i = 1, nploc
+ part(i,Ixp+j)=part(i,Ixp+j)+&
+ dt*(alpha*part(i,Iup+j)-beta*part(i,Iup1+j))
+ enddo
+ enddo
+
+ ! Transfer in y
+ call SortPartY()
+ call CountPartTransfY()
+ call SendPartUP(2)
+ call SendPartDOWN(2)
+
+ ! Transfer in z
+ call SortPartZ()
+ call CountPartTransfZ()
+ call SendPartUP(3)
+ call SendPartDOWN(3)
+
+ ! Check Particles esacping the domain (leakage)
+ call ParticlesLeakage()
+ ! Shift data for next step
+ !$acc parallel loop collapse(2) default(present)
+ do j = 0, 2
+ do i = 1, nploc
+ part(i,Iup1+j)=part(i,Iup+j)
+ enddo
+ enddo
+ write(*,*) 'rank',rank, 'nploc',nploc
+ #endif
+ !########################################################################################################################################
+ ! END STEP 4: PARTICLES
+ !########################################################################################################################################
+
+
+ ! (uncomment for profiling)
+ ! call nvtxStartRange("Phase-field")
+ !########################################################################################################################################
+ ! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT)
+ !########################################################################################################################################
+ #if phiflag == 1
+ ! 4.2 Get phi at n+1 using RK4
+ ! first stage of RK4 - saved in rhsphi
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ gamma=1.d0*gumax ! update gamma every time step with the current max velocity value
+ if (rank.eq.0) write(*,*) "gamma:", gamma
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first six lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphi(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi &
+ + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &
+ eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &
+ eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
+ normx_f(i,j,k) = normx(i,j,k)*normag
+ normy_f(i,j,k) = normy(i,j,k)*normag
+ normz_f(i,j,k) = normz(i,j,k)*normag
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Compute sharpening term flux split like CaNS-Fizzy
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! second stage of RK4 - saved in rhsphik2
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik2(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Compute sharpening term
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! third stage of RK4 - saved in rhsphik3
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik3(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Compute sharpening term
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ ! forth stage of RK4 - saved in rhsphik4
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik4(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Compute sharpening term
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Update phi with RK4
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + &
+ 2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ #endif
+
+ ! (uncomment for profiling)
+ ! call nvtxEndRange
+
+ !########################################################################################################################################
+ ! END STEP 5: PHASE-FIELD SOLVER
+ !########################################################################################################################################
+
+
+ !########################################################################################################################################
+ ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP)
+ !########################################################################################################################################
+ ! 6.1 compute rhs
+ ! 6.2 obtain ustar and store old rhs in rhs_o
+ ! 6.3 Call halo exchnages along Y and Z for u,v,w
+
+ ! (uncomment for profiling)
+ ! call nvtxStartRange("Projection")
+ ! 6.1a Convective and diffusiver terms NS
+ ! Loop on inner nodes
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ ! Manual periodicity ony along x (x-pencil), along y and z directions use halos
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! compute the products (conservative form)
+ h11 = (u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k)) - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k))
+ h12 = (u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k))
+ h13 = (u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp)) - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k))
+ h21 = (u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k)) - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k))
+ h22 = (v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k)) - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k))
+ h23 = (w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k)) - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km))
+ h31 = (w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km)) - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km))
+ h32 = (v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k)) - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k))
+ h33 = (w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k)) - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km))
+ ! compute the derivative
+ h11=h11*0.25d0*dxi
+ h12=h12*0.25d0*dxi
+ h13=h13*0.25d0*dxi
+ h21=h21*0.25d0*dxi
+ h22=h22*0.25d0*dxi
+ h23=h23*0.25d0*dxi
+ h31=h31*0.25d0*dxi
+ h32=h32*0.25d0*dxi
+ h33=h33*0.25d0*dxi
+ ! add to the rhs
+ rhsu(i,j,k)=-(h11+h12+h13)
+ rhsv(i,j,k)=-(h21+h22+h23)
+ rhsw(i,j,k)=-(h31+h32+h33)
+ ! viscos term
+ h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi
+ h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddxi
+ h13 = mu*(u(i,j,kp)-2.d0*u(i,j,k)+u(i,j,km))*ddxi
+ h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi
+ h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddxi
+ h23 = mu*(v(i,j,kp)-2.d0*v(i,j,k)+v(i,j,km))*ddxi
+ h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi
+ h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddxi
+ h33 = mu*(w(i,j,kp)-2.d0*w(i,j,k)+w(i,j,km))*ddxi
+ rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi
+ rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi
+ rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi
+ ! NS forcing
+ kg = piX%lo(3) + k - 1
+ jg = piX%lo(2) + j - 1
+ ! ABC forcing
+ rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg)
+ rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg)
+ rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i)
+ enddo
+ enddo
+ enddo
+
+ ! Surface tension forces
+ #if phiflag == 1
+ !$acc kernels
+ !Obtain surface tension forces evaluated at the center of the cell (where phi is located)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! continuum-surface force implementation
+ curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi
+ !compute capillary forces: sigma*curvature*gradphi
+ fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
+ fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
+ fzst(i,j,k)= -sigma*curv*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
+ !$acc host_data use_device(fxst)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(fyst)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(fzst)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Interpolate force at velocity points
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (im .lt. 1) im=nx
+ rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
+ rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
+ rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
+ u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+ v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+ w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+ rhsu_o(i,j,k)=rhsu(i,j,k)
+ rhsv_o(i,j,k)=rhsv(i,j,k)
+ rhsw_o(i,j,k)=rhsw(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ #else
+ ! 6.2 find u, v and w star only in the inner nodes
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+ v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+ w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+ rhsu_o(i,j,k)=rhsu(i,j,k)
+ rhsv_o(i,j,k)=rhsv(i,j,k)
+ rhsw_o(i,j,k)=rhsw(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ #endif
+
+ ! store rhs* in rhs*_o
+ ! First step is done with Euler explicit and then move to AB2
+ alpha=1.5d0
+ beta= 0.5d0
+
+ ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+ !$acc host_data use_device(u)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(v)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(w)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ ! (uncomment for profiling)
+ ! call nvtxEndRange
+ !########################################################################################################################################
+ ! END STEP 6: USTAR COMPUTATION
+ !########################################################################################################################################
+
+
+
+ !########################################################################################################################################
+ ! START STEP 7: POISSON SOLVER FOR PRESSURE
+ !########################################################################################################################################
+ ! initialize rhs and analytical solution
+ ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center
+ ! (uncomment for profiling)
+ ! call nvtxStartRange("Poisson")
+ ! call nvtxStartRange("compute RHS")
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ if (ip > nx) ip=1
+ rhsp(i,j,k) = (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
+ rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k))
+ rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ ! (uncomment for profiling)
+ ! call nvtxEndRange
+ ! call nvtxStartRange("FFT forward w/ transpositions")
+ !$acc host_data use_device(rhsp)
+ status = cufftExecD2Z(planXf, rhsp, psi_d)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
+ !$acc end host_data
+
+ ! psi(kx,y,z) -> psi(y,z,kx)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
+ ! psi(y,z,kx) -> psi(ky,z,kx)
+ status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
+ ! psi(ky,z,kx) -> psi(z,kx,ky)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+ ! psi(z,kx,ky) -> psi(kz,kx,ky)
+ status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status
+ ! END of FFT3D forward
+
+ ! call nvtxEndRange
+ np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
+ np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
+ np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
+ call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape)
+ ! divide by -K**2, and normalize
+ offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+ offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+ offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+ xoff = offsets(1)
+ yoff = offsets(2)
+ npx = np(1)
+ npy = np(2)
+ ! call nvtxStartRange("Solution")
+ !$acc kernels
+ do jl = 1, npy
+ jg = yoff + jl
+ do il = 1, npx
+ ig = xoff + il
+ do k = 1, nz
+ k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2
+ phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8))
+ enddo
+ enddo
+ enddo
+ ! specify mean (corrects division by zero wavenumber above)
+ if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0
+ !$acc end kernels
+ ! call nvtxEndRange
+ ! call nvtxStartRange("FFT backwards w/ transpositions")
+
+ ! psi(kz,kx,ky) -> psi(z,kx,ky)
+ status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status
+ ! psi(z,kx,ky) -> psi(ky,z,kx)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+ ! psi(ky,z,kx) -> psi(y,z,kx)
+ status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
+ ! psi(y,z,kx) -> psi(kx,y,z)
+ CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
+ !$acc host_data use_device(p)
+ ! psi(kx,y,z) -> psi(x,y,z)
+ status = cufftExecZ2D(planXb, psi_d, p)
+ if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
+ !$acc end host_data
+ ! call nvtxEndRange
+
+ !$acc host_data use_device(p)
+ ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data
+ ! Update X-pencil halos in Y and Z direction
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ ! call nvtxEndRange
+ !########################################################################################################################################
+ ! END STEP 7: POISSON SOLVER FOR PRESSURE
+ !########################################################################################################################################
+
+
+
+ !########################################################################################################################################
+ ! START STEP 8: VELOCITY CORRECTION
+ ! ########################################################################################################################################
+ ! 8.1 Correct velocity
+ ! 8.2 Remove mean velocity if using ABC forcing
+ ! 8.3 Call halo exchnages along Y and Z for u,v,w
+ ! Correct velocity, pressure has also the halo
+ ! call nvtxStartRange("Correction")
+
+ !$acc kernels
+ umean=0.d0
+ vmean=0.d0
+ wmean=0.d0
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i = 1, piX%shape(1) ! equal to nx (no halo on x)
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (im < 1) im=nx
+ u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
+ v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi
+ w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi
+ umean=umean + u(i,j,k)
+ vmean=vmean + v(i,j,k)
+ wmean=wmean + w(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Remove mean velocity (get local mean of the rank)
+
+ ! Divide by total number of points in the pencil
+ umean=umean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+ vmean=vmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+ wmean=wmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+
+ ! Find global mean (MPI_SUM and then divide by number of ranks)
+ call MPI_Allreduce(umean,gumean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+ call MPI_Allreduce(vmean,gvmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+ call MPI_Allreduce(wmean,gwmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+
+ ! remove mean value
+ !$acc kernels
+ u=u-(gumean/ranks)
+ v=v-(gvmean/ranks)
+ w=w-(gwmean/ranks)
+ !$acc end kernels
+
+ ! 8.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+ !$acc host_data use_device(u)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(v)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(w)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! find local maximum velocity
+ uc=maxval(u)
+ vc=maxval(v)
+ wc=maxval(w)
+ umax=max(wc,max(uc,vc))
+ call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+ cou=gumax*dt*dxi
+ if (rank.eq.0) then
+ write(*,*) "CFL (max among tasks)", cou
+ write(*,*) "umax", gumax
+ if (cou .gt. 7) stop
+ if (.not. ieee_is_finite(cou)) then
+ print *, "Error: cou is infinite or NaN!"
+ stop
+ end if
+ endif
+
+ call cpu_time(timef)
+ if (rank.eq.0) print '(" Time elapsed = ",f8.1," ms")',1000*(timef-times)
+
+ ! (uncomment for profiling)
+ ! call nvtxEndRange
+
+ !########################################################################################################################################
+ ! END STEP 8: VELOCITY CORRECTION
+ !########################################################################################################################################
+
+
+ !########################################################################################################################################
+ ! START STEP 9: OUTPUT FIELDS
+ ! ########################################################################################################################################
+ if (mod(t,dump) .eq. 0) then
+ if (rank .eq. 0) write(*,*) "Saving output files"
+ ! write velocity and pressure fiels (1-4)
+ call writefield(t,1)
+ call writefield(t,2)
+ call writefield(t,3)
+ call writefield(t,4)
+ #if phiflag == 1
+ ! write phase-field (5)
+ call writefield(t,5)
+ #endif
+ endif
+ !########################################################################################################################################
+ ! END STEP 9: OUTPUT FIELDS N
+ !########################################################################################################################################
+
+! (uncomment for profiling)
+! call nvtxEndRange
+
+enddo
+call cpu_time(t_end)
+elapsed = t_end-t_start
+if (rank .eq. 0) write(*,*) 'Elapsed time (seconds):', elapsed
+#if partflag == 1
+!$acc end data
+!$acc end data
+!$acc end data
+#endif
+!$acc end data
+!$acc end data
+!$acc end data
+
+#if partflag == 1
+! Particle variables
+deallocate(part)
+deallocate(partbuff)
+deallocate(vec_p)
+deallocate(order_p)
+deallocate(buffvar1)
+deallocate(buffvar2)
+#endif
+
+! Remove allocated variables (add new)
+deallocate(x_ext)
+deallocate(u,v,w)
+deallocate(tanh_psi, mysin, mycos)
+deallocate(rhsu,rhsv,rhsw)
+deallocate(rhsu_o,rhsv_o,rhsw_o)
+deallocate(phi,rhsphi,normx,normy,normz)
+deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4)
+
+call mpi_finalize(ierr)
+
+end program main
\ No newline at end of file
diff --git a/hit/mn5.sh b/hit/mn5.sh
new file mode 100644
index 0000000..12a62b4
--- /dev/null
+++ b/hit/mn5.sh
@@ -0,0 +1,5 @@
+module purge
+module load nvidia-hpc-sdk/24.3
+cp Makefile_mn5 Makefile
+make clean
+make
\ No newline at end of file
diff --git a/hit/module.f90 b/hit/module.f90
new file mode 100755
index 0000000..1a25def
--- /dev/null
+++ b/hit/module.f90
@@ -0,0 +1,82 @@
+module param
+ integer, parameter :: nx=512
+ integer :: ny=nx,nz=nx
+ double precision :: pi,lx,dx,dxi,ddxi,rhoi,twopi
+ integer :: restart,tstart,tfin,dump
+ double precision :: gamma, normag
+ double precision :: dt,mu,rho !flow parameters
+ integer :: inflow, inphi
+ double precision :: f1,f2,f3,k0 ! forcing parameters
+ double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters
+ double precision :: times,timef
+ ! other variables (wavenumber, grid location)
+ real(8), allocatable :: x(:), kx(:)
+ real(8), allocatable :: x_ext(:)
+ double precision :: yinf, ysup ! Inf and Sup of y in PiX (no halo)
+ double precision :: zinf, zsup ! Inf and Sup of z in PiX (no halo)
+ double precision :: lyloc, lzloc ! Inf and Sup of y in PiX
+
+end module param
+
+
+module mpivar
+ ! MPI variables
+ integer :: rank, ranks, ierr
+ integer :: localRank, localComm
+ integer :: nidp1y, nidm1y, nidp1z, nidm1z
+end module mpivar
+
+
+module cudecompvar
+ use cudecomp
+ integer :: npx, npy, npz
+ type(cudecompHandle) :: handle
+ type(cudecompGridDesc) :: grid_desc,grid_descD2Z
+ type(cudecompGridDescConfig) :: config
+ type(cudecompGridDescAutotuneOptions) :: options
+ integer :: pdims(2) ! pr x pc pencils
+ integer :: gdims(3) ! global grid dimensions
+ integer :: halo(3) ! halo extensions
+ integer :: halo_ext ! 0 no halo, 1 means 1 halo
+ type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration
+ type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z ! size of the pencils in x- y- and z-configuration for D2Z
+ type(cudecompPencilInfo) :: piX_Poiss
+ integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z
+ integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z
+ logical :: halo_periods(3)
+ integer :: pix_yoff,pix_zoff
+end module cudecompvar
+
+
+module velocity
+ double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:)
+ double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:)
+ double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:)
+ complex(8), allocatable :: rhsp_complex(:,:,:)
+ double precision, allocatable :: rhsp(:,:,:), p(:,:,:)
+ double precision, allocatable :: rhspp(:,:,:), pp(:,:,:)
+ double precision, allocatable :: div(:,:,:)
+ double precision :: uc, vc, wc, umax, gumax=1.0d0, cou, alpha, beta
+ double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33
+ double precision :: umean, vmean, wmean, gumean, gvmean, gwmean
+ double precision, allocatable :: mysin(:), mycos(:)
+end module velocity
+
+
+module phase
+ double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+ double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+ double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:)
+ double precision :: curv
+ double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+ double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:)
+ double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp
+ double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp
+ double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp
+ double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13
+ double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp
+end module phase
+
+
+
+
diff --git a/hit/particles.f90 b/hit/particles.f90
new file mode 100644
index 0000000..5e7d0ea
--- /dev/null
+++ b/hit/particles.f90
@@ -0,0 +1,382 @@
+module particles
+ use sort
+ implicit none
+ ! Total number of particles
+ integer:: npart
+ ! Local (per process) number of particles
+ integer :: nploc
+ ! Estimated maximum number of particles per process
+ integer :: nplocmax
+ ! Array containing the info of the particles (every column is a different particle)
+ double precision, allocatable :: part(:,:)
+ ! Buffer for particle array
+ double precision, allocatable :: partbuff(:,:)
+ ! Number of Info per particle
+ integer, parameter :: ninfop = 12
+ !....Indices for particle info
+ integer, parameter :: Itag=1 ! Row 1: Tag of the particle
+ integer, parameter :: Ixp=2,Iyp=3,Izp=4 ! Row 2 to 4: Current x,y,z of the particle
+ integer, parameter :: Iup=5,Ivp=6,Iwp=7 ! Row 5 to 7: Current u,v,w of the particle
+ integer, parameter :: Iphip=8 ! Row 8: phi at the current location of the particle
+ integer, parameter :: Iup1=9,Ivp1=10,Iwp1=11 ! Row 9 to 11: Previous (step) u,v,w of the particle
+ integer, parameter :: Iphip1=12 ! Row 12: Previous (step) u,v,w of the particle
+
+ ! Array for particle ordering
+ double precision, allocatable :: vec_p(:)
+ integer, allocatable :: order_p(:)
+ integer :: inpart
+
+ ! Number of Particles to be Transferred
+ integer::Nsendp1,Nsendm1,Nrecvp1,Nrecvm1
+
+ ! Buffer for particle transfer
+ double precision, allocatable,dimension(:,:)::buffvar1,buffvar2
+
+ contains
+!***********************************************************************
+subroutine particlegenerator(inflag)
+ use param
+ use mpivar
+ use cudecompvar
+ implicit none
+ integer :: inflag
+ integer :: i
+
+ !....Particle Tag
+ do i = 1, nploc
+ part(i,Itag) = dble(i)
+ part(i,Itag) = dble(i) + rank * nploc
+ enddo
+
+
+ if(inflag.eq.1)then
+ !....Generate xp position
+ call RANDOM_SEED()
+ call RANDOM_NUMBER(vec_p)
+ do i = 1, nploc
+ part(i,Ixp) = vec_p(i) * lx
+ enddo
+
+ !....Generate yp position
+ call RANDOM_SEED()
+ call RANDOM_NUMBER(vec_p)
+ do i = 1, nploc
+ part(i,Iyp) = vec_p(i) * lyloc + yinf
+ end do
+
+ !....Generate zp position
+ call RANDOM_SEED()
+ call RANDOM_NUMBER(vec_p)
+ do i = 1, nploc
+ part(i,Izp) = vec_p(i) * lzloc + zinf
+ end do
+ endif
+
+ return
+end subroutine
+!***********************************************************************
+subroutine linearinterpolation
+ use velocity
+ use param
+ use cudecompvar
+ implicit none
+ integer :: i
+ double precision :: xpt, ypt, zpt
+ integer :: ix, iy, iz, ixplus1
+ double precision :: tx, ty, tz
+
+ !$acc parallel loop present(part,u,v,w) private(xpt,ypt,zpt,ix,iy,iz,tx,ty,tz)
+ do i = 1, nploc
+ ! Particle position
+ xpt = part(i,Ixp)
+ ypt = part(i,Iyp)
+ zpt = part(i,Izp)
+
+ ! Global left-cell indices (uniform, fully periodic)
+ ix = floor( xpt / dx )
+ iy = floor( ypt / dx )
+ iz = floor( zpt / dx )
+
+ ! Local fractions in [0,1)
+ tx = xpt/dx - dble(ix)
+ ty = ypt/dx - dble(iy)
+ tz = zpt/dx - dble(iz)
+
+ ! Map to 1-based global indices and wrap, then to local by offsets
+ ix = 1 + ix
+ ixplus1 = ix + 1
+ ! Periodicity
+ if(ixplus1>nx) ixplus1 = 1
+ ! Periodicity is already accounted by the halos
+ iy = 1 + iy - pix_yoff
+ iz = 1 + iz - pix_zoff
+
+ ! 1-D linear per component (MAC: faces along native dir)
+ part(i,Iup) = (1.0d0 - tx)*u(ix , iy , iz ) + tx*u(ixplus1 , iy , iz )
+ part(i,Ivp) = (1.0d0 - ty)*v(ix , iy , iz ) + ty*v(ix , iy+1 , iz )
+ part(i,Iwp) = (1.0d0 - tz)*w(ix , iy , iz ) + tz*w(ix , iy , iz+1 )
+ end do
+ !$acc end parallel loop
+
+ return
+end subroutine
+!***********************************************************************
+subroutine SortPartY()
+ implicit none
+ integer :: i, j
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+ do i = 1, nploc
+ partbuff(i,j) = part(i,j)
+ vec_p(i) = part(i,Iyp)
+ order_p(i) = i
+ enddo
+ enddo
+
+ !$acc host_data use_device(vec_p,order_p)
+ call fsort(vec_p,order_p,nploc,.true.)
+ !$acc end host_data
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+ do i = 1, nploc
+ part(i,j) = partbuff(order_p(i),j)
+ end do
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine SortPartZ()
+ implicit none
+ integer :: i, j
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+ do i = 1, nploc
+ partbuff(i,j) = part(i,j)
+ vec_p(i) = part(i,Izp)
+ order_p(i) = i
+ enddo
+ enddo
+
+ !$acc host_data use_device(vec_p,order_p)
+ call fsort(vec_p,order_p,nploc,.true.)
+ !$acc end host_data
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+ do i = 1, nploc
+ part(i,j) = partbuff(order_p(i),j)
+ end do
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine CountPartTransfY()
+ use param
+ implicit none
+ integer :: flag,scanner
+ integer :: i
+ integer :: addp,addm
+
+ Nsendp1=0 !Number of particle to send to nid+1 (in Y)
+ Nrecvm1=0 !Number of particle to receive from nid-1 (in Y)
+ Nsendm1=0 !Number of particle to send to nid-1 (in Y)
+ Nrecvp1=0 !Number of particle to receive from nid+1 (in Y)
+
+
+ !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1)
+ do i = 1, nploc
+ addp=0
+ addm=0
+ if(part(i,Izp).GE.ysup) addp=1
+ if(part(i,Izp).LT.yinf) addm=1
+ Nsendp1=Nsendp1+addp
+ Nsendm1=Nsendm1+addm
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine CountPartTransfZ()
+ use param
+ implicit none
+ integer :: flag,scanner
+ integer :: i
+ integer :: addp,addm
+
+ Nsendp1=0 !Number of particle to send to nid+1 (in Z)
+ Nrecvm1=0 !Number of particle to receive from nid-1 (in Z)
+ Nsendm1=0 !Number of particle to send to nid-1 (in Z)
+ Nrecvp1=0 !Number of particle to receive from nid+1 (in Z)
+
+
+ !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1)
+ do i = 1, nploc
+ addp=0
+ addm=0
+ if(part(i,Izp).GE.zsup) addp=1
+ if(part(i,Izp).LT.zinf) addm=1
+ Nsendp1=Nsendp1+addp
+ Nsendm1=Nsendm1+addm
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine SendPartUP(direction)
+ use mpi
+ use mpivar
+ implicit none
+ integer :: direction
+ integer::NsendMAX
+ integer,dimension(MPI_STATUS_SIZE):: status
+ integer :: ii, jj
+ integer :: nidp1, nidm1
+
+ if(direction.eq.2)then
+ nidp1 = nidp1y
+ nidm1 = nidm1y
+ elseif(direction.eq.3)then
+ nidp1 = nidp1z
+ nidm1 = nidm1z
+ endif
+
+ !...Proc nid send to proc nid+1 the number of particles
+ !...that nid+1 has to receive from nid
+ call MPI_Sendrecv(Nsendp1,1,MPI_INTEGER, nidp1, nidp1,&
+ Nrecvm1,1,MPI_INTEGER,nidm1,rank,MPI_COMM_WORLD,status,ierr)
+
+ call MPI_Allreduce(Nsendp1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,ierr)
+
+ if(NsendMAX.GT.0)then
+ if(Nsendp1.GT.0)then
+ !$acc parallel loop collapse(2)
+ do jj=1,Nsendp1
+ do ii=1,ninfop
+ buffvar1(ii,jj)=part(nploc-Nsendp1+jj,ii)
+ end do
+ end do
+ end if
+
+ !$acc host_data use_device(buffvar1,buffvar2)
+ !....Proc nid send to nid+1 and receive from nid-1
+ call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendp1),MPI_DOUBLE_PRECISION,&
+ nidp1, nidp1,buffvar2(1,1),int(ninfop*Nrecvm1),MPI_DOUBLE_PRECISION,&
+ nidm1,rank,MPI_COMM_WORLD,STATUS,ierr)
+ !$acc end host_data
+
+ if(Nrecvm1.GT.0)then
+ !....Each proc place received data in the first block
+ !$acc parallel loop collapse(2)
+ do jj=1,ninfop
+ do ii=1,Nrecvm1
+ partbuff(ii,jj)=buffvar2(jj,ii)
+ end do
+ end do
+ end if
+ end if
+
+
+ !....Adjust central block (not transferred data)
+ !$acc parallel loop collapse(2)
+ do jj=1,ninfop
+ do ii=1,(nploc-Nsendm1-Nsendp1)
+ partbuff(Nrecvm1+ii,jj)=part(Nsendm1+ii,jj)
+ end do
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine SendPartDOWN(direction)
+ use mpi
+ use mpivar
+ implicit none
+ integer :: direction
+ integer::NsendMAX
+ integer,dimension(MPI_STATUS_SIZE):: status
+ integer :: ii, jj
+ integer :: nidp1, nidm1
+
+
+ if(direction.eq.2)then
+ nidp1 = nidp1y
+ nidm1 = nidm1y
+ elseif(direction.eq.3)then
+ nidp1 = nidp1z
+ nidm1 = nidm1z
+ endif
+
+ !...Proc nid send to proc nid-1 the number of particles
+ !...that nid-1 has to receive from nid
+ call MPI_Sendrecv(Nsendm1,1,MPI_INTEGER, nidm1, nidm1,&
+ Nrecvp1,1,MPI_INTEGER,nidp1,rank,MPI_COMM_WORLD,STATUS,ierr)
+
+ call MPI_Allreduce(Nsendm1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_Comm_World,ierr)
+
+
+ if(NsendMAX.GT.0)then
+ if(Nsendm1.GT.0)then
+ !$acc parallel loop collapse(2)
+ do jj=1,Nsendm1
+ do ii=1,ninfop
+ buffvar1(ii,jj)=part(jj,ii)
+ end do
+ end do
+ end if
+
+ !$acc host_data use_device(buffvar1,buffvar2)
+ !....Proc nid send to nid-1 and receive from nid+1
+ call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendm1),MPI_DOUBLE_PRECISION,&
+ nidm1, nidm1,buffvar2(1,1),int(ninfop*Nrecvp1),MPI_DOUBLE_PRECISION,&
+ nidp1, rank ,MPI_COMM_WORLD,STATUS,ierr)
+ !$acc end host_data
+
+ if(Nrecvp1.GT.0)then
+ !....Each proc append received data
+ !$acc parallel loop collapse(2)
+ do jj=1,ninfop
+ do ii=1,Nrecvp1
+ partbuff(ii+Nrecvm1+nploc-Nsendm1-Nsendp1,jj)=buffvar2(jj,ii)
+ end do
+ end do
+ end if
+
+ end if
+
+ !...Update Local Number of Paricles
+ nploc=nploc+Nrecvm1+Nrecvp1-Nsendp1-Nsendm1
+ !$acc parallel loop collapse(2)
+ do jj=1,ninfop
+ do ii=1,nploc
+ part(ii,jj)=partbuff(ii,jj)
+ end do
+ end do
+
+
+ return
+end subroutine
+!***********************************************************************
+subroutine ParticlesLeakage()
+ use param
+ !....Particles which escape from the domain along x, y and z must
+ !....be reintroduced
+ implicit none
+
+ !$acc kernels
+ !....Along x
+ part(1:nploc,Ixp)=part(1:nploc,Ixp)-lx*FLOOR(part(1:nploc,Ixp)/lx)
+ !....Along y
+ part(1:nploc,Iyp)=part(1:nploc,Iyp)-lx*FLOOR(part(1:nploc,Iyp)/lx)
+ !....Along z
+ part(1:nploc,Izp)=part(1:nploc,Izp)-lx*FLOOR(part(1:nploc,Izp)/lx)
+ !$acc end kernels
+
+ return
+end subroutine
+!***********************************************************************
+end module particles
\ No newline at end of file
diff --git a/hit/readinput.f90 b/hit/readinput.f90
new file mode 100644
index 0000000..2dabaa2
--- /dev/null
+++ b/hit/readinput.f90
@@ -0,0 +1,83 @@
+
+!##########################################################################
+!###########################################################################
+subroutine readinput
+use velocity
+use phase
+use param
+use mpivar
+use particles
+implicit none
+
+open(unit=55,file='input.inp',form='formatted',status='old')
+!Time step parameters
+read(55,*) restart
+read(55,*) tstart
+read(55,*) tfin
+read(55,*) dump
+!Flow parameters
+read(55,*) inflow
+read(55,*) inphi
+read(55,*) dt
+read(55,*) mu
+read(55,*) rho
+! forcing parameters
+read(55,*) f1
+read(55,*) f2
+read(55,*) f3
+read(55,*) k0
+! phase-field parameters
+read(55,*) radius
+read(55,*) sigma
+read(55,*) epsr
+! particle parameters
+read(55,*) npart
+read(55,*) inpart
+
+
+! compute pre-defined constant
+twopi=8.0_8*atan(1.0_8)
+lx=twopi
+! dx = lx/(nx-1)
+dx = lx/(nx)
+
+dxi=1.d0/dx
+ddxi=1.d0/dx/dx
+rhoi=1.d0/rho
+eps=epsr*dx
+epsi=1.d0/eps
+enum=1.e-16
+
+if (rank .eq. 0) then
+ !enable/disable for debug check parameters
+ write(*,*) "----------------------------------------------"
+ write(*,*) "███ ███ ██ ██ ██ ████████ ██████ ██████"
+ write(*,*) "████ ████ ██ ██ ██ ██ ██ ██"
+ write(*,*) "██ ████ ██ ███████ ██ ██ █████ ███████"
+ write(*,*) "██ ██ ██ ██ ██ ██ ██ ██ ██ ██"
+ write(*,*) "██ ██ ██ ██ ██ ██ ██████ ██████"
+ write(*,*) "----------------------------------------------"
+ write(*,*) 'Grid:', nx, 'x', nx, 'x', nx
+ write(*,*) "Restart ", restart
+ write(*,*) "Tstart ", tstart
+ write(*,*) "Tfin ", tfin
+ write(*,*) "Dump ", dump
+ write(*,*) "Inflow ", inflow
+ write(*,*) "Deltat ", dt
+ write(*,*) "Mu ", mu
+ write(*,*) "Rho ", rho
+ write(*,*) "f1,f2,f3,k0", f1,f2,f3,k0
+ write(*,*) "Radius ", radius
+ write(*,*) "Sigma ", sigma
+ write(*,*) "Eps ", eps
+ write(*,*) "Epsi ", epsi
+ write(*,*) "Lx ", lx
+ write(*,*) "dx", dx
+ write(*,*) "dxi", dxi
+ write(*,*) "ddxi", ddxi
+ write(*,*) "rhoi", rhoi
+endif
+
+end subroutine
+
+
diff --git a/hit/readwrite.f90 b/hit/readwrite.f90
new file mode 100644
index 0000000..44df20f
--- /dev/null
+++ b/hit/readwrite.f90
@@ -0,0 +1,318 @@
+subroutine writefield(t,fieldn)
+! Output field, file is written in the /output folder
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: out(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1]
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 2) then
+ out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 3) then
+ out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 4) then
+ out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 5) then
+ out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+ write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+endif
+
+deallocate(out)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield(fieldn)
+! Input field, file is written in the /input folder
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ namefile='./input/u.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+ namefile='./input/v.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+ namefile='./input/w.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+ namefile='./input/p.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+ namefile='./input/phi.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield_restart(t,fieldn)
+! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!)
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+ write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+ write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+ write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+ write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+ write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+ call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+ call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+ call mpi_type_commit(ftype,ierr)
+ call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+ !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+ call mpi_file_close(f_handle,ierr)
+ phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
\ No newline at end of file
diff --git a/hit/testpush.sh b/hit/testpush.sh
new file mode 100644
index 0000000..2e0e21a
--- /dev/null
+++ b/hit/testpush.sh
@@ -0,0 +1,2 @@
+make clean
+make
diff --git a/multi/main.f90 b/multi/main.f90
old mode 100755
new mode 100644
index 67dd398..beb351f
--- a/multi/main.f90
+++ b/multi/main.f90
@@ -285,11 +285,13 @@ program main
allocate(div(piX%shape(1),piX%shape(2),piX%shape(3)))
!PFM variables
#if phiflag == 1
- allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi_o(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+ allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3)))
allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+ allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3)))
#endif
! allocate arrays for transpositions and halo exchanges
@@ -439,9 +441,9 @@ program main
! First step use Euler
alpha=1.0d0
beta=0.0d0
-gumax=1.d0
+gumax=8.0d0
tstart=tstart+1
-gamma=1.d0*gumax
+gamma=1.d0*gumax ! initial gamma for phase-field
!$acc data copyin(piX)
!$acc data create(rhsu_o, rhsv_o, rhsw_o)
!$acc data copyin(mysin, mycos)
@@ -462,7 +464,6 @@ program main
if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
call cpu_time(times)
-
!########################################################################################################################################
! START STEP 4: PARTICLES (TRACERS)
!########################################################################################################################################
@@ -519,6 +520,8 @@ program main
! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT)
!########################################################################################################################################
#if phiflag == 1
+ ! 4.2 Get phi at n+1 using RK4
+ ! first stage of RK4 - saved in rhsphi
!$acc kernels
do k=1, piX%shape(3)
do j=1, piX%shape(2)
@@ -533,7 +536,8 @@ program main
enddo
!$acc end kernels
- gamma=1.d0*gumax
+ gamma=1.d0*gumax ! update gamma every time step with the current max velocity value
+ if (rank.eq.0) write(*,*) "gamma:", gamma
!$acc parallel loop tile(16,4,2)
do k=1+halo_ext, piX%shape(3)-halo_ext
do j=1+halo_ext, piX%shape(2)-halo_ext
@@ -547,14 +551,160 @@ program main
km=k-1
if (ip .gt. nx) ip=1
if (im .lt. 1) im=nx
- ! convective (first three lines) and diffusive (last three lines)
+ ! convective (first six lines) and diffusive (last three lines)
+ ! flux-splitting
rhsphi(i,j,k) = &
- - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi &
- - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi &
- - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi &
+ - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi &
+ gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &
eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &
- eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)
+ eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
+ normx_f(i,j,k) = normx(i,j,k)*normag
+ normy_f(i,j,k) = normy(i,j,k)*normag
+ normz_f(i,j,k) = normz(i,j,k)*normag
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Compute sharpening term flux split like CaNS-Fizzy
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! second stage of RK4 - saved in rhsphik2
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik2(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
! 4.1.3. Compute normals for sharpening term (gradient)
normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
@@ -577,20 +727,133 @@ program main
CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
!$acc end host_data
- ! 4.1.3. Compute Sharpening term (gradient)
- ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0)
+ ! Compute sharpening term
!$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! third stage of RK4 - saved in rhsphik3
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2)
do k=1, piX%shape(3)
do j=1, piX%shape(2)
do i=1,nx
- normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
- normx(i,j,k) = normx(i,j,k)*normag
- normy(i,j,k) = normy(i,j,k)*normag
- normz(i,j,k) = normz(i,j,k)*normag
+ phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
enddo
enddo
enddo
!$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik3(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
+
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
! Compute sharpening term
!$acc kernels
@@ -605,31 +868,186 @@ program main
km=k-1
if (ip .gt. nx) ip=1
if (im .lt. 1) im=nx
- ! ACDI with pre-computed tanh
- rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
- 0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5d0*dxi + &
- (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
- 0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5d0*dxi + &
- (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
- 0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5d0*dxi)
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+ ! forth stage of RK4 - saved in rhsphik4
+ !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3)
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k)
+ enddo
+ enddo
+ enddo
+ !$acc end parallel loop
+ ! 4.3 Call halo exchanges along Y and Z for phi
+ !$acc host_data use_device(phi_tmp)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc kernels
+ do k=1, piX%shape(3)
+ do j=1, piX%shape(2)
+ do i=1,nx
+ ! compute distance function psi (used to compute normals)
+ val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+ psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+ ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+ tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
enddo
enddo
enddo
!$acc end kernels
+ !$acc parallel loop tile(16,4,2)
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ! 4.1 RHS computation
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ ! convective (first three lines) and diffusive (last three lines)
+ ! flux-splitting
+ rhsphik4(i,j,k) = &
+ - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+ min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi &
+ - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+ min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi &
+ - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+ min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi &
+ + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &
+ eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &
+ eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+ ! 4.1.3. Compute normals for sharpening term (gradient)
+ normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+ normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+ normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+ enddo
+ enddo
+ enddo
- ! 4.2 Get phi at n+1 using AB2
+ ! Update normx,normy and normz halos, required to then compute normal derivative
+ !$acc host_data use_device(normx)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normy)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+ !$acc host_data use_device(normz)
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+ CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+ !$acc end host_data
+
+ ! Compute sharpening term
+ !$acc kernels
+ do k=1+halo_ext, piX%shape(3)-halo_ext
+ do j=1+halo_ext, piX%shape(2)-halo_ext
+ do i=1,nx
+ ip=i+1
+ jp=j+1
+ kp=k+1
+ im=i-1
+ jm=j-1
+ km=k-1
+ if (ip .gt. nx) ip=1
+ if (im .lt. 1) im=nx
+ normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+ normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+ normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+ normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+ normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+ normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+ normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+ normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+ normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+ normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+ normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+ normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+ normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+ normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+ normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+ normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+ normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+ normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+ ! sharpening term
+ !
+ rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum)
+ rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum)
+ rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum)
+ rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum)
+ rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum)
+ rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum)
+ !
+ sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+ sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+ sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+ sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+ sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+ sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+ !
+ rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+ enddo
+ enddo
+ enddo
+ !$acc end kernels
+
+ ! Update phi with RK4
!$acc kernels
do k=1+halo_ext, piX%shape(3)-halo_ext
do j=1+halo_ext, piX%shape(2)-halo_ext
do i=1,nx
- phi(i,j,k) = phi(i,j,k) + dt*(alpha*rhsphi(i,j,k)-beta*rhsphi_o(i,j,k))
- rhsphi_o(i,j,k)=rhsphi(i,j,k)
+ phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + &
+ 2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k))
enddo
enddo
enddo
!$acc end kernels
- ! 4.3 Call halo exchnages along Y and Z for phi
+ ! 4.3 Call halo exchanges along Y and Z for phi
!$acc host_data use_device(phi)
CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
@@ -732,7 +1150,7 @@ program main
if (ip .gt. nx) ip=1
if (im .lt. 1) im=nx
! continuum-surface force implementation
- curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dxi + 0.5d0*(normz(i,j,kp)-normz(i,j,km))*dxi
+ curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi
!compute capillary forces: sigma*curvature*gradphi
fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
@@ -999,6 +1417,7 @@ program main
cou=gumax*dt*dxi
if (rank.eq.0) then
write(*,*) "CFL (max among tasks)", cou
+ write(*,*) "umax", gumax
if (cou .gt. 7) stop
if (.not. ieee_is_finite(cou)) then
print *, "Error: cou is infinite or NaN!"
@@ -1068,9 +1487,9 @@ program main
deallocate(tanh_psi, mysin, mycos)
deallocate(rhsu,rhsv,rhsw)
deallocate(rhsu_o,rhsv_o,rhsw_o)
-deallocate(phi,rhsphi,rhsphi_o,normx,normy,normz)
+deallocate(phi,rhsphi,normx,normy,normz)
+deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4)
call mpi_finalize(ierr)
-end program main
-
+end program main
\ No newline at end of file
diff --git a/multi/module.f90 b/multi/module.f90
index 8168c1b..1a25def 100755
--- a/multi/module.f90
+++ b/multi/module.f90
@@ -64,10 +64,17 @@ end module velocity
module phase
- double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), rhsphi_o(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+ double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+ double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:)
double precision :: curv
double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+ double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:)
+ double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp
+ double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp
+ double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp
+ double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13
+ double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp
end module phase