diff --git a/channel/Makefile_leonardo b/channel/Makefile_leonardo
new file mode 100644
index 0000000..ff923ef
--- /dev/null
+++ b/channel/Makefile_leonardo
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+ 
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda  -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft  -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):        $(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
diff --git a/channel/Makefile_local b/channel/Makefile_local
new file mode 100644
index 0000000..6cb5517
--- /dev/null
+++ b/channel/Makefile_local
@@ -0,0 +1,28 @@
+# Compiler
+FC = mpif90
+LD	= $(FC)
+RM	= /bin/rm -f
+
+# Paths (Modify if necessary)
+CUDECOMP_DIR = /home/milton/TCF36/cuDecomp/build/
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Mfree -Mpreprocess -cpp -cuda  -I$(CUDECOMP_DIR)/include
+LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft 
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):	$(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
+
+
diff --git a/channel/README.md b/channel/README.md
new file mode 100644
index 0000000..e73f5c9
--- /dev/null
+++ b/channel/README.md
@@ -0,0 +1,31 @@
+# Channel configuratuion
+
+Code for turbulent channel flow simulations (closed channel).
+Solution of Navier-Stokes equations + phase-field method (ACDI) + energy equation (passive scalar).
+For time integration, a fully explicit scheme is used: Euler explicit for ACDI + RK3 for Navier-Stokes and temperature
+
+## Main features at a glance:
+- No-slip boundary conditions at the two walls ($z=\pm h$)
+- Poisson solver based on FFT2 along $x$ and $y$ + TDMA along $z$.
+- Non-uniform grid along $z$, any stretching function can be used (defined in read_input.f90)
+- Domain of arbitary dimensions along all three directions
+
+**Autotuning of decomposition**  
+- Default: `pr=0` and `pc=0` → enables **automatic process decomposition**.  
+- Only input needed: **total number of MPI tasks**.  
+- No recompilation required when changing MPI processes.  
+
+**Optional features (conditional compilation)**  
+- Phase-field module: Can be enabled or disabled. Default is single-phase.  
+- Passive scaler: Can be enabled or disabled. Default is single-phase.  
+
+## Turbulent channel flow 
+- Turbulent channel flow at $Re_\tau=590$, Grid: $1536 \times 768 \times 576$
+
+![Test](../val/tcf2.png)
+
+## Nodes numbering and staggered grid
+
+X-pencil configuration:
+
+![Test](../val/grid_channel.png)
diff --git a/channel/binder.sh b/channel/binder.sh
new file mode 100755
index 0000000..b449dda
--- /dev/null
+++ b/channel/binder.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in
+0) export UCX_NET_DEVICES=mlx5_0:1 ;;
+1) export UCX_NET_DEVICES=mlx5_1:1 ;;
+2) export UCX_NET_DEVICES=mlx5_2:1 ;;
+3) export UCX_NET_DEVICES=mlx5_3:1 ;;
+esac
+
+echo Launching on $UCX_NET_DEVICES
+
+$*
+
diff --git a/channel/go_leo.sh b/channel/go_leo.sh
new file mode 100644
index 0000000..2424115
--- /dev/null
+++ b/channel/go_leo.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+#SBATCH --account="IscrB_EXCEED"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:05:00
+#SBATCH --nodes=1      ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --error=test.err
+#SBATCH --qos=boost_qos_dbg
+
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+chmod 777 binder.sh
+mpirun -np 4 --map-by node:PE=8 --rank-by core  ./binder.sh ./mhit36
\ No newline at end of file
diff --git a/channel/input.inp b/channel/input.inp
new file mode 100644
index 0000000..7e814cb
--- /dev/null
+++ b/channel/input.inp
@@ -0,0 +1,21 @@
+0           ! fresh start=0 restart =1
+0           ! iteration to start from (fresh start 0 otherwise different)
+200000        ! Final time step
+1000         ! Saving frequency
+4.0         ! Streamwise domain lenght Lx
+4.0         ! Spanwise domain width Ly
+2.0         ! Wall-normal domain height Lz=2h
+0.01         ! Stretching factor quasi~0=uniform 1.5 mid-stretching and 3.0 extreme (do not use zero)
+1           ! inflow condition (0=zero, 1=perturbed 3D field, 2=read from fields)
+0           ! phase-field initial condition (0=circular drop, 1=read from fields)
+0           ! theta initial condition (0=uniform, 1=read from fields)
+0.0001      ! dt for the simulation
+0.0066666   ! mu (viscosity)
+1.000       ! density
+0.0        ! Pressure gradient along x (negative if flow along x) 
+0.0         ! Pressure gradient along y (negative if flow along y) 
+0.5         ! Initial drop radius for phase-field
+0.1d0       ! Surface tension
+1.0         ! Ratio eps/dx
+0.00666666  ! kappa (thermal diffusivity)
+50.0         ! alphag for buoyancy
diff --git a/channel/leo.sh b/channel/leo.sh
new file mode 100644
index 0000000..f3d5f46
--- /dev/null
+++ b/channel/leo.sh
@@ -0,0 +1,8 @@
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+cp Makefile_leonardo Makefile
+make clean
+make
+mkdir -p output
+
diff --git a/channel/local.sh b/channel/local.sh
new file mode 100644
index 0000000..4f27984
--- /dev/null
+++ b/channel/local.sh
@@ -0,0 +1,15 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+#clean folder output
+rm -rf output
+mkdir output
+cp Makefile_local Makefile
+#rm *.dat
+make clean
+make
+mpirun -np 2 ./mhit36 
diff --git a/channel/main.f90 b/channel/main.f90
new file mode 100644
index 0000000..9c4f06f
--- /dev/null
+++ b/channel/main.f90
@@ -0,0 +1,1028 @@
+#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
+
+program main
+use cudafor
+use cudecomp
+use cufft
+use mpi
+use velocity 
+use phase
+use temperature
+use param
+use mpivar
+use cudecompvar
+use nvtx
+
+
+implicit none
+! timer for scaling test
+real :: t_start, t_end, elapsed
+! grid dimensions
+integer :: comm_backend
+integer :: pr, pc
+! cudecomp
+! cuFFT
+integer :: planXf, planXb, planY
+integer :: batchsize
+integer :: status
+integer :: i,j,k,il,jl,kl,ig,jg,kg,t,stage
+integer :: im,ip,jm,jp,km,kp,last,idx
+! TDMA variables
+double precision, allocatable :: a(:), b(:), c(:)
+double complex, allocatable :: d(:), sol(:)
+! working arrays
+double complex, allocatable :: psi(:)
+double precision, allocatable :: ua(:,:,:)
+double precision, allocatable :: uaa(:,:,:)
+double complex, device, allocatable :: psi_d(:)
+double precision, device, allocatable :: vel_d(:) ! only used for implicit diffusion in z
+double complex, pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:)
+character(len=40) :: namefile
+character(len=4) :: itcount
+! Code variables
+double precision ::err, maxErr, meanp, gmeanp
+double complex, device, pointer :: psi3d(:,:,:)
+double precision :: k2
+!integer :: il, jl, ig, jg
+integer :: offsets(3), xoff, yoff
+integer :: np(3)
+! Alan Williamson classic
+double precision, parameter :: alpha(3) = (/ 8.d0/15.d0,   5.d0/12.d0,   3.d0/4.d0 /)
+double precision, parameter :: beta(3)  = (/ 0.d0,       -17.d0/60.d0,  -5.d0/12.d0 /)
+! Stage coefficients for diffusion-optimized SSP RK3
+!real(kind=8), parameter :: alpha(3) = (/ 0.444370493651235d0, 0.555629506348765d0, 1.0d0 /)
+!real(kind=8), parameter :: beta(3)   = (/ 0.0d0, -0.122243120495896d0, -0.377756879504104d0 /)
+
+! Enable or disable phase field 
+#define phiflag 0
+! Enable or disable temperature field
+#define thetaflag 1
+
+!########################################################################################################################################
+! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START
+!########################################################################################################################################
+! MPI initialization, put in rank the local MPI rank number and ranks total number
+! Same procedura defined in the cuDecomp documentation
+call mpi_init(ierr)
+call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
+call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr)
+call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr)
+call mpi_comm_rank(localComm, localRank, ierr)
+ierr = cudaSetDevice(localRank) !assign GPU to MPI rank
+
+! Define grid and decomposition
+call readinput
+
+! domain decomposition (pencils in y and z) 
+pr = 0
+pc = 0
+halo_ext=1
+
+! CuDECOMP initialization and settings 
+comm_backend = 0 ! Enable full autotuning
+CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD))
+! config is a struct and pr and pc are the number of pencils along the two directions
+! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults. 
+CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config)) ! Initializing to default values is required to ensure no entries are left uninitialized.
+pdims = [pr, pc] !pr and pc are the number of pencil along the different directions
+config%pdims = pdims
+halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil
+! for transpositions
+config%transpose_comm_backend = comm_backend
+config%transpose_axis_contiguous = .true.
+! for halo exchanges
+config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI
+! Setting for periodic halos in all directions (non required to be in config)
+halo_periods = [.true., .true., .false.]
+! create spectral grid descriptor first to select pdims for optimal transposes
+gdims = [nx/2+1, ny, nz]
+config%gdims = gdims
+! Set up autotuning options for spectral grid (transpose related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+   options%autotune_transpose_backend = .true.
+   options%autotune_halo_backend = .false.
+endif
+options%transpose_use_inplace_buffers = .true.
+options%transpose_input_halo_extents(:, 1) = halo
+options%transpose_output_halo_extents(:, 4) = halo
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options))
+! create physical grid descriptor
+! take previous config and modify the global grid (nx instead of nx/2+1)
+! reset transpose_comm_backend to default value to avoid picking up possible nvshmem
+! transpose backend selection (this impacts how workspaces are allocated)
+gdims = [nx, ny, nz]
+config%gdims = gdims
+config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+! Set up autotuning options for physical grid (halo related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+   options%autotune_halo_backend = .true.
+endif
+options%halo_extents(:) = halo
+options%halo_periods(:) = halo_periods
+options%halo_axis = 1
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
+! Get pencil info for the grid descriptor in the physical space pencil struct (piX, piY or piZ)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo))
+nElemX = piX%size !<- number of total elments in x-configuratiion (including halo)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2))
+nElemY = piY%size
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3))
+nElemZ = piZ%size
+! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo))
+! Get pencil info for the grid descriptor in the complex space 
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo))
+nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2))
+nElemY_d2z = piY_d2z%size
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3))
+nElemZ_d2z = piZ_d2z%size
+! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z))
+! End cuDecomp initialization
+
+
+! CUFFT initialization -- Create Plans (along x anf y only, z not required)
+! Forward 1D FFT in X: D2Z
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward'
+! Backward 1D FFT in X: Z2D
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward'
+! it's always 2 and 3 because y-pencil have coordinates y,z,x
+batchSize = piY_d2z%shape(2)*piY_d2z%shape(3)
+status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward'
+!########################################################################################################################################
+! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
+!########################################################################################################################################
+
+
+
+
+
+!########################################################################################################################################
+! START STEP 2: ALLOCATE ARRAYS
+!########################################################################################################################################
+! Pressure variable
+allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3))) 
+allocate(p(piX%shape(1), piX%shape(2), piX%shape(3))) 
+allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z))) 
+!NS variables
+allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector
+! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field
+allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+!allocate(div(piX%shape(1),piX%shape(2),piX%shape(3))) (debug only)
+!TDMA solver
+allocate(a(0:nz+1),b(0:nz+1),c(0:nz+1),d(0:nz+1),sol(0:nz+1))
+!PFM variables
+#if phiflag == 1
+allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+#endif
+!Temperature variables
+#if thetaflag == 1
+allocate(theta(piX%shape(1),piX%shape(2),piX%shape(3)),rhstheta(piX%shape(1),piX%shape(2),piX%shape(3)))
+allocate(rhstheta_o(piX%shape(1),piX%shape(2),piX%shape(3)))
+#endif
+! allocate arrays for transpositions and halo exchanges 
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo))
+! allocate arrays for transpositions
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z)) ! not required 
+!########################################################################################################################################
+! END STEP2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+
+
+
+
+
+
+
+!########################################################################################################################################
+! START STEP 3: FLOW FIELD, PHASE-FIELD AND TEMPERATURE INIT
+!########################################################################################################################################
+! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O)
+! 3.2 Call halo exchnages along Y and Z for u, v, w, phi and theta
+if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder
+if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
+   if (inflow .eq. 0) then
+      if (rank.eq.0) write(*,*) "Initialize zero velocity field"
+      do k = 1+halo_ext, piX%shape(3)-halo_ext
+         do j = 1+halo_ext, piX%shape(2)-halo_ext
+            do i = 1, piX%shape(1)
+               u(i,j,k) =  0.0d0
+               v(i,j,k) =  0.0d0 
+               w(i,j,k) =  0.0d0 
+            enddo
+         enddo
+      enddo
+   endif
+   if (inflow .eq. 1) then
+      if (rank.eq.0) write(*,*) "Initialize laminar flow (x) + 3D perturbation"
+      do k = 1+halo_ext, piX%shape(3)-halo_ext
+         kg = piX%lo(3) + k - 1 - halo_ext                   
+         do j = 1+halo_ext, piX%shape(2)-halo_ext
+            jg = piX%lo(2) + j - 1 - halo_ext
+            do i = 1, piX%shape(1)
+               amp=3.d0
+               mx=3.03d0
+               my=2.02d0
+               mz=4.d0
+               !3D divergence free flow with fluctuations that satisfies the boundary conditions
+               u(i,j,k) =  1.d0*(1.d0 - ((2*z(kg) - lz)/lz)**2) !
+               u(i,j,k) =  u(i,j,k) - amp*cos(twopi*mx*x(i)/lx)*sin(twopi*my*y(jg)/ly)*2.d0*twopi/lz*sin(twopi*z(kg)/lz)*cos(twopi*z(kg)/lz)
+               u(i,j,k) =  u(i,j,k) + amp*sin(twopi*mx*x(i)/lx)*(-twopi*my/ly)*sin(2.d0*twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+               v(i,j,k) = -amp*cos(twopi*my*y(jg)/ly)*(twopi*mx/lx)*cos(twopi*mx*x(i)/lx)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+               w(i,j,k) =  amp*cos(twopi*mx*x(i)/lx)*(twopi*mx/lx)*sin(twopi*my*y(jg)/ly)*sin(twopi*z(kg)/lz)*sin(twopi*z(kg)/lz)
+            enddo
+         enddo
+      enddo
+   endif
+   if (inflow .eq. 2) then
+      if (rank.eq.0)  write(*,*) "Initialize from data"
+      call readfield(1)
+      call readfield(2)
+      call readfield(3)
+   endif
+endif
+if (restart .eq. 1) then !restart, ignore inflow and read the tstart field 
+   if (rank.eq.0)  write(*,*) "Initialize velocity field (from output folder), iteration:", tstart
+   call readfield_restart(tstart,1)
+   call readfield_restart(tstart,2)
+   call readfield_restart(tstart,3)
+endif
+
+! update halo cells along y and z directions (enough only if pr and pc are non-unitary)
+!$acc host_data use_device(u)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+!$acc host_data use_device(v)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+!$acc host_data use_device(w)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+
+! initialize phase-field
+#if phiflag == 1
+if (restart .eq. 0) then
+if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
+   if (inphi .eq. 0) then
+   if (rank.eq.0) write(*,*) 'Spherical drop'
+      do k = 1+halo_ext, piX%shape(3)-halo_ext
+      kg = piX%lo(3) + k - 1 - halo_ext
+         do j = 1+halo_ext, piX%shape(2)-halo_ext
+         jg = piX%lo(2) + j - 1 - halo_ext
+            do i = 1, piX%shape(1)
+                pos=(x(i)-lx/2)**2d0 +  (y(jg)-ly/2)**2d0 + (z(kg)-lz/2)**2d0
+                phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+            enddo
+        enddo
+    enddo
+   endif
+   if (inphi .eq. 1) then
+      if (rank.eq.0)  write(*,*) "Initialize phase-field from data"
+      call readfield(5)
+   endif
+endif
+if (restart .eq. 1) then
+    write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+    call readfield_restart(tstart,5)
+endif
+! update halo
+!$acc host_data use_device(phi)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+#endif
+
+! initialize temperature field
+#if thetaflag == 1
+if (restart .eq. 0) then
+if (rank.eq.0) write(*,*) 'Initialize temperature field (fresh start)'
+   if (intheta .eq. 0) then
+   if (rank.eq.0) write(*,*) 'Uniform temperature field'
+      do k = 1+halo_ext, piX%shape(3)-halo_ext
+         kg = piX%lo(3) + k - 1 - halo_ext
+         do j = 1+halo_ext, piX%shape(2)-halo_ext
+            do i = 1, piX%shape(1)
+               theta(i,j,k) = 0.d0!1.d0 - z(kg)  ! uniform temperature
+            enddo
+         enddo
+      enddo
+   endif
+   if (intheta .eq. 1) then
+      if (rank.eq.0) write(*,*) "Initialize temperature from data"
+      call readfield(6)
+   endif
+endif
+if (restart .eq. 1) then
+    write(*,*) "Initialize temperature (restart, from output folder), iteration:", tstart
+    call readfield_restart(tstart,6)
+endif
+! update halo
+!$acc host_data use_device(theta)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+#endif
+
+!Save initial fields (only if a fresh start)
+if (restart .eq. 0) then
+   if (rank.eq.0) write(*,*) "Save initial fields"
+   call writefield(tstart,1)
+   call writefield(tstart,2)
+   call writefield(tstart,3)
+   call writefield(tstart,4)
+   #if phiflag == 1
+   call writefield(tstart,5)
+   #endif
+   #if thetaflag == 1
+   call writefield(tstart,6)  ! temperature
+   #endif
+endif
+!########################################################################################################################################
+! END STEP 3: FLOW FIELD, PHASE-FIELD AND TEMP INIT FIELD INIT
+!########################################################################################################################################
+
+
+
+
+
+
+! ########################################################################################################################################
+! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP
+! ########################################################################################################################################
+! First step use Euler
+gumax=1.d0
+tstart=tstart+1
+gamma=1.d0*gumax
+!$acc data copyin(piX)
+#if thetaflag == 1
+!$acc data create(rhsu_o, rhsv_o, rhsw_o, rhstheta_o)
+#else
+!$acc data create(rhsu_o, rhsv_o, rhsw_o)
+#endif
+!$acc data copyin(mysin, mycos)
+call cpu_time(t_start)
+! Start temporal loop
+do t=tstart,tfin
+    ! Create custom label for each marker
+    write(itcount,'(i4)') t
+    ! Range with custom  color
+    call nvtxStartRange("Iteration "//itcount,t)
+
+    if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
+    call cpu_time(times)
+
+   call nvtxStartRange("Phase-field")
+   !########################################################################################################################################
+   ! START STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
+   !########################################################################################################################################
+   #if phiflag == 1
+   !$acc kernels
+   do k=1, piX%shape(3)
+      do j=1, piX%shape(2)
+         do i=1,nx
+            ! compute distance function psi (used to compute normals)
+            val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+            psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+            ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+            tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+
+   gamma=1.d0*gumax
+   !$acc parallel loop tile(16,4,2)
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            ! 4.1 RHS computation
+            ip=i+1
+            jp=j+1
+            kp=k+1
+            im=i-1
+            jm=j-1
+            km=k-1
+            kg=piX%lo(3) + k - 1 - halo_ext
+            if (ip .gt. nx) ip=1
+            if (im .lt. 1) im=nx
+            ! convective (first three lines) and diffusive (last three lines)
+            rhsphi(i,j,k) =   &
+                  - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi   &  
+                  - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dyi   &  
+                  - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dzci(kg)  &  
+                        + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &                   
+                                 eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddyi + &                   
+                                 eps*((phi(i,j,kp)-phi(i,j,k))*dzi(kg+1) - (phi(i,j,k) -phi(i,j,km))*dzi(kg))*dzci(kg))     ! first between centers and then betwenn faces                
+            ! Compute normals for sharpening term (gradient)
+            normx(i,j,k) = 0.5d0*(psidi(ip,j,k) - psidi(im,j,k))*dxi
+            normy(i,j,k) = 0.5d0*(psidi(i,jp,k) - psidi(i,jm,k))*dyi
+            normz(i,j,k) =       (psidi(i,j,kp) - psidi(i,j,km))/(z(kg+1)-z(kg-1))
+         enddo
+      enddo
+   enddo
+
+   ! Update normx,normy and normz halos, required to then compute normal derivative
+   !$acc host_data use_device(normx,normy,normz)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+
+   ! 4.1.3. Compute Sharpening term (gradient)
+   !$acc kernels
+   do k=1, piX%shape(3)
+      do j=1, piX%shape(2)
+         do i=1,nx
+            normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16)
+            ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16)
+            normx(i,j,k) = normx(i,j,k)*normod
+            normy(i,j,k) = normy(i,j,k)*normod
+            normz(i,j,k) = normz(i,j,k)*normod
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+
+   ! Compute sharpening term
+   !$acc kernels
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            ip=i+1
+            jp=j+1
+            kp=k+1
+            im=i-1
+            jm=j-1
+            km=k-1
+            kg=piX%lo(3) + k - 1 - halo_ext
+            if (ip .gt. nx) ip=1
+            if (im .lt. 1) im=nx
+            rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
+                                                0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + &
+                                               (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
+                                                0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dyi + &
+                                               (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
+                                                0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))/(z(kg+1)-z(kg-1))) 
+         enddo
+      enddo
+   enddo
+    !$acc end kernels
+
+   ! Get phi at n+1 
+   !$acc kernels
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            phi(i,j,k) = phi(i,j,k) + dt*rhsphi(i,j,k)
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+
+   !$acc host_data use_device(phi)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   #endif
+   !########################################################################################################################################
+   ! END STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
+   !########################################################################################################################################
+   call nvtxEndRange
+
+
+   !########################################################################################################################################
+   ! START STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT)
+   !########################################################################################################################################
+   #if thetaflag == 1
+   ! Temperature solver (RK3 explicit)
+   do stage = 1,3
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               kg=piX%lo(3) + k - 1 - halo_ext
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective terms
+               rhstheta(i,j,k) = &
+                     - (u(ip,j,k)*0.5d0*(theta(ip,j,k)+theta(i,j,k)) - u(i,j,k)*0.5d0*(theta(i,j,k)+theta(im,j,k)))*dxi &
+                     - (v(i,jp,k)*0.5d0*(theta(i,jp,k)+theta(i,j,k)) - v(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,jm,k)))*dyi &
+                     - (w(i,j,kp)*0.5d0*(theta(i,j,kp)+theta(i,j,k)) - w(i,j,k)*0.5d0*(theta(i,j,k)+theta(i,j,km)))*dzci(kg)
+               ! diffusive terms
+               rhstheta(i,j,k) = rhstheta(i,j,k) + kappa*((theta(ip,j,k)-2.d0*theta(i,j,k)+theta(im,j,k))*ddxi + &
+                                                          (theta(i,jp,k)-2.d0*theta(i,j,k)+theta(i,jm,k))*ddyi + &
+                                                          ((theta(i,j,kp)-theta(i,j,k))*dzi(kg+1) - (theta(i,j,k) -theta(i,j,km))*dzi(kg))*dzci(kg))  
+
+               enddo
+         enddo
+      enddo
+      !$acc parallel loop collapse(3)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               theta(i,j,k) = theta(i,j,k) + dt*alpha(stage)*rhstheta(i,j,k) + dt*beta(stage)*rhstheta_o(i,j,k)
+               rhstheta_o(i,j,k)=rhstheta(i,j,k)
+            enddo
+         enddo
+      enddo
+      ! update halos 
+      !$acc host_data use_device(theta)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, theta, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      ! impose temperature boundary conditions, interpolated as done for the velocity (see node sketch)
+      !$acc parallel loop collapse(3)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               kg = piX%lo(3) + k - 1 - halo_ext                   
+               if (kg .eq. 1)    theta(i,j,k-1) =  2.d0*( 1.d0) - theta(i,j,k)     ! mean value between kg and kg-1 (top wall) equal to 1 
+               if (kg .eq. nz)   theta(i,j,k+1) =  2.d0*(-1.d0) - theta(i,j,k)     ! mean value between kg and kg+1 (bottom wall) equal to -1 
+            enddo
+         enddo
+      enddo
+   enddo
+   #endif
+   !########################################################################################################################################
+   ! END STEP 5: ENERGY EQUATION SOLVER (RK3 EXPLICIT)
+   !########################################################################################################################################
+
+
+
+
+   call nvtxStartRange("Projection")
+   !########################################################################################################################################
+   ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP)
+   !########################################################################################################################################
+   ! 6.1 compute rhs 
+   ! 6.2 obtain ustar and store old rhs in rhs_o
+   ! 6.3 Call halo exchnages along Y and Z for u,v,w
+
+   ! Projection step
+   do stage = 1,3
+      !$acc parallel loop tile(16,4,2) 
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               kg=piX%lo(3) + k - 1 - halo_ext
+               if (ip .gt. nx) ip=1  
+               if (im .lt. 1) im=nx
+               !  compute the products (conservative form)
+               h11 = 0.25d0*((u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k))     - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k)))*dxi
+               h12 = 0.25d0*((u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k))   - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dyi
+               h13 = 0.25d0*((u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp))   - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k)))*dzci(kg) ! divide by cell height
+               h21 = 0.25d0*((u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k))   - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k)))*dxi
+               h22 = 0.25d0*((v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k))     - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k)))*dyi
+               h23 = 0.25d0*((w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k))   - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km)))*dzci(kg) ! divide by cell height
+               h31 = 0.25d0*((w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km))   - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km)))*dxi
+               h32 = 0.25d0*((v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k))   - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k)))*dyi
+               h33 = 0.25d0*((w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k))     - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km)))*dzi(kg) ! divie by distance between centers
+               rhsu(i,j,k)=-(h11+h12+h13)
+               rhsv(i,j,k)=-(h21+h22+h23)
+               rhsw(i,j,k)=-(h31+h32+h33)
+               ! viscous/diffusive terms
+               h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi
+               h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddyi
+               h13 = mu*((u(i,j,kp)-u(i,j,k))*dzi(kg+1)-(u(i,j,k)-u(i,j,km))*dzi(kg))*dzci(kg)
+               h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi
+               h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddyi
+               h23 = mu*((v(i,j,kp)-v(i,j,k))*dzi(kg+1)-(v(i,j,k)-v(i,j,km))*dzi(kg))*dzci(kg)
+               h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi
+               h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddyi
+               h33 = mu*((w(i,j,kp)-w(i,j,k))*dzci(kg+1)-(w(i,j,k)-w(i,j,km))*dzci(kg))*dzi(kg) ! face to face and then center to center
+               rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi
+               rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi
+               rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi               
+               ! Pressure driven
+               rhsu(i,j,k)=rhsu(i,j,k) - gradpx
+               rhsv(i,j,k)=rhsv(i,j,k) - gradpy
+            enddo
+         enddo
+      enddo
+
+      ! Surface tension forces (Obtain surface tension forces evaluated at the center of the cell (same as where phi is located)
+      #if phiflag == 1
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               kg=piX%lo(3) + k - 1 - halo_ext
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dyi + (normz(i,j,kp)-normz(i,j,km))/(z(kg+1)-z(kg-1))
+               fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
+               fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dyi
+               fzst(i,j,k)= -sigma*curv*(phi(i,j,kp)-phi(i,j,km))/(z(kg+1)-z(kg-1))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
+      !$acc host_data use_device(fxst,fyst,fzst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+   
+      ! Interpolate force at velocity points
+      !$acc parallel loop collapse(3)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (im .lt. 1) im=nx
+               #if thetaflag == 0 
+               ! Add surface tension force only (ACDI only)
+               rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
+               rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
+               rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
+               #elif thetaflag == 1
+               ! Add here also buoyancy force if temperature is active (case ACDI + temperature)
+               rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k))
+               #endif
+               u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(im,j,k))*dxi
+               v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,jm,k))*dyi
+               w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k)! -dt*(alpha(stage)+beta(stage))*rho*(p(i,j,k)-p(i,j,km))*dzi
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
+      enddo
+      #else
+      ! 5.2 find u, v and w star (RK3), single-phase case
+      !$acc parallel loop collapse(3)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! Add here also buoyancy force if temperature is active (case NS + temperature)
+               #if thetaflag == 1
+               km=k-1
+               rhsw(i,j,k)=rhsw(i,j,k) + alphag*0.5d0*(theta(i,j,km)+theta(i,j,k))
+               #endif
+               u(i,j,k) = u(i,j,k) + dt*alpha(stage)*rhsu(i,j,k) + dt*beta(stage)*rhsu_o(i,j,k)
+               v(i,j,k) = v(i,j,k) + dt*alpha(stage)*rhsv(i,j,k) + dt*beta(stage)*rhsv_o(i,j,k)
+               w(i,j,k) = w(i,j,k) + dt*alpha(stage)*rhsw(i,j,k) + dt*beta(stage)*rhsw_o(i,j,k)
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
+      enddo
+      #endif
+
+      ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+      !$acc host_data use_device(u,v,w)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+
+      ! impose velocity boundary conditions, w is at the wall, u and v interpolate so that the mean value is zero, no-slip assumted, i.e. u=0, can be extented to any value
+      ! even when stretched the spacing cell face is centered
+      !$acc parallel loop collapse(3)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               kg = piX%lo(3) + k - 1 -halo_ext            
+               ! bottom wall 
+               if (kg .eq. 1)    u(i,j,k-1)=  -u(i,j,k)  !  mean value between kg and kg-1 (wall) equal to zero  
+               if (kg .eq. 1)    v(i,j,k-1)=  -v(i,j,k)  !  mean value between kg and kg-1 (wall) equal to zero  
+               if (kg .eq. 1)    w(i,j,k)=0.d0           ! w point is at the wall
+               ! top wall
+               if (kg .eq. nz)   u(i,j,k+1)=  -u(i,j,k)  !  mean value between kg and kg+1 (wall) equal to zero 
+               if (kg .eq. nz)   v(i,j,k+1)=  -v(i,j,k)  !  mean value between kg and kg+1 (wall) equal to zero 
+               if (kg .eq. nz+1) w(i,j,k)=0.d0           ! w point (nz+1) is at the wall
+            enddo
+         enddo
+      enddo  
+   enddo
+   !########################################################################################################################################
+   ! END STEP 6: USTAR COMPUTATION 
+   !########################################################################################################################################
+   call nvtxEndRange
+
+
+   call nvtxStartRange("Poisson")
+   !########################################################################################################################################
+   ! START STEP 7: POISSON SOLVER FOR PRESSURE
+   !########################################################################################################################################
+   ! initialize rhs and analytical solution
+   ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center 
+   ! I've done the halo updates so to compute the divergence at the pencil border i have the *star from the halo
+   call nvtxStartRange("compute RHS")
+   !$acc kernels
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            ip=i+1
+            jp=j+1
+            kp=k+1
+            kg = piX%lo(3)  + k - 1 - halo_ext
+            if (ip > nx) ip=1
+            rhsp(i,j,k) =                    (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
+            rhsp(i,j,k) = rhsp(i,j,k) +      (rho*dyi/dt)*(v(i,jp,k)-v(i,j,k))
+            rhsp(i,j,k) = rhsp(i,j,k) + (rho*dzci(kg)/dt)*(w(i,j,kp)-w(i,j,k))
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+   call nvtxEndRange
+
+   call nvtxStartRange("FFT forward w/ transpositions")
+   !$acc host_data use_device(rhsp)
+   status = cufftExecD2Z(planXf, rhsp, psi_d)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
+   !$acc end host_data
+   ! psi(kx,y,z) -> psi(y,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
+   ! psi(y,z,kx) -> psi(ky,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
+   ! psi(ky,z,kx) -> psi(z,kx,ky)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) 
+
+   call nvtxEndRange
+   np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
+   np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
+   np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
+   call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape)
+   offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+   offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+   offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+
+   xoff = offsets(1)
+   yoff = offsets(2)
+   npx = np(1)
+   npy = np(2)
+   call nvtxStartRange("Solution")
+   !$acc parallel loop collapse(2) gang private(a,b,c,d,factor) 
+   do jl = 1, npy
+      do il = 1, npx
+         ! compute index global wavenumber ig and jg
+         jg = yoff + jl
+         ig = xoff + il
+         ! Set up tridiagonal system for each i and j
+         ! Fill diagonals and rhs for each
+         ! 0 and ny+1 are the ghost nodes
+         do k = 1, nz
+            a(k) =  2.0d0*(dzi(k)**2*dzi(k+1))/(dzi(k)+dzi(k+1))
+            c(k) =  2.0d0*(dzi(k)*dzi(k+1)**2)/(dzi(k)+dzi(k+1))
+            b(k) = -a(k) - c(k) - (kx_d(ig)**2 + ky_d(jg)**2)
+            d(k) =  psi3d(k,il,jl)
+         enddo
+         ! Neumann BC at bottom
+         a(0) =  0.0d0
+         b(0) = -1.d0*dzi(1)*dzi(1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+         c(0) =  1.d0*dzi(1)*dzi(1)
+         d(0) =  0.0d0
+         ! Neumann BC at top
+         a(nz+1) =  1.0d0*dzi(nz+1)*dzi(nz+1)
+         b(nz+1) = -1.0d0*dzi(nz+1)*dzi(nz+1) - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+         c(nz+1) =  0.0d0
+         d(nz+1) =  0.0d0
+         ! Enforce pressure at one point? one interior point, avodig messing up with BC
+         ! need brackets?
+         if (ig == 1 .and. jg == 1) then
+            a(1) = 0.d0
+            b(1) = 1.d0
+            c(1) = 0.d0
+            d(1) = 0.d0
+         end if
+         ! Forward elimination (Thomas)
+         !$acc loop seq
+         do k = 1, nz+1
+            factor = a(k)/b(k-1)
+            b(k) = b(k) - factor*c(k-1)
+            d(k) = d(k) - factor*d(k-1)
+         end do
+         ! Back substitution
+         psi3d(nz+1,il,jl) = d(nz+1)/b(nz+1)
+         ! check on pivot like flutas?
+         !$acc loop seq
+         do k = nz, 1, -1
+            psi3d(k,il,jl) = (d(k) - c(k)*psi3d(k+1,il,jl))/b(k)
+         end do
+      end do
+   end do
+
+   call nvtxStartRange("FFT backwards along x and y w/ transpositions")
+   ! psi(z,kx,ky) -> psi(ky,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+   ! psi(ky,z,kx) -> psi(y,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
+   ! psi(y,z,kx) -> psi(kx,y,z)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
+   !$acc host_data use_device(p)
+   ! psi(kx,y,z) -> p(x,y,z)
+   status = cufftExecZ2D(planXb, psi_d, p)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
+   !$acc end host_data
+
+   ! normalize pressure (must be done here, not in the TDMA)
+   !$acc parallel loop collapse(3)
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            p(i,j,k) = p(i,j,k)/nx/ny
+         end do
+      end do
+   end do
+
+   call nvtxEndRange
+   ! update halo nodes with pressure 
+   ! Update X-pencil halos 
+   !$acc host_data use_device(p)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   !########################################################################################################################################
+   ! END STEP 7: POISSON SOLVER FOR PRESSURE
+   !########################################################################################################################################
+   call nvtxEndRange
+
+
+   call nvtxStartRange("Correction")
+   !########################################################################################################################################
+   ! START STEP 8: VELOCITY CORRECTION
+   ! ########################################################################################################################################
+   ! 8.1 Correct velocity 
+   ! 8.2 Call halo update
+   ! Correct velocity, pressure has also the halo
+   !$acc kernels 
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i = 1, piX%shape(1) ! equal to nx (no halo on x)
+              im=i-1
+              jm=j-1
+              km=k-1
+              kg=piX%lo(3)  + k - 1 - halo_ext
+              if (im < 1) im=nx
+              u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
+              v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dyi
+              w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dzi(kg)
+          enddo
+      enddo
+   enddo
+   !$acc end kernels 
+
+   ! 8.3 update halos (y direction), required to then compute the RHS of Poisson equation because of staggered grid
+   !$acc host_data use_device(u,v,w)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+
+   ! impose velocity boundary conditions, can be optimized, no real gain
+   ! w is at the wall, u and v interpolate so that the mean value is zero
+   ! no-slip assumted, i.e. u=0, can be extented to any value
+   umax=0.d0
+   vmax=0.d0
+   wmax=0.d0
+   !$acc parallel loop collapse(3) reduction(max:umax,vmax,wmax)
+   do k=1, piX%shape(3)
+      do j=1, piX%shape(2)
+         do i=1,nx
+            kg = piX%lo(3) + k - 1 - halo_ext                   
+            ! bottom wall 
+            if (kg .eq. 1)    u(i,j,k-1) =  -u(i,j,k)  !  mean value between kg and kg-1 (wall) equal to zero  
+            if (kg .eq. 1)    v(i,j,k-1) =  -v(i,j,k)  !  mean value between kg and kg-1 (wall) equal to zero  
+            if (kg .eq. 1)    w(i,j,k)    =   0.d0       !  w point is at the wall
+            ! top wall
+            if (kg .eq. nz)   u(i,j,k+1)=  -u(i,j,k)  !  mean value between kg and kg+1 (wall) equal to zero 
+            if (kg .eq. nz)   v(i,j,k+1)=  -v(i,j,k)  !  mean value between kg and kg+1 (wall) equal to zero 
+            if (kg .eq. nz+1) w(i,j,k)=0.d0             !  w point (nz+1) is at the wall
+            umax=max(umax,u(i,j,k))
+            vmax=max(vmax,v(i,j,k))
+            wmax=max(wmax,w(i,j,k))
+         enddo
+      enddo
+   enddo
+
+   call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+   call MPI_Allreduce(vmax,gvmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+   call MPI_Allreduce(wmax,gwmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+
+   cflx=gumax*dt*dxi
+   cfly=gvmax*dt*dyi
+   cflz=gwmax*dt*lz/nz
+   cou=max(cflx,cfly)
+   cou=max(cou,gcflz)
+   if (rank.eq.0) then
+      write(*,*) "CFL (max among tasks)", cou
+      if (cou .gt. 7) stop
+   endif
+   
+   call cpu_time(timef)
+   if (rank.eq.0) print '(" Time elapsed = ",f6.1," ms")',1000*(timef-times)
+   !########################################################################################################################################
+   ! END STEP 8: VELOCITY CORRECTION  
+   !########################################################################################################################################
+   call nvtxEndRange
+
+
+   !########################################################################################################################################
+   ! START STEP 9: OUTPUT FIELDS 
+   ! ########################################################################################################################################
+   if (mod(t,dump) .eq. 0) then
+      if (rank .eq. 0) write(*,*) "Saving output files"
+         ! write velocity and pressure fiels (1-4)
+         call writefield(t,1)
+         call writefield(t,2)
+         call writefield(t,3)
+         call writefield(t,4)
+         #if phiflag == 1
+         ! write phase-field (5)
+         call writefield(t,5)
+         #endif
+         #if thetaflag == 1
+         ! write temperature field (6)
+         call writefield(t,6)
+         #endif
+   endif
+   !########################################################################################################################################
+   ! END STEP 9: OUTPUT FIELDS N  
+   !########################################################################################################################################
+
+call nvtxEndRange
+!call nvtxEndRange
+enddo
+call cpu_time(t_end)
+elapsed = t_end-t_start
+if (rank .eq. 0) write(*,*)  'Elapsed time (seconds):', elapsed
+!$acc end data
+!$acc end data
+!$acc end data
+
+! Remove allocated variables (add new)
+deallocate(u,v,w)
+deallocate(tanh_psi, mysin, mycos)
+deallocate(rhsu,rhsv,rhsw)
+deallocate(rhsu_o,rhsv_o,rhsw_o)
+#if phiflag == 1
+deallocate(phi,rhsphi,normx,normy,normz)
+#endif
+#if thetaflag == 1
+deallocate(theta,rhstheta,rhstheta_o)
+#endif
+
+call mpi_finalize(ierr)
+
+end program main
\ No newline at end of file
diff --git a/channel/module.f90 b/channel/module.f90
new file mode 100644
index 0000000..6ea93b7
--- /dev/null
+++ b/channel/module.f90
@@ -0,0 +1,149 @@
+module param
+    integer, parameter :: nx=200
+    integer, parameter :: ny=200
+    integer, parameter :: nz=100
+    double precision :: pi, rhoi, twopi
+    double precision :: lx, dx, dxi, ddxi 
+    double precision :: ly, dy, dyi, ddyi
+    double precision :: lz
+    double precision, allocatable :: x(:), y(:), z(:), dzi(:), dzci(:), kx(:), ky(:)
+    double precision, device, allocatable :: kx_d(:), ky_d(:)
+    integer :: restart,tstart,tfin,dump
+    double precision :: gamma, normod, factor, csi
+    double precision :: dt,mu,rho !flow parameters
+    integer :: inflow, inphi, intheta
+    double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters
+    double precision :: times, timef, alphag
+    double precision :: gradpx, gradpy, noise, lflow, gflow, ubulk, cflx, cfly, cflz, gcflz
+    double precision :: amp, mx, my, mz ! for perturbed flow
+    double precision :: kappa ! temperature parameters: thermal diffusivity, Prandtl number
+end module param
+
+
+module mpivar
+   ! MPI variables
+   integer :: rank, ranks, ierr
+   integer :: localRank, localComm
+end module mpivar
+
+
+module cudecompvar
+   use cudecomp
+   integer :: npx, npy, npz
+   type(cudecompHandle) :: handle
+   type(cudecompGridDesc) :: grid_desc,grid_descD2Z
+   type(cudecompGridDescConfig) :: config
+   type(cudecompGridDescAutotuneOptions) :: options
+   integer :: pdims(2) ! pr x pc pencils
+   integer :: gdims(3) ! global grid dimensions
+   integer :: halo(3) ! halo extensions
+   integer :: halo_ext ! 0 no halo, 1 means 1 halo
+   type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration
+   type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z  ! size of the pencils in x- y- and z-configuration for D2Z
+   type(cudecompPencilInfo) :: piX_Poiss
+   integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z
+   integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z
+   logical :: halo_periods(3)
+end module cudecompvar
+
+
+module velocity
+   double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:)
+   double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:)
+   double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:)
+   complex(8), allocatable :: rhsp_complex(:,:,:)
+   double precision, allocatable :: rhsp(:,:,:), p(:,:,:)
+   double precision, allocatable :: rhspp(:,:,:), pp(:,:,:)
+   double precision, allocatable :: div(:,:,:)
+   double precision :: uc, vc, wc, umax, vmax, wmax, gumax, gvmax, gwmax, cou
+   double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33
+   double precision, allocatable :: mysin(:), mycos(:)
+end module velocity
+
+
+module phase
+   double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+   double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+   double precision :: chempot, curv
+   double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+end module phase
+
+
+module temperature
+   double precision, allocatable :: theta(:,:,:), rhstheta(:,:,:)
+   double precision, allocatable :: rhstheta_o(:,:,:)
+end module temperature
+
+
+! added NVTX for profiing from maxcuda/NVTX_example
+module nvtx
+use iso_c_binding
+implicit none
+integer,private :: col(7) = [ int(Z'0000ff00'), int(Z'000000ff'), int(Z'00ffff00'), int(Z'00ff00ff'), int(Z'0000ffff'), int(Z'00ff0000'), int(Z'00ffffff')]
+character,private,target :: tempName(256)
+
+type, bind(C):: nvtxEventAttributes
+  integer(C_INT16_T):: version=1
+  integer(C_INT16_T):: size=48 !
+  integer(C_INT):: category=0
+  integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1
+  integer(C_INT):: color
+  integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0
+  integer(C_INT):: reserved0
+  integer(C_INT64_T):: payload   ! union uint,int,double
+  integer(C_INT):: messageType=1  ! NVTX_MESSAGE_TYPE_ASCII     = 1 
+  type(C_PTR):: message  ! ascii char
+end type
+
+interface nvtxRangePush
+  ! push range with custom label and standard color
+  subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
+  use iso_c_binding
+  character(kind=C_CHAR) :: name(256)
+  end subroutine
+
+  ! push range with custom label and custom color
+  subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx')
+  use iso_c_binding
+  import:: nvtxEventAttributes
+  type(nvtxEventAttributes):: event
+  end subroutine
+end interface
+
+interface nvtxRangePop
+  subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
+  end subroutine
+end interface
+
+contains
+
+subroutine nvtxStartRange(name,id)
+  character(kind=c_char,len=*) :: name
+  integer, optional:: id
+  type(nvtxEventAttributes):: event
+  character(kind=c_char,len=256) :: trimmed_name
+  integer:: i
+
+  trimmed_name=trim(name)//c_null_char
+
+  ! move scalar trimmed_name into character array tempName
+  do i=1,LEN(trim(name)) + 1
+     tempName(i) = trimmed_name(i:i)
+  enddo
+
+
+  if ( .not. present(id)) then
+    call nvtxRangePush(tempName)
+  else
+    event%color=col(mod(id,7)+1)
+    event%message=c_loc(tempName)
+    call nvtxRangePushEx(event)
+  end if
+end subroutine
+
+subroutine nvtxEndRange
+  call nvtxRangePop
+end subroutine
+
+end module nvtx
+
diff --git a/channel/plot_fields.py b/channel/plot_fields.py
new file mode 100644
index 0000000..5847922
--- /dev/null
+++ b/channel/plot_fields.py
@@ -0,0 +1,569 @@
+# # %%
+# %%html
+# <style>
+# .cell-output-ipywidget-background {
+#    background-color: transparent !important;
+# }
+# .jp-OutputArea-output {
+#    background-color: transparent;
+# }  
+# .dataframe th {
+#     font-size: 6px;
+# }
+# .dataframe td {
+#     font-size: 6px;
+# }
+# </style>
+
+# %%
+# %matplotlib widget
+# %matplotlib inline
+
+import ipympl
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.style as style
+# from ing_theme_matplotlib import mpl_style
+# from qbstyles import mpl_style
+
+import os
+import glob
+
+from scipy.interpolate import griddata
+
+
+from matplotlib.widgets import Cursor
+
+import h5py
+
+plt.style.use("dark_background")
+# mpl_style(dark=True)
+
+
+#################################################################
+foldername = './output/'
+
+# PARAMETERS:
+
+# fields to plot
+fields = ['theta']
+
+# number of points in each direction
+# Grid parameters (user-defined)
+nx = 256  # number of points in x
+ny = 128  # number of points in y
+nz = 200  # number of points in z
+
+Lx = 6.0  # length of domain in x
+Ly = 3.0      # length of domain in y
+Lz = 2.0      # length of domain in z
+
+# compute the derivative of the fields (show them instead of the neormal fields)
+# 0: no derivative
+# 1: x derivative
+# 2: y derivative
+# 3: z derivative
+# list more flag to compute consecutive derivatives (forder 1 FD)
+derivative_vec = [0]
+
+# # normal direction of the 2D slice:
+# 1: x-direction
+# 2: y-direction
+# 3: z-direction
+slice_dir = 2
+
+# index to take the slice (from 1 to nx_i, choose -1 for computing the average)
+slice_idx = 0
+
+# slice_idx = 222
+
+# time_steps to plot 
+# ts_vec = [0]
+# ts_vec = range(800000,840500,10000)
+# ts_vec = range(0,900001,100000)
+ts_vec = [60000]  # Test with just one timestep
+
+# ts_vec = [10000]
+
+# set 1 to compute time averaged quantities, 0 otherwise
+timeaverage = 0
+
+# set 1 to compute fluctuating components, 0 otherwise (expensive)
+fluct = 0
+
+# value for the fontsize:
+fontsize_val = 10
+
+# show heatmaps
+showmaps_flag = 1
+
+# slice of slice (leave -1 to compute the mean)
+meanprof_slice = 0
+
+# value of the figure size
+figsize_val = 8
+
+# save figure in png format
+savefig_flag = 0
+
+# save data in h5 format
+savedata_flag = 0
+
+# Aspect ratio of the heatmaps (-1 for auto, i.e. proportional, WARNING:can't handle huge AR)
+AR_plot = -1
+
+# Set your desired color limits here
+vmin = None  # e.g., vmin = 0.0
+vmax = None  # e.g., vmax = 1.0
+
+# vmin = -1  # e.g., vmin = 0.0
+# vmax = 1  # e.g., vmax = 1.0
+
+#################################################################
+# AUXILIARY FUNCTIONS
+def get_spectrum(x, signal, title='Spectrum Analysis'):
+
+    # Compute the Fourier Transform
+    spectrum = np.fft.fft(signal)
+    
+    # Calculate the amplitude and phase
+    amplitude = np.abs(spectrum)
+    phase = np.angle(spectrum)
+
+    # Frequency values corresponding to the FFT components
+    frequencies = np.fft.fftfreq(len(x), d=(x[1] - x[0]))
+
+    # cut negative freq
+    positive_freq_indices = frequencies >= 0
+    frequencies = frequencies[positive_freq_indices]
+    amplitude = amplitude[positive_freq_indices]
+    phase = phase[positive_freq_indices]
+
+    # Sort frequencies and corresponding arrays
+    sorted_indices = np.argsort(frequencies)
+    frequencies = frequencies[sorted_indices]
+    amplitude = amplitude[sorted_indices]
+    phase = phase[sorted_indices]
+
+    # crop 2/3 of freq
+    amplitude = amplitude[1:int(np.floor(len(frequencies)*2/3))]
+    phase = phase[1:int(np.floor(len(frequencies)*2/3))]
+    frequencies = frequencies[1:int(np.floor(len(frequencies)*2/3))]
+
+    wavenum = frequencies*2*np.pi
+    return [wavenum,amplitude,phase]
+
+#################################################################
+
+# font = {'family' : 'normal',
+#         'weight' : 'normal',
+#         'size'   : 6}
+
+plt.rcParams.update({'font.size': 5})
+
+# params_txt = {'axes.labelsize': 5,'axes.titlesize':5, 'text.fontsize': 5, 'legend.fontsize': 5, 'xtick.labelsize': 5, 'ytick.labelsize': 5}
+# plt.rcParams.update(params_txt)
+
+# solve conflicts: if fluctuations is compute mean is needed
+if fluct == 1:
+    timeaverage = 1
+
+if timeaverage == 1:
+    meansuff = '_MEAN'
+else:
+    meansuff = ''
+
+done = 0
+# GENERATED ON:
+from datetime import datetime
+
+# Print current date and time briefly
+print("Current date and time:", datetime.now())
+
+glob_profiles = []
+
+FLUCARRAYS_time = []
+
+x = np.linspace(0, Lx, nx)
+y = np.linspace(0, Ly, ny)
+z = np.linspace(0, Lz, nz)
+
+# Define the dimensions of the reshaped arrays (nvec) [y z x]
+
+nvec = (nx, ny, nz)  # [y, z, x] order in your data file
+
+# nvec = (512, 513, 512)  # Update with the actual dimensions
+# nvec = (256, 257, 256)
+# # nvec = (128, 129, 128)
+# nvec = (0, 0, 0)
+
+nx = nvec[0]
+ny = nvec[1]
+nz = nvec[2]
+
+for n_step in ts_vec:
+    file_names = []
+
+    for fld in fields:
+        file_names.append(fld + '_{:08d}.dat')
+
+    # Initialize an empty list to store the data arrays
+    data_arrays = []
+
+    # initialize arrays with sums if want to calculate time averages
+    if timeaverage == 1:
+        flucarray = []
+        if n_step == ts_vec[0]:
+            sumarray = []
+
+    # Read the data from each file and reshape
+    id_fnames = -1
+    for fld in fields:
+        file_name = f"{fld}_{n_step:08d}.dat"
+        file_path = foldername + file_name
+        id_fnames = id_fnames+1
+        
+        # Check if file exists
+        if not os.path.exists(file_path):
+            print(f"Warning: File {file_path} not found, skipping...")
+            continue
+            
+        with open(file_path, 'rb') as file:
+            total_elements = np.prod(nvec)
+            data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,))
+            data = data.reshape(np.flip(nvec))*1.0
+
+            data = data.transpose((2, 1, 0)) # Permute first and third index to match the convection [x,z,y]
+            
+            # Validate data
+            print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}")
+
+            dersuff = ''
+            if derivative_vec[0] != 0:
+                dersuff = '_'
+                for ider in derivative_vec:
+                    derivative_x, derivative_z, derivative_y = np.gradient(data, x, z, y)
+                    if ider == 1:
+                        data = derivative_x
+                        dersuff = dersuff+'x'
+                    elif ider == 2:
+                        data = derivative_y
+                        dersuff = dersuff+'y'
+                    elif ider == 3:
+                        data = derivative_z
+                        dersuff = dersuff+'z'
+
+            # define axes:
+            if slice_dir == 1:
+                hor_name = 'y'
+                ver_name = 'z'
+                hor = y
+                nhor = ny
+                ver = z
+                nver = nz
+            elif slice_dir == 2:
+                hor_name = 'x'
+                ver_name = 'z'
+                hor = x
+                nhor = nx
+                ver = z
+                nver = nz
+            elif slice_dir == 3:
+                hor_name = 'x'
+                ver_name = 'y'
+                hor = x
+                nhor = nx
+                ver = y
+                nver = ny
+
+            if slice_idx == -1:
+                type_name = 'average'
+                if slice_dir == 1:
+                    mean_array = np.mean(data, axis=0)
+                elif slice_dir == 2:
+                    mean_array = np.mean(data, axis=1)
+                elif slice_dir == 3:
+                    mean_array = np.mean(data, axis=2)
+
+            else:
+                type_name = 'slice'
+                if slice_dir == 1:
+                    mean_array = data[slice_idx-1,:,:]
+                elif slice_dir == 2:
+                    mean_array = data[:,slice_idx-1,:]
+                elif slice_dir == 3:
+                    mean_array = data[:,:,slice_idx-1]
+
+            data_arrays.append(mean_array)
+
+            if timeaverage == 1:
+                if fluct == 1:
+                    flucarray.append(mean_array)
+                if n_step == ts_vec[0]:
+                    sumarray.append(mean_array)
+                else:
+                    sumarray[id_fnames] = sumarray[id_fnames]+mean_array
+
+                if n_step == ts_vec[-1]:
+                    data_arrays[id_fnames] = sumarray[id_fnames]/len(ts_vec)
+
+                    FLUCARRAYS_time.append(flucarray)
+
+    # Plot each array as a heat map with coordinates
+
+    if timeaverage == 0 or n_step == ts_vec[-1]:
+        if showmaps_flag == 1:
+            for i, array in enumerate(data_arrays):
+                if array.ndim != 2:
+                    print(f"Invalid shape for {file_name}")
+                    continue
+                
+                N = 500j
+                extent = (hor.min(),hor.max(),ver.min(),ver.max())
+
+                HOR,VER = np.meshgrid(hor,ver)
+                hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N]
+
+                plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val))
+                
+                ax = plt.gca()
+                # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+                # im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+                # Use pcolormesh for non-uniform grids
+                X, Y = np.meshgrid(hor,ver)
+
+                im = ax.pcolormesh(X, Y, array.T, cmap='jet', shading='gouraud', vmin=vmin, vmax=vmax)  # smoother shading
+
+                ax.set_aspect('equal')  # <-- Add this line to set axis equal
+
+                plt.xlabel(hor_name, fontsize=fontsize_val)
+                plt.ylabel(ver_name, fontsize=fontsize_val)
+                plt.xticks(fontsize=fontsize_val, rotation=0)
+                plt.yticks(fontsize=fontsize_val, rotation=0)
+
+                # create an axes on the right side of ax. The width of cax will be 5%
+                # of ax and the padding between cax and ax will be fixed at 0.05 inch.
+                cax = plt.axes([0.15, 0.95, 0.7, 0.03])
+                plt.colorbar(im, cax=cax,orientation = "horizontal")
+                cax.xaxis.set_ticks_position('top')
+                plt.xticks(fontsize=fontsize_val, rotation=0)
+                plt.title(file_name+dersuff,fontsize=fontsize_val)
+
+                plt.show()
+
+                # save arrays and figure in the case of temporal average
+                if timeaverage == 1:
+                    if savefig_flag == 1:
+                        plt.savefig('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'.png', dpi=800)
+                    
+                    # Save the arrays to an HDF5 file using h5py
+                    if savedata_flag == 1:
+                        with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'timeav.h5', 'w') as hf:
+                            hf.create_dataset('data', data=array)
+                            hf.create_dataset('x', data=hor)
+                            hf.create_dataset('y', data=ver)
+
+
+    # # # # # PLOT ALL LINES IN THE SAME FIGURE
+    # # Plot the vertical profiles in the opposite direction with different colors
+    # plt.figure(figsize=(8, 6))
+
+    # for i, array in enumerate(data_arrays):
+    #     if array.ndim != 2:
+    #         print(f"Invalid shape for {file_name}")
+    #         continue
+
+    #     # Compute the vertical profile in the opposite direction
+    #     vertical_profile = np.mean(array, axis=0)[::-1]
+
+    #     # Plot the vertical profile with a different color for each array
+    #     plt.plot(vertical_profile, ver, label=file_name, alpha=0.7)
+
+    # plt.title(fields[i]+'Profiles',fontsize=fontsize_val)
+    # plt.xlabel(fields[i] +'Mean',fontsize = fontsize_val)
+    # plt.ylabel(ver_name,fontsize = fontsize_val)
+    # plt.legend(loc ="best",fontsize = fontsize_val)
+    # plt.xticks(fontsize=fontsize_val, rotation=0)
+    # plt.yticks(fontsize=fontsize_val, rotation=0)
+    # plt.show()
+
+    # Plot the vertical profiles in the opposite direction with different colors
+
+    vertical_profile = []
+
+    for i, array in enumerate(data_arrays):
+        if array.ndim != 2:
+            print(f"Invalid shape for {file_name}")
+            continue
+
+        # Compute the vertical profile in the opposite direction
+        if meanprof_slice == -1:
+            vertical_profile.append(np.mean(array, axis=0))
+            horproflabelaux = ' Mean'
+        else:
+            vertical_profile.append(np.transpose(array[meanprof_slice-1,:]))
+            horproflabelaux = ' '
+            
+
+        # plt.figure(figsize=(8, 6))
+
+        # # Plot the vertical profile with a different color for each array
+        # plt.plot(vertical_profile[i], ver, label=file_name, alpha=0.7)
+
+        # plt.title(fields[i]+' Profiles',fontsize=fontsize_val)
+        # plt.xlabel(fields[i] +' Mean',fontsize = fontsize_val)
+        # plt.ylabel(ver_name,fontsize = fontsize_val)
+        # plt.legend(loc ="best",fontsize = fontsize_val)
+        # plt.xticks(fontsize=fontsize_val, rotation=0)
+        # plt.yticks(fontsize=fontsize_val, rotation=0)
+        # plt.show()
+
+    glob_profiles.append(vertical_profile)
+
+prf_plt = []
+
+if fluct == 1:
+    for i, array in enumerate(data_arrays):
+        mean = data_arrays[id_fnames]
+        rmsfluc = np.zeros_like(mean)
+        for j in range(len(ts_vec)):
+            flucs = FLUCARRAYS_time[j]
+            singfield = flucs[i]
+            rmsfluc = rmsfluc+np.square(singfield-mean)
+        rmsfluc = np.sqrt(rmsfluc/len(ts_vec))
+
+        # PLOT RMS FIELD
+        N = 500j
+        extent = (hor.min(),hor.max(),ver.min(),ver.max())
+
+        HOR,VER = np.meshgrid(hor, ver)
+        hors,vers = np.mgrid[extent[0]:extent[1]:N, extent[2]:extent[3]:N]
+        aaa = np.ones((len(HOR.flatten()),2))
+        aaa[:,0] = HOR.flatten()
+        aaa[:,1] = VER.flatten()
+
+        resampled = griddata( aaa, rmsfluc.T.flatten(), (hors, vers), method='linear')
+
+        if done == 0:
+            done = 1
+            if AR_plot == -1:
+                AR_plot = 1/(hor.max()-hor.min())*(ver.max()-ver.min())
+                AR_plt_flag = 0
+            else:
+                AR_plt_flag = 1
+
+        plt.figure(figsize=(figsize_val, figsize_val*0.12+figsize_val*AR_plot))
+        
+        ax = plt.gca()
+        if AR_plt_flag == 1:
+            # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+            im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect='auto')#extent=[y.min(), y.max(), z.min(), z.max()])
+        else:
+            # im = ax.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()])
+            im = ax.imshow(resampled.T, cmap='jet', origin='lower', extent=extent, aspect=1)#extent=[y.min(), y.max(), z.min(), z.max()])
+
+        # plt.imshow(np.flip(resampled.T, axis = 0), cmap='jet', origin='lower', extent=extent)#extent=[y.min(), y.max(), z.min(), z.max()])
+        # plt.colorbar(orientation = "horizontal")
+
+        # plt.title(file_name,fontsize=fontsize_val)
+        plt.xlabel(hor_name,fontsize=fontsize_val)
+        plt.ylabel(ver_name,fontsize=fontsize_val)
+        plt.xticks(fontsize=fontsize_val, rotation=0)
+        plt.yticks(fontsize=fontsize_val, rotation=0)
+
+        # create an axes on the right side of ax. The width of cax will be 5%
+        # of ax and the padding between cax and ax will be fixed at 0.05 inch.
+        divider = make_axes_locatable(ax)
+        cax = divider.append_axes("top", size="5%", pad=0.05)
+        plt.colorbar(im, cax=cax,orientation = "horizontal")
+        cax.xaxis.set_ticks_position('top')
+        plt.xticks(fontsize=fontsize_val, rotation=0)
+        plt.title(file_name+dersuff+'_RMS',fontsize=fontsize_val)
+
+        plt.show()
+
+        # SAVE FLUCTUATION FIELDS
+        if savedata_flag == 1:
+            with h5py.File('./'+fields[i]+dersuff+'_'+str(ts_vec[0])+'_'+str(ts_vec[-1])+'time_rms.h5', 'w') as hf:
+                hf.create_dataset('data', data=rmsfluc)
+                hf.create_dataset('x', data=hor)
+                hf.create_dataset('y', data=ver)
+
+numfields = len(glob_profiles[0])
+
+for i in range(numfields):
+
+    plt.figure(figsize=(figsize_val, 9))
+
+    for j in range(len(ts_vec)):
+
+        profile_sets = glob_profiles[j]
+        single_profile = profile_sets[i]
+                
+        # plt.figure(figsize=(8, 6))
+
+        # Plot the vertical profile with a different color for each array
+        plt.plot(single_profile, ver, label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+
+
+    if fld == 'u' and slice_dir != 3:
+        zmin = np.min(ver)
+        zmax = np.max(ver)
+        zc = 0.5 * (zmin + zmax)
+        zlen = zmax - zmin
+
+        # Parabolic profile: u(z) = umax * (1 - ((z - zc)/H)^2), with H = zlen/2
+        H = zlen / 2.0
+        umax = 1.0 / (2.0 * 1.0)  # assuming mu = rho = 1 → umax = 1/(2μ)
+        zvals = np.linspace(zmin, zmax, 256)
+        u_analytical = umax * (1.0 - ((zvals - zc)/H)**2)
+
+        plt.plot(u_analytical, zvals, 'r--', linewidth=2, label='Poiseuille (analytical)')
+
+    plt.title(fields[i]+dersuff+' Profiles',fontsize=fontsize_val)
+    plt.xlabel(fields[i]+dersuff + horproflabelaux,fontsize = fontsize_val)
+    plt.ylabel(ver_name,fontsize = fontsize_val)
+    # plt.legend(loc ="best",fontsize = fontsize_val)
+    plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+    plt.subplots_adjust(right=0.70)
+    plt.xticks(fontsize=fontsize_val, rotation=0)
+    plt.yticks(fontsize=fontsize_val, rotation=0)
+    plt.show()
+
+for i in range(numfields):
+
+    plt.figure(figsize=(figsize_val, 8))
+
+    for j in range(len(ts_vec)):
+
+        profile_sets = glob_profiles[j]
+        single_profile = profile_sets[i]
+                
+        # plt.figure(figsize=(8, 6))
+
+        # Plot the vertical profile with a different color for each array
+        [freq,amp,phase] = get_spectrum(ver,single_profile)
+
+        # Plot amplitude on log scale
+        plt.subplot(2, 1, 1)
+        plt.plot(freq, amp,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+        plt.xscale('log')
+        if np.any(amp > 0):  # Only use log scale if there are positive values
+            plt.yscale('log')
+        plt.title('Amplitude')
+        plt.xlabel('k_'+ver_name)
+        plt.ylabel('Amplitude')
+
+        # Plot phase on log scale
+        plt.subplot(2, 1, 2)
+        plt.plot(freq, phase,label=fields[i]+'_'+str(ts_vec[j]), alpha=0.7)
+        plt.xscale('log')
+        plt.title('Phase')
+        plt.xlabel('k_'+ver_name)
+        plt.ylabel('Phase (radians)')
+
+    # Adjust layout and show the plot
+    plt.subplot(2, 1, 1)
+    plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+    plt.tight_layout()
+    plt.show()
diff --git a/channel/plot_profiles.py b/channel/plot_profiles.py
new file mode 100644
index 0000000..086c742
--- /dev/null
+++ b/channel/plot_profiles.py
@@ -0,0 +1,163 @@
+# # %%
+# %%html
+# <style>
+# .cell-output-ipywidget-background {
+#    background-color: transparent !important;
+# }
+# .jp-OutputArea-output {
+#    background-color: transparent;
+# }  
+# .dataframe th {
+#     font-size: 6px;
+# }
+# .dataframe td {
+#     font-size: 6px;
+# }
+# </style>
+
+# %%
+# %matplotlib widget
+# %matplotlib inline
+
+import ipympl
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.style as style
+# from ing_theme_matplotlib import mpl_style
+# from qbstyles import mpl_style
+
+import os
+import glob
+
+from scipy.interpolate import griddata
+
+
+from matplotlib.widgets import Cursor
+
+import h5py
+
+plt.style.use("dark_background")
+# mpl_style(dark=True)
+
+
+#################################################################
+foldername = './output/'
+
+# PARAMETERS:
+
+# fields to plot
+fields = ['theta']
+
+# number of points in each direction
+# Grid parameters (user-defined)
+nx = 256  # number of points in x
+ny = 128  # number of points in y
+nz = 200  # number of points in z
+
+nx = 2  # number of points in x
+ny = 2  # number of points in y
+nz = 1000  # number of points in z
+
+Lx = 6.0  # length of domain in x
+Ly = 3.0      # length of domain in y
+Lz = 2.0      # length of domain in z
+
+# compute the derivative of the fields (show them instead of the neormal fields)
+# 0: no derivative
+# 1: x derivative
+# 2: y derivative
+# 3: z derivative
+# list more flag to compute consecutive derivatives (forder 1 FD)
+derivative_vec = [0]
+
+# # normal direction of the 2D slice:
+# 1: x-direction
+# 2: y-direction
+# 3: z-direction
+slice_dir = 2
+
+# index to take the slice (from 1 to nx_i, choose -1 for computing the average)
+slice_idx = 0
+
+# slice_idx = 222
+
+# time_steps to plot 
+ts_vec = range(0,230000,10000)
+
+# ts_vec = [10000]
+
+# set 1 to compute time averaged quantities, 0 otherwise
+timeaverage = 0
+
+# set 1 to compute fluctuating components, 0 otherwise (expensive)
+fluct = 0
+
+# value for the fontsize:
+fontsize_val = 10
+
+x = np.linspace(0, Lx, nx)
+y = np.linspace(0, Ly, ny)
+z = np.linspace(0, Lz, nz)
+
+# Define the dimensions of the reshaped arrays (nvec) [y z x]
+
+nvec = (nx, ny, nz)  # [y, z, x] order in your data file
+
+# nvec = (512, 513, 512)  # Update with the actual dimensions
+# nvec = (256, 257, 256)
+# # nvec = (128, 129, 128)
+# nvec = (0, 0, 0)
+
+nx = nvec[0]
+ny = nvec[1]
+nz = nvec[2]
+
+id_fnames = -1
+
+for fld in fields:
+
+    id_fnames = id_fnames+1
+
+
+    plt.figure(figsize=(10, 9))
+
+    for n_step in ts_vec:
+        file_names = []
+
+        file_names.append(fld + '_{:08d}.dat')
+
+
+        # Read the data from each file and reshape
+
+        file_name = f"{fld}_{n_step:08d}.dat"
+        file_path = foldername + file_name
+        
+        # Check if file exists
+        if not os.path.exists(file_path):
+            print(f"Warning: File {file_path} not found, skipping...")
+            continue
+            
+        with open(file_path, 'rb') as file:
+            total_elements = np.prod(nvec)
+            data = np.memmap(file, dtype=np.float64, mode='r', shape=(total_elements,))
+
+            data = data.reshape(np.flip(nvec))*1.0
+            
+            # Validate data
+            print(f"Loaded {file_name}: shape={data.shape}, min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}")
+
+            prof = np.mean(data, axis=(1, 2)) #.transpose((1,0,2))
+
+            # prof = np.flip(np.mean(data, axis=(0, 2)))
+
+            plt.plot(prof, z, label=f'{fld}_{n_step}', alpha=0.7)
+
+    plt.title(fld+' Profiles',fontsize=fontsize_val)
+    plt.xlabel(fld,fontsize = fontsize_val)
+    plt.ylabel("z",fontsize = fontsize_val)
+    # plt.legend(loc ="best",fontsize = fontsize_val)
+    plt.legend(bbox_to_anchor=(1.00, 1.02), loc="upper left",fontsize = fontsize_val)
+    plt.subplots_adjust(right=0.70)
+    plt.xticks(fontsize=fontsize_val, rotation=0)
+    plt.yticks(fontsize=fontsize_val, rotation=0)
+    plt.show()
diff --git a/channel/post.py b/channel/post.py
new file mode 100644
index 0000000..4bcd91d
--- /dev/null
+++ b/channel/post.py
@@ -0,0 +1,35 @@
+# %%
+import numpy as np
+import matplotlib.pyplot as plt
+
+nx, ny, nz = 256, 128, 200
+
+data = np.fromfile('output/u_00005000.dat', dtype=np.float64)  # or float64
+print("Data size:", data.size)
+data = data.reshape((nz, ny, nx))  
+
+slice_index = ny // 2  # middle of z
+slice_data = data[ :,slice_index, :]  # xy slice at fixed z
+
+print("Max:", np.max(data))
+print("Min:", np.min(data))
+plt.figure(figsize=(14,6))
+plt.imshow(slice_data, cmap='jet', origin='lower', aspect='3.14')
+plt.colorbar(label='Velocity')
+plt.xlabel('x')
+plt.ylabel('y')
+plt.axis('scaled')
+plt.show()
+#1D plot
+#line_data = data[1, 1, : ]  # middle column
+#plt.subplot(1, 2, 2)
+#plt.plot(line_data, np.arange(ny), color='black')
+#plt.xlabel('Variable')
+#plt.ylabel('y')
+#plt.title('1D Profile along y (x = nx/2)')
+#plt.grid(True)
+
+#plt.tight_layout()
+#plt.show()
+
+# %%
diff --git a/channel/readinput.f90 b/channel/readinput.f90
new file mode 100644
index 0000000..b792e99
--- /dev/null
+++ b/channel/readinput.f90
@@ -0,0 +1,155 @@
+
+!##########################################################################
+!###########################################################################
+subroutine readinput
+use velocity
+use phase
+use temperature
+use param
+use mpivar
+implicit none
+integer :: i,j,k
+double precision :: zk
+
+open(unit=55,file='input.inp',form='formatted',status='old')
+!Time step parameters
+read(55,*) restart
+read(55,*) tstart
+read(55,*) tfin
+read(55,*) dump
+! Domain size
+read(55,*) lx
+read(55,*) ly
+read(55,*) lz
+read(55,*) csi
+!Flow parameters
+read(55,*) inflow
+read(55,*) inphi
+read(55,*) intheta
+read(55,*) dt
+read(55,*) mu
+read(55,*) rho
+! forcing parameters
+read(55,*) gradpx
+read(55,*) gradpy
+! phase-field parameters
+read(55,*) radius
+read(55,*) sigma
+read(55,*) epsr   
+! temperature parameters
+read(55,*) kappa
+read(55,*) alphag
+close(55)
+
+! compute pre-defined constants
+twopi=8.0_8*atan(1.0_8)
+pi=twopi/2.d0
+dx = lx/nx
+dy = ly/ny
+dxi = 1.d0/dx
+dyi = 1.d0/dy
+ddxi = 1.d0/dx/dx
+ddyi = 1.d0/dy/dy
+rhoi=1.d0/rho
+eps=epsr*dx
+epsi=1.d0/eps
+enum=1.e-16
+!write(*,*) "Check on stability", dt*mu*dzi*dzi
+
+if (rank .eq. 0) then
+    !enable/disable for debug check parameters
+    write(*,*) "----------------------------------------------"
+    write(*,*) "███    ███ ██   ██ ██ ████████ ██████   ██████"  
+    write(*,*) "████  ████ ██   ██ ██    ██         ██ ██"       
+    write(*,*) "██ ████ ██ ███████ ██    ██     █████  ███████"  
+    write(*,*) "██  ██  ██ ██   ██ ██    ██         ██ ██    ██" 
+    write(*,*) "██      ██ ██   ██ ██    ██    ██████   ██████"         
+    write(*,*) "----------------------------------------------"
+    write(*,*) "-------------Channel flow setup---------------"
+    write(*,*) 'Grid:', nx, 'x', ny, 'x', nz
+    write(*,*) "Restart   ", restart
+    write(*,*) "Tstart    ", tstart
+    write(*,*) "Tfin      ", tfin
+    write(*,*) "Dump      ", dump
+    write(*,*) "Inflow    ", inflow
+    write(*,*) "Deltat    ", dt
+    write(*,*) "Mu        ", mu
+    write(*,*) "Rho       ", rho
+    write(*,*) "Gradpx    ", gradpx
+    write(*,*) "Gradpy    ", gradpy
+    write(*,*) "Radius    ", radius
+    write(*,*) "Sigma     ", sigma
+    write(*,*) "Eps       ", eps
+    write(*,*) "Kappa     ", kappa
+    write(*,*) "Alphag    ", alphag
+    write(*,*) "Lx        ", lx
+    write(*,*) "Ly        ", ly
+    write(*,*) "Lz        ", lz
+    write(*,*) "Z-stretch ", csi
+!    write(*,*) "dx", dx
+!    write(*,*) "dxi", dxi
+!    write(*,*) "ddxi", ddxi
+!    write(*,*) "dy", dx
+!    write(*,*) "dyi", dyi
+!    write(*,*) "ddyi", ddyi
+!    write(*,*) "dz", dz
+!    write(*,*) "dzi", dzi
+!    write(*,*) "ddzi", ddzi
+endif
+
+
+
+! define wavenumbers and grid points axis
+! define grid (then move in readinput)
+allocate(x(nx),y(ny),z(0:nz+1),dzci(nz),dzi(nz+1),kx(nx),ky(ny))
+! location of the pressure nodes (cell centers)
+x(1)=dx/2
+do i = 2, nx
+   x(i) = x(i-1) + dx
+enddo
+y(1)=dy/2
+do j = 2, ny
+   y(j) = y(j-1) + dy
+enddo
+! stretched grid along z; z axis include also the two ghost nodes located at +/- dz/2 above and below the wall
+do k = 1, nz
+  zk=(dble(k)-0.5d0)/dble(nz)         
+  z(k) = 0.5d0*dble(lz)*(1.d0+tanh(csi*(zk-0.5d0))/tanh(0.5d0*csi))
+enddo
+z(0)=-z(1)
+z(nz+1)= lz+(lz-z(nz))
+! compute inverse of dz (between cell faces)
+dzci(1) = 2.d0/(z(1)+ z(2))
+dzci(nz) = 1.d0/(lz-0.5d0*(z(nz)+z(nz-1)))
+do k = 2, nz-1
+   dzci(k) = 2.d0/(z(k+1)-z(k-1))
+enddo
+! compute inverse of dz (between nodes)
+dzi(1)=0.5d0/z(1)
+!dzi(nz+1)=0.5d0/(lz-z(nz))
+do k=1, nz+1
+   dzi(k) = 1.d0/(z(k)-z(k-1))
+enddo
+!write(*,*) "dzi", dzi
+! wavenumber
+do i = 1, nx/2
+   kx(i) = (i-1)*(twopi/lx)
+enddo
+do i = nx/2+1, nx
+   kx(i) = (i-1-nx)*(twopi/lx)
+enddo
+do j = 1, ny/2
+   ky(j) = (j-1)*(twopi/ly)
+enddo
+do j = ny/2+1, ny
+   ky(j) = (j-1-ny)*(twopi/ly)
+enddo
+! allocate kx_d and ky_d on the device 
+allocate(kx_d, source=kx)
+allocate(ky_d, source=ky)
+
+
+
+end subroutine
+
+
diff --git a/channel/readwrite.f90 b/channel/readwrite.f90
new file mode 100644
index 0000000..3b54006
--- /dev/null
+++ b/channel/readwrite.f90
@@ -0,0 +1,356 @@
+subroutine writefield(t,fieldn)
+! Output field, file is written in the /output folder 
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: out(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1]
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed 
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 2) then
+  out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 3) then
+  out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 4) then
+  out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 5) then
+  out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 6) then
+  out=theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+deallocate(out)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield(fieldn)
+! Input field, file is written in the /input folder 
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  namefile='./input/u.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+  namefile='./input/v.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+  namefile='./input/w.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+  namefile='./input/p.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+  namefile='./input/phi.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 6) then
+  namefile='./input/theta.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield_restart(t,fieldn)
+! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!)
+
+use velocity
+use phase
+use temperature
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+  write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+  write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+  write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+  write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 6) then
+  write(namefile,'(a,i8.8,a)') './output/theta_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  theta(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
\ No newline at end of file
diff --git a/channel/restart_local.sh b/channel/restart_local.sh
new file mode 100755
index 0000000..f761baf
--- /dev/null
+++ b/channel/restart_local.sh
@@ -0,0 +1,10 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+
+
+mpirun -np 2 ./tcf36 
\ No newline at end of file
diff --git a/channel/tdma.f90 b/channel/tdma.f90
new file mode 100644
index 0000000..2cde690
--- /dev/null
+++ b/channel/tdma.f90
@@ -0,0 +1,312 @@
+! Subroutine still under development, not included in the makefile at the moment (7/8/25)
+subroutine tdmau
+    implicit none
+    integer il, jl
+    ! TDMA for u velocity, solve from 1 to nz+1 where 1 is the w collocation point on the bottom and nz+1 the upper one
+    ! Load all variables via modules 
+    ! move pencil information in a module
+    ! Set up tridiagonal system for each i and j
+    ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+    ! Dirilecht BC: w = 0 at k=1 and k=nz+1
+    ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? 
+    
+    use velocity
+    implicit none
+    call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+   !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) 
+   do jl = 1, npy
+      do il = 1, npx
+
+         !inner nodes
+         do k = 2, nz-1
+            a(k) =  1.0d0*dzi*dzi
+            b(k) = -2.0d0*dzi*dzi 
+            c(k) =  1.0d0*dzi*dzi
+            d(k) =  u(k,il,jl) ! check if correct and consisten with CN
+         enddo
+
+         ! Dirilecht BC at k=1
+         a(1) =  0.0d0
+         b(1) =  1.0d0
+         c(1) =  0.0d0
+         d(1) =  0.0d0
+
+         ! Dirilecht BC at k=nz+1
+         a(nz) =  0.0d0
+         b(nz) =  1.0d0
+         c(nz) =  0.0d0
+         d(nz) =  0.0d0
+
+         ! Forward elimination (Thomas)
+         !$acc loop seq
+         do k = 2, nz
+            factor = a(k)/b(k-1)
+            b(k) = b(k) - factor*c(k-1)
+            d(k) = d(k) - factor*d(k-1)
+         end do
+
+         ! Back substitution
+         sol(nz) = d(nz)/b(nz+1)
+         ! check on pivot like flutas?
+         !$acc loop seq
+         do k = nz-1, 1, -1
+            sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+         end do
+
+         ! Store solution in array that do the back FFT
+         do k=1,nz
+            vel3d(k,il,jl) = sol(k)
+         enddo 
+      end do
+   end do
+end subroutine
+
+
+
+
+
+
+
+
+
+
+
+
+subroutine tdmav
+    implicit none
+    integer il, jl
+    ! TDMA for v velocity, solve from 1 to nz where 1 is the v collocation point on the bottom and nz the upper one
+    ! Load all variables via modules 
+    ! move pencil information in a module
+    ! Set up tridiagonal system for each i and j
+    ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+    ! Dirilecht BC: v = 0 at k=1 and k=nz
+    ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? 
+    ! Same as u but just RHS is different
+    
+    use velocity
+    implicit none
+    call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+   !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) 
+   do jl = 1, npy
+      do il = 1, npx
+
+         !inner nodes
+         do k = 2, nz-1
+            a(k) =  1.0d0*dzi*dzi
+            b(k) = -2.0d0*dzi*dzi 
+            c(k) =  1.0d0*dzi*dzi
+            d(k) =  v(k,il,jl) ! check if correct and consisten with CN
+         enddo
+
+         ! Dirilecht BC at k=1
+         a(1) =  0.0d0
+         b(1) =  1.0d0
+         c(1) =  0.0d0
+         d(1) =  0.0d0
+
+         ! Dirilecht BC at k=nz
+         a(nz+1) =  0.0d0
+         b(nz+1) =  1.0d0
+         c(nz+1) =  0.0d0
+         d(nz+1) =  0.0d0
+
+         ! Forward elimination (Thomas)
+         !$acc loop seq
+         do k = 2, nz
+            factor = a(k)/b(k-1)
+            b(k) = b(k) - factor*c(k-1)
+            d(k) = d(k) - factor*d(k-1)
+         end do
+
+         ! Back substitution
+         sol(nz) = d(nz)/b(nz)
+         ! check on pivot like flutas?
+         !$acc loop seq
+         do k = nz-1, 1, -1
+            sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+         end do
+
+         ! Store solution in array that do the back FFT
+         do k=1,nz
+            vel3d(k,il,jl) = sol(k)
+         enddo 
+      end do
+   end do
+end subroutine
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+subroutine tdmaw
+    implicit none
+    integer il, jl
+    ! TDMA for w velocity, solve from 1 to ny+1 where 1 is the w collocation point on the bottom and ny+1 the upper one
+    ! Load all variables via modules 
+    ! move pencil information in a module
+    ! Set up tridiagonal system for each i and j
+    ! The system is: (A_z) * w(k-1,j,i) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k+1,j,i) = rhs(k,j,i)
+    ! Dirilecht BC: w = 0 at k=1 and k=nz+1
+    ! a, b, c, d are complex, they can be optimized defined them as real, however this may introduce additional data movement, better to use always the same a, b, c, d? 
+    
+    use velocity
+    implicit none
+    call c_f_pointer(c_devloc(vel_d), vel3d, piZ%shape)
+
+   !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) 
+   do jl = 1, npy
+      do il = 1, npx
+
+         !inner nodes
+         do k = 2, nz
+            a(k) =  1.0d0*dzi*dzi
+            b(k) = -2.0d0*dzi*dzi 
+            c(k) =  1.0d0*dzi*dzi
+            d(k) =  w(k,il,jl) ! check if correct and consisten with CN
+         enddo
+
+         ! Dirilecht BC at k=1
+         a(1) =  0.0d0
+         b(1) =  1.0d0
+         c(1) =  0.0d0
+         d(1) =  0.0d0
+
+         ! Dirilecht BC at k=nz+1
+         a(nz+1) =  0.0d0
+         b(nz+1) =  1.0d0
+         c(nz+1) =  0.0d0
+         d(nz+1) =  0.0d0
+
+         ! Forward elimination (Thomas)
+         !$acc loop seq
+         do k = 2, nz+1
+            factor = a(k)/b(k-1)
+            b(k) = b(k) - factor*c(k-1)
+            d(k) = d(k) - factor*d(k-1)
+         end do
+
+         ! Back substitution
+         sol(nz+1) = d(nz+1)/b(nz+1)
+         ! check on pivot like flutas?
+         !$acc loop seq
+         do k = nz, 1, -1
+            sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+         end do
+
+         ! Store solution in array that do the back FFT
+         ! only up to nz, because i am not trasposing the halo, i can eventually
+         do k=1,nz
+            vel3d(k,il,jl) = sol(k)
+         enddo 
+      end do
+   end do
+end subroutine
+
+
+
+
+
+
+
+
+subroutine tdmap
+    ! TDMA for pressure, solve from 0 to ny+1 where 0 is the first ghost node and ny+1 the upper ghost node
+    ! When ready, remove it from the main code and just call this function
+    ! Load all variables via modules 
+    ! move pencil information in a module
+    ! Set up tridiagonal system for each i and j
+    ! The system is: (A_z) * pc(k-1,ky,kx) + (B_k) * pc(k,ky,kx) + (C_k) * pc(k,ky,kx) = rhs(k,ky,kx)
+    ! Neumann BC: d/dz pc = 0 at w collocation points
+    ! Fill diagonals and rhs for each
+    ! 0 and ny+1 are the ghost nodes
+    ! a, b, c and are complex (because of FFT)
+    
+    use velocity
+
+    implicit none
+
+   offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+   offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+   offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+   call c_f_pointer(c_devloc(psi_d), psi3d, piZ_d2z%shape)
+
+   xoff = offsets(1)
+   yoff = offsets(2)
+   npx = np(1)
+   npy = np(2)
+
+   !$acc parallel loop collapse(2) gang private(a, b, c, d, factor, sol) 
+   do jl = 1, npy
+      do il = 1, npx
+         ! compute index global wavenumber ig and jg
+         jg = yoff + jl
+         ig = xoff + il
+
+         do k = 1, nz
+            a(k) =  1.0d0*dzi*dzi
+            b(k) = -2.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+            c(k) =  1.0d0*dzi*dzi
+            d(k) =  psi3d(k,il,jl)
+         enddo
+
+         ! Neumann BC at bottom
+         a(0) =  0.0d0
+         b(0) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+         c(0) =  1.0d0*dzi*dzi
+         d(0) =  0.0d0
+
+         ! ghost node elimintaion trick
+         ! Neumann BC at top
+         a(nz+1) =  1.0d0*dzi*dzi
+         b(nz+1) = -1.0d0*dzi*dzi - kx_d(ig)*kx_d(ig) - ky_d(jg)*ky_d(jg)
+         c(nz+1) =  0.0d0
+         d(nz+1) =  0.0d0
+
+         ! Enforce pressure at one point? one interior point, avodig messing up with BC
+         ! need brackets?
+         if (ig == 1 .and. jg == 1) then
+            a(1) = 0.d0
+            b(1) = 1.d0
+            c(1) = 0.d0
+            d(1) = 0.d0
+         end if
+
+         ! Forward elimination (Thomas)
+         !$acc loop seq
+         do k = 1, nz+1
+            factor = a(k)/b(k-1)
+            b(k) = b(k) - factor*c(k-1)
+            d(k) = d(k) - factor*d(k-1)
+         end do
+
+         ! Back substitution
+         sol(nz+1) = d(nz+1)/b(nz+1)
+         ! check on pivot like flutas?
+         !$acc loop seq
+         do k = nz, 0, -1
+            sol(k) = (d(k) - c(k)*sol(k+1))/b(k)
+         end do
+
+         ! Store solution in array that do the back FFT
+         do k=1,nz
+            psi3d(k,il,jl) = sol(k)
+         enddo 
+      end do
+   end do
+
+
+end subroutine
\ No newline at end of file
diff --git a/channel/testpush.sh b/channel/testpush.sh
new file mode 100644
index 0000000..2e0e21a
--- /dev/null
+++ b/channel/testpush.sh
@@ -0,0 +1,2 @@
+make clean
+make
diff --git a/cuDecomp b/cuDecomp
index 7968153..907b18e 160000
--- a/cuDecomp
+++ b/cuDecomp
@@ -1 +1 @@
-Subproject commit 796815382081da97054602aed68baae88b658ee9
+Subproject commit 907b18e901321c1ca6e6142ae4c2a4cc3ff49030
diff --git a/hit/Makefile b/hit/Makefile
new file mode 100644
index 0000000..0ee5198
--- /dev/null
+++ b/hit/Makefile
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+ 
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda  -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft  -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):        $(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
diff --git a/hit/Makefile_leonardo b/hit/Makefile_leonardo
new file mode 100755
index 0000000..061dbd9
--- /dev/null
+++ b/hit/Makefile_leonardo
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+ 
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -cuda  -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft  -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 particles.f90  main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):        $(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
diff --git a/hit/Makefile_local b/hit/Makefile_local
new file mode 100644
index 0000000..08156c7
--- /dev/null
+++ b/hit/Makefile_local
@@ -0,0 +1,28 @@
+# Compiler
+FC = mpif90
+LD	= $(FC)
+RM	= /bin/rm -f
+
+# Paths (Modify if necessary)
+CUDECOMP_DIR = /home/milton/MHIT36/cuDecomp/build/
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=mem:managed -Minfo=accel -Mfree -Mpreprocess -cpp -Minfo=accel -cuda  -I$(CUDECOMP_DIR)/include
+LIBS = -L$(CUDECOMP_DIR)/lib -L/usr/local/cuda/lib64 -lcudecomp_fort -lcudecomp -cudalib=cufft -lcuda -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 particles.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):	$(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
+
+
diff --git a/hit/Makefile_mn5 b/hit/Makefile_mn5
new file mode 100644
index 0000000..fe0febe
--- /dev/null
+++ b/hit/Makefile_mn5
@@ -0,0 +1,33 @@
+# Compiler
+FC = mpif90
+LD = $(FC)
+ 
+# Paths (Modify if necessary)
+ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+$(info $$ROOT_DIR is [${ROOT_DIR}])
+PARENT_DIR := $(abspath $(ROOT_DIR)/..)
+$(info $$PARENT_DIR is [${PARENT_DIR}])
+CUDECOMP_DIR = $(PARENT_DIR)/cuDecomp/build
+CUDA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none
+EXTRA_DIR = /leonardo/prod/opt/compilers/cuda/12.3/none/compat
+
+export $(CUDECOMP_DIR)/lib
+
+# Compiler and Linker Flags
+FFLAGS = -fast -acc -gpu=managed -Mfree -Mpreprocess -cpp -cuda  -I$(CUDECOMP_DIR)/include/
+LIBS = -L$(CUDECOMP_DIR)/lib/ -L$(CUDA_DIR)/lib64/ -lcudecomp_fort -lcudecomp -cudalib=cufft -lnvToolsExt
+
+# Source and Object Files
+OBJS = module.f90 main.f90 readinput.f90 readwrite.f90
+EXEC = mhit36
+
+$(EXEC):        $(OBJS)
+	$(LD) $(FFLAGS) $(OBJS) $(LIBS) -o $@
+
+clean:
+	$(RM) $(EXEC)
+
+.SUFFIXES: .o
+
+.f90.o:
+	$(FC)  -c $(FFLAGS) $<
diff --git a/hit/binder.sh b/hit/binder.sh
new file mode 100755
index 0000000..b449dda
--- /dev/null
+++ b/hit/binder.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+case $(( ${OMPI_COMM_WORLD_LOCAL_RANK} )) in
+0) export UCX_NET_DEVICES=mlx5_0:1 ;;
+1) export UCX_NET_DEVICES=mlx5_1:1 ;;
+2) export UCX_NET_DEVICES=mlx5_2:1 ;;
+3) export UCX_NET_DEVICES=mlx5_3:1 ;;
+esac
+
+echo Launching on $UCX_NET_DEVICES
+
+$*
+
diff --git a/hit/go_leo.sh b/hit/go_leo.sh
new file mode 100644
index 0000000..7545b12
--- /dev/null
+++ b/hit/go_leo.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --account="IscrB_EXCEED"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:05:00
+#SBATCH --nodes=1      ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --error=test.err
+#SBATCH --qos=boost_qos_dbg
+
+#module load nvhpc/25.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+chmod 777 binder.sh
+mpirun -np 4 --map-by node:PE=8 --rank-by core  ./binder.sh ./mhit36
\ No newline at end of file
diff --git a/hit/go_leo_bprod.sh b/hit/go_leo_bprod.sh
new file mode 100644
index 0000000..15c7b1a
--- /dev/null
+++ b/hit/go_leo_bprod.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --account="IscrB_SONORA"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:10:00
+#SBATCH --nodes=128    ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8 ## override threads limitation
+#SBATCH --output=test.out
+#SBATCH --partition=boost_usr_prod
+#SBATCH --qos=boost_qos_bprod
+#SBATCH --error=test.err
+
+module load nvhpc/24.3
+module load cuda/12.3
+module load openmpi/4.1.6--nvhpc--24.3
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_$
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_P$
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+#export OMP_NUM_THREADS=16
+export OMP_PROC_BIND=spread
+export OMP_PLACES=cores
+
+mpirun -n 512 ./mhit36
diff --git a/hit/go_leo_prof.sh b/hit/go_leo_prof.sh
new file mode 100755
index 0000000..ebcb52e
--- /dev/null
+++ b/hit/go_leo_prof.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --account="tra25_openhack"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:10:00
+#SBATCH --nodes=1      ##adjust
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --qos=boost_qos_dbg
+#SBATCH --error=test.err
+#### if mapping is on
+#SBATCH --cpus-per-task=8
+
+
+#module load nvhpc/24.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/aroccon0/MHIT36_cuDecomp/cuDecomp/build/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/leonardo_scratch/large/userexternal/lenzenbe/RE95_256_cuDec/cuDecomp/build/lib:$LD_LIBRARY_PATH
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+# simple run 
+#mpirun -n 4 ./mhit36
+
+# profile on single node (all ranks inside one profile)
+#nsys profile  -t cuda,nvtx,mpi,openacc  mpirun -np 4 ./mhit36
+
+# profile (one rep for rank)
+#mpirun -n 4 nsys profile --trace=cuda,nvtx,mpi -o profile_output_%q{SLURM_PROCID} --stats=true ./mhit36
+#mpirun  -n 4 nsys profile -t cuda,nvtx,mpi -o report.$SLURM_LOCALID ./mhit36
+#srun -n 4  nsys profile -t cuda,nvtx,mpi --output=nsys_report_rank%t ./mhit36nsys profile --multiprocess=true -t cuda,nvtx,mpi -o report $
+
+# profile + node mapping + nic 
+mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile  -t cuda,nvtx,mpi,openacc  --nic-metrics=true  ./binder.sh ./mhit36
+
+# for nsight compute report
+#mpirun -n 4 ncu --kernel-name main_659 --set=full --import-source=yes -o profile -f --launch-skip 3 --launch-count 1 "./mhit36"
+
+# for nsight compute report - all kernels
+# mpirun -n 4 ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./mhit36
+
+# for nsight compute report - all kernels + mapping + nic
+# mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36
+
diff --git a/hit/go_mn5.sh b/hit/go_mn5.sh
new file mode 100644
index 0000000..7855336
--- /dev/null
+++ b/hit/go_mn5.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH --ntasks=16  # --ntasks=8 when used two nodes
+#SBATCH --account="ehpc244"
+#SBATCH --job-name="cudec"
+#SBATCH --time=00:5:00
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=20
+#SBATCH --output=test.out
+#SBATCH -p boost_usr_prod
+#SBATCH --qos=acc_debug
+#SBATCH --error=test.err
+
+module load nvidia-hpc-sdk/24.3
+CURRENT_DIR="$(pwd)"
+ROOT_DIR="$(dirname "$CURRENT_DIR")/cuDecomp/build/lib"
+echo "Using directory: $ROOT_DIR"
+export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
+
+
+mpirun -n 8 ./mhit36
\ No newline at end of file
diff --git a/hit/input.inp b/hit/input.inp
new file mode 100644
index 0000000..845aab6
--- /dev/null
+++ b/hit/input.inp
@@ -0,0 +1,18 @@
+0           ! fresh start=0 restart =1
+0           ! iteration to start from (fresh start 0 otherwise different)
+10000           ! Final time step
+100000           ! Saving frequency
+0           ! inflow condition (0=Taylor green, 1=read from fields)
+0           ! phase-field initial condition (0=circular drop, 1=read from fields)
+0.0002      ! dt for the simulation
+0.006       ! mu (viscosity)
+1.000       ! density
+1.0         ! A forcing parameter (ABC forcing)
+1.0         ! B forcing parameter (ABC forcing)
+1.0         ! C forcing parameter (ABC forcing)
+2.0         ! Forced wavenumber (ABC forcing)
+0.5         ! Initial drop radius for phase-field
+1.0d0       ! Surface tension
+1.0         ! Ratio eps/dx
+10000       ! Number particles            
+1           ! Particle initial condition (0 from file, 1 random, 2 random in drops)
diff --git a/hit/leo.sh b/hit/leo.sh
new file mode 100755
index 0000000..364635a
--- /dev/null
+++ b/hit/leo.sh
@@ -0,0 +1,12 @@
+# old modules - ! if you use these, changes in Makefile are needed from -gpu=mem:managed to -gpu=managed
+#module load nvhpc/24.3
+#module load cuda/12.3
+#module load openmpi/4.1.6--nvhpc--24.3
+module load profile/candidate
+module load nvhpc/25.3
+module load hpcx-mpi/2.19
+cp Makefile_leonardo Makefile
+make clean
+make
+mkdir -p output
+# mpirun -np 4 ./mhit36 
diff --git a/hit/local.sh b/hit/local.sh
new file mode 100644
index 0000000..4f27984
--- /dev/null
+++ b/hit/local.sh
@@ -0,0 +1,15 @@
+NVARCH=Linux_x86_64; export NVARCH
+NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
+MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/compilers/man; export MANPATH
+PATH=$NVCOMPILERS/$NVARCH/25.7/compilers/bin:$PATH; export PATH
+export PATH=$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/bin:$PATH
+export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/25.7/comm_libs/mpi/man
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
+#clean folder output
+rm -rf output
+mkdir output
+cp Makefile_local Makefile
+#rm *.dat
+make clean
+make
+mpirun -np 2 ./mhit36 
diff --git a/hit/main.f90 b/hit/main.f90
new file mode 100644
index 0000000..beb351f
--- /dev/null
+++ b/hit/main.f90
@@ -0,0 +1,1495 @@
+#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
+
+program main
+use cudafor
+use cudecomp
+use cufft
+use mpi
+use velocity 
+use phase
+use particles
+use param
+use mpivar
+use cudecompvar
+
+use, intrinsic :: ieee_arithmetic
+
+implicit none
+! timer for scaling test
+real :: t_start, t_end, elapsed
+! grid dimensions
+integer :: comm_backend
+integer :: pr, pc
+! cudecomp
+! cuFFT
+integer :: planXf, planXb
+integer :: planY, planZ
+integer :: batchsize
+integer :: status
+integer :: i,j,k,il,jl,kl,ig,jg,kg,t
+integer :: im,ip,jm,jp,km,kp,last
+integer :: inY,enY,inZ,enZ
+!beginDEBUG
+integer, parameter :: Mx = 1, My = 2, Mz = 1
+!endDEBUG
+real(8), device, allocatable :: kx_d(:)
+! working arrays
+complex(8), allocatable :: psi(:)
+real(8), allocatable :: ua(:,:,:)
+real(8), allocatable :: uaa(:,:,:)
+
+real(8), allocatable :: psi_real(:)
+! real(8), device, allocatable :: psi_real_d(:)
+complex(8), device, allocatable :: psi_d(:)
+complex(8), pointer, device, contiguous :: work_d(:), work_halo_d(:), work_d_d2z(:), work_halo_d_d2z(:)
+character(len=40) :: namefile
+character(len=4) :: itcount
+! Code variables
+
+real(8)::err,maxErr
+
+complex(8), device, pointer :: phi3d(:,:,:)
+real(8) :: k2
+!integer :: il, jl, ig, jg
+integer :: offsets(3), xoff, yoff
+integer :: np(3)
+
+! Enable or disable phase field (acceleration eneabled by default)
+#define phiflag 1
+! Enable or disable particle Lagrangian tracking (tracers)
+#define partflag 0
+
+!########################################################################################################################################
+! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START
+!########################################################################################################################################
+! MPI initialization, put in rank the local MPI rank number and ranks total number
+! Same procedura defined in the cuDecomp documentation
+call mpi_init(ierr)
+call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
+call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr)
+
+call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr)
+call mpi_comm_rank(localComm, localRank, ierr)
+ierr = cudaSetDevice(localRank) !assign GPU to MPI rank
+
+! Define grid and decomposition
+call readinput
+
+! number of process along each direction (r means y and c means z)
+pr = 0
+pc = 0
+halo_ext=1
+! comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+comm_backend = 0 ! Enable full autotuning
+
+CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD))
+
+! config is a struct and pr and pc are the number of pencils along the two directions
+! gdims is the global grid
+! create an uninitialized configuration struct and initialize it to defaults using cudecompGridDescConfigSetDefaults. 
+! Initializing to default values is required to ensure no entries are left uninitialized.
+CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config))
+pdims = [pr, pc] !pr and pc are the number of pencil along the different directions
+config%pdims = pdims
+! gdims = [nx, ny, nz]
+! config%gdims = gdims
+halo = [0, halo_ext, halo_ext] ! no halo along x neeed because is periodic and in physical space i have x-pencil
+! for transpositions
+config%transpose_comm_backend = comm_backend
+config%transpose_axis_contiguous = .true.
+! for halo exchanges
+config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI
+! Setting for periodic halos in all directions (non required to be in config)
+halo_periods = [.true., .true., .true.]
+
+! create spectral grid descriptor first to select pdims for optimal transposes
+gdims = [nx/2+1, ny, nz]
+config%gdims = gdims
+
+! Set up autotuning options for spectral grid (transpose related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+   options%autotune_transpose_backend = .true.
+   options%autotune_halo_backend = .false.
+endif
+options%transpose_use_inplace_buffers = .true.
+options%transpose_input_halo_extents(:, 1) = halo
+options%transpose_output_halo_extents(:, 4) = halo
+
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options))
+
+! create physical grid descriptor
+! take previous config and modify the global grid (nx instead of nx/2+1)
+! reset transpose_comm_backend to default value to avoid picking up possible nvshmem
+! transpose backend selection (this impacts how workspaces are allocated)
+gdims = [nx, ny, nz]
+config%gdims = gdims
+config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+
+! Set up autotuning options for physical grid (halo related settings)
+CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
+options%dtype = CUDECOMP_DOUBLE_COMPLEX
+if (comm_backend == 0) then
+   options%autotune_halo_backend = .true.
+endif
+options%halo_extents(:) = halo
+options%halo_periods(:) = halo_periods
+options%halo_axis = 1
+CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
+
+!Print information on configuration
+! issue with NVHPC > 25.X, simply avoid this call (not critical)
+!if (rank == 0) then
+!   write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2)
+!   write(*,"(' Using ', a, ' transpose backend ...')") &
+!         cudecompTransposeCommBackendToString(config%transpose_comm_backend)
+!   write(*,"(' Using ', a, ' halo backend ...')") &
+!         cudecompHaloCommBackendToString(config%halo_comm_backend)
+!endif
+
+! Get pencil info for the grid descriptor in the physical space
+! This function returns a pencil struct (piX, piY or piZ) that contains the shape, global lower and upper index bounds (lo and hi), 
+! size of the pencil, and an order array to indicate the memory layout that will be used (to handle permuted, axis-contiguous layouts).
+! Additionally, there is a halo_extents data member that indicates the depth of halos for the pencil, by axis.
+! If no halo regions are necessary, a NULL pointer can be provided in place of this array (or omitted)
+! Pencil info in x-configuration present in PiX (shape,lo,hi,halo_extents,size)
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1, halo))
+nElemX = piX%size !<- number of total elments in x-configuratiion (including halo)
+! Pencil info in Y-configuration present in PiY
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2))
+nElemY = piY%size
+! Pencil info in Z-configuration present in PiZ
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3))
+nElemZ = piZ%size
+
+! Get workspace sizes for transpose (1st row, not used) and halo (2nd row, used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo))
+
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo))
+nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo)
+! Pencil info in Y-configuration present in PiY
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piY_d2z, 2))
+nElemY_d2z = piY_d2z%size
+! Pencil info in Z-configuration present in PiZ
+CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piZ_d2z, 3))
+nElemZ_d2z = piZ_d2z%size
+! Get workspace sizes for transpose (1st row,used) and halo (2nd row, not used)
+CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_descD2Z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_descD2Z, 1, halo, nElemWork_halo_d2z))
+
+! Get the global rank of neighboring processes in PiX config (required only for particles)
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, 1  , .true. , nidp1y))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 2, -1 , .true. , nidm1y))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, 1  , .true. , nidp1z))
+CHECK_CUDECOMP_EXIT(cudecompGetShiftedRank(handle, grid_desc, 1, 3, -1 , .true. , nidm1z))
+
+
+! CUFFT initialization -- Create Plans
+! Forward 1D FFT in X: D2Z
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXf, nx, CUFFT_D2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Forward'
+
+! Backward 1D FFT in X: Z2D
+batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
+status = cufftPlan1D(planXb, nx, CUFFT_Z2D, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan Backward'
+
+! it's always 2 and 3 because y-pencil have coordinates y,z,x
+batchSize = piY_d2z%shape(2)*piY_d2z%shape(3)
+status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan Forward & Backward'
+
+! it's always 2 and 3 because y-pencil have coordinates z,y,x
+batchSize = piZ_d2z%shape(2)*piZ_d2z%shape(3)
+status = cufftPlan1D(planZ, nz, CUFFT_Z2Z, batchSize)
+if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Z plan Forward & Backward'
+
+
+
+! define grid
+allocate(x(nx),x_ext(nx+1))
+x(1)= 0
+do i = 2, nx
+   x(i) = x(i-1) + dx
+enddo
+
+x_ext(1:nx) = x(1:nx)
+x_ext(nx+1) = lx
+
+! Offsets in the X-pencil decomposition
+pix_yoff = pix%lo(2)-1
+pix_zoff = pix%lo(3)-1
+
+! Boundaries of each PiX pencil
+yinf = x_ext( pix%lo(2) )
+ysup = x_ext( pix%hi(2) + 1 )
+zinf = x_ext( pix%lo(3) )
+zsup = x_ext( pix%hi(3) + 1 )   
+
+! Physical size of each PiX pencil
+lyloc = ysup - yinf
+lzloc = zsup - zinf
+
+! define wavenumbers
+allocate(kx(nx))
+do i = 1, nx/2
+   kx(i) = (i-1)*(twoPi/lx)
+enddo
+do i = nx/2+1, nx
+   kx(i) = (i-1-nx)*(twoPi/LX)
+enddo
+! allocate k_d on the device (later on remove and use OpenACC + managed memory?)
+allocate(kx_d, source=kx)
+
+allocate(mysin(nx), mycos(nx))
+do i=1,nx
+   ! compute here the sin to avoid multiple computations of sin
+   mysin(i)=sin(k0*x(i)+dx/2)
+   ! compute here the cos to avoid multiple computations of cos
+   mycos(i)=cos(k0*x(i)+dx/2)
+enddo
+
+! Initial distribution of particles among the processes
+nploc = npart/ranks
+nplocmax = nploc*2
+
+!########################################################################################################################################
+! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START STEP 2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+! allocate arrays
+allocate(psi(max(nElemX, nElemY, nElemZ))) !largest among the pencil
+allocate(psi_real(max(nElemX, nElemY, nElemZ))) !largest among the pencil
+allocate(psi_d(max(nElemX_d2z, nElemY_d2z, nElemZ_d2z))) ! phi on device
+allocate(ua(nx, piX%shape(2), piX%shape(3)))
+
+! Pressure variable
+allocate(rhsp(piX%shape(1), piX%shape(2), piX%shape(3))) 
+allocate(p(piX%shape(1), piX%shape(2), piX%shape(3))) 
+
+!allocate variables
+!NS variables
+allocate(u(piX%shape(1),piX%shape(2),piX%shape(3)),v(piX%shape(1),piX%shape(2),piX%shape(3)),w(piX%shape(1),piX%shape(2),piX%shape(3))) !velocity vector
+! allocate(ustar(piX%shape(1),piX%shape(2),piX%shape(3)),vstar(piX%shape(1),piX%shape(2),piX%shape(3)),wstar(piX%shape(1),piX%shape(2),piX%shape(3))) ! provisional velocity field
+allocate(rhsu(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(rhsu_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsv_o(piX%shape(1),piX%shape(2),piX%shape(3)),rhsw_o(piX%shape(1),piX%shape(2),piX%shape(3))) ! right hand side u,v,w
+allocate(div(piX%shape(1),piX%shape(2),piX%shape(3)))
+!PFM variables
+#if phiflag == 1
+   allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+   allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3)))
+#endif
+
+! allocate arrays for transpositions and halo exchanges 
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_halo_d, nElemWork_halo))
+! allocate arrays for transpositions
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_d_d2z, nElemWork_d2z))
+CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_descD2Z, work_halo_d_d2z, nElemWork_halo_d2z))
+
+#if partflag == 1
+! Particle variables
+allocate(part(1:nplocmax,1:ninfop))
+allocate(partbuff(1:nplocmax,1:ninfop))
+allocate(vec_p(1:nplocmax))
+allocate(order_p(1:nplocmax))
+allocate(buffvar1(1:ninfop,1:nploc))
+allocate(buffvar2(1:ninfop,1:nploc))
+#endif
+!########################################################################################################################################
+! END STEP2: ALLOCATE ARRAYS
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT
+!########################################################################################################################################
+! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O)
+! 3.2 Call halo exchnages along Y and Z for u,v,w and phi
+if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder
+   if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
+   if (inflow .eq. 0) then
+      if (rank.eq.0) write(*,*) "Initialize Taylor-green"
+      do k = 1+halo_ext, piX%shape(3)-halo_ext
+         kg = piX%lo(3) + k - 1 
+         do j = 1+halo_ext, piX%shape(2)-halo_ext
+            jg = piX%lo(2) + j - 1 
+            do i = 1, piX%shape(1)
+               u(i,j,k) =   sin(x(i)-dx/2)*cos(x(jg))*cos(x(kg))
+               v(i,j,k) =  -cos(x(i))*sin(x(jg)-dx/2)*cos(x(kg))
+               w(i,j,k) =  0.d0
+            enddo
+         enddo
+      enddo
+   endif
+   if (inflow .eq. 1) then
+      if (rank.eq.0)  write(*,*) "Initialize from data"
+      call readfield(1)
+      call readfield(2)
+      call readfield(3)
+   endif
+endif
+if (restart .eq. 1) then !restart, ignore inflow and read the tstart field 
+   if (rank.eq.0)  write(*,*) "Initialize velocity field (from output folder), iteration:", tstart
+   call readfield_restart(tstart,1)
+   call readfield_restart(tstart,2)
+   call readfield_restart(tstart,3)
+endif
+
+! update halo cells along y and z directions (enough only if pr and pc are non-unitary)
+!$acc host_data use_device(u)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+!$acc host_data use_device(v)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+!$acc host_data use_device(w)
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+!$acc end host_data 
+
+! initialize phase-field
+#if phiflag == 1
+   if (restart .eq. 0) then
+      if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
+      if (inphi .eq. 0) then
+         if (rank.eq.0) write(*,*) 'Spherical drop'
+         do k = 1+halo_ext, piX%shape(3)-halo_ext
+            kg = piX%lo(3) + k - 1 
+            do j = 1+halo_ext, piX%shape(2)-halo_ext
+               jg = piX%lo(2) + j - 1 
+               do i = 1, piX%shape(1)
+                  pos=(x(i)-lx/2)**2d0 +  (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0
+                  phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+               enddo
+            enddo
+         enddo
+      endif
+      if (inphi .eq. 1) then
+         if (rank.eq.0)  write(*,*) "Initialize phase-field from data"
+         call readfield(5)
+      endif
+   endif
+   if (restart .eq. 1) then
+      write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+      call readfield_restart(tstart,5)
+   endif
+   ! update halo
+   !$acc host_data use_device(phi)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+#endif
+
+#if partflag == 1
+  if (restart .eq. 0) then
+     if (rank.eq.0) write(*,*) 'Initialize particles (fresh start)'
+     if (inpart .eq. 1) then
+        if (rank.eq.0) write(*,*) 'Random Position whole Domain'
+        call particlegenerator(inpart)
+     endif
+      if (inpart .eq. 2) then
+         if (rank.eq.0)  write(*,*) 'Random Position in Drops'
+         call particlegenerator(inpart)
+      endif
+  endif
+ !   if (restart .eq. 1) then
+ !      write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+ !      call readfield_restart(tstart,5)
+ !   endif
+#endif
+
+!Save initial fields (only if a fresh start)
+if (restart .eq. 0) then
+   if (rank.eq.0) write(*,*) "Save initial fields"
+   call writefield(tstart,1)
+   call writefield(tstart,2)
+   call writefield(tstart,3)
+   call writefield(tstart,4)
+   #if phiflag == 1
+      call writefield(tstart,5)
+   #endif
+endif
+
+!########################################################################################################################################
+! END STEP 3: FLOW, PHASE FIELD AND PARTICLES INIT
+!########################################################################################################################################
+
+
+
+!########################################################################################################################################
+! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP
+!########################################################################################################################################
+
+! First step use Euler
+alpha=1.0d0
+beta=0.0d0
+gumax=8.0d0
+tstart=tstart+1
+gamma=1.d0*gumax ! initial gamma for phase-field
+!$acc data copyin(piX)
+!$acc data create(rhsu_o, rhsv_o, rhsw_o)
+!$acc data copyin(mysin, mycos)
+#if partflag == 1
+!$acc data copy(part,partbuff)
+!$acc data create(vec_p, order_p)
+!$acc data create(buffvar1,buffvar2)
+#endif
+call cpu_time(t_start)
+
+! Start temporal loop
+do t=tstart,tfin
+   ! Create custom label for each marker (profiling)
+    write(itcount,'(i4)') t
+   ! Range with custom  color (uncomment for profiling)
+   ! call nvtxStartRange("Iteration "//itcount,t)
+
+    if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
+    call cpu_time(times)
+
+   !########################################################################################################################################
+   ! START STEP 4: PARTICLES (TRACERS)
+   !########################################################################################################################################
+   #if partflag == 1
+      ! Operations:
+      ! 4.1 Perform Interpolation (consider passing to trilinear: more accurate, but way more expensive)
+      ! 4.2 Integrate with Adams-Bashforth
+      ! 4.3 Order and transfer in y
+      ! 4.4 Order and transfer in z 
+      ! 4.5 Check Leakage of Particles
+
+      call LinearInterpolation()
+      ! Particle Tracker Integration
+      ! Two-Step Adams-Bashfort (Euler for first step)
+      !$acc parallel loop collapse(2) default(present)
+      do j = 0, 2
+         do i = 1, nploc
+            part(i,Ixp+j)=part(i,Ixp+j)+&
+                     dt*(alpha*part(i,Iup+j)-beta*part(i,Iup1+j))
+         enddo
+      enddo
+
+      ! Transfer in y
+      call SortPartY()
+      call CountPartTransfY()
+      call SendPartUP(2)
+      call SendPartDOWN(2)
+ 
+      ! Transfer in z
+      call SortPartZ()
+      call CountPartTransfZ()
+      call SendPartUP(3)
+      call SendPartDOWN(3)
+
+      ! Check Particles esacping the domain (leakage)
+      call ParticlesLeakage()
+      ! Shift data for next step
+      !$acc parallel loop collapse(2) default(present)
+      do j = 0, 2
+         do i = 1, nploc
+            part(i,Iup1+j)=part(i,Iup+j)
+         enddo
+      enddo
+      write(*,*) 'rank',rank, 'nploc',nploc
+   #endif
+   !########################################################################################################################################
+   ! END STEP 4: PARTICLES
+   !########################################################################################################################################
+
+   
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Phase-field")
+   !########################################################################################################################################
+   ! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT)
+   !########################################################################################################################################
+   #if phiflag == 1
+      ! 4.2 Get phi at n+1 using RK4
+      ! first stage of RK4 - saved in rhsphi
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      gamma=1.d0*gumax ! update gamma every time step with the current max velocity value
+      if (rank.eq.0) write(*,*) "gamma:", gamma
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first six lines) and diffusive (last three lines)                     
+               ! flux-splitting
+               rhsphi(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &                   
+                                    eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &                   
+                                    eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
+
+      ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
+               normx_f(i,j,k) = normx(i,j,k)*normag
+               normy_f(i,j,k) = normy(i,j,k)*normag
+               normz_f(i,j,k) = normz(i,j,k)*normag
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Compute sharpening term flux split like CaNS-Fizzy
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! second stage of RK4 - saved in rhsphik2
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik2(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+
+      ! Compute sharpening term
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! third stage of RK4 - saved in rhsphik3
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik3(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
+
+      ! Compute sharpening term
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      ! forth stage of RK4 - saved in rhsphik4
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik4(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+
+      ! Compute sharpening term
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Update phi with RK4
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + &
+                            2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+   #endif
+
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+
+   !########################################################################################################################################
+   ! END STEP 5: PHASE-FIELD SOLVER 
+   !########################################################################################################################################
+
+
+   !########################################################################################################################################
+   ! START STEP 6: USTAR COMPUTATION (PROJECTION STEP)
+   !########################################################################################################################################
+   ! 6.1 compute rhs 
+   ! 6.2 obtain ustar and store old rhs in rhs_o
+   ! 6.3 Call halo exchnages along Y and Z for u,v,w
+
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Projection")
+   ! 6.1a Convective and diffusiver terms NS
+   ! Loop on inner nodes
+   !$acc parallel loop tile(16,4,2) 
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+            ip=i+1
+            jp=j+1
+            kp=k+1
+            im=i-1
+            jm=j-1
+            km=k-1
+            ! Manual periodicity ony along x (x-pencil), along y and z directions use halos
+            if (ip .gt. nx) ip=1  
+            if (im .lt. 1) im=nx
+            ! compute the products (conservative form)
+            h11 = (u(ip,j,k)+u(i,j,k))*(u(ip,j,k)+u(i,j,k))     - (u(i,j,k)+u(im,j,k))*(u(i,j,k)+u(im,j,k))
+            h12 = (u(i,jp,k)+u(i,j,k))*(v(i,jp,k)+v(im,jp,k))   - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k))
+            h13 = (u(i,j,kp)+u(i,j,k))*(w(i,j,kp)+w(im,j,kp))   - (u(i,j,k)+u(i,j,km))*(w(i,j,k)+w(im,j,k))
+            h21 = (u(ip,j,k)+u(ip,jm,k))*(v(ip,j,k)+v(i,j,k))   - (u(i,j,k)+u(i,jm,k))*(v(i,j,k)+v(im,j,k))
+            h22 = (v(i,jp,k)+v(i,j,k))*(v(i,jp,k)+v(i,j,k))     - (v(i,j,k)+v(i,jm,k))*(v(i,j,k)+v(i,jm,k))
+            h23 = (w(i,j,kp)+w(i,jm,kp))*(v(i,j,kp)+v(i,j,k))   - (w(i,j,k)+w(i,jm,k))*(v(i,j,k)+v(i,j,km))
+            h31 = (w(ip,j,k)+w(i,j,k))*(u(ip,j,k)+u(ip,j,km))   - (w(i,j,k)+w(im,j,k))*(u(i,j,k)+u(i,j,km))
+            h32 = (v(i,jp,k)+v(i,jp,km))*(w(i,jp,k)+w(i,j,k))   - (v(i,j,k)+v(i,j,km))*(w(i,j,k)+w(i,jm,k))
+            h33 = (w(i,j,kp)+w(i,j,k))*(w(i,j,kp)+w(i,j,k))     - (w(i,j,k)+w(i,j,km))*(w(i,j,k)+w(i,j,km))
+            ! compute the derivative
+            h11=h11*0.25d0*dxi
+            h12=h12*0.25d0*dxi
+            h13=h13*0.25d0*dxi
+            h21=h21*0.25d0*dxi
+            h22=h22*0.25d0*dxi
+            h23=h23*0.25d0*dxi
+            h31=h31*0.25d0*dxi
+            h32=h32*0.25d0*dxi
+            h33=h33*0.25d0*dxi
+            ! add to the rhs
+            rhsu(i,j,k)=-(h11+h12+h13)
+            rhsv(i,j,k)=-(h21+h22+h23)
+            rhsw(i,j,k)=-(h31+h32+h33)
+            ! viscos term
+            h11 = mu*(u(ip,j,k)-2.d0*u(i,j,k)+u(im,j,k))*ddxi
+            h12 = mu*(u(i,jp,k)-2.d0*u(i,j,k)+u(i,jm,k))*ddxi
+            h13 = mu*(u(i,j,kp)-2.d0*u(i,j,k)+u(i,j,km))*ddxi
+            h21 = mu*(v(ip,j,k)-2.d0*v(i,j,k)+v(im,j,k))*ddxi
+            h22 = mu*(v(i,jp,k)-2.d0*v(i,j,k)+v(i,jm,k))*ddxi
+            h23 = mu*(v(i,j,kp)-2.d0*v(i,j,k)+v(i,j,km))*ddxi
+            h31 = mu*(w(ip,j,k)-2.d0*w(i,j,k)+w(im,j,k))*ddxi
+            h32 = mu*(w(i,jp,k)-2.d0*w(i,j,k)+w(i,jm,k))*ddxi
+            h33 = mu*(w(i,j,kp)-2.d0*w(i,j,k)+w(i,j,km))*ddxi
+            rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi
+            rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi
+            rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi
+            ! NS forcing
+            kg = piX%lo(3) + k - 1 
+            jg = piX%lo(2) + j - 1
+            ! ABC forcing
+            rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg)
+            rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg)
+            rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i)
+         enddo
+      enddo
+   enddo
+
+   ! Surface tension forces
+   #if phiflag == 1
+      !$acc kernels
+      !Obtain surface tension forces evaluated at the center of the cell (where phi is located)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! continuum-surface force implementation
+               curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi
+               !compute capillary forces: sigma*curvature*gradphi
+               fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
+               fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
+               fzst(i,j,k)= -sigma*curv*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
+      !$acc host_data use_device(fxst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(fyst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(fzst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      
+      ! Interpolate force at velocity points
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (im .lt. 1) im=nx
+               rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
+               rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
+               rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
+               u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+               v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+               w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+   #else
+      ! 6.2 find u, v and w star only in the inner nodes 
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+               v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+               w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+   #endif
+
+   ! store rhs* in rhs*_o 
+   ! First step is done with Euler explicit and then move to AB2 
+   alpha=1.5d0
+   beta= 0.5d0
+
+   ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+   !$acc host_data use_device(u)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   !$acc host_data use_device(v)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   !$acc host_data use_device(w)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+   !########################################################################################################################################
+   ! END STEP 6: USTAR COMPUTATION 
+   !########################################################################################################################################
+
+
+   
+   !########################################################################################################################################
+   ! START STEP 7: POISSON SOLVER FOR PRESSURE
+   !########################################################################################################################################
+   ! initialize rhs and analytical solution
+   ! 7.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center 
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Poisson")
+   ! call nvtxStartRange("compute RHS")
+   !$acc kernels
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               if (ip > nx) ip=1
+               rhsp(i,j,k) =               (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
+               rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k))
+               rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k))
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+   ! call nvtxStartRange("FFT forward w/ transpositions")
+   !$acc host_data use_device(rhsp)
+   status = cufftExecD2Z(planXf, rhsp, psi_d)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
+   !$acc end host_data
+
+   ! psi(kx,y,z) -> psi(y,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
+   ! psi(y,z,kx) -> psi(ky,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
+   ! psi(ky,z,kx) -> psi(z,kx,ky)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) 
+   ! psi(z,kx,ky) -> psi(kz,kx,ky)
+   status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status
+   ! END of FFT3D forward
+
+   ! call nvtxEndRange
+   np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
+   np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
+   np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
+   call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape)
+   ! divide by -K**2, and normalize
+   offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+   offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+   offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+   xoff = offsets(1)
+   yoff = offsets(2)
+   npx = np(1)
+   npy = np(2)
+   ! call nvtxStartRange("Solution")
+   !$acc kernels
+   do jl = 1, npy
+      jg = yoff + jl
+      do il = 1, npx
+         ig = xoff + il
+         do k = 1, nz
+            k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2    
+            phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8))
+         enddo
+      enddo
+   enddo
+   ! specify mean (corrects division by zero wavenumber above)
+   if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0
+   !$acc end kernels
+   ! call nvtxEndRange
+   ! call nvtxStartRange("FFT backwards w/ transpositions")
+
+   ! psi(kz,kx,ky) -> psi(z,kx,ky)
+   status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status
+   ! psi(z,kx,ky) -> psi(ky,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+   ! psi(ky,z,kx) -> psi(y,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
+   ! psi(y,z,kx) -> psi(kx,y,z)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
+   !$acc host_data use_device(p)
+   ! psi(kx,y,z) -> psi(x,y,z)
+   status = cufftExecZ2D(planXb, psi_d, p)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
+   !$acc end host_data
+   ! call nvtxEndRange
+
+   !$acc host_data use_device(p)
+   ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data
+   ! Update X-pencil halos in Y and Z direction
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   ! call nvtxEndRange
+   !########################################################################################################################################
+   ! END STEP 7: POISSON SOLVER FOR PRESSURE
+   !########################################################################################################################################
+
+
+
+   !########################################################################################################################################
+   ! START STEP 8: VELOCITY CORRECTION
+   ! ########################################################################################################################################
+   ! 8.1 Correct velocity 
+   ! 8.2 Remove mean velocity if using ABC forcing
+   ! 8.3 Call halo exchnages along Y and Z for u,v,w
+   ! Correct velocity, pressure has also the halo
+   ! call nvtxStartRange("Correction")
+
+   !$acc kernels 
+   umean=0.d0
+   vmean=0.d0
+   wmean=0.d0
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i = 1, piX%shape(1) ! equal to nx (no halo on x)
+            im=i-1
+            jm=j-1
+            km=k-1
+            if (im < 1) im=nx
+            u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
+            v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi
+            w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi
+            umean=umean + u(i,j,k)
+            vmean=vmean + v(i,j,k)
+            wmean=wmean + w(i,j,k)
+          enddo
+      enddo
+   enddo
+   !$acc end kernels 
+
+   ! Remove mean velocity (get local mean of the rank)
+
+   ! Divide by total number of points in the pencil
+   umean=umean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+   vmean=vmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+   wmean=wmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
+
+   ! Find global mean (MPI_SUM and then divide by number of ranks)
+   call MPI_Allreduce(umean,gumean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+   call MPI_Allreduce(vmean,gvmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+   call MPI_Allreduce(wmean,gwmean,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)
+
+   ! remove mean value
+   !$acc kernels 
+   u=u-(gumean/ranks)
+   v=v-(gvmean/ranks)
+   w=w-(gwmean/ranks)
+   !$acc end kernels 
+
+   ! 8.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
+   !$acc host_data use_device(u)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, u, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   !$acc host_data use_device(v)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, v, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+   !$acc host_data use_device(w)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
+
+   ! find local maximum velocity
+   uc=maxval(u)
+   vc=maxval(v)
+   wc=maxval(w)
+   umax=max(wc,max(uc,vc))
+   call MPI_Allreduce(umax,gumax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD, ierr)
+   cou=gumax*dt*dxi
+   if (rank.eq.0) then
+      write(*,*) "CFL (max among tasks)", cou
+      write(*,*) "umax", gumax
+      if (cou .gt. 7) stop
+      if (.not. ieee_is_finite(cou)) then
+         print *, "Error: cou is infinite or NaN!"
+         stop
+      end if
+   endif
+
+   call cpu_time(timef)
+   if (rank.eq.0) print '(" Time elapsed = ",f8.1," ms")',1000*(timef-times)
+
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+
+   !########################################################################################################################################
+   ! END STEP 8: VELOCITY CORRECTION  
+   !########################################################################################################################################
+
+
+   !########################################################################################################################################
+   ! START STEP 9: OUTPUT FIELDS 
+   ! ########################################################################################################################################
+   if (mod(t,dump) .eq. 0) then
+      if (rank .eq. 0) write(*,*) "Saving output files"
+      ! write velocity and pressure fiels (1-4)
+      call writefield(t,1)
+      call writefield(t,2)
+      call writefield(t,3)
+      call writefield(t,4)
+      #if phiflag == 1
+         ! write phase-field (5)
+         call writefield(t,5)
+      #endif
+   endif
+   !########################################################################################################################################
+   ! END STEP 9: OUTPUT FIELDS N  
+   !########################################################################################################################################
+
+! (uncomment for profiling)
+! call nvtxEndRange
+
+enddo
+call cpu_time(t_end)
+elapsed = t_end-t_start
+if (rank .eq. 0) write(*,*)  'Elapsed time (seconds):', elapsed
+#if partflag == 1
+!$acc end data
+!$acc end data
+!$acc end data
+#endif
+!$acc end data
+!$acc end data
+!$acc end data
+
+#if partflag == 1
+! Particle variables
+deallocate(part)
+deallocate(partbuff)
+deallocate(vec_p)
+deallocate(order_p)
+deallocate(buffvar1)
+deallocate(buffvar2)
+#endif
+
+! Remove allocated variables (add new)
+deallocate(x_ext)
+deallocate(u,v,w)
+deallocate(tanh_psi, mysin, mycos)
+deallocate(rhsu,rhsv,rhsw)
+deallocate(rhsu_o,rhsv_o,rhsw_o)
+deallocate(phi,rhsphi,normx,normy,normz)
+deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4)
+
+call mpi_finalize(ierr)
+
+end program main
\ No newline at end of file
diff --git a/hit/mn5.sh b/hit/mn5.sh
new file mode 100644
index 0000000..12a62b4
--- /dev/null
+++ b/hit/mn5.sh
@@ -0,0 +1,5 @@
+module purge
+module load nvidia-hpc-sdk/24.3
+cp Makefile_mn5 Makefile
+make clean
+make
\ No newline at end of file
diff --git a/hit/module.f90 b/hit/module.f90
new file mode 100755
index 0000000..1a25def
--- /dev/null
+++ b/hit/module.f90
@@ -0,0 +1,82 @@
+module param
+    integer, parameter :: nx=512
+    integer :: ny=nx,nz=nx
+    double precision :: pi,lx,dx,dxi,ddxi,rhoi,twopi
+    integer :: restart,tstart,tfin,dump
+    double precision :: gamma, normag
+    double precision :: dt,mu,rho !flow parameters
+    integer :: inflow, inphi
+    double precision :: f1,f2,f3,k0 ! forcing parameters
+    double precision :: radius, sigma, epsr, eps, pos, val, epsi, enum ! phase-field parameters
+    double precision :: times,timef
+    ! other variables (wavenumber, grid location)
+    real(8), allocatable :: x(:), kx(:)
+    real(8), allocatable :: x_ext(:)
+    double precision :: yinf, ysup ! Inf and Sup of y in PiX (no halo)
+    double precision :: zinf, zsup ! Inf and Sup of z in PiX (no halo)
+    double precision :: lyloc, lzloc ! Inf and Sup of y in PiX
+  
+end module param
+
+
+module mpivar
+   ! MPI variables
+   integer :: rank, ranks, ierr
+   integer :: localRank, localComm
+   integer :: nidp1y, nidm1y, nidp1z, nidm1z
+end module mpivar
+
+
+module cudecompvar
+   use cudecomp
+   integer :: npx, npy, npz
+   type(cudecompHandle) :: handle
+   type(cudecompGridDesc) :: grid_desc,grid_descD2Z
+   type(cudecompGridDescConfig) :: config
+   type(cudecompGridDescAutotuneOptions) :: options
+   integer :: pdims(2) ! pr x pc pencils
+   integer :: gdims(3) ! global grid dimensions
+   integer :: halo(3) ! halo extensions
+   integer :: halo_ext ! 0 no halo, 1 means 1 halo
+   type(cudecompPencilInfo) :: piX, piY, piZ ! size of the pencils in x- y- and z-configuration
+   type(cudecompPencilInfo) :: piX_d2z, piY_d2z, piZ_d2z  ! size of the pencils in x- y- and z-configuration for D2Z
+   type(cudecompPencilInfo) :: piX_Poiss
+   integer(8) :: nElemX, nElemY, nElemZ, nElemWork, nElemWork_halo,nElemWork_halo_d2z
+   integer(8) :: nElemX_d2z, nElemY_d2z, nElemZ_d2z, nElemWork_d2z
+   logical :: halo_periods(3)
+   integer :: pix_yoff,pix_zoff
+end module cudecompvar
+
+
+module velocity
+   double precision, allocatable :: u(:,:,:), v(:,:,:), w(:,:,:)
+   double precision, allocatable :: rhsu(:,:,:), rhsv(:,:,:), rhsw(:,:,:)
+   double precision, allocatable :: rhsu_o(:,:,:), rhsv_o(:,:,:), rhsw_o(:,:,:)
+   complex(8), allocatable :: rhsp_complex(:,:,:)
+   double precision, allocatable :: rhsp(:,:,:), p(:,:,:)
+   double precision, allocatable :: rhspp(:,:,:), pp(:,:,:)
+   double precision, allocatable :: div(:,:,:)
+   double precision :: uc, vc, wc, umax, gumax=1.0d0, cou, alpha, beta
+   double precision :: h11, h12, h13, h21, h22, h23, h31, h32, h33
+   double precision :: umean, vmean, wmean, gumean, gvmean, gwmean
+   double precision, allocatable :: mysin(:), mycos(:)
+end module velocity
+
+
+module phase
+   double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+   double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+   double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:)
+   double precision :: curv
+   double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+   double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:)
+   double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp
+   double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp
+   double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp
+   double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13
+   double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp
+end module phase
+
+
+
+
diff --git a/hit/particles.f90 b/hit/particles.f90
new file mode 100644
index 0000000..5e7d0ea
--- /dev/null
+++ b/hit/particles.f90
@@ -0,0 +1,382 @@
+module particles
+ use sort
+ implicit none 
+ ! Total number of particles
+ integer:: npart
+ ! Local (per process) number of particles
+ integer :: nploc
+ ! Estimated maximum number of particles per process
+ integer :: nplocmax 
+ ! Array containing the info of the particles (every column is a different particle)
+ double precision, allocatable :: part(:,:)
+ ! Buffer for particle array
+ double precision, allocatable :: partbuff(:,:) 
+ ! Number of Info per particle
+ integer, parameter :: ninfop = 12 
+ !....Indices for particle info
+ integer, parameter :: Itag=1                     ! Row 1: Tag of the particle
+ integer, parameter :: Ixp=2,Iyp=3,Izp=4          ! Row 2 to 4: Current x,y,z of the particle 
+ integer, parameter :: Iup=5,Ivp=6,Iwp=7          ! Row 5 to 7: Current u,v,w of the particle
+ integer, parameter :: Iphip=8                    ! Row 8: phi at the current location of the particle
+ integer, parameter :: Iup1=9,Ivp1=10,Iwp1=11    ! Row 9 to 11: Previous (step) u,v,w of the particle
+ integer, parameter :: Iphip1=12                 ! Row 12: Previous (step) u,v,w of the particle
+ 
+ ! Array for particle ordering
+ double precision, allocatable :: vec_p(:)
+ integer, allocatable :: order_p(:) 
+ integer :: inpart
+
+ ! Number of Particles to be Transferred
+ integer::Nsendp1,Nsendm1,Nrecvp1,Nrecvm1
+
+ ! Buffer for particle transfer
+ double precision, allocatable,dimension(:,:)::buffvar1,buffvar2
+ 
+ contains
+!***********************************************************************
+subroutine particlegenerator(inflag)
+ use param
+ use mpivar
+ use cudecompvar
+ implicit none
+ integer :: inflag
+ integer :: i
+   
+ !....Particle Tag
+ do i = 1, nploc
+ part(i,Itag) = dble(i)
+ part(i,Itag) = dble(i) + rank * nploc
+ enddo
+    
+
+ if(inflag.eq.1)then
+     !....Generate xp position
+     call RANDOM_SEED()
+     call RANDOM_NUMBER(vec_p)
+     do i = 1, nploc
+     part(i,Ixp) = vec_p(i) * lx
+     enddo
+     
+     !....Generate yp position
+     call RANDOM_SEED()
+     call RANDOM_NUMBER(vec_p)
+     do i = 1, nploc
+     part(i,Iyp) = vec_p(i) * lyloc + yinf 
+     end do
+     
+     !....Generate zp position
+     call RANDOM_SEED()
+     call RANDOM_NUMBER(vec_p)
+     do i = 1, nploc
+     part(i,Izp) = vec_p(i) * lzloc + zinf 
+     end do
+ endif
+
+ return
+end subroutine
+!***********************************************************************
+subroutine linearinterpolation
+    use velocity
+    use param
+    use cudecompvar
+    implicit none
+    integer :: i 
+    double precision :: xpt, ypt, zpt
+    integer :: ix, iy, iz, ixplus1
+    double precision :: tx, ty, tz
+
+ !$acc parallel loop present(part,u,v,w) private(xpt,ypt,zpt,ix,iy,iz,tx,ty,tz)
+  do i = 1, nploc
+    ! Particle position
+    xpt = part(i,Ixp)
+    ypt = part(i,Iyp)
+    zpt = part(i,Izp)
+
+    ! Global left-cell indices (uniform, fully periodic)
+    ix = floor( xpt / dx )
+    iy = floor( ypt / dx )
+    iz = floor( zpt / dx )
+
+    ! Local fractions in [0,1)
+    tx = xpt/dx - dble(ix)
+    ty = ypt/dx - dble(iy)
+    tz = zpt/dx - dble(iz)
+
+    ! Map to 1-based global indices and wrap, then to local by offsets
+    ix = 1 + ix
+    ixplus1 = ix + 1
+    ! Periodicity
+    if(ixplus1>nx) ixplus1 = 1
+    ! Periodicity is already accounted by the halos
+    iy = 1 + iy - pix_yoff
+    iz = 1 + iz - pix_zoff
+
+    ! 1-D linear per component (MAC: faces along native dir)
+    part(i,Iup) = (1.0d0 - tx)*u(ix    , iy    , iz    ) + tx*u(ixplus1  , iy    , iz    )
+    part(i,Ivp) = (1.0d0 - ty)*v(ix    , iy    , iz    ) + ty*v(ix       , iy+1  , iz    )
+    part(i,Iwp) = (1.0d0 - tz)*w(ix    , iy    , iz    ) + tz*w(ix       , iy    , iz+1  )
+  end do
+ !$acc end parallel loop
+    
+ return
+end subroutine
+!***********************************************************************   
+subroutine SortPartY()
+ implicit none
+ integer :: i, j
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+    do i = 1, nploc
+        partbuff(i,j) = part(i,j)
+        vec_p(i) = part(i,Iyp)
+        order_p(i) = i
+    enddo
+ enddo
+    
+ !$acc host_data use_device(vec_p,order_p)
+ call fsort(vec_p,order_p,nploc,.true.)
+ !$acc end host_data
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+    do i = 1, nploc
+        part(i,j) = partbuff(order_p(i),j)
+    end do
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine SortPartZ()
+ implicit none
+ integer :: i, j
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+    do i = 1, nploc
+        partbuff(i,j) = part(i,j)
+        vec_p(i) = part(i,Izp)
+        order_p(i) = i
+    enddo
+ enddo
+    
+ !$acc host_data use_device(vec_p,order_p)
+ call fsort(vec_p,order_p,nploc,.true.)
+ !$acc end host_data
+
+ !$acc parallel loop collapse(2)
+ do j = 1, ninfop
+    do i = 1, nploc
+        part(i,j) = partbuff(order_p(i),j)
+    end do
+ end do
+
+ return
+end subroutine
+!***********************************************************************
+subroutine CountPartTransfY()
+ use param
+ implicit none 
+ integer :: flag,scanner
+ integer :: i
+ integer :: addp,addm
+ 
+ Nsendp1=0 !Number of particle to send to nid+1 (in Y)
+ Nrecvm1=0 !Number of particle to receive from nid-1 (in Y)
+ Nsendm1=0 !Number of particle to send to nid-1 (in Y)
+ Nrecvp1=0 !Number of particle to receive from nid+1 (in Y)
+
+
+ !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1)
+ do i = 1, nploc
+  addp=0
+  addm=0
+  if(part(i,Izp).GE.ysup) addp=1
+  if(part(i,Izp).LT.yinf) addm=1
+  Nsendp1=Nsendp1+addp
+  Nsendm1=Nsendm1+addm
+ end do
+
+ return
+end subroutine 
+!***********************************************************************
+subroutine CountPartTransfZ()
+ use param
+ implicit none 
+ integer :: flag,scanner
+ integer :: i
+ integer :: addp,addm
+ 
+ Nsendp1=0 !Number of particle to send to nid+1 (in Z)
+ Nrecvm1=0 !Number of particle to receive from nid-1 (in Z)
+ Nsendm1=0 !Number of particle to send to nid-1 (in Z)
+ Nrecvp1=0 !Number of particle to receive from nid+1 (in Z)
+
+
+ !$acc parallel loop default(present) private(addp,addm) reduction(+:Nsendp1,Nsendm1)
+ do i = 1, nploc
+  addp=0
+  addm=0
+  if(part(i,Izp).GE.zsup) addp=1
+  if(part(i,Izp).LT.zinf) addm=1
+  Nsendp1=Nsendp1+addp
+  Nsendm1=Nsendm1+addm
+ end do
+
+ return
+end subroutine 
+!***********************************************************************
+subroutine SendPartUP(direction)
+  use mpi
+  use mpivar
+  implicit none
+  integer :: direction
+  integer::NsendMAX
+  integer,dimension(MPI_STATUS_SIZE):: status
+  integer :: ii, jj
+  integer :: nidp1, nidm1
+  
+  if(direction.eq.2)then
+      nidp1 = nidp1y
+      nidm1 = nidm1y
+  elseif(direction.eq.3)then
+      nidp1 = nidp1z
+      nidm1 = nidm1z
+  endif
+  
+  !...Proc nid send to proc nid+1 the number of particles 
+  !...that nid+1 has to receive from nid
+  call MPI_Sendrecv(Nsendp1,1,MPI_INTEGER, nidp1, nidp1,&
+  Nrecvm1,1,MPI_INTEGER,nidm1,rank,MPI_COMM_WORLD,status,ierr)
+  
+  call MPI_Allreduce(Nsendp1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,ierr)
+  
+  if(NsendMAX.GT.0)then
+      if(Nsendp1.GT.0)then
+         !$acc parallel loop collapse(2)
+         do jj=1,Nsendp1
+          do ii=1,ninfop
+           buffvar1(ii,jj)=part(nploc-Nsendp1+jj,ii)
+          end do
+         end do
+      end if
+    
+      !$acc host_data use_device(buffvar1,buffvar2)
+      !....Proc nid send to nid+1 and receive from nid-1
+      call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendp1),MPI_DOUBLE_PRECISION,&
+      nidp1, nidp1,buffvar2(1,1),int(ninfop*Nrecvm1),MPI_DOUBLE_PRECISION,&
+      nidm1,rank,MPI_COMM_WORLD,STATUS,ierr)
+      !$acc end host_data
+    
+      if(Nrecvm1.GT.0)then
+         !....Each proc place received data in the first block
+         !$acc parallel loop collapse(2)
+         do jj=1,ninfop
+          do ii=1,Nrecvm1
+           partbuff(ii,jj)=buffvar2(jj,ii)
+          end do
+         end do
+      end if
+  end if
+  
+  
+  !....Adjust central block (not transferred data)
+  !$acc parallel loop collapse(2)
+  do jj=1,ninfop
+   do ii=1,(nploc-Nsendm1-Nsendp1)
+    partbuff(Nrecvm1+ii,jj)=part(Nsendm1+ii,jj)
+   end do
+  end do
+  
+  return
+end subroutine 
+!***********************************************************************
+subroutine SendPartDOWN(direction)
+  use mpi
+  use mpivar
+  implicit none
+  integer :: direction
+  integer::NsendMAX
+  integer,dimension(MPI_STATUS_SIZE):: status
+  integer :: ii, jj
+  integer :: nidp1, nidm1
+
+  
+  if(direction.eq.2)then
+      nidp1 = nidp1y
+      nidm1 = nidm1y
+  elseif(direction.eq.3)then
+      nidp1 = nidp1z
+      nidm1 = nidm1z
+  endif
+  
+  !...Proc nid send to proc nid-1 the number of particles 
+  !...that nid-1 has to receive from nid
+  call MPI_Sendrecv(Nsendm1,1,MPI_INTEGER, nidm1, nidm1,&
+  Nrecvp1,1,MPI_INTEGER,nidp1,rank,MPI_COMM_WORLD,STATUS,ierr)
+  
+  call MPI_Allreduce(Nsendm1,NsendMAX,1,MPI_INTEGER,MPI_MAX,MPI_Comm_World,ierr)
+  
+  
+  if(NsendMAX.GT.0)then
+    if(Nsendm1.GT.0)then
+       !$acc parallel loop collapse(2)
+       do jj=1,Nsendm1
+        do ii=1,ninfop
+         buffvar1(ii,jj)=part(jj,ii)
+        end do
+       end do
+    end if
+  
+    !$acc host_data use_device(buffvar1,buffvar2)
+    !....Proc nid send to nid-1 and receive from nid+1
+    call MPI_Sendrecv(buffvar1(1,1),int(ninfop*Nsendm1),MPI_DOUBLE_PRECISION,&
+    nidm1, nidm1,buffvar2(1,1),int(ninfop*Nrecvp1),MPI_DOUBLE_PRECISION,&
+    nidp1, rank ,MPI_COMM_WORLD,STATUS,ierr)
+    !$acc end host_data
+  
+    if(Nrecvp1.GT.0)then
+       !....Each proc append received data
+       !$acc parallel loop collapse(2)
+       do jj=1,ninfop
+        do ii=1,Nrecvp1
+         partbuff(ii+Nrecvm1+nploc-Nsendm1-Nsendp1,jj)=buffvar2(jj,ii)
+        end do
+       end do
+    end if
+  
+  end if
+  
+  !...Update Local Number of Paricles
+  nploc=nploc+Nrecvm1+Nrecvp1-Nsendp1-Nsendm1
+  !$acc parallel loop collapse(2)
+  do jj=1,ninfop
+   do ii=1,nploc
+    part(ii,jj)=partbuff(ii,jj)
+   end do
+  end do
+  
+  
+  return
+end subroutine 
+!***********************************************************************
+subroutine ParticlesLeakage()
+  use param
+  !....Particles which escape from the domain along x, y and z must 
+  !....be reintroduced
+  implicit none
+  
+  !$acc kernels
+  !....Along x
+  part(1:nploc,Ixp)=part(1:nploc,Ixp)-lx*FLOOR(part(1:nploc,Ixp)/lx)
+  !....Along y
+  part(1:nploc,Iyp)=part(1:nploc,Iyp)-lx*FLOOR(part(1:nploc,Iyp)/lx)
+  !....Along z
+  part(1:nploc,Izp)=part(1:nploc,Izp)-lx*FLOOR(part(1:nploc,Izp)/lx)
+  !$acc end kernels
+    
+  return
+end subroutine
+!***********************************************************************
+end module particles
\ No newline at end of file
diff --git a/hit/readinput.f90 b/hit/readinput.f90
new file mode 100644
index 0000000..2dabaa2
--- /dev/null
+++ b/hit/readinput.f90
@@ -0,0 +1,83 @@
+
+!##########################################################################
+!###########################################################################
+subroutine readinput
+use velocity
+use phase
+use param
+use mpivar
+use particles
+implicit none
+
+open(unit=55,file='input.inp',form='formatted',status='old')
+!Time step parameters
+read(55,*) restart
+read(55,*) tstart
+read(55,*) tfin
+read(55,*) dump
+!Flow parameters
+read(55,*) inflow
+read(55,*) inphi
+read(55,*) dt
+read(55,*) mu
+read(55,*) rho
+! forcing parameters
+read(55,*) f1
+read(55,*) f2
+read(55,*) f3
+read(55,*) k0
+! phase-field parameters
+read(55,*) radius
+read(55,*) sigma
+read(55,*) epsr   
+! particle parameters
+read(55,*) npart
+read(55,*) inpart
+ 
+
+! compute pre-defined constant 
+twopi=8.0_8*atan(1.0_8)
+lx=twopi
+! dx = lx/(nx-1)
+dx = lx/(nx)
+
+dxi=1.d0/dx
+ddxi=1.d0/dx/dx
+rhoi=1.d0/rho
+eps=epsr*dx
+epsi=1.d0/eps
+enum=1.e-16
+
+if (rank .eq. 0) then
+    !enable/disable for debug check parameters
+    write(*,*) "----------------------------------------------"
+    write(*,*) "███    ███ ██   ██ ██ ████████ ██████   ██████"  
+    write(*,*) "████  ████ ██   ██ ██    ██         ██ ██"       
+    write(*,*) "██ ████ ██ ███████ ██    ██     █████  ███████"  
+    write(*,*) "██  ██  ██ ██   ██ ██    ██         ██ ██    ██" 
+    write(*,*) "██      ██ ██   ██ ██    ██    ██████   ██████"         
+    write(*,*) "----------------------------------------------"
+    write(*,*) 'Grid:', nx, 'x', nx, 'x', nx
+    write(*,*) "Restart ", restart
+    write(*,*) "Tstart  ", tstart
+    write(*,*) "Tfin    ", tfin
+    write(*,*) "Dump    ", dump
+    write(*,*) "Inflow  ", inflow
+    write(*,*) "Deltat  ", dt
+    write(*,*) "Mu      ", mu
+    write(*,*) "Rho     ", rho
+    write(*,*) "f1,f2,f3,k0", f1,f2,f3,k0
+    write(*,*) "Radius  ", radius
+    write(*,*) "Sigma   ", sigma
+    write(*,*) "Eps     ", eps
+    write(*,*) "Epsi    ", epsi
+    write(*,*) "Lx      ", lx
+    write(*,*) "dx", dx
+    write(*,*) "dxi", dxi
+    write(*,*) "ddxi", ddxi
+    write(*,*) "rhoi", rhoi
+endif
+
+end subroutine
+
+
diff --git a/hit/readwrite.f90 b/hit/readwrite.f90
new file mode 100644
index 0000000..44df20f
--- /dev/null
+++ b/hit/readwrite.f90
@@ -0,0 +1,318 @@
+subroutine writefield(t,fieldn)
+! Output field, file is written in the /output folder 
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: out(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1]
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(out(p_size(1),p_size(2),p_size(3))) !<- halo removed 
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  out=u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 2) then
+  out=v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 3) then
+  out=w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 4) then
+  out=p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+if (fieldn .eq. 5) then
+  out=phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext) !<- out only the inner parts (no halo)
+  write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_create+mpi_mode_rdwr,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  call mpi_file_write_all(f_handle,out,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+endif
+
+deallocate(out)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield(fieldn)
+! Input field, file is written in the /input folder 
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  namefile='./input/u.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+  namefile='./input/v.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+  namefile='./input/w.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+  namefile='./input/p.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+  namefile='./input/phi.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
+
+
+
+
+
+
+
+
+
+
+subroutine readfield_restart(t,fieldn)
+! Used in case of restart, file is read from the multi/output folder (iteration tstart must be present!)
+
+use velocity
+use phase
+use mpi
+use mpivar
+use param
+use cudecompvar
+
+implicit none
+
+integer :: g_size(3),p_size(3),fstart(3)
+integer :: t,fieldn
+character(len=40) :: namefile
+integer(mpi_offset_kind) :: offset=0
+integer :: f_handle ! file handle
+integer :: ftype
+double precision, allocatable :: in(:,:,:)
+
+! fieldn=1 means u
+! fieldn=2 means v
+! fieldn=3 means w
+! fieldn=4 means p
+! fieldn=5 means phi
+
+! define basic quantities to be used later (gloabl and pencil size)
+g_size=[nx, ny, nz] ! global size
+p_size=[piX%shape(1), piX%shape(2)-2*halo_ext, piX%shape(3)-2*halo_ext] !<- pencil has no halo along x
+fstart=[piX%lo(1)-1,piX%lo(2)-1,piX%lo(3)-1] !<- MPI is in C and index start from 0 (not 1)
+! for debug
+!write(*,*) "g_size", g_size
+!write(*,*) "p_size", p_size
+!write(*,*) "fstart", fstart
+allocate(in(p_size(1),p_size(2),p_size(3))) !<- no halos read
+ 
+!write(*,*) "in readwrite"
+
+if (fieldn .eq. 1) then
+  write(namefile,'(a,i8.8,a)') './output/u_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  u(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 2) then
+  write(namefile,'(a,i8.8,a)') './output/v_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  v(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 3) then
+  write(namefile,'(a,i8.8,a)') './output/w_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  w(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 4) then
+  write(namefile,'(a,i8.8,a)') './output/p_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  p(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+if (fieldn .eq. 5) then
+  write(namefile,'(a,i8.8,a)') './output/phi_',t,'.dat'
+  call mpi_file_open(MPI_COMM_WORLD,namefile,mpi_mode_rdonly,mpi_info_null,f_handle,ierr)
+  call mpi_type_create_subarray(3,g_size,p_size,fstart,mpi_order_fortran,mpi_double_precision,ftype,ierr)
+  call mpi_type_commit(ftype,ierr)
+  call mpi_file_set_view(f_handle,offset,mpi_double_precision,ftype,'native',mpi_info_null,ierr)
+  !call mpi_file_read_all(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_read(f_handle,in,p_size(1)*p_size(2)*p_size(3),mpi_double_precision,mpi_status_ignore,ierr)
+  call mpi_file_close(f_handle,ierr)
+  phi(1:nx,1+halo_ext:piX%shape(2)-halo_ext,1+halo_ext:piX%shape(3)-halo_ext)=in !<- read only the inner parts (no halo) u has halos; in no halos
+endif
+
+deallocate(in)
+
+end subroutine
\ No newline at end of file
diff --git a/hit/testpush.sh b/hit/testpush.sh
new file mode 100644
index 0000000..2e0e21a
--- /dev/null
+++ b/hit/testpush.sh
@@ -0,0 +1,2 @@
+make clean
+make
diff --git a/multi/main.f90 b/multi/main.f90
old mode 100755
new mode 100644
index 67dd398..beb351f
--- a/multi/main.f90
+++ b/multi/main.f90
@@ -285,11 +285,13 @@ program main
 allocate(div(piX%shape(1),piX%shape(2),piX%shape(3)))
 !PFM variables
 #if phiflag == 1
-   allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi_o(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
    allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
    allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
    allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(normx_f(piX%shape(1),piX%shape(2),piX%shape(3)),normy_f(piX%shape(1),piX%shape(2),piX%shape(3)),normz_f(piX%shape(1),piX%shape(2),piX%shape(3)))
    allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+   allocate(phi_tmp(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik2(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik3(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphik4(piX%shape(1),piX%shape(2),piX%shape(3)))
 #endif
 
 ! allocate arrays for transpositions and halo exchanges 
@@ -439,9 +441,9 @@ program main
 ! First step use Euler
 alpha=1.0d0
 beta=0.0d0
-gumax=1.d0
+gumax=8.0d0
 tstart=tstart+1
-gamma=1.d0*gumax
+gamma=1.d0*gumax ! initial gamma for phase-field
 !$acc data copyin(piX)
 !$acc data create(rhsu_o, rhsv_o, rhsw_o)
 !$acc data copyin(mysin, mycos)
@@ -462,7 +464,6 @@ program main
     if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
     call cpu_time(times)
 
-
    !########################################################################################################################################
    ! START STEP 4: PARTICLES (TRACERS)
    !########################################################################################################################################
@@ -519,6 +520,8 @@ program main
    ! START STEP 5: PHASE-FIELD SOLVER (EXPLICIT)
    !########################################################################################################################################
    #if phiflag == 1
+      ! 4.2 Get phi at n+1 using RK4
+      ! first stage of RK4 - saved in rhsphi
       !$acc kernels
       do k=1, piX%shape(3)
          do j=1, piX%shape(2)
@@ -533,7 +536,8 @@ program main
       enddo
       !$acc end kernels
 
-      gamma=1.d0*gumax
+      gamma=1.d0*gumax ! update gamma every time step with the current max velocity value
+      if (rank.eq.0) write(*,*) "gamma:", gamma
       !$acc parallel loop tile(16,4,2)
       do k=1+halo_ext, piX%shape(3)-halo_ext
          do j=1+halo_ext, piX%shape(2)-halo_ext
@@ -547,14 +551,160 @@ program main
                km=k-1
                if (ip .gt. nx) ip=1
                if (im .lt. 1) im=nx
-               ! convective (first three lines) and diffusive (last three lines)
+               ! convective (first six lines) and diffusive (last three lines)                     
+               ! flux-splitting
                rhsphi(i,j,k) =   &
-                     - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi  &  
-                     - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi  &  
-                     - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi  &  
+                     - (max(u(ip,j,k),0.0d0)*phi(i,j,k) + min(u(ip,j,k),0.0d0)*phi(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi(i,j,k) - max(u(i,j,k),0.0d0)*phi(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi(i,j,k) + min(v(i,jp,k),0.0d0)*phi(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi(i,j,k) - max(v(i,j,k),0.0d0)*phi(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi(i,j,k) + min(w(i,j,kp),0.0d0)*phi(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi(i,j,k) - max(w(i,j,k),0.0d0)*phi(i,j,km))*dxi  &                           
                            + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &                   
                                     eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &                   
-                                    eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)                      
+                                    eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
+
+      ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0) for surface tension force computation later
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
+               normx_f(i,j,k) = normx(i,j,k)*normag
+               normy_f(i,j,k) = normy(i,j,k)*normag
+               normz_f(i,j,k) = normz(i,j,k)*normag
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Compute sharpening term flux split like CaNS-Fizzy
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphi(i,j,k)=rhsphi(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! second stage of RK4 - saved in rhsphik2
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphi)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphi(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik2(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
                ! 4.1.3. Compute normals for sharpening term (gradient)
                normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
                normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
@@ -577,20 +727,133 @@ program main
       CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
       !$acc end host_data 
 
-      ! 4.1.3. Compute Sharpening term (gradient)
-      ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avoid 0/0)
+      ! Compute sharpening term
       !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik2(i,j,k)=rhsphik2(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! third stage of RK4 - saved in rhsphik3
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik2)
       do k=1, piX%shape(3)
          do j=1, piX%shape(2)
             do i=1,nx
-               normag = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + enum)
-               normx(i,j,k) = normx(i,j,k)*normag
-               normy(i,j,k) = normy(i,j,k)*normag
-               normz(i,j,k) = normz(i,j,k)*normag
+               phi_tmp(i,j,k) = phi(i,j,k) + 0.5d0 * dt * rhsphik2(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
             enddo
          enddo
       enddo
       !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik3(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
+
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data
 
       ! Compute sharpening term
       !$acc kernels
@@ -605,31 +868,186 @@ program main
                km=k-1
                if (ip .gt. nx) ip=1
                if (im .lt. 1) im=nx
-               ! ACDI with pre-computed tanh                                
-               rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
-                                                   0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5d0*dxi + &
-                                                  (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
-                                                   0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5d0*dxi + &
-                                                  (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
-                                                   0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5d0*dxi)
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.0d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.0d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.0d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.0d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.0d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.0d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik3(i,j,k)=rhsphik3(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+      ! forth stage of RK4 - saved in rhsphik4
+      !$acc parallel loop collapse(3) present(phi, phi_tmp, rhsphik3)
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               phi_tmp(i,j,k) = phi(i,j,k) + dt * rhsphik3(i,j,k)
+            enddo
+         enddo
+      enddo
+      !$acc end parallel loop
+      ! 4.3 Call halo exchanges along Y and Z for phi 
+      !$acc host_data use_device(phi_tmp)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi_tmp, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi_tmp(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
             enddo
          enddo
       enddo
       !$acc end kernels
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)                  
+               ! flux-splitting
+               rhsphik4(i,j,k) =   &
+                     - (max(u(ip,j,k),0.0d0)*phi_tmp(i,j,k) + min(u(ip,j,k),0.0d0)*phi_tmp(ip,j,k) - &
+                        min(u(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(u(i,j,k),0.0d0)*phi_tmp(im,j,k))*dxi  &  
+                     - (max(v(i,jp,k),0.0d0)*phi_tmp(i,j,k) + min(v(i,jp,k),0.0d0)*phi_tmp(i,jp,k) - &
+                        min(v(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(v(i,j,k),0.0d0)*phi_tmp(i,jm,k))*dxi  &  
+                     - (max(w(i,j,kp),0.0d0)*phi_tmp(i,j,k) + min(w(i,j,kp),0.0d0)*phi_tmp(i,j,kp) - &
+                        min(w(i,j,k),0.0d0)*phi_tmp(i,j,k) - max(w(i,j,k),0.0d0)*phi_tmp(i,j,km))*dxi  &                           
+                           + gamma*(eps*(phi_tmp(ip,j,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(im,j,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,jp,k)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,jm,k))*ddxi + &                   
+                                    eps*(phi_tmp(i,j,kp)-2.d0*phi_tmp(i,j,k)+phi_tmp(i,j,km))*ddxi)
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
+         enddo
+      enddo
 
-      ! 4.2 Get phi at n+1 using AB2
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+
+      ! Compute sharpening term
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               normx_xm = 0.5d0*(normx(im,j,k)+normx(i,j,k))
+               normx_xp = 0.5d0*(normx(ip,j,k)+normx(i,j,k))
+               normx_ym = 0.5d0*(normx(i,jm,k)+normx(i,j,k))
+               normx_yp = 0.5d0*(normx(i,jp,k)+normx(i,j,k))
+               normx_zm = 0.5d0*(normx(i,j,km)+normx(i,j,k))
+               normx_zp = 0.5d0*(normx(i,j,kp)+normx(i,j,k))
+               normy_xm = 0.5d0*(normy(im,j,k)+normy(i,j,k))
+               normy_xp = 0.5d0*(normy(ip,j,k)+normy(i,j,k))
+               normy_ym = 0.5d0*(normy(i,jm,k)+normy(i,j,k))
+               normy_yp = 0.5d0*(normy(i,jp,k)+normy(i,j,k))
+               normy_zm = 0.5d0*(normy(i,j,km)+normy(i,j,k))
+               normy_zp = 0.5d0*(normy(i,j,kp)+normy(i,j,k))
+               normz_xm = 0.5d0*(normz(im,j,k)+normz(i,j,k))
+               normz_xp = 0.5d0*(normz(ip,j,k)+normz(i,j,k))
+               normz_ym = 0.5d0*(normz(i,jm,k)+normz(i,j,k))
+               normz_yp = 0.5d0*(normz(i,jp,k)+normz(i,j,k))
+               normz_zm = 0.5d0*(normz(i,j,km)+normz(i,j,k))
+               normz_zp = 0.5d0*(normz(i,j,kp)+normz(i,j,k))
+               ! sharpening term
+               !
+               rn_01 = normx_xm/(sqrt(normx_xm**2.0d0+normy_xm**2.d0+normz_xm**2.0d0)+enum)
+               rn_11 = normx_xp/(sqrt(normx_xp**2.0d0+normy_xp**2.d0+normz_xp**2.0d0)+enum)
+               rn_02 = normy_ym/(sqrt(normx_ym**2.0d0+normy_ym**2.d0+normz_ym**2.0d0)+enum)
+               rn_12 = normy_yp/(sqrt(normx_yp**2.0d0+normy_yp**2.d0+normz_yp**2.0d0)+enum)
+               rn_03 = normz_zm/(sqrt(normx_zm**2.0d0+normy_zm**2.d0+normz_zm**2.0d0)+enum)
+               rn_13 = normz_zp/(sqrt(normx_zp**2.0d0+normy_zp**2.d0+normz_zp**2.0d0)+enum)
+               !
+               sharpxm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(im,j,k))*epsi))**2.0d0)*rn_01)
+               sharpxp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(ip,j,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_11)
+               sharpym = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,jm,k))*epsi))**2.0d0)*rn_02)
+               sharpyp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,jp,k)+psidi(i,j,k))*epsi))**2.0d0)*rn_12)
+               sharpzm = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,k)+psidi(i,j,km))*epsi))**2.0d0)*rn_03)
+               sharpzp = 0.25d0*gamma*((1.0d0-(tanh(0.25d0*(psidi(i,j,kp)+psidi(i,j,k))*epsi))**2.0d0)*rn_13)
+               !
+               rhsphik4(i,j,k)=rhsphik4(i,j,k)-dxi*((sharpxp-sharpxm)+(sharpyp-sharpym)+(sharpzp-sharpzm))
+            enddo
+         enddo
+      enddo
+      !$acc end kernels
+
+      ! Update phi with RK4
       !$acc kernels
       do k=1+halo_ext, piX%shape(3)-halo_ext
          do j=1+halo_ext, piX%shape(2)-halo_ext
             do i=1,nx
-               phi(i,j,k) = phi(i,j,k) + dt*(alpha*rhsphi(i,j,k)-beta*rhsphi_o(i,j,k))
-               rhsphi_o(i,j,k)=rhsphi(i,j,k)
+               phi(i,j,k) = phi(i,j,k) + dt/6.0d0*(rhsphi(i,j,k) + 2.0d0*rhsphik2(i,j,k) + &
+                            2.0d0*rhsphik3(i,j,k) + rhsphik4(i,j,k))
             enddo
          enddo
       enddo
       !$acc end kernels
 
-      ! 4.3 Call halo exchnages along Y and Z for phi 
+      ! 4.3 Call halo exchanges along Y and Z for phi 
       !$acc host_data use_device(phi)
       CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
       CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
@@ -732,7 +1150,7 @@ program main
                if (ip .gt. nx) ip=1
                if (im .lt. 1) im=nx
                ! continuum-surface force implementation
-               curv=0.5d0*(normx(ip,j,k)-normx(im,j,k))*dxi + 0.5d0*(normy(i,jp,k)-normy(i,jm,k))*dxi + 0.5d0*(normz(i,j,kp)-normz(i,j,km))*dxi
+               curv=0.5d0*(normx_f(ip,j,k)-normx_f(im,j,k))*dxi + 0.5d0*(normy_f(i,jp,k)-normy_f(i,jm,k))*dxi + 0.5d0*(normz_f(i,j,kp)-normz_f(i,j,km))*dxi
                !compute capillary forces: sigma*curvature*gradphi
                fxst(i,j,k)= -sigma*curv*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
                fyst(i,j,k)= -sigma*curv*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
@@ -999,6 +1417,7 @@ program main
    cou=gumax*dt*dxi
    if (rank.eq.0) then
       write(*,*) "CFL (max among tasks)", cou
+      write(*,*) "umax", gumax
       if (cou .gt. 7) stop
       if (.not. ieee_is_finite(cou)) then
          print *, "Error: cou is infinite or NaN!"
@@ -1068,9 +1487,9 @@ program main
 deallocate(tanh_psi, mysin, mycos)
 deallocate(rhsu,rhsv,rhsw)
 deallocate(rhsu_o,rhsv_o,rhsw_o)
-deallocate(phi,rhsphi,rhsphi_o,normx,normy,normz)
+deallocate(phi,rhsphi,normx,normy,normz)
+deallocate(phi_tmp, rhsphik2, rhsphik3, rhsphik4)
 
 call mpi_finalize(ierr)
 
-end program main
-
+end program main
\ No newline at end of file
diff --git a/multi/module.f90 b/multi/module.f90
index 8168c1b..1a25def 100755
--- a/multi/module.f90
+++ b/multi/module.f90
@@ -64,10 +64,17 @@ end module velocity
 
 
 module phase
-   double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), rhsphi_o(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
+   double precision, allocatable :: phi(:,:,:), rhsphi(:,:,:), psidi(:,:,:), tanh_psi(:,:,:)
    double precision, allocatable :: normx(:,:,:), normy(:,:,:), normz(:,:,:)
+   double precision, allocatable :: normx_f(:,:,:), normy_f(:,:,:), normz_f(:,:,:)
    double precision :: curv
    double precision, allocatable :: fxst(:,:,:), fyst(:,:,:), fzst(:,:,:)
+   double precision, allocatable :: rhsphik1(:,:,:),rhsphik2(:,:,:), rhsphik3(:,:,:), rhsphik4(:,:,:), phi_tmp(:,:,:)
+   double precision :: normx_xm,normx_xp,normx_ym,normx_yp,normx_zm,normx_zp
+   double precision :: normy_xm,normy_xp,normy_ym,normy_yp,normy_zm,normy_zp
+   double precision :: normz_xm,normz_xp,normz_ym,normz_yp,normz_zm,normz_zp
+   double precision :: rn_01,rn_11,rn_02,rn_12,rn_03,rn_13
+   double precision :: sharpxm,sharpxp,sharpym,sharpyp,sharpzm,sharpzp
 end module phase