diff --git a/README.md b/README.md
index e472c2c..66a3b3b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # MHIT36
 
-Code for direct numerical simulation of Navier-Stokes equation coupled with a phase-field method (ACDI) for interface description.
+Code for direct numerical simulation of Navier-Stokes equations coupled with a phase-field method (ACDI) for interface description.
 
 ~~~text
 ███    ███ ██   ██ ██ ████████ ██████   ██████       
@@ -17,7 +17,7 @@ If you use this code, please cite the following work:
   @article{roccon2025,
   title   = {MHIT36: A Phase-Field Code for Gpu Simulations of Multiphase Homogeneous Isotropic Turbulence},
   author  = {Roccon, A. and Enzenberger, L. and Zaza, D. and Soldati, A.},
-  journal = {SSRN},
+  journal = {Computer Physics Communications (in press)},
   year    = {2025},
   doi     = {http://dx.doi.org/10.2139/ssrn.5264052}
 }
@@ -58,15 +58,36 @@ If you use this code, please cite the following work:
 
 ## Reference performance
 
-Performance (NS only) - from last push
-* 512 x 512 x 512 | 4 x A100@Leonardo  |  19 ms/timestep
-* 1024 x 1024 x 1024 | 4 x GH200@Nvidia | 161.9 ms/timestep (out of date)
-* 1024 x 1024 x 1024 | 8 x A100@Leonardo | 150.1 ms/timestep
-* 512 x 512 x 512 | 4 x H100@MN5-ACC   |  18 ms/timestep (out of date)
+Performance (NS only)
+* 256 x 256 x 256 | 4 x A100@Leonardo  |   16 ms/timestep
+* 512 x 512 x 512 | 4 x A100@Leonardo  |  270 ms/timestep 
+* 1024 x 1024 x 1024 | 32 x A100@Leonardo | 357 ms/timestep 
+* 2048 x 2048 x 2048 | 128 x A100@Leonardo | 820 ms/timestep
+* 256 x 256 x 256 | 4 x H100@MN5-ACC   |   13 ms/timestep
+* 512 x 512 x 512 | 4 x H100@MN5-ACC   |  230 ms/timestep
+* 1024 x 1024 x 1024 | 32 x H100@MN5-ACC  | 320 ms/timestep
+* 2048 x 2048 x 2048 | 512 x H100@MN5-ACC  | 259 ms/timestep
 
-Performance (NS + ACDI) - from last push
-* 512 x 512 x 512 | 4 x A100@Leonardo  |  28 ms/timestep
-* 1024 x 1024 x 1024 | 8 x A100@Leonardo | 288 ms/timestep
+Phase-field introduces about 15% of overhead compared to NS only.
+
+## Scaling
+
+Strong scaling results obtained on Leonardo (4 x A100 64 GB x node) and MN5 (4 x H100 40 GB x node)
+* Tested from 1 node up to 128 nodes (Leonardo)
+* Tested from 1 node up to 256 nodes (MN5-ACC)
+* Grid from 64 x 64 x 64 up to 4096 x 4096 x 4096
+* Very similar scaling for both NS and NS+ACDI
+
+![Scal](val/scaling.png)
+
+
+## Validation
+
+Benchamrk present in "W.M.VanRees, A.Leonard, D.Pullin, P.Koumoutsakos, A comparison of vortex and pseudo-spectral methods for the simulation of periodic vortical flows at high Reynolds numbers,J. Comput. Phys.2 30(8)(2011)2794–2805" and also Used in CaNS.
+
+Time evolution of the viscous dissipation:
+
+![Test](val/val.png)
 
 
 ## Contributing
diff --git a/make_lib_leo.sh b/make_lib_leo.sh
index c942f50..530055f 100755
--- a/make_lib_leo.sh
+++ b/make_lib_leo.sh
@@ -6,4 +6,6 @@ module load profile/candidate
 module load nvhpc/25.3
 module load hpcx-mpi/2.19
 cmake ..
+#enable nvshmem
+#cmake -DCUDECOMP_BUILD_EXTRAS=1 -DCUDECOMP_ENABLE_NVSHMEM=1 ..
 make -j
diff --git a/multi/Makefile_local b/multi/Makefile_local
index d928029..fc154b8 100644
--- a/multi/Makefile_local
+++ b/multi/Makefile_local
@@ -4,7 +4,7 @@ LD	= $(FC)
 RM	= /bin/rm -f
 
 # Paths (Modify if necessary)
-CUDECOMP_DIR = /home/milton/MHIT36_cuDecomp/cuDecomp/build/
+CUDECOMP_DIR = /home/milton/MHIT36/cuDecomp/build/
 
 # Compiler and Linker Flags
 FFLAGS = -fast -acc -gpu=managed -Minfo=accel -Mfree -Mpreprocess -cpp -Minfo=accel -cuda  -I$(CUDECOMP_DIR)/include
diff --git a/multi/PoissonExampleD2Z.f90 b/multi/PoissonExampleD2Z.f90
deleted file mode 100755
index a8b4820..0000000
--- a/multi/PoissonExampleD2Z.f90
+++ /dev/null
@@ -1,339 +0,0 @@
-!Optimized verison compared to the stanard one as it use D2Z in the first step
-! This avoid transpostions with nx x ny x nz complex and instead use nx/2+1 x ny x nz
-
-
-#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1)
-!#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) exit(1)
-
-! Solves poisson equation
-!
-!   u_xx + u_yy + u_zz= phi(x,y,z)
-!
-! where
-!
-!   phi(x,y,z) = sin(Mx*x)*sin(My*y)*sin(Mz*z)
-!
-! on domain 0 <= x <= 2*pi, 0 <= y <= 2*pi, 0 <= z <= 2*pi
-
-program main
-  use cudafor
-  use cudecomp
-  use cufft
-  use mpi
-
-  implicit none
-
-  ! Command line arguments
-  ! grid dimensions
-  integer :: nx, ny, nz
-  integer :: comm_backend
-  integer :: pr, pc
-
-  ! grid size
-  real(8), parameter :: Lx = 1.0, Ly = 1.0, Lz = 1.0
-  ! modes for phi
-  integer, parameter :: Mx = 1, My = 2, Mz = 1
-  real(8), parameter :: twopi = 8.0_8*atan(1.0_8)
-  real(8) :: err, maxErr = -1.0, times, timef
-  character(len=40) :: namefile
-
-
-  ! grid
-  real(8), allocatable :: x(:)
-  real(8) :: hx, hy, hz
-  ! wavenumbers
-  real(8), allocatable :: kx(:)
-  real(8), device, allocatable :: kx_d(:)
-
-  real(8), allocatable :: phi(:), ua(:,:,:), rhsp(:,:,:), p(:,:,:)
-  !complex(8), allocatable :: rhspc(:,:,:)
-  complex(8), device, allocatable :: phi_d(:)
-  complex(8), pointer, device, contiguous :: work_d(:)
-
-  ! MPI
-  integer :: rank, ranks, ierr
-  integer :: localRank, localComm
-
-  ! cudecomp
-  type(cudecompHandle) :: handle
-  type(cudecompGridDesc) :: grid_desc
-  type(cudecompGridDescConfig) :: config
-  type(cudecompGridDescAutotuneOptions) :: options
-
-  integer :: pdims(2)
-  integer :: gdims(3)
-  integer :: npx, npy, npz
-  type(cudecompPencilInfo) :: piX, piY, piZ
-  integer(8) :: nElemX, nElemY, nElemZ, nElemWork
-
-  ! CUFFT
-  integer :: planX, planY, planZ, planXF, planXB
-  integer :: batchsize
-  integer :: status
-
-  logical :: skip_next
-  character(len=16) :: arg
-  integer :: i, j ,k , jl, kl, jg, kg
-
-
-  ! MPI initialization
-
-  call mpi_init(ierr)
-  if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_init failed: ', ierr
-  call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)
-  if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_rank failed: ', ierr
-  call mpi_comm_size(MPI_COMM_WORLD, ranks, ierr)
-  if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_size failed: ', ierr
-
-  call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, localComm, ierr)
-  if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_split_type failed: ', ierr
-  call mpi_comm_rank(localComm, localRank, ierr)
-  if (ierr /= MPI_SUCCESS) write(*,*) 'mpi_comm_rank on local rank failed: ', ierr
-  ierr = cudaSetDevice(localRank)
-
-  ! Parse command-line arguments
-  nx = 64
-  ny = nx
-  nz = nx
-  pr = 2
-  pc = 1
-  comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
-
-
-  ! cudecomp initialization
-
-  CHECK_CUDECOMP_EXIT(cudecompInit(handle, MPI_COMM_WORLD))
-
-  CHECK_CUDECOMP_EXIT(cudecompGridDescConfigSetDefaults(config))
-  pdims = [pr, pc]
-  config%pdims = pdims
-  gdims = [ nx/2+1, ny, nz]
-  config%gdims = gdims
-  config%transpose_comm_backend = comm_backend
-  config%transpose_axis_contiguous = .true.
-
-  CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(options))
-  options%dtype = CUDECOMP_DOUBLE_COMPLEX
-  if (comm_backend == 0) then
-    options%autotune_transpose_backend = .true.
-  endif
-
-  CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
-
-  if (rank == 0) then
-     write(*,"('Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2)
-     write(*,"('Using ', a, ' backend ...')") cudecompTransposeCommBackendToString(config%transpose_comm_backend)
-  end if
-
-  ! get pencil info
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piX, 1))
-  nElemX = piX%size
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piY, 2))
-  nElemY = piY%size
-  CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, piZ, 3))
-  nElemZ = piZ%size
-
-  ! get workspace size
-  CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
-
-  ! CUFFT initialization
-
-  batchSize = piX%shape(2)*piX%shape(3)
-  status = cufftPlan1D(planX, nx, CUFFT_Z2Z, batchSize)
-  if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X plan'
-
-  batchSize = piX%shape(2)*piX%shape(3)
-  status = cufftPlan1D(planXF, nx, CUFFT_D2Z, batchSize)
-  if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X-forward plan'
-
-  batchSize = piX%shape(2)*piX%shape(3)
-  status = cufftPlan1D(planXB, nx, CUFFT_Z2D, batchSize)
-  if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating X-backward plan'
-
-  batchSize = piY%shape(2)*piY%shape(3)
-  status = cufftPlan1D(planY, ny, CUFFT_Z2Z, batchSize)
-  if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Y plan'
-
-  batchSize = piZ%shape(2)*piZ%shape(3)
-  status = cufftPlan1D(planZ, nz, CUFFT_Z2Z, batchSize)
-  if (status /= CUFFT_SUCCESS) write(*,*) rank, ': Error in creating Z plan'
-
-
-  ! Physical grid
-
-  allocate(x(nx))
-
-  hx = twopi/nx
-  do i = 1, nx
-     x(i) = hx*i
-  enddo
-  ! Wavenumbers
-
-  allocate(kx(nx))
-
-  do i = 1, nx/2
-     kx(i) = (i-1)
-  enddo
-  do i = nx/2+1, nx
-     kx(i) = (i-1-nx)
-  enddo
-  allocate(kx_d, source=kx)
-
-  ! allocate arrays
-
-  allocate(phi(max(nElemX, nElemY, nElemZ)))
-  allocate(phi_d, mold=phi)
-  allocate(ua(nx, piX%shape(2), piX%shape(3)))
-  allocate(rhsp(nx, piX%shape(2), piX%shape(3)))
-  !allocate(rhspc(nx, piX%shape(2), piX%shape(3)))
-  allocate(p(nx, piX%shape(2), piX%shape(3)))
-
-
-  CHECK_CUDECOMP_EXIT(cudecompMalloc(handle, grid_desc, work_d, nElemWork))
-
- !initialize rigth hand side + analytic solution
-    do kl = 1, piX%shape(3)
-       kg = piX%lo(3) + kl - 1
-       do jl = 1, piX%shape(2)
-          jg = piX%lo(2) + jl - 1
-          do i = 1, nx
-             rhsp(i,jl,kl) = sin(Mx*x(i))*sin(My*x(jg))*sin(Mz*x(kg))
-             !rhspc(i,jl,kl) = cmplx(rhsp(i,jl,kl),0.d0)
-             ua(i,jl,kl) = -rhsp(i,jl,kl)/(Mx**2 + My**2 + Mz**2)
-          enddo
-       enddo
-  enddo
-
-   write(namefile,'(a,i3.3,a)') 'rhsp_',rank,'.dat'
-   open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new')
-   write(55) rhsp
-   close(55)
-
-  call cpu_time(times)
-
-  ! phi(x,y,z) -> phi(kx,y,z)
-  !$acc host_data use_device(rhsp)
-  status = cufftExecD2Z(planXF, rhsp, phi_d)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
-  !$acc end host_data
-  ! phi(kx,y,z) -> phi(y,z,kx)
-  CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX))
-  ! phi(y,z,kx) -> phi(ky,z,kx)
-  status = cufftExecZ2Z(planY, phi_d, phi_d, CUFFT_FORWARD)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
-  ! phi(ky,z,kx) -> phi(z,kx,ky)
-  CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX))
-  ! phi(z,kx,ky) -> phi(kz,kx,ky)
-  status = cufftExecZ2Z(planZ, phi_d, phi_d, CUFFT_FORWARD)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status
-
-
-  block
-    complex(8), device, pointer :: phi3d(:,:,:)
-    real(8) :: k2
-    integer :: il, jl, ig, jg
-    integer :: offsets(3), xoff, yoff
-    integer :: np(3)
-    np(piZ%order(1)) = piZ%shape(1)
-    np(piZ%order(2)) = piZ%shape(2)
-    np(piZ%order(3)) = piZ%shape(3)
-    call c_f_pointer(c_devloc(phi_d), phi3d, piZ%shape)
-
-
-    ! divide by -K**2, and normalize
-
-    offsets(piZ%order(1)) = piZ%lo(1) - 1
-    offsets(piZ%order(2)) = piZ%lo(2) - 1
-    offsets(piZ%order(3)) = piZ%lo(3) - 1
-
-    xoff = offsets(1)
-    yoff = offsets(2)
-    npx = np(1)
-    npy = np(2)
-    !$cuf kernel do (2)
-    do jl = 1, npy
-       do il = 1, npx
-          jg = yoff + jl
-          ig = xoff + il
-          do k = 1, nz
-              k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2
-              !phi3d(k,il,jl) = phi3d(k,il,jl)/(int(nx,8)*int(ny,8)*int(nz,8))    
-              phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8))
-          enddo
-       enddo
-    enddo
-
-    ! specify mean (corrects division by zero wavenumber above)
-    if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0
-
-  end block
-
-
-  ! phi(kz,kx,ky) -> phi(z,kx,ky)
-  status = cufftExecZ2Z(planZ, phi_d, phi_d, CUFFT_INVERSE)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status
-  ! phi(z,kx,ky) -> phi(ky,z,kx)
-  CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX))
-  ! phi(ky,z,kx) -> phi(y,z,kx)
-  status = cufftExecZ2Z(planY, phi_d, phi_d, CUFFT_INVERSE)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
-  ! phi(y,z,kx) -> phi(kx,y,z)
-  CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, phi_d, phi_d, work_d, CUDECOMP_DOUBLE_COMPLEX))
-  ! phi(kx,y,z) -> phi(x,y,z)
-  !$acc host_data use_device(p)
-  status = cufftExecZ2D(planXB, phi_d, p)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
-  !$acc end host_data
-
-  err=0.d0
-  maxErr=-1.d0
-  !$acc kernels
-    do kl = 1, piX%shape(3)
-       do jl = 1, piX%shape(2)
-          do i = 1, nx
-             err = abs(ua(i,jl,kl)-p(i,jl,kl))
-             !rhsp(i,jl,kl)=real(rhspc(i,jl,kl))
-             if (err > maxErr) maxErr = err
-          enddo
-       enddo
-    enddo
-    !$acc end kernels
-
-    write(namefile,'(a,i3.3,a)') 'rhspo_',rank,'.dat'
-    open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new')
-    write(55) p
-    close(55)
-
-    write(namefile,'(a,i3.3,a)') 'ua_',rank,'.dat'
-    open(unit=55,file=namefile,form='unformatted',position='append',access='stream',status='new')
-    write(55) ua
-    close(55)
-
-
-    write(*,"('[', i0, '] Max Error: ', e12.6)") rank, maxErr
-
-  ! cleanup
-
-  !status = cufftDestroy(planX)
-  !if (status /= CUFFT_SUCCESS) write(*,*) 'X plan destroy: ', status
-  status = cufftDestroy(planXF)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'XF plan destroy: ', status
-  status = cufftDestroy(planXB)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'XB plan destroy: ', status
-  status = cufftDestroy(planY)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Y plan destroy: ', status
-  status = cufftDestroy(planZ)
-  if (status /= CUFFT_SUCCESS) write(*,*) 'Z plan destroy: ', status
-
-  deallocate(x)
-  deallocate(kx)
-  deallocate(kx_d)
-  deallocate(phi, phi_d, ua)
-
-  CHECK_CUDECOMP_EXIT(cudecompFree(handle, grid_desc, work_d))
-  CHECK_CUDECOMP_EXIT(cudecompGridDescDestroy(handle, grid_desc))
-  CHECK_CUDECOMP_EXIT(cudecompFinalize(handle))
-
-  call mpi_finalize(ierr)
-
-end program main
\ No newline at end of file
diff --git a/multi/go_leo.sh b/multi/go_leo.sh
index 2be0e05..3bb0e99 100644
--- a/multi/go_leo.sh
+++ b/multi/go_leo.sh
@@ -25,7 +25,4 @@ echo "Using directory: $ROOT_DIR"
 export LD_LIBRARY_PATH=$ROOT_DIR:$LD_LIBRARY_PATH
 
 chmod 777 binder.sh
-mpirun -np 4 --map-by node:PE=8 --rank-by core  ./binder.sh ./mhit36
-
-# mpirun -n 4 nsys profile --trace=cuda,nvtx,mpi -o profile_output_%q{SLURM_PROCID} --stats=true ./mhit36
-
+mpirun -np 4 --map-by node:PE=8 --rank-by core  ./binder.sh ./mhit36
\ No newline at end of file
diff --git a/multi/go_leo_prof.sh b/multi/go_leo_prof.sh
index 6368296..ebcb52e 100755
--- a/multi/go_leo_prof.sh
+++ b/multi/go_leo_prof.sh
@@ -49,5 +49,3 @@ mpirun -np 4 --map-by node:PE=8 --rank-by core nsys profile  -t cuda,nvtx,mpi,op
 # for nsight compute report - all kernels + mapping + nic
 # mpirun -np 4 --map-by node:PE=8 --rank-by core ncu --kernel-name regex:main_ --set=full --import-source=yes --launch-skip 70 --launch-count 18 -o reportall.%p ./binder.sh ./mhit36
 
-
-
diff --git a/multi/local.sh b/multi/local.sh
index 6b4e9dc..04116e8 100644
--- a/multi/local.sh
+++ b/multi/local.sh
@@ -4,7 +4,7 @@ MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/24.3/compilers/man; export MANPATH
 PATH=$NVCOMPILERS/$NVARCH/24.3/compilers/bin:$PATH; export PATH
 export PATH=$NVCOMPILERS/$NVARCH/24.3/comm_libs/mpi/bin:$PATH
 export MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/24.3/comm_libs/mpi/man
-LD_LIBRARY_PATH=/home/milton/MHIT36_cuDecomp/cuDecomp/build/lib
+LD_LIBRARY_PATH=/home/milton/MHIT36/cuDecomp/build/lib
 #clean folder output
 rm -rf output
 mkdir output
diff --git a/multi/main.f90 b/multi/main.f90
index 04a2cdf..7b8b1ad 100755
--- a/multi/main.f90
+++ b/multi/main.f90
@@ -10,7 +10,6 @@ program main
 use param
 use mpivar
 use cudecompvar
-use nvtx
 
 
 implicit none
@@ -56,7 +55,7 @@ program main
 integer :: np(3)
 
 ! Enable or disable phase field (acceleration eneabled by default)
-#define phiflag 1
+#define phiflag 0
 
 !########################################################################################################################################
 ! 1. INITIALIZATION OF MPI AND cuDECOMP AUTOTUNING : START
@@ -137,15 +136,14 @@ program main
 options%halo_axis = 1
 CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_desc, config, options))
 
-! Print information on configuration
-!if (rank == 0) then
-!   write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2)
-!   write(*,"(' Using ', a, ' transpose backend ...')") &
-!            cudecompTransposeCommBackendToString(config%transpose_comm_backend)
-!   write(*,"(' Using ', a, ' halo backend ...')") &
-!            cudecompHaloCommBackendToString(config%halo_comm_backend)
-!endif
-
+Print information on configuration
+if (rank == 0) then
+   write(*,"(' Running on ', i0, ' x ', i0, ' process grid ...')") config%pdims(1), config%pdims(2)
+   write(*,"(' Using ', a, ' transpose backend ...')") &
+         cudecompTransposeCommBackendToString(config%transpose_comm_backend)
+   write(*,"(' Using ', a, ' halo backend ...')") &
+         cudecompHaloCommBackendToString(config%halo_comm_backend)
+endif
 
 ! Get pencil info for the grid descriptor in the physical space
 ! This function returns a pencil struct (piX, piY or piZ) that contains the shape, global lower and upper index bounds (lo and hi), 
@@ -167,16 +165,6 @@ program main
 CHECK_CUDECOMP_EXIT(cudecompGetTransposeWorkspaceSize(handle, grid_desc, nElemWork))
 CHECK_CUDECOMP_EXIT(cudecompGetHaloWorkspaceSize(handle, grid_desc, 1, halo, nElemWork_halo))
 
-
-
-
-
-
-! Get pencil info for the grid descriptor in the complex space 
-!gdims = [nx/2+1, ny, nz]
-!config%gdims = gdims
-!CHECK_CUDECOMP_EXIT(cudecompGridDescCreate(handle, grid_descD2Z, config, options))
-
 CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_descD2Z, piX_d2z, 1,halo))
 nElemX_d2z = piX_d2z%size !<- number of total elments in x-configuratiion (include halo)
 ! Pencil info in Y-configuration present in PiY
@@ -191,9 +179,6 @@ program main
 
 
 
-
-
-
 ! CUFFT initialization -- Create Plans
 ! Forward 1D FFT in X: D2Z
 batchSize = piX_d2z%shape(2)*piX_d2z%shape(3) !<- number of FFT (from x-pencil dimension)
@@ -239,18 +224,17 @@ program main
    ! compute here the cos to avoid multiple computations of cos
    mycos(i)=cos(k0*x(i)+dx/2)
 enddo
+
 !########################################################################################################################################
 ! 1. INITIALIZATION AND cuDECOMP AUTOTUNING : END
 !########################################################################################################################################
 
 
 
-
-
-
 !########################################################################################################################################
 ! START STEP 2: ALLOCATE ARRAYS
 !########################################################################################################################################
+
 ! allocate arrays
 allocate(psi(max(nElemX, nElemY, nElemZ))) !largest among the pencil
 allocate(psi_real(max(nElemX, nElemY, nElemZ))) !largest among the pencil
@@ -270,11 +254,11 @@ program main
 allocate(div(piX%shape(1),piX%shape(2),piX%shape(3)))
 !PFM variables
 #if phiflag == 1
-allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)))
-allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
-allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
-allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
-allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
+   allocate(phi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi(piX%shape(1),piX%shape(2),piX%shape(3)),rhsphi_o(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(psidi(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(tanh_psi(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(normx(piX%shape(1),piX%shape(2),piX%shape(3)),normy(piX%shape(1),piX%shape(2),piX%shape(3)),normz(piX%shape(1),piX%shape(2),piX%shape(3)))
+   allocate(fxst(piX%shape(1),piX%shape(2),piX%shape(3)),fyst(piX%shape(1),piX%shape(2),piX%shape(3)),fzst(piX%shape(1),piX%shape(2),piX%shape(3))) ! surface tension forces
 #endif
 
 ! allocate arrays for transpositions and halo exchanges 
@@ -289,18 +273,13 @@ program main
 
 
 
-
-
-
-
-
 !########################################################################################################################################
 ! START STEP 3: FLOW AND PHASE FIELD INIT
 !########################################################################################################################################
 ! 3.1 Read/initialize from data without halo grid points (avoid out-of-bound if reading usin MPI I/O)
 ! 3.2 Call halo exchnages along Y and Z for u,v,w and phi
 if (restart .eq. 0) then !fresh start Taylor Green or read from file in init folder
-if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
+   if (rank.eq.0) write(*,*) "Initialize velocity field (fresh start)"
    if (inflow .eq. 0) then
       if (rank.eq.0) write(*,*) "Initialize Taylor-green"
       do k = 1+halo_ext, piX%shape(3)-halo_ext
@@ -316,12 +295,12 @@ program main
       enddo
    endif
    if (inflow .eq. 1) then
-   if (rank.eq.0)  write(*,*) "Initialize from data"
-         call readfield(1)
-         call readfield(2)
-         call readfield(3)
-      endif
+      if (rank.eq.0)  write(*,*) "Initialize from data"
+      call readfield(1)
+      call readfield(2)
+      call readfield(3)
    endif
+endif
 if (restart .eq. 1) then !restart, ignore inflow and read the tstart field 
    if (rank.eq.0)  write(*,*) "Initialize velocity field (from output folder), iteration:", tstart
    call readfield_restart(tstart,1)
@@ -345,35 +324,35 @@ program main
 
 ! initialize phase-field
 #if phiflag == 1
-if (restart .eq. 0) then
-if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
-   if (inphi .eq. 0) then
-   if (rank.eq.0) write(*,*) 'Spherical drop'
-      do k = 1+halo_ext, piX%shape(3)-halo_ext
-      kg = piX%lo(3) + k - 1 
-         do j = 1+halo_ext, piX%shape(2)-halo_ext
-         jg = piX%lo(2) + j - 1 
-            do i = 1, piX%shape(1)
-                pos=(x(i)-lx/2)**2d0 +  (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0
-                phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+   if (restart .eq. 0) then
+      if (rank.eq.0) write(*,*) 'Initialize phase field (fresh start)'
+      if (inphi .eq. 0) then
+         if (rank.eq.0) write(*,*) 'Spherical drop'
+         do k = 1+halo_ext, piX%shape(3)-halo_ext
+            kg = piX%lo(3) + k - 1 
+            do j = 1+halo_ext, piX%shape(2)-halo_ext
+               jg = piX%lo(2) + j - 1 
+               do i = 1, piX%shape(1)
+                  pos=(x(i)-lx/2)**2d0 +  (x(jg)-lx/2)**2d0 + (x(kg)-lx/2)**2d0
+                  phi(i,j,k) = 0.5d0*(1.d0-tanh((sqrt(pos)-radius)/2/eps))
+               enddo
             enddo
-        enddo
-    enddo
+         enddo
+      endif
+      if (inphi .eq. 1) then
+         if (rank.eq.0)  write(*,*) "Initialize phase-field from data"
+         call readfield(5)
+      endif
    endif
-   if (inphi .eq. 1) then
-      if (rank.eq.0)  write(*,*) "Initialize phase-field from data"
-      call readfield(5)
+   if (restart .eq. 1) then
+      write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
+      call readfield_restart(tstart,5)
    endif
-endif
-if (restart .eq. 1) then
-    write(*,*) "Initialize phase-field (restart, from output folder), iteration:", tstart
-    call readfield_restart(tstart,5)
-endif
-! update halo
-!$acc host_data use_device(phi)
-CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-!$acc end host_data 
+   ! update halo
+   !$acc host_data use_device(phi)
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   !$acc end host_data 
 #endif
 
 !Save initial fields (only if a fresh start)
@@ -384,22 +363,20 @@ program main
    call writefield(tstart,3)
    call writefield(tstart,4)
    #if phiflag == 1
-   call writefield(tstart,5)
+      call writefield(tstart,5)
    #endif
 endif
+
 !########################################################################################################################################
 ! END STEP 3: FLOW AND PHASE FIELD INIT
 !########################################################################################################################################
 
 
 
-
-
-
-
-! ########################################################################################################################################
+!########################################################################################################################################
 ! START TEMPORAL LOOP: STEP 4 to 9 REPEATED AT EVERY TIME STEP
-! ########################################################################################################################################
+!########################################################################################################################################
+
 ! First step use Euler
 alpha=1.0d0
 beta=0.0d0
@@ -410,99 +387,103 @@ program main
 !$acc data create(rhsu_o, rhsv_o, rhsw_o)
 !$acc data copyin(mysin, mycos)
 call cpu_time(t_start)
+
 ! Start temporal loop
 do t=tstart,tfin
-    ! Create custom label for each marker
+   ! Create custom label for each marker
     write(itcount,'(i4)') t
-    ! Range with custom  color
-    call nvtxStartRange("Iteration "//itcount,t)
+   ! Range with custom  color (uncomment for profiling)
+   ! call nvtxStartRange("Iteration "//itcount,t)
 
     if (rank.eq.0) write(*,*) "Time step",t,"of",tfin
     call cpu_time(times)
 
-   call nvtxStartRange("Phase-field")
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Phase-field")
+
    !########################################################################################################################################
    ! START STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
    !########################################################################################################################################
+
    #if phiflag == 1
-   !$acc kernels
-   do k=1, piX%shape(3)
-      do j=1, piX%shape(2)
-         do i=1,nx
-            ! compute distance function psi (used to compute normals)
-            val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
-            psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
-            ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
-            tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               ! compute distance function psi (used to compute normals)
+               val = min(phi(i,j,k),1.0d0) ! avoid machine precision overshoots in phi that leads to problem with log
+               psidi(i,j,k) = eps*log((val+enum)/(1.d0-val+enum))
+               ! compute here the tanh of distance function psi (used in the sharpening term) to avoid multiple computations of tanh
+               tanh_psi(i,j,k) = tanh(0.5d0*psidi(i,j,k)*epsi)
+            enddo
          enddo
       enddo
-   enddo
-   !$acc end kernels
+      !$acc end kernels
 
-   gamma=1.d0*gumax
-   !$acc parallel loop tile(16,4,2)
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
-         do i=1,nx
-            ! 4.1 RHS computation
-            ip=i+1
-            jp=j+1
-            kp=k+1
-            im=i-1
-            jm=j-1
-            km=k-1
-            if (ip .gt. nx) ip=1
-            if (im .lt. 1) im=nx
-            ! convective (first three lines) and diffusive (last three lines)
-            rhsphi(i,j,k) =   &
-                  - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi  &  
-                  - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi  &  
-                  - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi  &  
-                        + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &                   
-                                 eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &                   
-                                 eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)                      
-            ! 4.1.3. Compute normals for sharpening term (gradient)
-            normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
-            normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
-            normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+      gamma=1.d0*gumax
+      !$acc parallel loop tile(16,4,2)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ! 4.1 RHS computation
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               ! convective (first three lines) and diffusive (last three lines)
+               rhsphi(i,j,k) =   &
+                     - (u(ip,j,k)*0.5d0*(phi(ip,j,k)+phi(i,j,k)) - u(i,j,k)*0.5d0*(phi(i,j,k)+phi(im,j,k)))*dxi  &  
+                     - (v(i,jp,k)*0.5d0*(phi(i,jp,k)+phi(i,j,k)) - v(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,jm,k)))*dxi  &  
+                     - (w(i,j,kp)*0.5d0*(phi(i,j,kp)+phi(i,j,k)) - w(i,j,k)*0.5d0*(phi(i,j,k)+phi(i,j,km)))*dxi  &  
+                           + gamma*(eps*(phi(ip,j,k)-2.d0*phi(i,j,k)+phi(im,j,k))*ddxi + &                   
+                                    eps*(phi(i,jp,k)-2.d0*phi(i,j,k)+phi(i,jm,k))*ddxi + &                   
+                                    eps*(phi(i,j,kp)-2.d0*phi(i,j,k)+phi(i,j,km))*ddxi)                      
+               ! 4.1.3. Compute normals for sharpening term (gradient)
+               normx(i,j,k) = (psidi(ip,j,k) - psidi(im,j,k))
+               normy(i,j,k) = (psidi(i,jp,k) - psidi(i,jm,k))
+               normz(i,j,k) = (psidi(i,j,kp) - psidi(i,j,km))
+            enddo
          enddo
       enddo
-   enddo
-
-   ! Update normx,normy and normz halos, required to then compute normal derivative
-   !$acc host_data use_device(normx)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
-   !$acc host_data use_device(normy)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
-   !$acc host_data use_device(normz)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
 
-   ! 4.1.3. Compute Sharpening term (gradient)
-   ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avodi 0/0)
-   !$acc kernels
-   do k=1, piX%shape(3)
-      do j=1, piX%shape(2)
-         do i=1,nx
-            normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16)
-            ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16)
-            normx(i,j,k) = normx(i,j,k)*normod
-            normy(i,j,k) = normy(i,j,k)*normod
-            normz(i,j,k) = normz(i,j,k)*normod
+      ! Update normx,normy and normz halos, required to then compute normal derivative
+      !$acc host_data use_device(normx)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normx, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normy)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normy, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(normz)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, normz, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+
+      ! 4.1.3. Compute Sharpening term (gradient)
+      ! Substep 2: Compute normals (1.e-16 is a numerical tollerance to avodi 0/0)
+      !$acc kernels
+      do k=1, piX%shape(3)
+         do j=1, piX%shape(2)
+            do i=1,nx
+               normod = 1.d0/(sqrt(normx(i,j,k)*normx(i,j,k) + normy(i,j,k)*normy(i,j,k) + normz(i,j,k)*normz(i,j,k)) + 1.0E-16)
+               ! normod = 1.d0/(sqrt(normx(i,j,k)**2d0 + normy(i,j,k)**2d0 + normz(i,j,k)**2d0) + 1.0E-16)
+               normx(i,j,k) = normx(i,j,k)*normod
+               normy(i,j,k) = normy(i,j,k)*normod
+               normz(i,j,k) = normz(i,j,k)*normod
+            enddo
          enddo
       enddo
-   enddo
-   !$acc end kernels
+      !$acc end kernels
 
-   ! Compute sharpening term
-   !$acc kernels
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
+      ! Compute sharpening term
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
             do i=1,nx
                ip=i+1
                jp=j+1
@@ -512,61 +493,46 @@ program main
                km=k-1
                if (ip .gt. nx) ip=1
                if (im .lt. 1) im=nx
-               ! OLD CDI
-               !rhsphi(i,j,k)=rhsphi(i,j,k)+gamma*(((phi(ip,j,k)**2d0-phi(ip,j,k))*normx(ip,j,k)-(phi(im,j,k)**2d0-phi(im,j,k))*normx(im,j,k))*0.5d0*dxi + &
-               !                                    ((phi(i,jp,k)**2d0-phi(i,jp,k))*normy(i,jp,k)-(phi(i,jm,k)**2d0-phi(i,jm,k))*normy(i,jm,k))*0.5d0*dxi + &
-               !                                    ((phi(i,j,kp)**2d0-phi(i,j,kp))*normz(i,j,kp)-(phi(i,j,km)**2d0-phi(i,j,km))*normz(i,j,km))*0.5d0*dxi)
-               ! NEW ACDI
-               ! rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-(tanh(0.5d0*psidi(ip,j,k)*epsi))**2)*normx(ip,j,k)- 0.25d0*(1.d0-(tanh(0.5d0*psidi(im,j,k)*epsi))**2)*normx(im,j,k))*0.5*dxi +&
-               !                                    (0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jp,k)*epsi))**2)*normy(i,jp,k)- 0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jm,k)*epsi))**2)*normy(i,jm,k))*0.5*dxi +&
-               !  
-               ! ACDI improved version with pre-computed tanh                                
-                  rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
-                                                      0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + &
-                                                     (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
-                                                      0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dxi + &
-                                                     (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
-                                                      0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5*dxi)
-
-               ! rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-(tanh(0.5d0*psidi(ip,j,k)*epsi))**2)*normx(ip,j,k)- &
-               !                                    0.25d0*(1.d0-(tanh(0.5d0*psidi(im,j,k)*epsi))**2)*normx(im,j,k))*0.5*dxi +&
-               !                                    (0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jp,k)*epsi))**2)*normy(i,jp,k) - &
-                !                                    0.25d0*(1.d0-(tanh(0.5d0*psidi(i,jm,k)*epsi))**2)*normy(i,jm,k))*0.5*dxi +&
-               !  
 
+               ! ACDI with pre-computed tanh                                
+               rhsphi(i,j,k)=rhsphi(i,j,k)-gamma*((0.25d0*(1.d0-tanh_psi(ip,j,k)*tanh_psi(ip,j,k))*normx(ip,j,k) - &
+                                                   0.25d0*(1.d0-tanh_psi(im,j,k)*tanh_psi(im,j,k))*normx(im,j,k))*0.5*dxi + &
+                                                  (0.25d0*(1.d0-tanh_psi(i,jp,k)*tanh_psi(i,jp,k))*normy(i,jp,k) - &
+                                                   0.25d0*(1.d0-tanh_psi(i,jm,k)*tanh_psi(i,jm,k))*normy(i,jm,k))*0.5*dxi + &
+                                                  (0.25d0*(1.d0-tanh_psi(i,j,kp)*tanh_psi(i,j,kp))*normz(i,j,kp) - &
+                                                   0.25d0*(1.d0-tanh_psi(i,j,km)*tanh_psi(i,j,km))*normz(i,j,km))*0.5*dxi)
             enddo
-        enddo
-    enddo
-    !$acc end kernels
+         enddo
+      enddo
+      !$acc end kernels
 
-   ! 4.2 Get phi at n+1 
-   !$acc kernels
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
+      ! 4.2 Get phi at n+1 using AB2
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
             do i=1,nx
-                phi(i,j,k) = phi(i,j,k) + dt*rhsphi(i,j,k)
+               phi(i,j,k) = phi(i,j,k) + dt*(alpha*rhsphi(i,j,k)-beta*rhsphi_o(i,j,k))
+               rhsphi_o(i,j,k)=rhsphi(i,j,k)
             enddo
-        enddo
-    enddo
-   !$acc end kernels
+         enddo
+      enddo
+      !$acc end kernels
 
-   ! 4.3 Call halo exchnages along Y and Z for phi 
-   !$acc host_data use_device(phi)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
+      ! 4.3 Call halo exchnages along Y and Z for phi 
+      !$acc host_data use_device(phi)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, phi, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
    #endif
-   !########################################################################################################################################
-   ! END STEP 4: PHASE-FIELD SOLVER (EXPLICIT)
-   !########################################################################################################################################
-   call nvtxEndRange
-
-
 
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
 
+   !########################################################################################################################################
+   ! END STEP 4: PHASE-FIELD SOLVER 
+   !########################################################################################################################################
 
 
-   call nvtxStartRange("Projection")
    !########################################################################################################################################
    ! START STEP 5: USTAR COMPUTATION (PROJECTION STEP)
    !########################################################################################################################################
@@ -574,6 +540,9 @@ program main
    ! 5.2 obtain ustar and store old rhs in rhs_o
    ! 5.3 Call halo exchnages along Y and Z for u,v,w
 
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Projection")
+
    ! Projection step, convective terms
    ! 5.1a Convective terms NS
    ! Loop on inner nodes
@@ -627,102 +596,103 @@ program main
             rhsu(i,j,k)=rhsu(i,j,k)+(h11+h12+h13)*rhoi
             rhsv(i,j,k)=rhsv(i,j,k)+(h21+h22+h23)*rhoi
             rhsw(i,j,k)=rhsw(i,j,k)+(h31+h32+h33)*rhoi
+
             ! NS forcing
             kg = piX%lo(3) + k - 1 
-            jg = piX%lo(2) + j - 1 
+            jg = piX%lo(2) + j - 1
+
             ! ABC forcing
-            ! rhsu(i,j,k)= rhsu(i,j,k) + f3*sin(k0*x(kg))+f2*cos(k0*x(jg))
             rhsu(i,j,k)= rhsu(i,j,k) + f3*mysin(kg)+f2*mycos(jg)
-            ! rhsv(i,j,k)= rhsv(i,j,k) + f1*sin(k0*x(i))+f3*cos(k0*x(kg))
             rhsv(i,j,k)= rhsv(i,j,k) + f1*mysin(i)+f3*mycos(kg)
-            ! rhsw(i,j,k)= rhsw(i,j,k) + f2*sin(k0*x(jg))+f1*cos(k0*x(i))
             rhsw(i,j,k)= rhsw(i,j,k) + f2*mysin(jg)+f1*mycos(i)
+
             ! TG Forcing
             !rhsu(i,j,k)= rhsu(i,j,k) + f1*sin(k0*x(i))*cos(k0*x(j))*cos(k0*x(k))
             !rhsv(i,j,k)= rhsv(i,j,k) - f1*cos(k0*x(i))*sin(k0*x(j))*sin(k0*x(k))
+
          enddo
       enddo
    enddo
 
    ! Surface tension forces
    #if phiflag == 1
-   !$acc kernels
-   !Obtain surface tension forces evaluated at the center of the cell (same as where phi is located)
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
-         do i=1,nx
-            ip=i+1
-            jp=j+1
-            kp=k+1
-            im=i-1
-            jm=j-1
-            km=k-1
-            if (ip .gt. nx) ip=1
-            if (im .lt. 1) im=nx
-            chempot=phi(i,j,k)*(1.d0-phi(i,j,k))*(1.d0-2.d0*phi(i,j,k))*epsi-eps*(phi(ip,j,k)+phi(im,j,k)+phi(i,jp,k)+phi(i,jm,k)+phi(i,j,kp)+phi(i,j,km)- 6.d0*phi(i,j,k))*ddxi
-            ! chempot*gradphi
-            fxst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
-            fyst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
-            fzst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi
+      !$acc kernels
+      !Obtain surface tension forces evaluated at the center of the cell (same as where phi is located)
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (ip .gt. nx) ip=1
+               if (im .lt. 1) im=nx
+               chempot=phi(i,j,k)*(1.d0-phi(i,j,k))*(1.d0-2.d0*phi(i,j,k))*epsi-eps*(phi(ip,j,k)+phi(im,j,k)+phi(i,jp,k)+phi(i,jm,k)+phi(i,j,kp)+phi(i,j,km)- 6.d0*phi(i,j,k))*ddxi
+               ! chempot*gradphi
+               fxst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(ip,j,k)-phi(im,j,k))*dxi
+               fyst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,jp,k)-phi(i,jm,k))*dxi
+               fzst(i,j,k)=6.d0*sigma*chempot*0.5d0*(phi(i,j,kp)-phi(i,j,km))*dxi
+            enddo
          enddo
       enddo
-   enddo
-   !$acc end kernels
+      !$acc end kernels
 
-   ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
-   !$acc host_data use_device(fxst)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
-   !$acc host_data use_device(fyst)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
-   !$acc host_data use_device(fzst)
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
-   !$acc end host_data 
-   
-   ! Interpolate force at velocity points
-   !$acc kernels
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
-         do i=1,nx
-            im=i-1
-            jm=j-1
-            km=k-1
-            if (im .lt. 1) im=nx
-            rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
-            rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
-            rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
-            u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
-            v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
-            w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
-            rhsu_o(i,j,k)=rhsu(i,j,k)
-            rhsv_o(i,j,k)=rhsv(i,j,k)
-            rhsw_o(i,j,k)=rhsw(i,j,k)
-          enddo
+      ! Update halo of fxst, fyst and fzst (required then to interpolate at velocity points)
+      !$acc host_data use_device(fxst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fxst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(fyst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fyst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      !$acc host_data use_device(fzst)
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+      CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, fzst, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+      !$acc end host_data 
+      
+      ! Interpolate force at velocity points
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               im=i-1
+               jm=j-1
+               km=k-1
+               if (im .lt. 1) im=nx
+               rhsu(i,j,k)=rhsu(i,j,k) + 0.5d0*(fxst(im,j,k)+fxst(i,j,k))*rhoi
+               rhsv(i,j,k)=rhsv(i,j,k) + 0.5d0*(fyst(i,jm,k)+fyst(i,j,k))*rhoi
+               rhsw(i,j,k)=rhsw(i,j,k) + 0.5d0*(fzst(i,j,km)+fzst(i,j,k))*rhoi
+               u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+               v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+               w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
       enddo
-   enddo
-   !$acc end kernels
+      !$acc end kernels
 
    #else
 
-   ! 5.2 find u, v and w star (explicit Eulero), only in the inner nodes 
-   !$acc kernels
-   do k=1+halo_ext, piX%shape(3)-halo_ext
-      do j=1+halo_ext, piX%shape(2)-halo_ext
-          do i=1,nx
-              u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
-              v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
-              w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
-              rhsu_o(i,j,k)=rhsu(i,j,k)
-              rhsv_o(i,j,k)=rhsv(i,j,k)
-              rhsw_o(i,j,k)=rhsw(i,j,k)
-          enddo
+      ! 5.2 find u, v and w star (explicit Eulero), only in the inner nodes 
+      !$acc kernels
+      do k=1+halo_ext, piX%shape(3)-halo_ext
+         do j=1+halo_ext, piX%shape(2)-halo_ext
+            do i=1,nx
+               u(i,j,k) = u(i,j,k) + dt*(alpha*rhsu(i,j,k)-beta*rhsu_o(i,j,k))
+               v(i,j,k) = v(i,j,k) + dt*(alpha*rhsv(i,j,k)-beta*rhsv_o(i,j,k))
+               w(i,j,k) = w(i,j,k) + dt*(alpha*rhsw(i,j,k)-beta*rhsw_o(i,j,k))
+               rhsu_o(i,j,k)=rhsu(i,j,k)
+               rhsv_o(i,j,k)=rhsv(i,j,k)
+               rhsw_o(i,j,k)=rhsw(i,j,k)
+            enddo
+         enddo
       enddo
-   enddo
-   !$acc end kernels
+      !$acc end kernels
 
    #endif
 
@@ -731,11 +701,6 @@ program main
    ! After first step move to AB2 
    alpha=1.5d0
    beta= 0.5d0
-   ! !$acc kernels
-   ! rhsu_o=rhsu
-   ! rhsv_o=rhsv
-   ! rhsw_o=rhsw
-   ! !$acc end kernels
 
    ! 5.3 update halos (y and z directions), required to then compute the RHS of Poisson equation because of staggered grid
    !$acc host_data use_device(u)
@@ -750,43 +715,47 @@ program main
    CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
    CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, w, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
    !$acc end host_data 
+
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+
    !########################################################################################################################################
    ! END STEP 5: USTAR COMPUTATION 
    !########################################################################################################################################
-   call nvtxEndRange
-
 
 
-
-
-
-
-   call nvtxStartRange("Poisson")
+   
    !########################################################################################################################################
    ! START STEP 6: POISSON SOLVER FOR PRESSURE
    !########################################################################################################################################
    ! initialize rhs and analytical solution
    ! 6.1 Compute rhs of Poisson equation div*ustar: divergence at the cell center 
    ! I've done the halo updates so to compute the divergence at the pencil border i have the *star from the halo
-   call nvtxStartRange("compute RHS")
 
-     !$acc kernels
-     do k=1+halo_ext, piX%shape(3)-halo_ext
-        do j=1+halo_ext, piX%shape(2)-halo_ext
-            do i=1,nx
-                ip=i+1
-                jp=j+1
-                kp=k+1
-                if (ip > nx) ip=1
-                rhsp(i,j,k) =               (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
-                rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k))
-                rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k))
-            enddo
-        enddo
-     enddo
-     !$acc end kernels
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Poisson")
 
-   call nvtxEndRange
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("compute RHS")
+
+   !$acc kernels
+   do k=1+halo_ext, piX%shape(3)-halo_ext
+      do j=1+halo_ext, piX%shape(2)-halo_ext
+         do i=1,nx
+               ip=i+1
+               jp=j+1
+               kp=k+1
+               if (ip > nx) ip=1
+               rhsp(i,j,k) =               (rho*dxi/dt)*(u(ip,j,k)-u(i,j,k))
+               rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(v(i,jp,k)-v(i,j,k))
+               rhsp(i,j,k) = rhsp(i,j,k) + (rho*dxi/dt)*(w(i,j,kp)-w(i,j,k))
+         enddo
+      enddo
+   enddo
+   !$acc end kernels
+
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
 
    ! ! !beginDEBUG
    ! !$acc kernels
@@ -807,127 +776,106 @@ program main
    ! enddo
    ! !$acc end kernels
 
-
    ! !endDEBUG
 
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("FFT forward w/ transpositions")
 
-   call nvtxStartRange("FFT forward w/ transpositions")
-
-     !$acc host_data use_device(rhsp)
-     status = cufftExecD2Z(planXf, rhsp, psi_d)
-     if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
-     !$acc end host_data
-  
-     !!Nyquist to be set to zero?
-     ! block
-     !    complex(8), device, pointer :: psi_dev(:,:,:)
-     !    call c_f_pointer( c_devloc(psi_d), psi_dev,                       &
-     !                       [piX_d2z%shape(1), piX_d2z%shape(2), piX_d2z%shape(3)] )
-     !    !$acc kernels
-     !    do k = 1, piX_d2z%shape(3)
-     !       do j = 1, piX_d2z%shape(2)
-     !          psi_dev(nx/2+1, j, k) = (0.0_8, 0.0_8)
-     !       end do
-     !    end do
-     !    !$acc end kernels
-     ! end block
-  
-  
-     ! psi(kx,y,z) -> psi(y,z,kx)
-     CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
-     ! psi(y,z,kx) -> psi(ky,z,kx)
-     status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
-     if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
-     ! psi(ky,z,kx) -> psi(z,kx,ky)
-     CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) 
-     ! psi(z,kx,ky) -> psi(kz,kx,ky)
-     status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD)
-     if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status
-     ! END of FFT3D forward
-
-   call nvtxEndRange
-
-   !block
-    np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
-    np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
-    np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
-    call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape)
-
-
-    ! divide by -K**2, and normalize
-
-    offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
-    offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
-    offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
-
-    xoff = offsets(1)
-    yoff = offsets(2)
-    npx = np(1)
-    npy = np(2)
-
-    call nvtxStartRange("Solution")
+   !$acc host_data use_device(rhsp)
+   status = cufftExecD2Z(planXf, rhsp, psi_d)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X forward error: ', status
+   !$acc end host_data
 
-      !$acc kernels
-      do jl = 1, npy
-        jg = yoff + jl
-         do il = 1, npx
-            ig = xoff + il
-            do k = 1, nz
-                k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2    
-                phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8))
-            enddo
+   ! psi(kx,y,z) -> psi(y,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,piX_d2z%halo_extents, [0,0,0]))
+   ! psi(y,z,kx) -> psi(ky,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_FORWARD)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y forward error: ', status
+   ! psi(ky,z,kx) -> psi(z,kx,ky)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX)) 
+   ! psi(z,kx,ky) -> psi(kz,kx,ky)
+   status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_FORWARD)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Z forward error: ', status
+   ! END of FFT3D forward
+
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
+
+   np(piZ_d2z%order(1)) = piZ_d2z%shape(1)
+   np(piZ_d2z%order(2)) = piZ_d2z%shape(2)
+   np(piZ_d2z%order(3)) = piZ_d2z%shape(3)
+   call c_f_pointer(c_devloc(psi_d), phi3d, piZ_d2z%shape)
+
+   ! divide by -K**2, and normalize
+
+   offsets(piZ_d2z%order(1)) = piZ_d2z%lo(1) - 1
+   offsets(piZ_d2z%order(2)) = piZ_d2z%lo(2) - 1
+   offsets(piZ_d2z%order(3)) = piZ_d2z%lo(3) - 1
+
+   xoff = offsets(1)
+   yoff = offsets(2)
+   npx = np(1)
+   npy = np(2)
+
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Solution")
+
+   !$acc kernels
+   do jl = 1, npy
+      jg = yoff + jl
+      do il = 1, npx
+         ig = xoff + il
+         do k = 1, nz
+            k2 = kx_d(ig)**2 + kx_d(jg)**2 + kx_d(k)**2    
+            phi3d(k,il,jl) = -phi3d(k,il,jl)/k2/(int(nx,8)*int(ny,8)*int(nz,8))
          enddo
       enddo
-      ! specify mean (corrects division by zero wavenumber above)
-      if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0
-      !$acc end kernels
+   enddo
+   ! specify mean (corrects division by zero wavenumber above)
+   if (xoff == 0 .and. yoff == 0) phi3d(1,1,1) = 0.0
+   !$acc end kernels
 
-    call nvtxEndRange
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
    
-   !end block
-
-   call nvtxStartRange("FFT backwards w/ transpositions")
-
-     ! psi(kz,kx,ky) -> psi(z,kx,ky)
-     status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE)
-     if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status
-     ! psi(z,kx,ky) -> psi(ky,z,kx)
-     CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
-     ! psi(ky,z,kx) -> psi(y,z,kx)
-     status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
-     if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
-     ! psi(y,z,kx) -> psi(kx,y,z)
-     CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
-     !$acc host_data use_device(p)
-      ! psi(kx,y,z) -> psi(x,y,z)
-      status = cufftExecZ2D(planXb, psi_d, p)
-      if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
-     !$acc end host_data
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("FFT backwards w/ transpositions")
+
+   ! psi(kz,kx,ky) -> psi(z,kx,ky)
+   status = cufftExecZ2Z(planZ, psi_d, psi_d, CUFFT_INVERSE)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Z inverse error: ', status
+   ! psi(z,kx,ky) -> psi(ky,z,kx)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX))
+   ! psi(ky,z,kx) -> psi(y,z,kx)
+   status = cufftExecZ2Z(planY, psi_d, psi_d, CUFFT_INVERSE)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'Y inverse error: ', status
+   ! psi(y,z,kx) -> psi(kx,y,z)
+   CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_descD2Z, psi_d, psi_d, work_d_d2z, CUDECOMP_DOUBLE_COMPLEX,[0,0,0], piX_d2z%halo_extents))
+   !$acc host_data use_device(p)
+   ! psi(kx,y,z) -> psi(x,y,z)
+   status = cufftExecZ2D(planXb, psi_d, p)
+   if (status /= CUFFT_SUCCESS) write(*,*) 'X inverse error: ', status
+   !$acc end host_data
       
-   call nvtxEndRange
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
 
    !$acc host_data use_device(p)
-    ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data
-    ! Update X-pencil halos in Y and Z direction
-    CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
-    CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
+   ! update halo nodes with pressure (needed for the pressure correction step), using device variable no need to use host-data
+   ! Update X-pencil halos in Y and Z direction
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 2))
+   CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, p, work_halo_d, CUDECOMP_DOUBLE, piX%halo_extents, halo_periods, 3))
    !$acc end host_data 
 
-   
-
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
 
    !########################################################################################################################################
    ! END STEP 7: POISSON SOLVER FOR PRESSURE
    !########################################################################################################################################
-   call nvtxEndRange
 
 
 
-
-
-
-
-   call nvtxStartRange("Correction")
    !########################################################################################################################################
    ! START STEP 8: VELOCITY CORRECTION
    ! ########################################################################################################################################
@@ -936,6 +884,10 @@ program main
    ! 8.3 Call halo exchnages along Y and Z for u,v,w
    ! Correct velocity, pressure has also the halo
    ! Correction + removal of mean velocity altogether for performance
+
+   ! (uncomment for profiling)
+   ! call nvtxStartRange("Correction")
+
    !$acc kernels 
    umean=0.d0
    vmean=0.d0
@@ -943,16 +895,16 @@ program main
    do k=1+halo_ext, piX%shape(3)-halo_ext
       do j=1+halo_ext, piX%shape(2)-halo_ext
          do i = 1, piX%shape(1) ! equal to nx (no halo on x)
-              im=i-1
-              jm=j-1
-              km=k-1
-              if (im < 1) im=nx
-              u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
-              v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi
-              w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi
-              umean=umean + u(i,j,k)
-              vmean=vmean + v(i,j,k)
-              wmean=wmean + w(i,j,k)
+            im=i-1
+            jm=j-1
+            km=k-1
+            if (im < 1) im=nx
+            u(i,j,k)=u(i,j,k) - dt/rho*(p(i,j,k)-p(im,j,k))*dxi
+            v(i,j,k)=v(i,j,k) - dt/rho*(p(i,j,k)-p(i,jm,k))*dxi
+            w(i,j,k)=w(i,j,k) - dt/rho*(p(i,j,k)-p(i,j,km))*dxi
+            umean=umean + u(i,j,k)
+            vmean=vmean + v(i,j,k)
+            wmean=wmean + w(i,j,k)
           enddo
       enddo
    enddo
@@ -960,7 +912,6 @@ program main
 
    ! Remove mean velocity (get local mean of the rank)
 
-
    ! Divide by total number of points in the pencil
    umean=umean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
    vmean=vmean/nx/(piX%shape(2)-2*halo_ext)/(piX%shape(3)-2*halo_ext)
@@ -1004,18 +955,15 @@ program main
       if (cou .gt. 7) stop
    endif
 
-   ! beginDZ
-   ! CHECK - To be removed in actual computation
-
-   ! endDZ
+   call cpu_time(timef)
+   if (rank.eq.0) print '(" Time elapsed = ",f8.1," ms")',1000*(timef-times)
 
+   ! (uncomment for profiling)
+   ! call nvtxEndRange
 
-   call cpu_time(timef)
-   if (rank.eq.0) print '(" Time elapsed = ",f6.1," ms")',1000*(timef-times)
    !########################################################################################################################################
    ! END STEP 8: VELOCITY CORRECTION  
    !########################################################################################################################################
-   call nvtxEndRange
 
 
    !########################################################################################################################################
@@ -1023,22 +971,23 @@ program main
    ! ########################################################################################################################################
    if (mod(t,dump) .eq. 0) then
       if (rank .eq. 0) write(*,*) "Saving output files"
-         ! write velocity and pressure fiels (1-4)
-         call writefield(t,1)
-         call writefield(t,2)
-         call writefield(t,3)
-         call writefield(t,4)
-         #if phiflag == 1
+      ! write velocity and pressure fiels (1-4)
+      call writefield(t,1)
+      call writefield(t,2)
+      call writefield(t,3)
+      call writefield(t,4)
+      #if phiflag == 1
          ! write phase-field (5)
          call writefield(t,5)
-         #endif
+      #endif
    endif
    !########################################################################################################################################
    ! END STEP 9: OUTPUT FIELDS N  
    !########################################################################################################################################
 
-call nvtxEndRange
-!call nvtxEndRange
+! (uncomment for profiling)
+! call nvtxEndRange
+
 enddo
 call cpu_time(t_end)
 elapsed = t_end-t_start
@@ -1052,7 +1001,7 @@ program main
 deallocate(tanh_psi, mysin, mycos)
 deallocate(rhsu,rhsv,rhsw)
 deallocate(rhsu_o,rhsv_o,rhsw_o)
-deallocate(phi,rhsphi,normx,normy,normz)
+deallocate(phi,rhsphi,rhsphi_o,normx,normy,normz)
 
 call mpi_finalize(ierr)
 
diff --git a/multi/module.f90 b/multi/module.f90
index 7222f29..32d83c3 100755
--- a/multi/module.f90
+++ b/multi/module.f90
@@ -62,75 +62,5 @@ module phase
 end module phase
 
 
-! added NVTX for profiing from maxcuda/NVTX_example
-module nvtx
-use iso_c_binding
-implicit none
-integer,private :: col(7) = [ int(Z'0000ff00'), int(Z'000000ff'), int(Z'00ffff00'), int(Z'00ff00ff'), int(Z'0000ffff'), int(Z'00ff0000'), int(Z'00ffffff')]
-character,private,target :: tempName(256)
 
-type, bind(C):: nvtxEventAttributes
-  integer(C_INT16_T):: version=1
-  integer(C_INT16_T):: size=48 !
-  integer(C_INT):: category=0
-  integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1
-  integer(C_INT):: color
-  integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0
-  integer(C_INT):: reserved0
-  integer(C_INT64_T):: payload   ! union uint,int,double
-  integer(C_INT):: messageType=1  ! NVTX_MESSAGE_TYPE_ASCII     = 1 
-  type(C_PTR):: message  ! ascii char
-end type
-
-interface nvtxRangePush
-  ! push range with custom label and standard color
-  subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
-  use iso_c_binding
-  character(kind=C_CHAR) :: name(256)
-  end subroutine
-
-  ! push range with custom label and custom color
-  subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx')
-  use iso_c_binding
-  import:: nvtxEventAttributes
-  type(nvtxEventAttributes):: event
-  end subroutine
-end interface
-
-interface nvtxRangePop
-  subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
-  end subroutine
-end interface
-
-contains
-
-subroutine nvtxStartRange(name,id)
-  character(kind=c_char,len=*) :: name
-  integer, optional:: id
-  type(nvtxEventAttributes):: event
-  character(kind=c_char,len=256) :: trimmed_name
-  integer:: i
-
-  trimmed_name=trim(name)//c_null_char
-
-  ! move scalar trimmed_name into character array tempName
-  do i=1,LEN(trim(name)) + 1
-     tempName(i) = trimmed_name(i:i)
-  enddo
-
-
-  if ( .not. present(id)) then
-    call nvtxRangePush(tempName)
-  else
-    event%color=col(mod(id,7)+1)
-    event%message=c_loc(tempName)
-    call nvtxRangePushEx(event)
-  end if
-end subroutine
-
-subroutine nvtxEndRange
-  call nvtxRangePop
-end subroutine
-
-end module nvtx
 
diff --git a/multi/readinput.f90 b/multi/readinput.f90
index b7262e9..0a25a33 100644
--- a/multi/readinput.f90
+++ b/multi/readinput.f90
@@ -53,6 +53,7 @@ subroutine readinput
     write(*,*) "██  ██  ██ ██   ██ ██    ██         ██ ██    ██" 
     write(*,*) "██      ██ ██   ██ ██    ██    ██████   ██████"         
     write(*,*) "----------------------------------------------"
+    write(*,*) 'Grid:', nx, 'x', nx, 'x', nx
     write(*,*) "Restart ", restart
     write(*,*) "Tstart  ", tstart
     write(*,*) "Tfin    ", tfin
diff --git a/multi/testpush.sh b/multi/testpush.sh
deleted file mode 100644
index 2e0e21a..0000000
--- a/multi/testpush.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-make clean
-make
diff --git a/val/.DS_Store b/val/.DS_Store
new file mode 100644
index 0000000..fb0b244
Binary files /dev/null and b/val/.DS_Store differ
diff --git a/vtkwrite.m b/vtkwrite.m
deleted file mode 100644
index 91be51a..0000000
--- a/vtkwrite.m
+++ /dev/null
@@ -1,279 +0,0 @@
-function vtkwrite( filename,dataType,varargin )
-% VTKWRITE Writes 3D Matlab array into VTK file format.
-%  vtkwrite(filename,'structured_grid',x,y,z,'vectors',title,u,v,w) writes
-%  a structured 3D vector data into VTK file, with name specified by the string
-%  filename. (u,v,w) are the vector components at the points (x,y,z). x,y,z
-%  should be 3-D matrices like those generated by meshgrid, where
-%  point(ijk) is specified by x(i,j,k), y(i,j,k) and z(i,j,k).
-%  The matrices x,y,z,u,v,w must all be the same size and contain
-%  corrresponding position and vector component. The string title specifies
-%  the name of the vector field to be saved. 
-%
-%  vtkwrite(filename,'structured_grid',x,y,z,'scalars',title,r) writes a 3D
-%  scalar data into VTK file whose name is specified by the string
-%  filename. r is the scalar value at the points (x,y,z). The matrices
-%  x,y,z,r must all be the same size and contain the corresponding position
-%  and scalar values. 
-%
-%  vtkwrite(filename,'structured_grid',x,y,z,'vectors',title,u,v,w,'scalars',
-%  title2,r) writes a 3D structured grid that contains both vector and scalar values.
-%  x,y,z,u,v,w,r must all be the same size and contain the corresponding
-%  positon, vector and scalar values.
-%
-%  vtkwrite(filename, 'structured_points', title, m) saves matrix m (could
-%  be 1D, 2D or 3D array) into vtk as structured points.
-%
-%  vtkwrite(filename, 'structured_points', title, m, 'spacing', sx, sy, sz)
-%  allows user to specify spacing. (default: 1, 1, 1). This is the aspect
-%  ratio of a single voxel. 
-%
-%  vtkwrite(filename, 'structured_points', title, m, 'origin', ox, oy, oz)
-%  allows user to speicify origin of dataset. (default: 0, 0, 0).
-%
-%  vtkwrite(filename,'unstructured_grid',x,y,z,'vectors',title,u,v,w,'scalars',
-%  title2,r) writes a 3D unstructured grid that contains both vector and scalar values.
-%  x,y,z,u,v,w,r must all be the same size and contain the corresponding
-%  positon, vector and scalar values.
-%  
-%  vtkwrite(filename, 'polydata', 'lines', x, y, z) exports a 3D line where
-%  x,y,z are coordinates of the points that make the line. x, y, z are
-%  vectors containing the coordinates of points of the line, where point(n)
-%  is specified by x(n), y(n) and z(n).
-%
-%  vtkwrite(filename,'polydata','lines',x,y,z,'Precision',n) allows you to
-%  specify precision of the exported number up to n digits after decimal
-%  point. Default precision is 3 digits. 
-%
-%  vtkwrite(filename,'polydata','triangle',x,y,z,tri) exports a list of
-%  triangles where x,y,z are the coordinates of the points and tri is an
-%  m*3 matrix whose rows denote the points of the individual triangles.
-%
-%  vtkwrite(filename,'polydata','tetrahedron',x,y,z,tetra) exports a list
-%  of tetrahedrons where x,y,z are the coordinates of the points
-%  and tetra is an m*4 matrix whose rows denote the points of individual
-%  tetrahedrons. 
-%  
-%  vtkwrite('execute','polydata','lines',x,y,z) will save data with default
-%  filename ''matlab_export.vtk' and automatically loads data into
-%  ParaView. 
-%  
-%  Version 2.3
-%  Copyright, Chaoyuan Yeh, 2016
-%  Codes are modified from William Thielicke and David Gingras's submission.    
-
-if strcmpi(filename,'execute'), filename = 'matlab_export.vtk'; end
-fid = fopen(filename, 'w'); 
-% VTK files contain five major parts
-% 1. VTK DataFile Version
-fprintf(fid, '# vtk DataFile Version 2.0\n');
-% 2. Title
-fprintf(fid, 'VTK from Matlab\n');
-
-
-binaryflag = any(strcmpi(varargin, 'BINARY'));
-if any(strcmpi(varargin, 'PRECISION'))
-    precision = num2str(varargin{find(strcmpi(vin, 'PRECISION'))+1});
-else
-    precision = '2';
-end
-
-switch upper(dataType)
-    case 'STRUCTURED_POINTS'
-        title = varargin{1};
-        m = varargin{2};
-        if any(strcmpi(varargin, 'spacing'))
-            sx = varargin{find(strcmpi(varargin, 'spacing'))+1};
-            sy = varargin{find(strcmpi(varargin, 'spacing'))+2};
-            sz = varargin{find(strcmpi(varargin, 'spacing'))+3};
-        else
-            sx = 1;
-            sy = 1;
-            sz = 1;
-        end
-        if any(strcmpi(varargin, 'origin'))
-            ox = varargin{find(strcmpi(varargin, 'origin'))+1};
-            oy = varargin{find(strcmpi(varargin, 'origin'))+2};
-            oz = varargin{find(strcmpi(varargin, 'origin'))+3};
-        else
-            ox = 0;
-            oy = 0;
-            oz = 0;
-        end
-        [nx, ny, nz] = size(m);
-        setdataformat(fid, binaryflag);
-        
-        fprintf(fid, 'DATASET STRUCTURED_POINTS\n');
-        fprintf(fid, 'DIMENSIONS %d %d %d\n', nx, ny, nz);
-        fprintf(fid, ['SPACING ', num2str(sx), ' ', num2str(sy), ' ',...
-            num2str(sz), '\n']);
-        fprintf(fid, ['ORIGIN ', num2str(ox), ' ', num2str(oy), ' ',...
-            num2str(oz), '\n']); 
-        fprintf(fid, 'POINT_DATA %d\n', nx*ny*nz);
-        fprintf(fid, ['SCALARS ', title, ' float 1\n']);
-        fprintf(fid,'LOOKUP_TABLE default\n');
-        if ~binaryflag 
-            spec = ['%0.', precision, 'f '];
-            fprintf(fid, spec, m(:)');
-        else
-            fwrite(fid, m(:)', 'float', 'b');
-        end
-        
-    case {'STRUCTURED_GRID','UNSTRUCTURED_GRID'}
-        % 3. The format data proper is saved in (ASCII or Binary). Use
-        % fprintf to write data in the case of ASCII and fwrite for binary.
-        if numel(varargin)<6, error('Not enough input arguments'); end
-        setdataformat(fid, binaryflag);
-%         fprintf(fid, 'BINARY\n');
-        x = varargin{1};
-        y = varargin{2};
-        z = varargin{3};
-        if sum(size(x)==size(y) & size(y)==size(z))~=length(size(x))
-            error('Input dimesions do not match')
-        end
-        n_elements = numel(x);
-        % 4. Type of Dataset ( can be STRUCTURED_POINTS, STRUCTURED_GRID,
-        % UNSTRUCTURED_GRID, POLYDATA, RECTILINEAR_GRID or FIELD )
-        % This part, dataset structure, begins with a line containing the
-        % keyword 'DATASET' followed by a keyword describing the type of dataset.
-        % Then the geomettry part describes geometry and topology of the dataset.
-        if strcmpi(dataType,'STRUCTURED_GRID')
-            fprintf(fid, 'DATASET STRUCTURED_GRID\n');
-            fprintf(fid, 'DIMENSIONS %d %d %d\n', size(x,1), size(x,2), size(x,3));
-        else
-            fprintf(fid, 'DATASET UNSTRUCTURED_GRID\n');
-        end
-        fprintf(fid, ['POINTS ' num2str(n_elements) ' float\n']);
-        output = [x(:)'; y(:)'; z(:)'];
-        
-        if ~binaryflag
-            spec = ['%0.', precision, 'f '];
-            fprintf(fid, spec, output);
-        else
-            fwrite(fid, output, 'float', 'b');
-        end
-        % 5.This final part describe the dataset attributes and begins with the
-        % keywords 'POINT_DATA' or 'CELL_DATA', followed by an integer number
-        % specifying the number of points of cells. Other keyword/data combination
-        % then define the actual dataset attribute values.
-        fprintf(fid, ['\nPOINT_DATA ' num2str(n_elements)]);
-        % Parse remaining argument.
-        vidx = find(strcmpi(varargin,'VECTORS'));
-        sidx = find(strcmpi(varargin,'SCALARS'));
-        if vidx~=0
-            for ii = 1:length(vidx)
-                title = varargin{vidx(ii)+1};
-                % Data enteries begin with a keyword specifying data type
-                % and numeric format.
-                fprintf(fid, ['\nVECTORS ', title,' float\n']);
-                output = [varargin{ vidx(ii) + 2 }(:)';...
-                          varargin{ vidx(ii) + 3 }(:)';...
-                          varargin{ vidx(ii) + 4 }(:)'];
-
-                if ~binaryflag
-                    spec = ['%0.', precision, 'f '];
-                    fprintf(fid, spec, output);
-                else
-                    fwrite(fid, output, 'float', 'b');
-                end
-%                 fwrite(fid, [reshape(varargin{vidx(ii)+2},1,n_elements);...
-%                 reshape(varargin{vidx(ii)+3},1,n_elements);...
-%                 reshape(varargin{vidx(ii)+4},1,n_elements)],'float','b');
-            end
-        end
-        if sidx~=0
-            for ii = 1:length(sidx)
-                title = varargin{sidx(ii)+1};
-                fprintf(fid, ['\nSCALARS ', title,' float\n']);
-                fprintf(fid, 'LOOKUP_TABLE default\n');
-                if ~binaryflag
-                    spec = ['%0.', precision, 'f '];
-                    fprintf(fid, spec, varargin{ sidx(ii) + 2});
-                else
-                    fwrite(fid, varargin{ sidx(ii) + 2}, 'float', 'b');
-                end
-%                 fwrite(fid, reshape(varargin{sidx(ii)+2},1,n_elements),'float','b');
-            end
-        end
-        
-    case 'POLYDATA'
-
-        fprintf(fid, 'ASCII\n');
-        if numel(varargin)<4, error('Not enough input arguments'); end
-        x = varargin{2}(:);
-        y = varargin{3}(:);
-        z = varargin{4}(:);
-        if numel(varargin)<4, error('Not enough input arguments'); end
-        if sum(size(x)==size(y) & size(y)==size(z))~= length(size(x))
-            error('Input dimesions do not match')
-        end
-        n_elements = numel(x);
-        fprintf(fid, 'DATASET POLYDATA\n');
-        if mod(n_elements,3)==1
-            x(n_elements+1:n_elements+2,1)=[0;0];
-            y(n_elements+1:n_elements+2,1)=[0;0];
-            z(n_elements+1:n_elements+2,1)=[0;0];
-        elseif mod(n_elements,3)==2
-            x(n_elements+1,1)=0;
-            y(n_elements+1,1)=0;
-            z(n_elements+1,1)=0;
-        end
-        nbpoint = numel(x);
-        fprintf(fid, ['POINTS ' num2str(nbpoint) ' float\n']);
-        
-        spec = [repmat(['%0.', precision, 'f '], 1, 9), '\n'];
-        
-        output = [x(1:3:end-2), y(1:3:end-2), z(1:3:end-2),...
-                  x(2:3:end-1), y(2:3:end-1), z(2:3:end-1),...
-                  x(3:3:end), y(3:3:end), z(3:3:end)]';
-              
-        fprintf(fid, spec, output);
-        
-        switch upper(varargin{1})
-            case 'LINES'
-                if mod(n_elements,2)==0
-                    nbLine = 2*n_elements-2;
-                else
-                    nbLine = 2*(n_elements-1);
-                end
-                conn1 = zeros(nbLine,1);
-                conn2 = zeros(nbLine,1);
-                conn2(1:nbLine/2) = 1:nbLine/2;
-                conn1(1:nbLine/2) = conn2(1:nbLine/2)-1;
-                conn1(nbLine/2+1:end) = 1:nbLine/2;
-                conn2(nbLine/2+1:end) = conn1(nbLine/2+1:end)-1;
-                fprintf(fid,'\nLINES %d %d\n',nbLine,3*nbLine);
-                fprintf(fid,'2 %d %d\n',[conn1';conn2']);
-            case 'TRIANGLE'
-                ntri = length(varargin{5});
-                fprintf(fid,'\nPOLYGONS %d %d\n',ntri,4*ntri);
-                fprintf(fid,'3 %d %d %d\n',(varargin{5}-1)');
-            case 'TETRAHEDRON'
-                ntetra = length(varargin{5});
-                fprintf(fid,'\nPOLYGONS %d %d\n',ntetra,5*ntetra);
-                fprintf(fid,'4 %d %d %d %d\n',(varargin{5}-1)');
-        end     
-end
-fclose(fid);
-if strcmpi(filename,'matlab_export.vtk')
-    switch computer
-        case {'PCWIN','PCWIN64'}
-            !paraview.exe --data='matlab_export.vtk' &
-            % Exclamation point character is a shell escape, the rest of the
-            % input line will be sent to operating system. It can not take
-            % variables, though. The & at the end of line will return control to 
-            % Matlab even when the outside process is still running. 
-        case {'GLNXA64','MACI64'}
-            !paraview --data='matlab_export.vtk' &
-    end
-end
-end
-
-function setdataformat(fid, binaryflag)
-
-if ~binaryflag
-    fprintf(fid, 'ASCII\n');
-else
-    fprintf(fid, 'BINARY\n');
-end
-end
-    
\ No newline at end of file