From d1f40e81bab5799ff1b5d2875e121d162a3ad21d Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Fri, 21 Nov 2025 15:50:03 -0500 Subject: [PATCH 01/17] Implemented kspace_modify isend yes for non-blocking MPI_Isend communication --- src/KOKKOS/fft3d_kokkos.cpp | 15 +++++----- src/KOKKOS/fft3d_kokkos.h | 4 +-- src/KOKKOS/pppm_kokkos.cpp | 7 +++-- src/KOKKOS/remap_kokkos.cpp | 56 +++++++++++++++++++++++++++++-------- src/KOKKOS/remap_kokkos.h | 7 +++-- src/kspace.cpp | 5 ++++ src/kspace.h | 1 + 7 files changed, 70 insertions(+), 25 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index c07f45cc228..99642e27f47 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -35,7 +35,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf, int usecollective, - int usegpu_aware) : + int useisend, int usegpu_aware) : Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; @@ -81,7 +81,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective,nthreads,usegpu_aware); + scaled,permute,nbuf,usecollective,useisend,nthreads,usegpu_aware); if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan"); } @@ -400,6 +400,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in, 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT usecollective use collective MPI operations for remapping data + useisend when using point-to-point MPI, use MPI_Isend usegpu_aware use GPU-Aware MPI or not ------------------------------------------------------------------------- */ @@ -411,7 +412,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf, int usecollective, - int nthreads, int usegpu_aware) + int useisend, int nthreads, int usegpu_aware) { struct fft_plan_3d_kokkos *plan; int me,nprocs; @@ -468,7 +469,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,2,0,0,FFT_PRECISION, - usecollective,usegpu_aware); + usecollective,useisend,usegpu_aware); if (plan->pre_plan == nullptr) return nullptr; } @@ -493,7 +494,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi,2,1,0,FFT_PRECISION, - usecollective,usegpu_aware); + usecollective,useisend,usegpu_aware); if (plan->mid1_plan == nullptr) return nullptr; // 1d FFTs along mid axis @@ -534,7 +535,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi,2,1,0,FFT_PRECISION, - usecollective,usegpu_aware); + usecollective,useisend,usegpu_aware); if (plan->mid2_plan == nullptr) return nullptr; // 1d FFTs along slow axis @@ -562,7 +563,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION, - usecollective,usegpu_aware); + usecollective,useisend,usegpu_aware); if (plan->post_plan == nullptr) return nullptr; } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index 6227a1e97b1..33d52e58130 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -97,7 +97,7 @@ class FFT3dKokkos : protected Pointers { FFT3dKokkos(class LAMMPS *, MPI_Comm, int,int,int,int,int,int,int,int,int,int,int,int,int,int,int, - int,int,int *,int,int); + int,int,int *,int,int,int); ~FFT3dKokkos() override; void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int); void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int); @@ -115,7 +115,7 @@ class FFT3dKokkos : protected Pointers { struct fft_plan_3d_kokkos *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int *, int, int, int); + int, int, int *, int, int, int, int); void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index dae53c12d0b..6ed541738e1 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -833,22 +833,23 @@ void PPPMKokkos::allocate() // remap takes data from 3d brick to FFT decomposition int collective_flag = force->kspace->collective_flag; + int isend_flag = force->kspace->isend_flag; int gpu_aware_flag = lmp->kokkos->gpu_aware_flag; int tmp; fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,gpu_aware_flag); + 0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag); fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,gpu_aware_flag); + 0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag); remap = new RemapKokkos(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag,gpu_aware_flag); + 1,0,0,FFT_PRECISION,collective_flag,isend_flag,gpu_aware_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 573f4c2508b..f9ab0f7f475 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -35,13 +35,13 @@ RemapKokkos::RemapKokkos(LAMMPS *lmp, MPI_Comm comm, int out_klo, int out_khi, int nqty, int permute, int memory, int precision, int usecollective, - int usegpu_aware) : Pointers(lmp) + int useisend, int usegpu_aware) : Pointers(lmp) { plan = remap_3d_create_plan_kokkos(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, nqty,permute,memory,precision,usecollective, - usegpu_aware); + useisend, usegpu_aware); if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan"); } @@ -142,14 +142,26 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d for (isend = 0; isend < plan->nsend; isend++) { int in_offset = plan->send_offset[isend]; - plan->pack(d_in,in_offset, - plan->d_sendbuf,0,&plan->packplan[isend]); + if (plan->useisend) { + plan->pack(d_in,in_offset, + plan->d_sendbuf,plan->send_bufloc[isend], + &plan->packplan[isend]); + } else { + plan->pack(d_in,in_offset, + plan->d_sendbuf,0, + &plan->packplan[isend]); + } if (!plan->usegpu_aware) Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); - MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, - plan->send_proc[isend],0,plan->comm); + if (plan->useisend) { + MPI_Isend(v_sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]); + } else { + MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } } // copy in -> scratch -> out for self data @@ -183,6 +195,11 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d plan->unpack(d_scratch,scratch_offset, d_out,out_offset,&plan->unpackplan[irecv]); } + + if (plan->useisend) { + // finally, wait for all Isends to be done + MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE); + } } else { if (plan->commringlen > 0) { int isend,irecv; @@ -266,6 +283,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) usecollective whether to use collective MPI or point-to-point + useisend when using point-to-point MPI, use non-blocking MPI_Isend usegpu_aware whether to use GPU-Aware MPI or not ------------------------------------------------------------------------- */ @@ -277,7 +295,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, int memory, int /*precision*/, - int usecollective, int usegpu_aware) + int usecollective, int useisend, int usegpu_aware) { struct remap_plan_3d_kokkos *plan; @@ -295,6 +313,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan = new struct remap_plan_3d_kokkos; if (plan == nullptr) return nullptr; plan->usecollective = usecollective; + plan->useisend = useisend; plan->usegpu_aware = usegpu_aware; // store parameters in local data structs @@ -359,6 +378,11 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->packplan = (struct pack_plan_3d *) malloc(nsend*sizeof(struct pack_plan_3d)); + if (plan->useisend) + plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request)); + plan->send_bufloc = (int *) malloc(nsend*sizeof(int)); + if (plan->send_bufloc == nullptr) return nullptr; + if (plan->send_offset == nullptr || plan->send_size == nullptr || plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; } @@ -400,6 +424,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat nsend = 0; iproc = me; + ibuf = 0; for (i = 0; i < nprocs; i++) { iproc++; if (iproc == nprocs) iproc = 0; @@ -415,6 +440,8 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; plan->packplan[nsend].nqty = nqty; plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->send_bufloc[nsend] = ibuf; + ibuf += plan->send_size[nsend]; nsend++; } } @@ -497,8 +524,13 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // find biggest send message (not including self) and malloc space for it size = 0; - for (nsend = 0; nsend < plan->nsend; nsend++) - size = MAX(size,plan->send_size[nsend]); + if (plan->useisend) { + for (nsend = 0; nsend < plan->nsend; nsend++) + size += plan->send_size[nsend]; + } else { + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); + } if (size) { plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); @@ -520,7 +552,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat MPI_Comm_dup(comm,&plan->comm); } else { - // Improved approach - use an AllReduce to aggregate which ranks need to be included // To do this, we build the local proc's send/receive list, then do an AllReduce // to create the send/recv count for the Alltoallv @@ -845,7 +876,6 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->unpackplan); } } else { - // free arrays used in pt2pt communication if (plan->nsend || plan->self) { @@ -853,6 +883,10 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->send_size); free(plan->send_proc); free(plan->packplan); + if (plan->useisend) { + free(plan->isend_reqs); + free(plan->send_bufloc); + } } if (plan->nrecv || plan->self) { diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index e3830eea7d4..a6a15bbf098 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -40,12 +40,14 @@ struct remap_plan_3d_kokkos { int *send_offset; // extraction loc for each send int *send_size; // size of each send message int *send_proc; // proc to send each message to + int *send_bufloc; // if useisend, offset in send buf for each isend struct pack_plan_3d *packplan; // pack plan for each send message int *recv_offset; // insertion loc for each recv int *recv_size; // size of each recv message int *recv_proc; // proc to recv each message from int *recv_bufloc; // offset in scratch buf for each recv int *nrecvmap; // maps receive index to rank index + MPI_Request *isend_reqs; // MPI request for each posted isend MPI_Request *request; // MPI request for each posted recv struct pack_plan_3d *unpackplan; // unpack plan for each recv message int nrecv; // # of recvs from other procs @@ -54,6 +56,7 @@ struct remap_plan_3d_kokkos { int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI + int useisend; // if using point-to-point MPI, use MPI_Isend int usegpu_aware; // use GPU-Aware MPI or not // variables for collective MPI only int commringlen; // length of commringlist @@ -75,7 +78,7 @@ class RemapKokkos : protected Pointers { typedef FFTArrayTypes FFT_AT; RemapKokkos(class LAMMPS *); RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, - int,int,int,int,int,int,int,int,int,int,int,int); + int,int,int,int,int,int,int,int,int,int,int,int,int); ~RemapKokkos() override; void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d); @@ -85,7 +88,7 @@ class RemapKokkos : protected Pointers { struct remap_plan_3d_kokkos *remap_3d_create_plan_kokkos(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int, int, int, int); + int, int, int, int, int, int, int); void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *); }; diff --git a/src/kspace.cpp b/src/kspace.cpp index 9cb400c1a72..ed025fd9454 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -60,6 +60,7 @@ KSpace::KSpace(LAMMPS *lmp) : #else collective_flag = 0; #endif + isend_flag = 0; kewaldflag = 0; @@ -552,6 +553,10 @@ void KSpace::modify_params(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; + } else if (strcmp(arg[iarg],"isend") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); + isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); + iarg += 2; } else if (strcmp(arg[iarg],"diff") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); if (strcmp(arg[iarg+1],"ad") == 0) differentiation_flag = 1; diff --git a/src/kspace.h b/src/kspace.h index 0a3db05ee7a..f056c25bbb7 100644 --- a/src/kspace.h +++ b/src/kspace.h @@ -131,6 +131,7 @@ class KSpace : protected Pointers { int compute_flag; // 0 if skip compute() int fftbench; // 0 if skip FFT timing int collective_flag; // 1 if use MPI collectives for FFT/remap + int isend_flag; // 1 if use MPI_Isend for FFT/remap int stagger_flag; // 1 if using staggered PPPM grids double splittol; // tolerance for when to truncate splitting From 5c6666a38938e63914e5192ff5a7cd904f8a2703 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Mon, 24 Nov 2025 11:25:39 -0500 Subject: [PATCH 02/17] Change isend keyword to nonblocking for KSPACE --- src/kspace.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kspace.cpp b/src/kspace.cpp index ed025fd9454..f390b6279af 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -553,7 +553,7 @@ void KSpace::modify_params(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; - } else if (strcmp(arg[iarg],"isend") == 0) { + } else if (strcmp(arg[iarg],"nonblocking") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; From 7715f2b8c5a14f4e412ca98e3471c089e67a3b61 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Mon, 24 Nov 2025 11:33:53 -0500 Subject: [PATCH 03/17] Add error if both collective and nonblocking are specified --- src/kspace.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kspace.cpp b/src/kspace.cpp index f390b6279af..9b4bf478bfb 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -528,8 +528,7 @@ void KSpace::modify_params(int narg, char **arg) } warn_nonneutral = 0; // can't use wire correction with non-neutral system iarg += 2; - } - else if (strcmp(arg[iarg], "amat") == 0) { + } else if (strcmp(arg[iarg], "amat") == 0) { if (iarg + 2 > narg) error->all(FLERR, "Illegal kspace_modify command"); if (!pppmflag) error->all(FLERR, "Illegal kspace_modify command 'amat'" "available for pppm/conp, only"); @@ -614,6 +613,7 @@ void KSpace::modify_params(int narg, char **arg) iarg += n; } } + if (collective_flag > 0 && isend_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true."); } /* ---------------------------------------------------------------------- */ From 8ca245b378e4b5774de59678ee450c21443d42ee Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 26 Nov 2025 14:23:27 -0500 Subject: [PATCH 04/17] Merge Kokkos-based KSPACE comms improvements into non-Kokkos based, add option for nonblocking --- src/KSPACE/fft3d.cpp | 12 +- src/KSPACE/fft3d.h | 2 +- src/KSPACE/fft3d_wrap.cpp | 4 +- src/KSPACE/fft3d_wrap.h | 2 +- src/KSPACE/pppm.cpp | 6 +- src/KSPACE/pppm_dipole.cpp | 6 +- src/KSPACE/pppm_disp.cpp | 12 +- src/KSPACE/remap.cpp | 922 ++++++++++++++++++++++--------------- src/KSPACE/remap.h | 12 +- src/KSPACE/remap_wrap.cpp | 4 +- src/KSPACE/remap_wrap.h | 2 +- 11 files changed, 578 insertions(+), 406 deletions(-) diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index 1426b9893ee..f066c7db656 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -18,6 +18,7 @@ FFTW3, KISS FFT, Dfti/MKL, and ACML Phil Blood (PSC) added single precision FFTs Paul Coffman (IBM) added MPI collectives remap + Nick Hagerty (ORNL) added non-blocking MPI pt2pt remap ------------------------------------------------------------------------- */ #include "fft3d.h" @@ -240,6 +241,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan) 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT usecollective use collective MPI operations for remapping data + useisend use non-blocking or blocking MPI pt2pt operations for remapping data ------------------------------------------------------------------------- */ struct fft_plan_3d *fft_3d_create_plan( @@ -248,7 +250,7 @@ struct fft_plan_3d *fft_3d_create_plan( int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) + int scaled, int permute, int *nbuf, int usecollective, int useisend) { struct fft_plan_3d *plan; int me,nprocs; @@ -313,7 +315,7 @@ struct fft_plan_3d *fft_3d_create_plan( first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, - first_klo,first_khi,2,0,0,FFT_PRECISION,0); + first_klo,first_khi,2,0,0,FFT_PRECISION,0,0); if (plan->pre_plan == nullptr) { free(plan); return nullptr; @@ -338,7 +340,7 @@ struct fft_plan_3d *fft_3d_create_plan( plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,second_ilo,second_ihi, second_jlo,second_jhi,second_klo,second_khi, - 2,1,0,FFT_PRECISION,usecollective); + 2,1,0,FFT_PRECISION,usecollective,useisend); if (plan->mid1_plan == nullptr) return nullptr; // 1d FFTs along mid axis @@ -379,7 +381,7 @@ struct fft_plan_3d *fft_3d_create_plan( second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, - third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); + third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,useisend); if (plan->mid2_plan == nullptr) return nullptr; // 1d FFTs along slow axis @@ -408,7 +410,7 @@ struct fft_plan_3d *fft_3d_create_plan( third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, - out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); + out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0,0); if (plan->post_plan == nullptr) return nullptr; } diff --git a/src/KSPACE/fft3d.h b/src/KSPACE/fft3d.h index 00a3332d2e8..28c434565da 100644 --- a/src/KSPACE/fft3d.h +++ b/src/KSPACE/fft3d.h @@ -143,7 +143,7 @@ struct fft_plan_3d { extern "C" { void fft_3d(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *); struct fft_plan_3d *fft_3d_create_plan(MPI_Comm, int, int, int, int, int, int, int, int, int, int, - int, int, int, int, int, int, int, int *, int); + int, int, int, int, int, int, int, int *, int, int); void fft_3d_destroy_plan(struct fft_plan_3d *); void factor(int, int *, int *); void bifactor(int, int *, int *); diff --git a/src/KSPACE/fft3d_wrap.cpp b/src/KSPACE/fft3d_wrap.cpp index 7b00543eeab..44d482a721b 100644 --- a/src/KSPACE/fft3d_wrap.cpp +++ b/src/KSPACE/fft3d_wrap.cpp @@ -25,13 +25,13 @@ FFT3d::FFT3d(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) : Pointers(lmp) + int scaled, int permute, int *nbuf, int usecollective, int useisend) : Pointers(lmp) { #ifndef FFT_HEFFTE plan = fft_3d_create_plan(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective); + scaled,permute,nbuf,usecollective,useisend); if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan"); #else heffte::plan_options options = heffte::default_options(); diff --git a/src/KSPACE/fft3d_wrap.h b/src/KSPACE/fft3d_wrap.h index 04b828b7de5..e8559cc9641 100644 --- a/src/KSPACE/fft3d_wrap.h +++ b/src/KSPACE/fft3d_wrap.h @@ -37,7 +37,7 @@ class FFT3d : protected Pointers { enum { FORWARD = 1, BACKWARD = -1 }; FFT3d(class LAMMPS *, MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int, int, int *, int); + int, int, int, int, int *, int, int); ~FFT3d() override; void compute(FFT_SCALAR *, FFT_SCALAR *, int); void timing1d(FFT_SCALAR *, int, int); diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index bb92208fe92..46232b13533 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -837,17 +837,17 @@ void PPPM::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + 1,0,0,FFT_PRECISION,collective_flag,isend_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp index 055e979b28e..193d66f8264 100644 --- a/src/KSPACE/pppm_dipole.cpp +++ b/src/KSPACE/pppm_dipole.cpp @@ -622,17 +622,17 @@ void PPPMDipole::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + 1,0,0,FFT_PRECISION,collective_flag,isend_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp index cbce62005f8..cf33931ac84 100644 --- a/src/KSPACE/pppm_disp.cpp +++ b/src/KSPACE/pppm_disp.cpp @@ -1771,17 +1771,17 @@ void _noopt PPPMDisp::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + 1,0,0,FFT_PRECISION,collective_flag,isend_flag); } // -------------------------------------- @@ -1848,19 +1848,19 @@ void _noopt PPPMDisp::allocate() new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,isend_flag); remap_6 = new Remap(lmp,world, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 1,0,0,FFT_PRECISION,collective_flag); + 1,0,0,FFT_PRECISION,collective_flag,isend_flag); } // -------------------------------------- diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index 06b013dbd33..297705217e8 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -63,31 +63,47 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, struct remap_plan_3d *plan) { + int me; + MPI_Comm_rank(plan->comm,&me); + + FFT_SCALAR *scratch; + if (plan->memory == 0) + scratch = buf; + else + scratch = plan->scratch; + // use point-to-point communication if (!plan->usecollective) { int i,isend,irecv; - FFT_SCALAR *scratch; - - if (plan->memory == 0) - scratch = buf; - else - scratch = plan->scratch; - - // post all recvs into scratch space - for (irecv = 0; irecv < plan->nrecv; irecv++) + for (irecv = 0; irecv < plan->nrecv; irecv++) { MPI_Irecv(&scratch[plan->recv_bufloc[irecv]],plan->recv_size[irecv], MPI_FFT_SCALAR,plan->recv_proc[irecv],0, plan->comm,&plan->request[irecv]); + } // send all messages to other procs for (isend = 0; isend < plan->nsend; isend++) { - plan->pack(&in[plan->send_offset[isend]], - plan->sendbuf,&plan->packplan[isend]); - MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, - plan->send_proc[isend],0,plan->comm); + int in_offset = plan->send_offset[isend]; + if (plan->useisend) { + plan->pack(&in[in_offset], + &plan->sendbuf[plan->send_bufloc[isend]], + &plan->packplan[isend]); + } else { + plan->pack(&in[in_offset], + plan->sendbuf, + &plan->packplan[isend]); + } + + if (plan->useisend) { + MPI_Isend(plan->sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]); + } else { + MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } } // copy in -> scratch -> out for self data @@ -95,109 +111,84 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, if (plan->self) { isend = plan->nsend; irecv = plan->nrecv; - plan->pack(&in[plan->send_offset[isend]], - &scratch[plan->recv_bufloc[irecv]], + + int in_offset = plan->send_offset[isend]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->pack(&in[in_offset], + &scratch[scratch_offset], &plan->packplan[isend]); - plan->unpack(&scratch[plan->recv_bufloc[irecv]], - &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); + plan->unpack(&scratch[scratch_offset], + &out[out_offset],&plan->unpackplan[irecv]); } // unpack all messages from scratch -> out for (i = 0; i < plan->nrecv; i++) { MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); - plan->unpack(&scratch[plan->recv_bufloc[irecv]], - &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); - } - // use All2Allv collective for remap communication + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->unpack(&scratch[scratch_offset], + &out[out_offset],&plan->unpackplan[irecv]); + } + if (plan->useisend) { + // finally, wait for all Isends to be done + MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE); + } } else { if (plan->commringlen > 0) { int isend,irecv; - // create send and recv buffers for alltoallv collective - - int sendBufferSize = 0; - int recvBufferSize = 0; - for (int i=0;insend;i++) - sendBufferSize += plan->send_size[i]; - for (int i=0;inrecv;i++) - recvBufferSize += plan->recv_size[i]; - - auto *packedSendBuffer = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * sendBufferSize + 1); - auto *packedRecvBuffer = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * recvBufferSize + 1); + // populate send data + // buffers are allocated and count/displacement buffers + // are populated in remap_3d_create_plan - int *sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); - int *rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); - int *sdispls = (int *) malloc(sizeof(int) * plan->commringlen); - int *rdispls = (int *) malloc(sizeof(int) * plan->commringlen); - int *nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); - - // create and populate send data, count and displacement buffers - - int currentSendBufferOffset = 0; + int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - sendcnts[isend] = 0; - sdispls[isend] = 0; - int foundentry = 0; - for (int i=0;(insend && !foundentry); i++) { - if (plan->send_proc[i] == plan->commringlist[isend]) { - foundentry = 1; - sendcnts[isend] = plan->send_size[i]; - sdispls[isend] = currentSendBufferOffset; - plan->pack(&in[plan->send_offset[i]], - &packedSendBuffer[currentSendBufferOffset], - &plan->packplan[i]); - currentSendBufferOffset += plan->send_size[i]; - } + if (plan->sendcnts[isend]) { + plan->pack(&in[plan->send_offset[numpacked]], + &plan->sendbuf[plan->sdispls[isend]], + &plan->packplan[numpacked]); + numpacked++; + } + else if (plan->commringlist[isend] == me && plan->self) { + numpacked++; } } - // create and populate recv count and displacement buffers + MPI_Alltoallv(plan->sendbuf, plan->sendcnts, plan->sdispls, + MPI_FFT_SCALAR, scratch, plan->rcvcnts, + plan->rdispls, MPI_FFT_SCALAR, plan->comm); - int currentRecvBufferOffset = 0; - for (irecv = 0; irecv < plan->commringlen; irecv++) { - rcvcnts[irecv] = 0; - rdispls[irecv] = 0; - nrecvmap[irecv] = -1; - int foundentry = 0; - for (int i=0;(inrecv && !foundentry); i++) { - if (plan->recv_proc[i] == plan->commringlist[irecv]) { - foundentry = 1; - rcvcnts[irecv] = plan->recv_size[i]; - rdispls[irecv] = currentRecvBufferOffset; - currentRecvBufferOffset += plan->recv_size[i]; - nrecvmap[irecv] = i; - } - } - } + // copy in -> scratch -> out for self data - MPI_Alltoallv(packedSendBuffer, sendcnts, sdispls, - MPI_FFT_SCALAR, packedRecvBuffer, rcvcnts, - rdispls, MPI_FFT_SCALAR, plan->comm); + if (plan->self) { + plan->pack(&in[plan->send_offset[plan->selfnsendloc]], + &plan->sendbuf[plan->sdispls[plan->selfcommringloc]], + &plan->packplan[plan->selfnsendloc]); + plan->unpack(&plan->sendbuf[plan->sdispls[plan->selfcommringloc]], + &out[plan->recv_offset[plan->selfnrecvloc]], + &plan->unpackplan[plan->selfnrecvloc]); + } // unpack the data from the recv buffer into out - currentRecvBufferOffset = 0; + numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (nrecvmap[irecv] > -1) { - plan->unpack(&packedRecvBuffer[currentRecvBufferOffset], - &out[plan->recv_offset[nrecvmap[irecv]]], - &plan->unpackplan[nrecvmap[irecv]]); - currentRecvBufferOffset += plan->recv_size[nrecvmap[irecv]]; + if (plan->rcvcnts[irecv]) { + plan->unpack(&scratch[plan->rdispls[irecv]], + &out[plan->recv_offset[numpacked]], + &plan->unpackplan[numpacked]); + numpacked++; + } + else if (plan->commringlist[irecv] == me && plan->self) { + numpacked++; } } - - // free temporary data structures - - free(sendcnts); - free(rcvcnts); - free(sdispls); - free(rdispls); - free(nrecvmap); - free(packedSendBuffer); - free(packedRecvBuffer); } } } @@ -225,6 +216,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) usecollective whether to use collective MPI or point-to-point + useisend whether to use non-blocking or blocking MPI point-to-point ------------------------------------------------------------------------- */ struct remap_plan_3d *remap_3d_create_plan( @@ -232,8 +224,8 @@ struct remap_plan_3d *remap_3d_create_plan( int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, - int out_klo, int out_khi, - int nqty, int permute, int memory, int /*precision*/, int usecollective) + int out_klo, int out_khi, int nqty, int permute, + int memory, int /*precision*/, int usecollective, int useisend) { @@ -252,6 +244,7 @@ struct remap_plan_3d *remap_3d_create_plan( plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d)); if (plan == nullptr) return nullptr; plan->usecollective = usecollective; + plan->useisend = useisend; // store parameters in local data structs @@ -293,342 +286,488 @@ struct remap_plan_3d *remap_3d_create_plan( return nullptr; } + // combine input & output extents across all procs + + MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, + inarray,sizeof(struct extent_3d),MPI_BYTE,comm); MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, outarray,sizeof(struct extent_3d),MPI_BYTE,comm); - // count send collides, including self - - nsend = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - nsend += remap_3d_collide(&in,&outarray[iproc],&overlap); - } - - // malloc space for send info - - if (nsend) { - plan->pack = pack_3d; + // for efficiency, handle collective & non-collective setup separately - plan->send_offset = (int *) malloc(nsend*sizeof(int)); - plan->send_size = (int *) malloc(nsend*sizeof(int)); - plan->send_proc = (int *) malloc(nsend*sizeof(int)); - plan->packplan = (struct pack_plan_3d *) - malloc(nsend*sizeof(struct pack_plan_3d)); - - if (plan->send_offset == nullptr || plan->send_size == nullptr || - plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; - } + if (!plan->usecollective) { + // count send & recv collides, including self - // store send info, with self as last entry - - nsend = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { - plan->send_proc[nsend] = iproc; - plan->send_offset[nsend] = nqty * - ((overlap.klo-in.klo)*in.jsize*in.isize + - ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); - plan->packplan[nsend].nfast = nqty*overlap.isize; - plan->packplan[nsend].nmid = overlap.jsize; - plan->packplan[nsend].nslow = overlap.ksize; - plan->packplan[nsend].nstride_line = nqty*in.isize; - plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; - plan->packplan[nsend].nqty = nqty; - plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; - nsend++; + nsend = 0; + nrecv = 0; + for (i = 0; i < nprocs; i++) { + nsend += remap_3d_collide(&in,&outarray[i],&overlap); + nrecv += remap_3d_collide(&out,&inarray[i],&overlap); } - } - // plan->nsend = # of sends not including self + // malloc space for send & recv info - if (nsend && plan->send_proc[nsend-1] == me) { - if (plan->usecollective) // for collectives include self in nsend list - plan->nsend = nsend; - else - plan->nsend = nsend - 1; - } else - plan->nsend = nsend; + if (nsend) { + plan->pack = pack_3d; - // combine input extents across all procs + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(nsend*sizeof(int)); + plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); - MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, - inarray,sizeof(struct extent_3d),MPI_BYTE,comm); + if (plan->useisend) + plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request)); + plan->send_bufloc = (int *) malloc(nsend*sizeof(int)); + if (plan->send_bufloc == nullptr) return nullptr; - // count recv collides, including self + if (plan->send_offset == nullptr || plan->send_size == nullptr || + plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; + } - nrecv = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap); - } + if (nrecv) { + if (permute == 0) + plan->unpack = unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = unpack_3d_permute1_2; + else + plan->unpack = unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = unpack_3d_permute2_2; + else + plan->unpack = unpack_3d_permute2_n; + } - // malloc space for recv info - - if (nrecv) { - if (permute == 0) - plan->unpack = unpack_3d; - else if (permute == 1) { - if (nqty == 1) - plan->unpack = unpack_3d_permute1_1; - else if (nqty == 2) - plan->unpack = unpack_3d_permute1_2; - else - plan->unpack = unpack_3d_permute1_n; + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == nullptr || plan->recv_size == nullptr || + plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || + plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; } - else if (permute == 2) { - if (nqty == 1) - plan->unpack = unpack_3d_permute2_1; - else if (nqty == 2) - plan->unpack = unpack_3d_permute2_2; - else - plan->unpack = unpack_3d_permute2_n; + + // store send info, with self as last entry + + nsend = 0; + iproc = me; + ibuf = 0; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + plan->send_proc[nsend] = iproc; + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->send_bufloc[nsend] = ibuf; + ibuf += plan->send_size[nsend]; + nsend++; + } } - plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); - plan->recv_size = (int *) malloc(nrecv*sizeof(int)); - plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); - plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); - plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); - plan->unpackplan = (struct pack_plan_3d *) - malloc(nrecv*sizeof(struct pack_plan_3d)); - - if (plan->recv_offset == nullptr || plan->recv_size == nullptr || - plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || - plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; - } + // plan->nsend = # of sends not including self + + if (nsend && plan->send_proc[nsend-1] == me) plan->nsend = nsend - 1; + else plan->nsend = nsend; + + // store recv info, with self as last entry + + ibuf = 0; + nrecv = 0; + iproc = me; + + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + plan->recv_proc[nrecv] = iproc; + plan->recv_bufloc[nrecv] = ibuf; + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } - // store recv info, with self as last entry - - ibuf = 0; - nrecv = 0; - iproc = me; - - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - plan->recv_proc[nrecv] = iproc; - plan->recv_bufloc[nrecv] = ibuf; - - if (permute == 0) { - plan->recv_offset[nrecv] = nqty * - ((overlap.klo-out.klo)*out.jsize*out.isize + - (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); - plan->unpackplan[nrecv].nfast = nqty*overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.isize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; - plan->unpackplan[nrecv].nqty = nqty; - } - else if (permute == 1) { - plan->recv_offset[nrecv] = nqty * - ((overlap.ilo-out.ilo)*out.ksize*out.jsize + - (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); - plan->unpackplan[nrecv].nfast = overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; - plan->unpackplan[nrecv].nqty = nqty; - } - else { - plan->recv_offset[nrecv] = nqty * - ((overlap.jlo-out.jlo)*out.isize*out.ksize + - (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); - plan->unpackplan[nrecv].nfast = overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; - plan->unpackplan[nrecv].nqty = nqty; + plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + ibuf += plan->recv_size[nrecv]; + nrecv++; } - - plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; - ibuf += plan->recv_size[nrecv]; - nrecv++; } - } - // create sub-comm rank list + // plan->nrecv = # of recvs not including self - if (plan->usecollective) { - plan->commringlist = nullptr; + if (nrecv && plan->recv_proc[nrecv-1] == me) plan->nrecv = nrecv - 1; + else plan->nrecv = nrecv; - // merge recv and send rank lists - // ask Steve Plimpton about method to more accurately determine - // maximum number of procs contributing to pencil + // init remaining fields in remap plan - int maxcommsize = nprocs; - int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); - int commringlen = 0; + plan->memory = memory; - for (i = 0; i < nrecv; i++) { - commringlist[i] = plan->recv_proc[i]; - commringlen++; + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; + + // the plan->sendbuf and plan->recvbuf are used by both the + // collective & non-collective implementations. + // For non-collective and blocking, the buffer size is MAX(send_size) for any one send + + // find biggest send message (not including self) and malloc space for it + + size = 0; + if (plan->useisend) { + for (nsend = 0; nsend < plan->nsend; nsend++) + size += plan->send_size[nsend]; + } else { + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); } - for (i = 0; i < nsend; i++) { - int foundentry = 0; - for (j = 0; j < commringlen;j++) - if (commringlist[j] == plan->send_proc[i]) foundentry = 1; - if (!foundentry) { - commringlist[commringlen] = plan->send_proc[i]; - commringlen++; - } + if (size) { + plan->sendbuf = (FFT_SCALAR*) malloc(sizeof(FFT_SCALAR) * size); + if (plan->sendbuf == nullptr) return nullptr; } - // sort initial commringlist + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) - int swap = 0; - for (i = 0 ; i < (commringlen - 1); i++) { - for (j = 0 ; j < commringlen - i - 1; j++) { - if (commringlist[j] > commringlist[j+1]) { - swap = commringlist[j]; - commringlist[j] = commringlist[j+1]; - commringlist[j+1] = swap; - } + if (memory == 1) { + if (nrecv > 0) { + plan->scratch = (FFT_SCALAR*) malloc(sizeof(FFT_SCALAR) * nqty*out.isize*out.jsize*out.ksize); + if (plan->scratch == nullptr) return nullptr; } } - // collide all inarray extents for the comm ring with all output - // extents and all outarray extents for the comm ring with all input - // extents - if there is a collison add the rank to the comm ring, - // keep iterating until nothing is added to commring - - int commringappend = 1; - while (commringappend) { - int newcommringlen = commringlen; - commringappend = 0; - for (i = 0; i < commringlen; i++) { - for (j = 0; j < nprocs; j++) { - if (remap_3d_collide(&inarray[commringlist[i]], - &outarray[j],&overlap)) { - int alreadyinlist = 0; - for (int k = 0; k < newcommringlen; k++) { - if (commringlist[k] == j) { - alreadyinlist = 1; - } - } - if (!alreadyinlist) { - commringlist[newcommringlen++] = j; - commringappend = 1; - } - } - if (remap_3d_collide(&outarray[commringlist[i]], - &inarray[j],&overlap)) { - int alreadyinlist = 0; - for (int k = 0 ; k < newcommringlen; k++) { - if (commringlist[k] == j) alreadyinlist = 1; - } - if (!alreadyinlist) { - commringlist[newcommringlen++] = j; - commringappend = 1; - } - } - } - } - commringlen = newcommringlen; + // Non-collectives do not use MPI Communicator Groups + + MPI_Comm_dup(comm,&plan->comm); + } else { + // Improved approach - use an AllReduce to aggregate which ranks need to be included + // To do this, we build the local proc's send/receive list, then do an AllReduce + // to create the send/recv count for the Alltoallv + + // local arrays to be used in the allreduce + // start with max length -- nprocs. Unused entries will be removed later + + int *local_cnts = (int*) malloc(2*nprocs*sizeof(int)); + if (local_cnts == nullptr) return nullptr; + int *local_sendcnts = local_cnts; + int *local_recvcnts = (local_cnts + nprocs); + + // local arrays used to store the results of the allreduce + + int *global_cnts = (int*) malloc(2*nprocs*sizeof(int)); + if (global_cnts == nullptr) return nullptr; + int *global_sendcnts = global_cnts; + int *global_recvcnts = (global_cnts + nprocs); + + // count send & recv collides, including self + + nsend = 0; + nrecv = 0; + for (i = 0; i < nprocs; i++) { + local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap); + local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap); + nsend += local_sendcnts[i]; + nrecv += local_recvcnts[i]; } - // sort the final commringlist + // perform an AllReduce to get the counts from all other processors and build sendcnts list - for (i = 0 ; i < ( commringlen - 1 ); i++) { - for (j = 0 ; j < commringlen - i - 1; j++) { - if (commringlist[j] > commringlist[j+1]) { - swap = commringlist[j]; - commringlist[j] = commringlist[j+1]; - commringlist[j+1] = swap; - } + MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm); + + // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv + // also builds commringlist -- which is already sorted + + int *commringlist = (int*) malloc(nprocs * sizeof(int)); + int commringlen = 0; + + for (i = 0; i < nprocs; i++) { + if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) { + commringlist[commringlen] = i; + commringlen++; } } // resize commringlist to final size - commringlist = (int *) realloc(commringlist, commringlen*sizeof(int) + 1); + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); // set the plan->commringlist plan->commringlen = commringlen; plan->commringlist = commringlist; - } - // plan->nrecv = # of recvs not including self - // for collectives include self in the nsend list + // clean up local buffers that are finished - if (nrecv && plan->recv_proc[nrecv-1] == me) { - if (plan->usecollective) plan->nrecv = nrecv; - else plan->nrecv = nrecv - 1; - } else plan->nrecv = nrecv; + local_sendcnts = nullptr; + local_recvcnts = nullptr; + global_recvcnts = nullptr; + global_sendcnts = nullptr; + free(local_cnts); + free(global_cnts); - // init remaining fields in remap plan + // malloc space for send & recv info + // if the current proc is involved in any way in the communication, allocate space + // because of the Alltoallv, both send and recv have to be initialized even if + // only one of those is performed - plan->memory = memory; + if (nsend || nrecv) { - if (nrecv == plan->nrecv) plan->self = 0; - else plan->self = 1; + // send space - // free locally malloced space + plan->selfcommringloc = -1; + plan->selfnsendloc = -1; + plan->selfnrecvloc = -1; - free(inarray); - free(outarray); + plan->nsend = nsend; + plan->pack = pack_3d; - // find biggest send message (not including self) and malloc space for it + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(plan->commringlen*sizeof(int)); - plan->sendbuf = nullptr; + plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int)); + plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int)); - size = 0; - for (nsend = 0; nsend < plan->nsend; nsend++) - size = MAX(size,plan->send_size[nsend]); + // only used when sendcnt > 0 - if (size) { - plan->sendbuf = (FFT_SCALAR *) malloc(size*sizeof(FFT_SCALAR)); - if (plan->sendbuf == nullptr) return nullptr; - } + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); + + if (plan->send_offset == nullptr || plan->send_size == nullptr || + plan->sendcnts == nullptr || plan->sdispls == nullptr || + plan->packplan == nullptr) return nullptr; + + // recv space + + plan->nrecv = nrecv; - // if requested, allocate internal scratch space for recvs, - // only need it if I will receive any data (including self) + if (permute == 0) + plan->unpack = unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = unpack_3d_permute1_2; + else + plan->unpack = unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = unpack_3d_permute2_2; + else + plan->unpack = unpack_3d_permute2_n; + } + + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(plan->commringlen*sizeof(int)); + + plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int)); + plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int)); - plan->scratch = nullptr; + // only used when recvcnt > 0 - if (memory == 1) { - if (nrecv > 0) { - plan->scratch = - (FFT_SCALAR *) malloc((size_t)nqty*out.isize*out.jsize*out.ksize * - sizeof(FFT_SCALAR)); - if (plan->scratch == nullptr) return nullptr; + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == nullptr || plan->recv_size == nullptr || + plan->rcvcnts == nullptr || plan->rdispls == nullptr || + plan->unpackplan == nullptr) return nullptr; } - } - // if using collective and the commringlist is NOT empty create a - // communicator for the plan based off an MPI_Group created with - // ranks from the commringlist + // store send info, with self as last entry - if ((plan->usecollective && (plan->commringlen > 0))) { - MPI_Group orig_group, new_group; - MPI_Comm_group(comm, &orig_group); - MPI_Group_incl(orig_group, plan->commringlen, - plan->commringlist, &new_group); - MPI_Comm_create(comm, new_group, &plan->comm); - } + nsend = 0; + ibuf = 0; + int total_send_size = 0; + for (i = 0; i < plan->commringlen; i++) { + iproc = plan->commringlist[i]; + if (iproc == me) { + plan->selfcommringloc = i; + plan->selfnsendloc = nsend; + } + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + // number of entries required for this pack's 3-d coords + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + // total amount of overlap + plan->send_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->sendcnts[i] = plan->send_size[i]; + plan->sdispls[i] = ibuf; + ibuf += plan->send_size[i]; + nsend++; + } else { + plan->send_size[i] = 0; + plan->sdispls[i] = ibuf; + plan->sendcnts[i] = 0; + } + total_send_size += plan->send_size[i]; + } + + if (total_send_size) { + plan->sendbuf = (FFT_SCALAR*) malloc(total_send_size * sizeof(FFT_SCALAR)); + if (plan->sendbuf == nullptr) return nullptr; + } - // if using collective and the comm ring list is empty create - // a communicator for the plan with an empty group + // store recv info, with self as last entry - else if ((plan->usecollective) && (plan->commringlen == 0)) { - MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); + ibuf = 0; + nrecv = 0; + + for (i = 0; i < plan->commringlen; i++) { + iproc = plan->commringlist[i]; + if (iproc == me) { + plan->selfnrecvloc = nrecv; + } + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } + + plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->rcvcnts[i] = plan->recv_size[i]; + plan->rdispls[i] = ibuf; + ibuf += plan->recv_size[i]; + nrecv++; + } else { + plan->recv_size[i] = 0; + plan->rcvcnts[i] = 0; + plan->rdispls[i] = ibuf; + } + } + + // init remaining fields in remap plan + + plan->memory = memory; + + if (plan->sendcnts[plan->selfcommringloc]) { + plan->self = 1; + plan->sendcnts[plan->selfcommringloc] = 0; + plan->rcvcnts[plan->selfcommringloc] = 0; + } + else { + plan->self = 0; + } + + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) + + if (memory == 1) { + if (nrecv > 0) { + plan->scratch = (FFT_SCALAR*) malloc(nqty*out.isize*out.jsize*out.ksize * sizeof(FFT_SCALAR)); + if (plan->scratch == nullptr) return nullptr; + } + } + + // if using collective and the commringlist is NOT empty create a + // communicator for the plan based off an MPI_Group created with + // ranks from the commringlist + + if (plan->commringlen > 0) { + MPI_Group orig_group, new_group; + MPI_Comm_group(comm, &orig_group); + MPI_Group_incl(orig_group, plan->commringlen, + plan->commringlist, &new_group); + MPI_Comm_create(comm, new_group, &plan->comm); + } + + // if using collective and the comm ring list is empty create + // a communicator for the plan with an empty group + + else + MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); } - // not using collective - dup comm + // free locally malloced space - else MPI_Comm_dup(comm,&plan->comm); + free(inarray); + free(outarray); // return pointer to plan @@ -641,39 +780,60 @@ struct remap_plan_3d *remap_3d_create_plan( void remap_3d_destroy_plan(struct remap_plan_3d *plan) { + if (plan == nullptr) return; + // free MPI communicator - if (!(plan->usecollective) || (plan->commringlen != 0)) + if (!((plan->usecollective) && (plan->commringlen == 0))) MPI_Comm_free(&plan->comm); if (plan->usecollective) { - if (plan->commringlist != nullptr) + if (plan->commringlist != nullptr) { free(plan->commringlist); - } + free(plan->sendcnts); + free(plan->rcvcnts); + free(plan->sdispls); + free(plan->rdispls); + } - // free internal arrays + if (plan->nsend) { + free(plan->send_offset); + free(plan->send_size); + free(plan->packplan); + } - if (plan->nsend || plan->self) { - free(plan->send_offset); - free(plan->send_size); - free(plan->send_proc); - free(plan->packplan); - if (plan->sendbuf) free(plan->sendbuf); - } + if (plan->nrecv) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->unpackplan); + } + } else { + // free arrays used in pt2pt communication + + if (plan->nsend || plan->self) { + free(plan->send_offset); + free(plan->send_size); + free(plan->send_proc); + free(plan->packplan); + if (plan->useisend) { + free(plan->isend_reqs); + free(plan->send_bufloc); + } + } - if (plan->nrecv || plan->self) { - free(plan->recv_offset); - free(plan->recv_size); - free(plan->recv_proc); - free(plan->recv_bufloc); - free(plan->request); - free(plan->unpackplan); - if (plan->scratch) free(plan->scratch); + if (plan->nrecv || plan->self) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->recv_proc); + free(plan->recv_bufloc); + free(plan->request); + free(plan->unpackplan); + } } // free plan itself - free(plan); + delete plan; } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h index 59d47ede534..c171bbe5a3e 100644 --- a/src/KSPACE/remap.h +++ b/src/KSPACE/remap.h @@ -27,6 +27,8 @@ struct remap_plan_3d { int *send_offset; // extraction loc for each send int *send_size; // size of each send message int *send_proc; // proc to send each message to + int *send_bufloc; // if useisend, offset in send buf for each isend + MPI_Request *isend_reqs; // MPI request for each posted isend struct pack_plan_3d *packplan; // pack plan for each send message int *recv_offset; // insertion loc for each recv int *recv_size; // size of each recv message @@ -40,8 +42,16 @@ struct remap_plan_3d { int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI + int useisend; // if using point-to-point MPI, use MPI_Isend int commringlen; // length of commringlist int *commringlist; // ranks on communication ring of this plan + int *sendcnts; // # of elements in send buffer for each rank + int *rcvcnts; // # of elements in recv buffer for each rank + int *sdispls; // extraction location in send buffer for each rank + int *rdispls; // extraction location in recv buffer for each rank + int selfcommringloc; // current proc's location in commringlist + int selfnsendloc; // current proc's location in send lists + int selfnrecvloc; // current proc's location in recv lists }; // collision between 2 regions @@ -56,6 +66,6 @@ struct extent_3d { void remap_3d(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *, struct remap_plan_3d *); struct remap_plan_3d *remap_3d_create_plan(MPI_Comm, int, int, int, int, int, int, int, int, int, - int, int, int, int, int, int, int, int); + int, int, int, int, int, int, int, int, int); void remap_3d_destroy_plan(struct remap_plan_3d *); int remap_3d_collide(struct extent_3d *, struct extent_3d *, struct extent_3d *); diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp index ca98748011b..31bf6af910e 100644 --- a/src/KSPACE/remap_wrap.cpp +++ b/src/KSPACE/remap_wrap.cpp @@ -26,12 +26,12 @@ Remap::Remap(LAMMPS *lmp, MPI_Comm comm, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, int memory, - int precision, int usecollective) : Pointers(lmp) + int precision, int usecollective, int useisend) : Pointers(lmp) { plan = remap_3d_create_plan(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - nqty,permute,memory,precision,usecollective); + nqty,permute,memory,precision,usecollective,useisend); if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan"); } diff --git a/src/KSPACE/remap_wrap.h b/src/KSPACE/remap_wrap.h index fc34bc2df17..2cba678c865 100644 --- a/src/KSPACE/remap_wrap.h +++ b/src/KSPACE/remap_wrap.h @@ -22,7 +22,7 @@ namespace LAMMPS_NS { class Remap : protected Pointers { public: Remap(class LAMMPS *, MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int, int); + int, int, int, int, int); ~Remap() override; void perform(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *); From afc4c2724590d452533af7a0248f12a27b23a196 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 26 Nov 2025 14:35:40 -0500 Subject: [PATCH 05/17] Add 0 for use_isend parameter of FFT3d init in AMOEBA --- src/AMOEBA/amoeba_convolution.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index b50fd673373..f9a0e794601 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -173,17 +173,17 @@ void AmoebaConvolution::allocate_grid() fft1 = new FFT3d(lmp,world,nx,ny,nz, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,&tmp,0); + 1,0,&tmp,0,0); fft2 = new FFT3d(lmp,world,nx,ny,nz, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,0); + 0,0,&tmp,0,0); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nqty,0,0,FFT_PRECISION,0); + nqty,0,0,FFT_PRECISION,0,0); // memory allocations From 8b8d1baccd9b29973ab043949554bcc5db08f2a9 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 26 Nov 2025 15:13:47 -0500 Subject: [PATCH 06/17] Added use_isend param=0 for FFT3d init in ELECTRODE and PHONON packages --- src/ELECTRODE/pppm_electrode.cpp | 6 +++--- src/PHONON/fix_phonon.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ELECTRODE/pppm_electrode.cpp b/src/ELECTRODE/pppm_electrode.cpp index 618b3af31c2..9194691b53c 100644 --- a/src/ELECTRODE/pppm_electrode.cpp +++ b/src/ELECTRODE/pppm_electrode.cpp @@ -1069,15 +1069,15 @@ void PPPMElectrode::allocate() fft1 = new FFT3d(lmp, world, nx_pppm, ny_pppm, nz_pppm, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft, - 0, 0, &tmp, collective_flag); + 0, 0, &tmp, collective_flag, 0); fft2 = new FFT3d(lmp, world, nx_pppm, ny_pppm, nz_pppm, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft, nxlo_in, nxhi_in, nylo_in, nyhi_in, nzlo_in, nzhi_in, 0, 0, - &tmp, collective_flag); + &tmp, collective_flag, 0); remap = new Remap(lmp, world, nxlo_in, nxhi_in, nylo_in, nyhi_in, nzlo_in, nzhi_in, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft, 1, 0, 0, FFT_PRECISION, - collective_flag); + collective_flag, 0); // ELECTRODE specific allocations diff --git a/src/PHONON/fix_phonon.cpp b/src/PHONON/fix_phonon.cpp index 7998395038a..94a86b8aa32 100644 --- a/src/PHONON/fix_phonon.cpp +++ b/src/PHONON/fix_phonon.cpp @@ -159,7 +159,7 @@ FixPhonon::FixPhonon(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) for (int i = 1; i < nprocs; ++i) fft_disp[i] = fft_disp[i-1] + fft_cnts[i-1]; delete []nx_loc; - fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize,0); + fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize,0,0); memory->create(fft_data, MAX(1,mynq)*2, "fix_phonon:fft_data"); // allocate variables; MAX(1,... is used because a null buffer will result in error for MPI From 107c2b9c9344c9feaf9cffc72d8443d81192e179 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Mon, 1 Dec 2025 09:52:32 -0500 Subject: [PATCH 07/17] Added usenonblocking param to FFT3D unit tests --- unittest/utils/test_fft3d.cpp | 33 +++++++++++++------------ unittest/utils/test_fft3d_kokkos.cpp | 36 +++++++++++++++------------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/unittest/utils/test_fft3d.cpp b/unittest/utils/test_fft3d.cpp index 7c1948d286e..d650cd64ada 100644 --- a/unittest/utils/test_fft3d.cpp +++ b/unittest/utils/test_fft3d.cpp @@ -116,16 +116,17 @@ class FFT3DTest : public LAMMPSTest { int out_klo = 0, out_khi = nslow - 1; // FFT parameters - int scaled = 0; // No scaling - int permute = 0; // No permutation - int nbuf = 0; // Buffer size (output) - int usecollective = 0; // Use point-to-point communication + int scaled = 0; // No scaling + int permute = 0; // No permutation + int nbuf = 0; // Buffer size (output) + int usecollective = 0; // Use point-to-point communication + int usenonblocking = 0; // Use blocking point-to-point communication // Create FFT3d object BEGIN_HIDE_OUTPUT(); fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, - scaled, permute, &nbuf, usecollective); + scaled, permute, &nbuf, usecollective, usenonblocking); END_HIDE_OUTPUT(); ASSERT_NE(fft, nullptr); @@ -654,16 +655,17 @@ TEST_F(FFT3DTest, RoundTrip_MPI_2proc_32x32x32) std::memset(output_data, 0, 2 * local_size * sizeof(FFT_SCALAR)); // FFT parameters - int scaled = 0; // No scaling - int permute = 0; // No permutation - int nbuf = 0; // Buffer size (output) - int usecollective = 0; // Use point-to-point communication + int scaled = 0; // No scaling + int permute = 0; // No permutation + int nbuf = 0; // Buffer size (output) + int usecollective = 0; // Use point-to-point communication + int usenonblocking = 0; // Use blocking point-to-point communication // Create MPI-aware FFT3d object BEGIN_HIDE_OUTPUT(); fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective); + &nbuf, usecollective, usenonblocking); END_HIDE_OUTPUT(); ASSERT_NE(fft, nullptr); @@ -799,16 +801,17 @@ TEST_F(FFT3DTest, RoundTrip_MPI_4proc_64x64x64) std::memset(output_data, 0, 2 * local_size * sizeof(FFT_SCALAR)); // FFT parameters - int scaled = 0; // No scaling - int permute = 0; // No permutation - int nbuf = 0; // Buffer size (output) - int usecollective = 0; // Use point-to-point communication + int scaled = 0; // No scaling + int permute = 0; // No permutation + int nbuf = 0; // Buffer size (output) + int usecollective = 0; // Use point-to-point communication + int usenonblocking = 0; // Use blocking point-to-point communication // Create MPI-aware FFT3d object BEGIN_HIDE_OUTPUT(); fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective); + &nbuf, usecollective, usenonblocking); END_HIDE_OUTPUT(); ASSERT_NE(fft, nullptr); diff --git a/unittest/utils/test_fft3d_kokkos.cpp b/unittest/utils/test_fft3d_kokkos.cpp index a1b41b669f4..0cd5b723c9a 100644 --- a/unittest/utils/test_fft3d_kokkos.cpp +++ b/unittest/utils/test_fft3d_kokkos.cpp @@ -140,18 +140,19 @@ class FFT3DKokkosTest : public LAMMPSTest { int out_klo = 0, out_khi = nslow - 1; // FFT parameters - int scaled = 0; // No scaling - int permute = 0; // No permutation - int nbuf = 0; // Buffer size (output) - int usecollective = 0; // Use point-to-point communication - int usegpu = 0; // Let KOKKOS decide based on backend + int scaled = 0; // No scaling + int permute = 0; // No permutation + int nbuf = 0; // Buffer size (output) + int usecollective = 0; // Use point-to-point communication + int usenonblocking = 0; // Use blocking point-to-point communication + int usegpu = 0; // Let KOKKOS decide based on backend // Create FFT3dKokkos object BEGIN_HIDE_OUTPUT(); fft = new FFT3dKokkos(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, &nbuf, - usecollective, usegpu); + usecollective, usenonblocking, usegpu); END_HIDE_OUTPUT(); ASSERT_NE(fft, nullptr); @@ -555,13 +556,13 @@ TEST_F(FFT3DKokkosTest, Threading_OpenMP_Concurrent) int out_jlo = in_jlo, out_jhi = in_jhi; int out_klo = in_klo, out_khi = in_khi; int scaled = 0, permute = 0, nbuf = 0; - int usecollective = 0, usegpu_aware = 0; + int usecollective = 0, usegpu_aware = 0, usenonblocking = 0; BEGIN_HIDE_OUTPUT(); auto fft = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ffts.push_back(fft); @@ -657,13 +658,13 @@ TEST_F(FFT3DKokkosTest, Threading_Threads_Concurrent) int out_jlo = in_jlo, out_jhi = in_jhi; int out_klo = in_klo, out_khi = in_khi; int scaled = 0, permute = 0, nbuf = 0; - int usecollective = 0, usegpu_aware = 0; + int usecollective = 0, usegpu_aware = 0, usenonblocking = 0; BEGIN_HIDE_OUTPUT(); auto fft = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ffts.push_back(fft); @@ -730,13 +731,13 @@ TEST_F(FFT3DKokkosTest, Threading_Safety) int out_jlo = in_jlo, out_jhi = in_jhi; int out_klo = in_klo, out_khi = in_khi; int scaled = 0, permute = 0, nbuf = 0; - int usecollective = 0, usegpu_aware = 0; + int usecollective = 0, usegpu_aware = 0, usenonblocking = 0; BEGIN_HIDE_OUTPUT(); auto fft_device = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ASSERT_NE(fft_device, nullptr); @@ -880,14 +881,14 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_2proc_32x32x32) // FFT parameters int scaled = 0, permute = 0, nbuf = 0; - int usecollective = 0, usegpu_aware = 0; + int usecollective = 0, usegpu_aware = 0, usenonblocking = 0; // Create MPI-aware FFT3dKokkos object BEGIN_HIDE_OUTPUT(); auto fft_mpi = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ASSERT_NE(fft_mpi, nullptr); @@ -1012,14 +1013,14 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_4proc_64x64x64) // FFT parameters int scaled = 0, permute = 0, nbuf = 0; - int usecollective = 0, usegpu_aware = 0; + int usecollective = 0, usegpu_aware = 0, usenonblocking = 0; // Create MPI-aware FFT3dKokkos object BEGIN_HIDE_OUTPUT(); auto fft_mpi = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ASSERT_NE(fft_mpi, nullptr); @@ -1146,6 +1147,7 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_GPU_2proc) // FFT parameters (disable GPU-aware MPI for now) int scaled = 0, permute = 0, nbuf = 0; int usecollective = 0; + int usenonblocking = 0; int usegpu_aware = 0; // Would check lmp->kokkos->gpu_aware_flag if KokkosLMP was complete // Create MPI+GPU FFT3dKokkos object @@ -1153,7 +1155,7 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_GPU_2proc) auto fft_mpi = new FFT3dKokkos( lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute, - &nbuf, usecollective, usegpu_aware); + &nbuf, usecollective, usenonblocking, usegpu_aware); END_HIDE_OUTPUT(); ASSERT_NE(fft_mpi, nullptr); From d145b1d6f2a4656d998f08380d7615818046d54c Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 3 Dec 2025 09:31:52 -0500 Subject: [PATCH 08/17] Fix bug in Kokkos Remap, where conditionals were ordered incorrectly --- src/KOKKOS/remap_kokkos.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index f9ab0f7f475..88f17eb40bd 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -210,15 +210,15 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - if (plan->sendcnts[isend]) { + if (isend == plan->selfcommringloc && plan->self) { + numpacked++; + } + else if (plan->sendcnts[isend]) { plan->pack(d_in,plan->send_offset[numpacked], plan->d_sendbuf,plan->sdispls[isend], &plan->packplan[numpacked]); numpacked++; } - else if (plan->commringlist[isend] == me && plan->self) { - numpacked++; - } } if (!plan->usegpu_aware) @@ -246,15 +246,15 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (plan->rcvcnts[irecv]) { + if (irecv == plan->selfcommringloc && plan->self) { + numpacked++; + } + else if (plan->rcvcnts[irecv]) { plan->unpack(d_scratch,plan->rdispls[irecv], d_out,plan->recv_offset[numpacked], &plan->unpackplan[numpacked]); numpacked++; } - else if (plan->commringlist[irecv] == me && plan->self) { - numpacked++; - } } } } From 9e348d5fd292b7069e762c63cc271c8c133e0d35 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 3 Dec 2025 09:35:41 -0500 Subject: [PATCH 09/17] Replace Allreduce-based collective commringlist building with C++ set-based --- src/KSPACE/remap.cpp | 106 ++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index 297705217e8..73db03cde83 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -15,6 +15,7 @@ #include "remap.h" #include +#include #define PACK_DATA FFT_SCALAR @@ -149,15 +150,15 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - if (plan->sendcnts[isend]) { + if (isend == plan->selfcommringloc && plan->self) { + numpacked++; + } + else if (plan->sendcnts[isend]) { plan->pack(&in[plan->send_offset[numpacked]], &plan->sendbuf[plan->sdispls[isend]], &plan->packplan[numpacked]); numpacked++; } - else if (plan->commringlist[isend] == me && plan->self) { - numpacked++; - } } MPI_Alltoallv(plan->sendbuf, plan->sendcnts, plan->sdispls, @@ -179,15 +180,15 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (plan->rcvcnts[irecv]) { + if (irecv == plan->selfcommringloc && plan->self) { + numpacked++; + } + else if (plan->rcvcnts[irecv]) { plan->unpack(&scratch[plan->rdispls[irecv]], &out[plan->recv_offset[numpacked]], &plan->unpackplan[numpacked]); numpacked++; } - else if (plan->commringlist[irecv] == me && plan->self) { - numpacked++; - } } } } @@ -488,71 +489,62 @@ struct remap_plan_3d *remap_3d_create_plan( MPI_Comm_dup(comm,&plan->comm); } else { - // Improved approach - use an AllReduce to aggregate which ranks need to be included - // To do this, we build the local proc's send/receive list, then do an AllReduce - // to create the send/recv count for the Alltoallv - - // local arrays to be used in the allreduce - // start with max length -- nprocs. Unused entries will be removed later - - int *local_cnts = (int*) malloc(2*nprocs*sizeof(int)); - if (local_cnts == nullptr) return nullptr; - int *local_sendcnts = local_cnts; - int *local_recvcnts = (local_cnts + nprocs); - - // local arrays used to store the results of the allreduce - - int *global_cnts = (int*) malloc(2*nprocs*sizeof(int)); - if (global_cnts == nullptr) return nullptr; - int *global_sendcnts = global_cnts; - int *global_recvcnts = (global_cnts + nprocs); - - // count send & recv collides, including self + int *commringlist; + int commringlen = 0; + // use a C++ set to organize the commringlist (C++17) + std::set commringset; nsend = 0; nrecv = 0; for (i = 0; i < nprocs; i++) { - local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap); - local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap); - nsend += local_sendcnts[i]; - nrecv += local_recvcnts[i]; + if (remap_3d_collide(&in,&outarray[i],&overlap)) { + commringset.insert(i); + nsend++; + } + if (remap_3d_collide(&out,&inarray[i],&overlap)) { + commringset.insert(i); + nrecv++; + } } - // perform an AllReduce to get the counts from all other processors and build sendcnts list - - MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm); - - // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv - // also builds commringlist -- which is already sorted - - int *commringlist = (int*) malloc(nprocs * sizeof(int)); - int commringlen = 0; - - for (i = 0; i < nprocs; i++) { - if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) { - commringlist[commringlen] = i; - commringlen++; + int commringappend = 1; + while (commringappend) { + commringappend = 0; + for (int setproci : commringset) { + for (j = 0; j < nprocs; j++) { + // short-circuit if already in commring + if (commringset.find(j) != commringset.end()) + continue; + if (remap_3d_collide(&inarray[setproci],&outarray[j],&overlap)) { + auto set_insert_result = commringset.insert(j); + if (set_insert_result.second) { + commringappend++; + } + } + if (remap_3d_collide(&outarray[setproci],&inarray[j],&overlap)) { + auto set_insert_result = commringset.insert(j); + if (set_insert_result.second) { + commringappend++; + } + } + } } } - // resize commringlist to final size + // build already-sorted commringlist as an array + commringlist = (int*) malloc(commringset.size() * sizeof(int)); + commringlen = 0; - commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + for (int setproci : commringset) { + commringlist[commringlen] = setproci; + commringlen++; + } // set the plan->commringlist plan->commringlen = commringlen; plan->commringlist = commringlist; - // clean up local buffers that are finished - - local_sendcnts = nullptr; - local_recvcnts = nullptr; - global_recvcnts = nullptr; - global_sendcnts = nullptr; - free(local_cnts); - free(global_cnts); - // malloc space for send & recv info // if the current proc is involved in any way in the communication, allocate space // because of the Alltoallv, both send and recv have to be initialized even if From 1eb68101262a0d75f839f20a228c36e529b126d8 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 3 Dec 2025 10:26:04 -0500 Subject: [PATCH 10/17] Update remap in Kokkos KSPACE to use set instead of MPI_Allreduce --- src/KOKKOS/remap_kokkos.cpp | 90 +++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 88f17eb40bd..63199e44f00 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -12,6 +12,8 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +#include + #include "remap_kokkos.h" #include "error.h" @@ -552,71 +554,63 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat MPI_Comm_dup(comm,&plan->comm); } else { - // Improved approach - use an AllReduce to aggregate which ranks need to be included - // To do this, we build the local proc's send/receive list, then do an AllReduce - // to create the send/recv count for the Alltoallv - - // local arrays to be used in the allreduce - // start with max length -- nprocs. Unused entries will be removed later - - int *local_cnts = (int*) malloc(2*nprocs*sizeof(int)); - if (local_cnts == nullptr) return nullptr; - int *local_sendcnts = local_cnts; - int *local_recvcnts = (local_cnts + nprocs); - // local arrays used to store the results of the allreduce - - int *global_cnts = (int*) malloc(2*nprocs*sizeof(int)); - if (global_cnts == nullptr) return nullptr; - int *global_sendcnts = global_cnts; - int *global_recvcnts = (global_cnts + nprocs); - - // count send & recv collides, including self + int *commringlist; + int commringlen = 0; + // use a C++ set to organize the commringlist (C++17) + std::set commringset; nsend = 0; nrecv = 0; for (i = 0; i < nprocs; i++) { - local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap); - local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap); - nsend += local_sendcnts[i]; - nrecv += local_recvcnts[i]; + if (remap_3d_collide(&in,&outarray[i],&overlap)) { + commringset.insert(i); + nsend++; + } + if (remap_3d_collide(&out,&inarray[i],&overlap)) { + commringset.insert(i); + nrecv++; + } } - // perform an AllReduce to get the counts from all other processors and build sendcnts list - - MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm); - - // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv - // also builds commringlist -- which is already sorted - - int *commringlist = (int*) malloc(nprocs * sizeof(int)); - int commringlen = 0; - - for (i = 0; i < nprocs; i++) { - if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) { - commringlist[commringlen] = i; - commringlen++; + int commringappend = 1; + while (commringappend) { + commringappend = 0; + for (int setproci : commringset) { + for (int j = 0; j < nprocs; j++) { + // short-circuit if already in commring + if (commringset.find(j) != commringset.end()) + continue; + if (remap_3d_collide(&inarray[setproci],&outarray[j],&overlap)) { + auto set_insert_result = commringset.insert(j); + if (set_insert_result.second) { + commringappend++; + } + } + if (remap_3d_collide(&outarray[setproci],&inarray[j],&overlap)) { + auto set_insert_result = commringset.insert(j); + if (set_insert_result.second) { + commringappend++; + } + } + } } } - // resize commringlist to final size + // build already-sorted commringlist as an array + commringlist = (int*) malloc(commringset.size() * sizeof(int)); + commringlen = 0; - commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + for (int setproci : commringset) { + commringlist[commringlen] = setproci; + commringlen++; + } // set the plan->commringlist plan->commringlen = commringlen; plan->commringlist = commringlist; - // clean up local buffers that are finished - - local_sendcnts = nullptr; - local_recvcnts = nullptr; - global_recvcnts = nullptr; - global_sendcnts = nullptr; - free(local_cnts); - free(global_cnts); - // malloc space for send & recv info // if the current proc is involved in any way in the communication, allocate space // because of the Alltoallv, both send and recv have to be initialized even if From 1ea586c74e87aaa375027860f17a80e1c3ffee04 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 3 Dec 2025 10:37:12 -0500 Subject: [PATCH 11/17] Added docs for nonblocking keyword for kspace_modify --- doc/src/kspace_modify.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst index b300213a3b1..3a0d3a99420 100644 --- a/doc/src/kspace_modify.rst +++ b/doc/src/kspace_modify.rst @@ -11,11 +11,12 @@ Syntax kspace_modify keyword value ... * one or more keyword/value pairs may be listed -* keyword = *collective* or *compute* or *cutoff/adjust* or *diff* or *disp/auto* or *fftbench* or *force/disp/kspace* or *force/disp/real* or *force* or *gewald/disp* or *gewald* or *kmax/ewald* or *mesh* or *minorder* or *mix/disp* or *order/disp* or *order* or *overlap* or *scafacos* or *slab* or *splittol* or *wire* +* keyword = *collective* or *nonblocking* or *compute* or *cutoff/adjust* or *diff* or *disp/auto* or *fftbench* or *force/disp/kspace* or *force/disp/real* or *force* or *gewald/disp* or *gewald* or *kmax/ewald* or *mesh* or *minorder* or *mix/disp* or *order/disp* or *order* or *overlap* or *scafacos* or *slab* or *splittol* or *wire* .. parsed-literal:: *collective* value = *yes* or *no* + *nonblocking* value = *yes* or *no* *compute* value = *yes* or *no* *cutoff/adjust* value = *yes* or *no* *diff* value = *ad* or *ik* = 2 or 4 FFTs for PPPM in smoothed or non-smoothed mode @@ -86,6 +87,16 @@ collective operations and adequate hardware. ---------- +The *nonblocking* keyword applies only to PPPM. It is set to *no* by +default. If this option is set to *yes*, LAMMPS will use non-blocking +point-to-point MPI operations to remap data for 3d-FFT operations +instead of the default blocking point-to-point communication. This +allows for better utilization of full network bandwidth by overlapping +communication to multiple other ranks at the same time, as well as +overlapping receiving/unpacking data and sending data. + +---------- + The *compute* keyword allows Kspace computations to be turned off, even though a :doc:`kspace_style ` is defined. This is not useful for running a real simulation, but can be useful for From 0df30a38e9599c96352a6b8ec61939ca60261963 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 3 Dec 2025 10:38:48 -0500 Subject: [PATCH 12/17] Added defaults and restriction for collective/nonblocking keywords --- doc/src/kspace_modify.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst index 3a0d3a99420..efdf0605bba 100644 --- a/doc/src/kspace_modify.rst +++ b/doc/src/kspace_modify.rst @@ -451,7 +451,8 @@ parameters, see the :doc:`Howto dispersion ` doc page. Restrictions """""""""""" -none +The *collective* and *nonblocking* keywords are mutually exclusive and +cannot be enabled at the same time. Related commands """""""""""""""" @@ -463,6 +464,8 @@ Default The option defaults are as follows: +* collective = no +* nonblocking = no * compute = yes * cutoff/adjust = yes (MSM) * diff = ik (PPPM) From 9df185fc149c4ced1e99131c91935ed3e7f004c8 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Tue, 9 Dec 2025 09:54:23 -0500 Subject: [PATCH 13/17] Removed plan->self usage from collectives, since it's faster for self to be included in the collective --- src/KOKKOS/remap_kokkos.cpp | 44 +++---------------------------------- src/KOKKOS/remap_kokkos.h | 3 --- src/KSPACE/remap.cpp | 43 +++--------------------------------- src/KSPACE/remap.h | 3 --- 4 files changed, 6 insertions(+), 87 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 63199e44f00..ebdae5d9548 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -212,10 +212,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - if (isend == plan->selfcommringloc && plan->self) { - numpacked++; - } - else if (plan->sendcnts[isend]) { + if (plan->sendcnts[isend]) { plan->pack(d_in,plan->send_offset[numpacked], plan->d_sendbuf,plan->sdispls[isend], &plan->packplan[numpacked]); @@ -235,23 +232,9 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d if (!plan->usegpu_aware) Kokkos::deep_copy(d_scratch,plan->h_scratch); - // copy in -> scratch -> out for self data - - if (plan->self) { - plan->pack(d_in,plan->send_offset[plan->selfnsendloc], - plan->d_sendbuf,plan->sdispls[plan->selfcommringloc], - &plan->packplan[plan->selfnsendloc]); - plan->unpack(plan->d_sendbuf,plan->sdispls[plan->selfcommringloc], - d_out,plan->recv_offset[plan->selfnrecvloc], - &plan->unpackplan[plan->selfnrecvloc]); - } - numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (irecv == plan->selfcommringloc && plan->self) { - numpacked++; - } - else if (plan->rcvcnts[irecv]) { + if (plan->rcvcnts[irecv]) { plan->unpack(d_scratch,plan->rdispls[irecv], d_out,plan->recv_offset[numpacked], &plan->unpackplan[numpacked]); @@ -620,10 +603,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // send space - plan->selfcommringloc = -1; - plan->selfnsendloc = -1; - plan->selfnrecvloc = -1; - plan->nsend = nsend; plan->pack = PackKokkos::pack_3d; @@ -688,10 +667,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat int total_send_size = 0; for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; - if (iproc == me) { - plan->selfcommringloc = i; - plan->selfnsendloc = nsend; - } if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { //plan->send_proc[nsend] = i; // number of entries required for this pack's 3-d coords @@ -730,11 +705,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; - if (iproc == me) { - plan->selfnrecvloc = nrecv; - } if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - if (permute == 0) { plan->recv_offset[nrecv] = nqty * ((overlap.klo-out.klo)*out.jsize*out.isize + @@ -784,16 +755,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // init remaining fields in remap plan plan->memory = memory; - - if (plan->sendcnts[plan->selfcommringloc]) { - plan->self = 1; - plan->sendcnts[plan->selfcommringloc] = 0; - plan->rcvcnts[plan->selfcommringloc] = 0; - } - else { - plan->self = 0; - } - + plan->self = 0; // if requested, allocate internal scratch space for recvs, // only need it if I will receive any data (including self) diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index a6a15bbf098..6f2f4b4344e 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -65,9 +65,6 @@ struct remap_plan_3d_kokkos { int *rcvcnts; // # of elements in recv buffer for each rank int *sdispls; // extraction location in send buffer for each rank int *rdispls; // extraction location in recv buffer for each rank - int selfcommringloc; // current proc's location in commringlist - int selfnsendloc; // current proc's location in send lists - int selfnrecvloc; // current proc's location in recv lists }; template diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index 73db03cde83..3b925126168 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -150,10 +150,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - if (isend == plan->selfcommringloc && plan->self) { - numpacked++; - } - else if (plan->sendcnts[isend]) { + if (plan->sendcnts[isend]) { plan->pack(&in[plan->send_offset[numpacked]], &plan->sendbuf[plan->sdispls[isend]], &plan->packplan[numpacked]); @@ -165,25 +162,11 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, MPI_FFT_SCALAR, scratch, plan->rcvcnts, plan->rdispls, MPI_FFT_SCALAR, plan->comm); - // copy in -> scratch -> out for self data - - if (plan->self) { - plan->pack(&in[plan->send_offset[plan->selfnsendloc]], - &plan->sendbuf[plan->sdispls[plan->selfcommringloc]], - &plan->packplan[plan->selfnsendloc]); - plan->unpack(&plan->sendbuf[plan->sdispls[plan->selfcommringloc]], - &out[plan->recv_offset[plan->selfnrecvloc]], - &plan->unpackplan[plan->selfnrecvloc]); - } - // unpack the data from the recv buffer into out numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (irecv == plan->selfcommringloc && plan->self) { - numpacked++; - } - else if (plan->rcvcnts[irecv]) { + if (plan->rcvcnts[irecv]) { plan->unpack(&scratch[plan->rdispls[irecv]], &out[plan->recv_offset[numpacked]], &plan->unpackplan[numpacked]); @@ -554,10 +537,6 @@ struct remap_plan_3d *remap_3d_create_plan( // send space - plan->selfcommringloc = -1; - plan->selfnsendloc = -1; - plan->selfnrecvloc = -1; - plan->nsend = nsend; plan->pack = pack_3d; @@ -622,10 +601,6 @@ struct remap_plan_3d *remap_3d_create_plan( int total_send_size = 0; for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; - if (iproc == me) { - plan->selfcommringloc = i; - plan->selfnsendloc = nsend; - } if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { // number of entries required for this pack's 3-d coords plan->send_offset[nsend] = nqty * @@ -663,11 +638,7 @@ struct remap_plan_3d *remap_3d_create_plan( for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; - if (iproc == me) { - plan->selfnrecvloc = nrecv; - } if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - if (permute == 0) { plan->recv_offset[nrecv] = nqty * ((overlap.klo-out.klo)*out.jsize*out.isize + @@ -717,15 +688,7 @@ struct remap_plan_3d *remap_3d_create_plan( // init remaining fields in remap plan plan->memory = memory; - - if (plan->sendcnts[plan->selfcommringloc]) { - plan->self = 1; - plan->sendcnts[plan->selfcommringloc] = 0; - plan->rcvcnts[plan->selfcommringloc] = 0; - } - else { - plan->self = 0; - } + plan->self = 0; // if requested, allocate internal scratch space for recvs, // only need it if I will receive any data (including self) diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h index c171bbe5a3e..057c8e8824c 100644 --- a/src/KSPACE/remap.h +++ b/src/KSPACE/remap.h @@ -49,9 +49,6 @@ struct remap_plan_3d { int *rcvcnts; // # of elements in recv buffer for each rank int *sdispls; // extraction location in send buffer for each rank int *rdispls; // extraction location in recv buffer for each rank - int selfcommringloc; // current proc's location in commringlist - int selfnsendloc; // current proc's location in send lists - int selfnrecvloc; // current proc's location in recv lists }; // collision between 2 regions From f89472bc886379e7b1bb8dbed8868270949b40ed Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 10 Dec 2025 09:20:31 -0700 Subject: [PATCH 14/17] Rename flag --- src/KOKKOS/fft3d_kokkos.cpp | 16 ++++++++-------- src/KOKKOS/pppm_kokkos.cpp | 8 ++++---- src/KOKKOS/remap_kokkos.cpp | 22 +++++++++++----------- src/KOKKOS/remap_kokkos.h | 4 ++-- src/KSPACE/fft3d.cpp | 8 ++++---- src/KSPACE/fft3d_wrap.cpp | 4 ++-- src/KSPACE/pppm.cpp | 6 +++--- src/KSPACE/pppm_dipole.cpp | 6 +++--- src/KSPACE/pppm_disp.cpp | 12 ++++++------ src/KSPACE/remap.cpp | 18 +++++++++--------- src/KSPACE/remap.h | 4 ++-- src/KSPACE/remap_wrap.cpp | 4 ++-- src/kspace.cpp | 6 +++--- src/kspace.h | 2 +- 14 files changed, 60 insertions(+), 60 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 99642e27f47..df1380c8853 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -35,7 +35,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf, int usecollective, - int useisend, int usegpu_aware) : + int usenonblocking, int usegpu_aware) : Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; @@ -81,7 +81,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective,useisend,nthreads,usegpu_aware); + scaled,permute,nbuf,usecollective,usenonblocking,nthreads,usegpu_aware); if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan"); } @@ -400,7 +400,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in, 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT usecollective use collective MPI operations for remapping data - useisend when using point-to-point MPI, use MPI_Isend + usenonblocking when using point-to-point MPI, use MPI_Isend usegpu_aware use GPU-Aware MPI or not ------------------------------------------------------------------------- */ @@ -412,7 +412,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf, int usecollective, - int useisend, int nthreads, int usegpu_aware) + int usenonblocking, int nthreads, int usegpu_aware) { struct fft_plan_3d_kokkos *plan; int me,nprocs; @@ -469,7 +469,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,2,0,0,FFT_PRECISION, - usecollective,useisend,usegpu_aware); + usecollective,usenonblocking,usegpu_aware); if (plan->pre_plan == nullptr) return nullptr; } @@ -494,7 +494,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi,2,1,0,FFT_PRECISION, - usecollective,useisend,usegpu_aware); + usecollective,usenonblocking,usegpu_aware); if (plan->mid1_plan == nullptr) return nullptr; // 1d FFTs along mid axis @@ -535,7 +535,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi,2,1,0,FFT_PRECISION, - usecollective,useisend,usegpu_aware); + usecollective,usenonblocking,usegpu_aware); if (plan->mid2_plan == nullptr) return nullptr; // 1d FFTs along slow axis @@ -563,7 +563,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION, - usecollective,useisend,usegpu_aware); + usecollective,usenonblocking,usegpu_aware); if (plan->post_plan == nullptr) return nullptr; } diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 6ed541738e1..df00378eac1 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -833,23 +833,23 @@ void PPPMKokkos::allocate() // remap takes data from 3d brick to FFT decomposition int collective_flag = force->kspace->collective_flag; - int isend_flag = force->kspace->isend_flag; + int nonblocking_flag = force->kspace->nonblocking_flag; int gpu_aware_flag = lmp->kokkos->gpu_aware_flag; int tmp; fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag); + 0,0,&tmp,collective_flag,nonblocking_flag,gpu_aware_flag); fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag); + 0,0,&tmp,collective_flag,nonblocking_flag,gpu_aware_flag); remap = new RemapKokkos(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag,isend_flag,gpu_aware_flag); + 1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag,gpu_aware_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index ebdae5d9548..9530dc626b9 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -37,13 +37,13 @@ RemapKokkos::RemapKokkos(LAMMPS *lmp, MPI_Comm comm, int out_klo, int out_khi, int nqty, int permute, int memory, int precision, int usecollective, - int useisend, int usegpu_aware) : Pointers(lmp) + int usenonblocking, int usegpu_aware) : Pointers(lmp) { plan = remap_3d_create_plan_kokkos(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, nqty,permute,memory,precision,usecollective, - useisend, usegpu_aware); + usenonblocking, usegpu_aware); if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan"); } @@ -144,7 +144,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d for (isend = 0; isend < plan->nsend; isend++) { int in_offset = plan->send_offset[isend]; - if (plan->useisend) { + if (plan->usenonblocking) { plan->pack(d_in,in_offset, plan->d_sendbuf,plan->send_bufloc[isend], &plan->packplan[isend]); @@ -157,7 +157,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d if (!plan->usegpu_aware) Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); - if (plan->useisend) { + if (plan->usenonblocking) { MPI_Isend(v_sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR, plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]); } else { @@ -198,7 +198,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d d_out,out_offset,&plan->unpackplan[irecv]); } - if (plan->useisend) { + if (plan->usenonblocking) { // finally, wait for all Isends to be done MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE); } @@ -268,7 +268,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) usecollective whether to use collective MPI or point-to-point - useisend when using point-to-point MPI, use non-blocking MPI_Isend + usenonblocking when using point-to-point MPI, use non-blocking MPI_Isend usegpu_aware whether to use GPU-Aware MPI or not ------------------------------------------------------------------------- */ @@ -280,7 +280,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, int memory, int /*precision*/, - int usecollective, int useisend, int usegpu_aware) + int usecollective, int usenonblocking, int usegpu_aware) { struct remap_plan_3d_kokkos *plan; @@ -298,7 +298,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan = new struct remap_plan_3d_kokkos; if (plan == nullptr) return nullptr; plan->usecollective = usecollective; - plan->useisend = useisend; + plan->usenonblocking = usenonblocking; plan->usegpu_aware = usegpu_aware; // store parameters in local data structs @@ -363,7 +363,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->packplan = (struct pack_plan_3d *) malloc(nsend*sizeof(struct pack_plan_3d)); - if (plan->useisend) + if (plan->usenonblocking) plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request)); plan->send_bufloc = (int *) malloc(nsend*sizeof(int)); if (plan->send_bufloc == nullptr) return nullptr; @@ -509,7 +509,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // find biggest send message (not including self) and malloc space for it size = 0; - if (plan->useisend) { + if (plan->usenonblocking) { for (nsend = 0; nsend < plan->nsend; nsend++) size += plan->send_size[nsend]; } else { @@ -839,7 +839,7 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->send_size); free(plan->send_proc); free(plan->packplan); - if (plan->useisend) { + if (plan->usenonblocking) { free(plan->isend_reqs); free(plan->send_bufloc); } diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index 6f2f4b4344e..6701f1afea2 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -40,7 +40,7 @@ struct remap_plan_3d_kokkos { int *send_offset; // extraction loc for each send int *send_size; // size of each send message int *send_proc; // proc to send each message to - int *send_bufloc; // if useisend, offset in send buf for each isend + int *send_bufloc; // if usenonblocking, offset in send buf for each isend struct pack_plan_3d *packplan; // pack plan for each send message int *recv_offset; // insertion loc for each recv int *recv_size; // size of each recv message @@ -56,7 +56,7 @@ struct remap_plan_3d_kokkos { int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI - int useisend; // if using point-to-point MPI, use MPI_Isend + int usenonblocking; // if using point-to-point MPI, use MPI_Isend int usegpu_aware; // use GPU-Aware MPI or not // variables for collective MPI only int commringlen; // length of commringlist diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index f066c7db656..e5063616625 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -241,7 +241,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan) 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT usecollective use collective MPI operations for remapping data - useisend use non-blocking or blocking MPI pt2pt operations for remapping data + usenonblocking use non-blocking or blocking MPI pt2pt operations for remapping data ------------------------------------------------------------------------- */ struct fft_plan_3d *fft_3d_create_plan( @@ -250,7 +250,7 @@ struct fft_plan_3d *fft_3d_create_plan( int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective, int useisend) + int scaled, int permute, int *nbuf, int usecollective, int usenonblocking) { struct fft_plan_3d *plan; int me,nprocs; @@ -340,7 +340,7 @@ struct fft_plan_3d *fft_3d_create_plan( plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi,second_ilo,second_ihi, second_jlo,second_jhi,second_klo,second_khi, - 2,1,0,FFT_PRECISION,usecollective,useisend); + 2,1,0,FFT_PRECISION,usecollective,usenonblocking); if (plan->mid1_plan == nullptr) return nullptr; // 1d FFTs along mid axis @@ -381,7 +381,7 @@ struct fft_plan_3d *fft_3d_create_plan( second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, - third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,useisend); + third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,usenonblocking); if (plan->mid2_plan == nullptr) return nullptr; // 1d FFTs along slow axis diff --git a/src/KSPACE/fft3d_wrap.cpp b/src/KSPACE/fft3d_wrap.cpp index 44d482a721b..0fe364ac25f 100644 --- a/src/KSPACE/fft3d_wrap.cpp +++ b/src/KSPACE/fft3d_wrap.cpp @@ -25,13 +25,13 @@ FFT3d::FFT3d(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective, int useisend) : Pointers(lmp) + int scaled, int permute, int *nbuf, int usecollective, int usenonblocking) : Pointers(lmp) { #ifndef FFT_HEFFTE plan = fft_3d_create_plan(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective,useisend); + scaled,permute,nbuf,usecollective,usenonblocking); if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan"); #else heffte::plan_options options = heffte::default_options(); diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index 46232b13533..c829e04117f 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -837,17 +837,17 @@ void PPPM::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag,isend_flag); + 1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp index 193d66f8264..979f14a3659 100644 --- a/src/KSPACE/pppm_dipole.cpp +++ b/src/KSPACE/pppm_dipole.cpp @@ -622,17 +622,17 @@ void PPPMDipole::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag,isend_flag); + 1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp index cf33931ac84..ae23ee873cd 100644 --- a/src/KSPACE/pppm_disp.cpp +++ b/src/KSPACE/pppm_disp.cpp @@ -1771,17 +1771,17 @@ void _noopt PPPMDisp::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag,isend_flag); + 1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag); } // -------------------------------------- @@ -1848,19 +1848,19 @@ void _noopt PPPMDisp::allocate() new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, - 0,0,&tmp,collective_flag,isend_flag); + 0,0,&tmp,collective_flag,nonblocking_flag); remap_6 = new Remap(lmp,world, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 1,0,0,FFT_PRECISION,collective_flag,isend_flag); + 1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag); } // -------------------------------------- diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index 3b925126168..03e467ca9e8 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -88,7 +88,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, for (isend = 0; isend < plan->nsend; isend++) { int in_offset = plan->send_offset[isend]; - if (plan->useisend) { + if (plan->usenonblocking) { plan->pack(&in[in_offset], &plan->sendbuf[plan->send_bufloc[isend]], &plan->packplan[isend]); @@ -98,7 +98,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, &plan->packplan[isend]); } - if (plan->useisend) { + if (plan->usenonblocking) { MPI_Isend(plan->sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR, plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]); } else { @@ -136,7 +136,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, &out[out_offset],&plan->unpackplan[irecv]); } - if (plan->useisend) { + if (plan->usenonblocking) { // finally, wait for all Isends to be done MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE); } @@ -200,7 +200,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) usecollective whether to use collective MPI or point-to-point - useisend whether to use non-blocking or blocking MPI point-to-point + usenonblocking whether to use non-blocking or blocking MPI point-to-point ------------------------------------------------------------------------- */ struct remap_plan_3d *remap_3d_create_plan( @@ -209,7 +209,7 @@ struct remap_plan_3d *remap_3d_create_plan( int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, - int memory, int /*precision*/, int usecollective, int useisend) + int memory, int /*precision*/, int usecollective, int usenonblocking) { @@ -228,7 +228,7 @@ struct remap_plan_3d *remap_3d_create_plan( plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d)); if (plan == nullptr) return nullptr; plan->usecollective = usecollective; - plan->useisend = useisend; + plan->usenonblocking = usenonblocking; // store parameters in local data structs @@ -300,7 +300,7 @@ struct remap_plan_3d *remap_3d_create_plan( plan->packplan = (struct pack_plan_3d *) malloc(nsend*sizeof(struct pack_plan_3d)); - if (plan->useisend) + if (plan->usenonblocking) plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request)); plan->send_bufloc = (int *) malloc(nsend*sizeof(int)); if (plan->send_bufloc == nullptr) return nullptr; @@ -445,7 +445,7 @@ struct remap_plan_3d *remap_3d_create_plan( // find biggest send message (not including self) and malloc space for it size = 0; - if (plan->useisend) { + if (plan->usenonblocking) { for (nsend = 0; nsend < plan->nsend; nsend++) size += plan->send_size[nsend]; } else { @@ -770,7 +770,7 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan) free(plan->send_size); free(plan->send_proc); free(plan->packplan); - if (plan->useisend) { + if (plan->usenonblocking) { free(plan->isend_reqs); free(plan->send_bufloc); } diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h index 057c8e8824c..8bc73057f0f 100644 --- a/src/KSPACE/remap.h +++ b/src/KSPACE/remap.h @@ -27,7 +27,7 @@ struct remap_plan_3d { int *send_offset; // extraction loc for each send int *send_size; // size of each send message int *send_proc; // proc to send each message to - int *send_bufloc; // if useisend, offset in send buf for each isend + int *send_bufloc; // if usenonblocking, offset in send buf for each isend MPI_Request *isend_reqs; // MPI request for each posted isend struct pack_plan_3d *packplan; // pack plan for each send message int *recv_offset; // insertion loc for each recv @@ -42,7 +42,7 @@ struct remap_plan_3d { int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI - int useisend; // if using point-to-point MPI, use MPI_Isend + int usenonblocking; // if using point-to-point MPI, use MPI_Isend int commringlen; // length of commringlist int *commringlist; // ranks on communication ring of this plan int *sendcnts; // # of elements in send buffer for each rank diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp index 31bf6af910e..40651381142 100644 --- a/src/KSPACE/remap_wrap.cpp +++ b/src/KSPACE/remap_wrap.cpp @@ -26,12 +26,12 @@ Remap::Remap(LAMMPS *lmp, MPI_Comm comm, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, int memory, - int precision, int usecollective, int useisend) : Pointers(lmp) + int precision, int usecollective, int usenonblocking) : Pointers(lmp) { plan = remap_3d_create_plan(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - nqty,permute,memory,precision,usecollective,useisend); + nqty,permute,memory,precision,usecollective,usenonblocking); if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan"); } diff --git a/src/kspace.cpp b/src/kspace.cpp index 9b4bf478bfb..249e73b7ae8 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -60,7 +60,7 @@ KSpace::KSpace(LAMMPS *lmp) : #else collective_flag = 0; #endif - isend_flag = 0; + nonblocking_flag = 0; kewaldflag = 0; @@ -554,7 +554,7 @@ void KSpace::modify_params(int narg, char **arg) iarg += 2; } else if (strcmp(arg[iarg],"nonblocking") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); + nonblocking_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); iarg += 2; } else if (strcmp(arg[iarg],"diff") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); @@ -613,7 +613,7 @@ void KSpace::modify_params(int narg, char **arg) iarg += n; } } - if (collective_flag > 0 && isend_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true."); + if (collective_flag > 0 && nonblocking_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true."); } /* ---------------------------------------------------------------------- */ diff --git a/src/kspace.h b/src/kspace.h index 9712f91aa32..eff18cf7960 100644 --- a/src/kspace.h +++ b/src/kspace.h @@ -131,7 +131,7 @@ class KSpace : protected Pointers { int compute_flag; // 0 if skip compute() int fftbench; // 0 if skip FFT timing int collective_flag; // 1 if use MPI collectives for FFT/remap - int isend_flag; // 1 if use MPI_Isend for FFT/remap + int nonblocking_flag; // 1 if use MPI_Isend for FFT/remap int stagger_flag; // 1 if using staggered PPPM grids double splittol; // tolerance for when to truncate splitting From b0e61971a9d0a6dd9e8fd26d610dec79f79125fc Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 10 Dec 2025 12:22:07 -0500 Subject: [PATCH 15/17] simplify handling of collective and nonblocking exclusivity --- doc/src/kspace_modify.rst | 5 +++-- src/kspace.cpp | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst index efdf0605bba..4336b557024 100644 --- a/doc/src/kspace_modify.rst +++ b/doc/src/kspace_modify.rst @@ -451,8 +451,9 @@ parameters, see the :doc:`Howto dispersion ` doc page. Restrictions """""""""""" -The *collective* and *nonblocking* keywords are mutually exclusive and -cannot be enabled at the same time. +The *collective* and *nonblocking* keywords cannot both be enabled +at the same time. Whichever of the two keywords is enabled last will +disable the other. Related commands """""""""""""""" diff --git a/src/kspace.cpp b/src/kspace.cpp index 249e73b7ae8..52e7a65980e 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -551,10 +551,12 @@ void KSpace::modify_params(int narg, char **arg) } else if (strcmp(arg[iarg],"collective") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); + if (collective_flag) nonblocking_flag = 0; iarg += 2; } else if (strcmp(arg[iarg],"nonblocking") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); nonblocking_flag = utils::logical(FLERR,arg[iarg+1],false,lmp); + if (nonblocking_flag) collective_flag = 0; iarg += 2; } else if (strcmp(arg[iarg],"diff") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); @@ -613,7 +615,6 @@ void KSpace::modify_params(int narg, char **arg) iarg += n; } } - if (collective_flag > 0 && nonblocking_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true."); } /* ---------------------------------------------------------------------- */ From 033af47e95f638ff479020e8ff9b1f8116add973 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 10 Dec 2025 12:44:32 -0500 Subject: [PATCH 16/17] add version tag --- doc/src/kspace_modify.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst index 4336b557024..869fd13b287 100644 --- a/doc/src/kspace_modify.rst +++ b/doc/src/kspace_modify.rst @@ -87,6 +87,8 @@ collective operations and adequate hardware. ---------- +.. versionadded:: 10Dec2025 + The *nonblocking* keyword applies only to PPPM. It is set to *no* by default. If this option is set to *yes*, LAMMPS will use non-blocking point-to-point MPI operations to remap data for 3d-FFT operations From f98de9e93474beddd325ac90b6e617e40c2d68f5 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 10 Dec 2025 12:45:01 -0500 Subject: [PATCH 17/17] avoid memory leaks and correct call to free plan --- src/KSPACE/remap.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index 03e467ca9e8..2fb6679b102 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -229,6 +229,8 @@ struct remap_plan_3d *remap_3d_create_plan( if (plan == nullptr) return nullptr; plan->usecollective = usecollective; plan->usenonblocking = usenonblocking; + plan->scratch = nullptr; + plan->sendbuf = nullptr; // store parameters in local data structs @@ -438,8 +440,7 @@ struct remap_plan_3d *remap_3d_create_plan( if (nrecv == plan->nrecv) plan->self = 0; else plan->self = 1; - // the plan->sendbuf and plan->recvbuf are used by both the - // collective & non-collective implementations. + // plan->sendbuf is used by both the collective & non-collective implementations. // For non-collective and blocking, the buffer size is MAX(send_size) for any one send // find biggest send message (not including self) and malloc space for it @@ -471,6 +472,7 @@ struct remap_plan_3d *remap_3d_create_plan( // Non-collectives do not use MPI Communicator Groups MPI_Comm_dup(comm,&plan->comm); + } else { int *commringlist; int commringlen = 0; @@ -763,6 +765,7 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan) free(plan->unpackplan); } } else { + // free arrays used in pt2pt communication if (plan->nsend || plan->self) { @@ -786,9 +789,14 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan) } } + // free buffers, if needed + + if (plan->scratch) free(plan->scratch); + if (plan->sendbuf) free(plan->sendbuf); + // free plan itself - delete plan; + free(plan); } /* ----------------------------------------------------------------------