From d1f40e81bab5799ff1b5d2875e121d162a3ad21d Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Fri, 21 Nov 2025 15:50:03 -0500
Subject: [PATCH 01/17] Implemented kspace_modify isend yes for non-blocking
 MPI_Isend communication

---
 src/KOKKOS/fft3d_kokkos.cpp | 15 +++++-----
 src/KOKKOS/fft3d_kokkos.h   |  4 +--
 src/KOKKOS/pppm_kokkos.cpp  |  7 +++--
 src/KOKKOS/remap_kokkos.cpp | 56 +++++++++++++++++++++++++++++--------
 src/KOKKOS/remap_kokkos.h   |  7 +++--
 src/kspace.cpp              |  5 ++++
 src/kspace.h                |  1 +
 7 files changed, 70 insertions(+), 25 deletions(-)
diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp
index c07f45cc228..99642e27f47 100644
--- a/src/KOKKOS/fft3d_kokkos.cpp
+++ b/src/KOKKOS/fft3d_kokkos.cpp
@@ -35,7 +35,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
              int scaled, int permute, int *nbuf, int usecollective,
-             int usegpu_aware) :
+             int useisend, int usegpu_aware) :
   Pointers(lmp)
 {
   int nthreads = lmp->kokkos->nthreads;
@@ -81,7 +81,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
   plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
                             in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                             out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,usecollective,nthreads,usegpu_aware);
+                            scaled,permute,nbuf,usecollective,useisend,nthreads,usegpu_aware);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan");
 }
 
@@ -400,6 +400,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
                           2 = permute twice = slow->fast, fast->mid, mid->slow
    nbuf                 returns size of internal storage buffers used by FFT
    usecollective        use collective MPI operations for remapping data
+   useisend             when using point-to-point MPI, use MPI_Isend
    usegpu_aware         use GPU-Aware MPI or not
 ------------------------------------------------------------------------- */
 
@@ -411,7 +412,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
        int out_ilo, int out_ihi, int out_jlo, int out_jhi,
        int out_klo, int out_khi,
        int scaled, int permute, int *nbuf, int usecollective,
-       int nthreads, int usegpu_aware)
+       int useisend, int nthreads, int usegpu_aware)
 {
   struct fft_plan_3d_kokkos<DeviceType> *plan;
   int me,nprocs;
@@ -468,7 +469,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
       remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                            first_ilo,first_ihi,first_jlo,first_jhi,
                            first_klo,first_khi,2,0,0,FFT_PRECISION,
-                           usecollective,usegpu_aware);
+                           usecollective,useisend,usegpu_aware);
     if (plan->pre_plan == nullptr) return nullptr;
   }
 
@@ -493,7 +494,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                            first_klo,first_khi,
                            second_ilo,second_ihi,second_jlo,second_jhi,
                            second_klo,second_khi,2,1,0,FFT_PRECISION,
-                           usecollective,usegpu_aware);
+                           usecollective,useisend,usegpu_aware);
   if (plan->mid1_plan == nullptr) return nullptr;
 
   // 1d FFTs along mid axis
@@ -534,7 +535,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                          second_ilo,second_ihi,
                          third_jlo,third_jhi,third_klo,third_khi,
                          third_ilo,third_ihi,2,1,0,FFT_PRECISION,
-                         usecollective,usegpu_aware);
+                         usecollective,useisend,usegpu_aware);
   if (plan->mid2_plan == nullptr) return nullptr;
 
   // 1d FFTs along slow axis
@@ -562,7 +563,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                            third_jlo,third_jhi,
                            out_klo,out_khi,out_ilo,out_ihi,
                            out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
-                           usecollective,usegpu_aware);
+                           usecollective,useisend,usegpu_aware);
     if (plan->post_plan == nullptr) return nullptr;
   }
 
diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h
index 6227a1e97b1..33d52e58130 100644
--- a/src/KOKKOS/fft3d_kokkos.h
+++ b/src/KOKKOS/fft3d_kokkos.h
@@ -97,7 +97,7 @@ class FFT3dKokkos : protected Pointers {
 
   FFT3dKokkos(class LAMMPS *, MPI_Comm,
         int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
-        int,int,int *,int,int);
+        int,int,int *,int,int,int);
   ~FFT3dKokkos() override;
   void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
   void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
@@ -115,7 +115,7 @@ class FFT3dKokkos : protected Pointers {
   struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
                                          int, int, int, int, int,
                                          int, int, int, int, int, int, int,
-                                         int, int, int *, int, int, int);
+                                         int, int, int *, int, int, int, int);
 
   void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
 
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index dae53c12d0b..6ed541738e1 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -833,22 +833,23 @@ void PPPMKokkos<DeviceType>::allocate()
   // remap takes data from 3d brick to FFT decomposition
 
   int collective_flag = force->kspace->collective_flag;
+  int isend_flag = force->kspace->isend_flag;
   int gpu_aware_flag = lmp->kokkos->gpu_aware_flag;
   int tmp;
 
   fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                         0,0,&tmp,collective_flag,gpu_aware_flag);
+                         0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag);
 
   fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         0,0,&tmp,collective_flag,gpu_aware_flag);
+                         0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag);
   remap = new RemapKokkos<DeviceType>(lmp,world,
                           nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                           nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                          1,0,0,FFT_PRECISION,collective_flag,gpu_aware_flag);
+                          1,0,0,FFT_PRECISION,collective_flag,isend_flag,gpu_aware_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp
index 573f4c2508b..f9ab0f7f475 100644
--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@@ -35,13 +35,13 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
              int out_klo, int out_khi,
              int nqty, int permute, int memory,
              int precision, int usecollective,
-             int usegpu_aware) : Pointers(lmp)
+             int useisend, int usegpu_aware) : Pointers(lmp)
 {
   plan = remap_3d_create_plan_kokkos(comm,
                               in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                               out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                               nqty,permute,memory,precision,usecollective,
-                              usegpu_aware);
+                              useisend, usegpu_aware);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan");
 }
 
@@ -142,14 +142,26 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
 
     for (isend = 0; isend < plan->nsend; isend++) {
       int in_offset = plan->send_offset[isend];
-      plan->pack(d_in,in_offset,
-                 plan->d_sendbuf,0,&plan->packplan[isend]);
+      if (plan->useisend) {
+        plan->pack(d_in,in_offset,
+                  plan->d_sendbuf,plan->send_bufloc[isend],
+                  &plan->packplan[isend]);
+      } else {
+        plan->pack(d_in,in_offset,
+                  plan->d_sendbuf,0,
+                  &plan->packplan[isend]);
+      }
 
       if (!plan->usegpu_aware)
         Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
 
-      MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
-               plan->send_proc[isend],0,plan->comm);
+      if (plan->useisend) {
+        MPI_Isend(v_sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR,
+                plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]);
+      } else {
+        MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
+                plan->send_proc[isend],0,plan->comm);
+      }
     }
 
     // copy in -> scratch -> out for self data
@@ -183,6 +195,11 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
       plan->unpack(d_scratch,scratch_offset,
                    d_out,out_offset,&plan->unpackplan[irecv]);
     }
+
+    if (plan->useisend) {
+      // finally, wait for all Isends to be done
+      MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE);
+    }
   } else {
     if (plan->commringlen > 0) {
       int isend,irecv;
@@ -266,6 +283,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
                           1 = single precision (4 bytes per datum)
                           2 = double precision (8 bytes per datum)
    usecollective        whether to use collective MPI or point-to-point
+   useisend             when using point-to-point MPI, use non-blocking MPI_Isend
    usegpu_aware         whether to use GPU-Aware MPI or not
 ------------------------------------------------------------------------- */
 
@@ -277,7 +295,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
   int out_ilo, int out_ihi, int out_jlo, int out_jhi,
   int out_klo, int out_khi,
   int nqty, int permute, int memory, int /*precision*/,
-  int usecollective, int usegpu_aware)
+  int usecollective, int useisend, int usegpu_aware)
 {
 
   struct remap_plan_3d_kokkos<DeviceType> *plan;
@@ -295,6 +313,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
   plan = new struct remap_plan_3d_kokkos<DeviceType>;
   if (plan == nullptr) return nullptr;
   plan->usecollective = usecollective;
+  plan->useisend = useisend;
   plan->usegpu_aware = usegpu_aware;
 
   // store parameters in local data structs
@@ -359,6 +378,11 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
       plan->packplan = (struct pack_plan_3d *)
         malloc(nsend*sizeof(struct pack_plan_3d));
 
+      if (plan->useisend)
+        plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request));
+        plan->send_bufloc = (int *) malloc(nsend*sizeof(int));
+        if (plan->send_bufloc == nullptr) return nullptr;
+
       if (plan->send_offset == nullptr || plan->send_size == nullptr ||
           plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
     }
@@ -400,6 +424,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
 
     nsend = 0;
     iproc = me;
+    ibuf = 0;
     for (i = 0; i < nprocs; i++) {
       iproc++;
       if (iproc == nprocs) iproc = 0;
@@ -415,6 +440,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
         plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
         plan->packplan[nsend].nqty = nqty;
         plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->send_bufloc[nsend] = ibuf;
+        ibuf += plan->send_size[nsend];
         nsend++;
       }
     }
@@ -497,8 +524,13 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
     // find biggest send message (not including self) and malloc space for it
 
     size = 0;
-    for (nsend = 0; nsend < plan->nsend; nsend++)
-      size = MAX(size,plan->send_size[nsend]);
+    if (plan->useisend) {
+      for (nsend = 0; nsend < plan->nsend; nsend++)
+        size += plan->send_size[nsend];
+    } else {
+      for (nsend = 0; nsend < plan->nsend; nsend++)
+        size = MAX(size,plan->send_size[nsend]);
+    }
 
     if (size) {
       plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
@@ -520,7 +552,6 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
 
     MPI_Comm_dup(comm,&plan->comm);
   } else {
-
     // Improved approach - use an AllReduce to aggregate which ranks need to be included
     // To do this, we build the local proc's send/receive list, then do an AllReduce
     // to create the send/recv count for the Alltoallv
@@ -845,7 +876,6 @@ void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_
       free(plan->unpackplan);
     }
   } else {
-
     // free arrays used in pt2pt communication
 
     if (plan->nsend || plan->self) {
@@ -853,6 +883,10 @@ void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_
       free(plan->send_size);
       free(plan->send_proc);
       free(plan->packplan);
+      if (plan->useisend) {
+        free(plan->isend_reqs);
+        free(plan->send_bufloc);
+      }
     }
 
     if (plan->nrecv || plan->self) {
diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h
index e3830eea7d4..a6a15bbf098 100644
--- a/src/KOKKOS/remap_kokkos.h
+++ b/src/KOKKOS/remap_kokkos.h
@@ -40,12 +40,14 @@ struct remap_plan_3d_kokkos {
   int *send_offset;                 // extraction loc for each send
   int *send_size;                   // size of each send message
   int *send_proc;                   // proc to send each message to
+  int *send_bufloc;                 // if useisend, offset in send buf for each isend
   struct pack_plan_3d *packplan;    // pack plan for each send message
   int *recv_offset;                 // insertion loc for each recv
   int *recv_size;                   // size of each recv message
   int *recv_proc;                   // proc to recv each message from
   int *recv_bufloc;                 // offset in scratch buf for each recv
   int *nrecvmap;                    // maps receive index to rank index
+  MPI_Request *isend_reqs;          // MPI request for each posted isend
   MPI_Request *request;             // MPI request for each posted recv
   struct pack_plan_3d *unpackplan;  // unpack plan for each recv message
   int nrecv;                        // # of recvs from other procs
@@ -54,6 +56,7 @@ struct remap_plan_3d_kokkos {
   int memory;                       // user provides scratch space or not
   MPI_Comm comm;                    // group of procs performing remap
   int usecollective;                // use collective or point-to-point MPI
+  int useisend;                     // if using point-to-point MPI, use MPI_Isend
   int usegpu_aware;                 // use GPU-Aware MPI or not
   // variables for collective MPI only
   int commringlen;                  // length of commringlist
@@ -75,7 +78,7 @@ class RemapKokkos : protected Pointers {
   typedef FFTArrayTypes<DeviceType> FFT_AT;
   RemapKokkos(class LAMMPS *);
   RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
-        int,int,int,int,int,int,int,int,int,int,int,int);
+        int,int,int,int,int,int,int,int,int,int,int,int,int);
   ~RemapKokkos() override;
   void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
 
@@ -85,7 +88,7 @@ class RemapKokkos : protected Pointers {
   struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
                                              int, int, int, int, int, int,
                                              int, int, int, int, int, int,
-                                             int, int, int, int, int, int);
+                                             int, int, int, int, int, int, int);
   void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
 };
 
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 9cb400c1a72..ed025fd9454 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -60,6 +60,7 @@ KSpace::KSpace(LAMMPS *lmp) :
 #else
   collective_flag = 0;
 #endif
+  isend_flag = 0;
 
   kewaldflag = 0;
 
@@ -552,6 +553,10 @@ void KSpace::modify_params(int narg, char **arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
+    } else if (strcmp(arg[iarg],"isend") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
+      isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
     } else if (strcmp(arg[iarg],"diff") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       if (strcmp(arg[iarg+1],"ad") == 0) differentiation_flag = 1;
diff --git a/src/kspace.h b/src/kspace.h
index 0a3db05ee7a..f056c25bbb7 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -131,6 +131,7 @@ class KSpace : protected Pointers {
   int compute_flag;       // 0 if skip compute()
   int fftbench;           // 0 if skip FFT timing
   int collective_flag;    // 1 if use MPI collectives for FFT/remap
+  int isend_flag;         // 1 if use MPI_Isend for FFT/remap
   int stagger_flag;       // 1 if using staggered PPPM grids
 
   double splittol;    // tolerance for when to truncate splitting

From 5c6666a38938e63914e5192ff5a7cd904f8a2703 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Mon, 24 Nov 2025 11:25:39 -0500
Subject: [PATCH 02/17] Change isend keyword to nonblocking for KSPACE

---
 src/kspace.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kspace.cpp b/src/kspace.cpp
index ed025fd9454..f390b6279af 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -553,7 +553,7 @@ void KSpace::modify_params(int narg, char **arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"isend") == 0) {
+    } else if (strcmp(arg[iarg],"nonblocking") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;

From 7715f2b8c5a14f4e412ca98e3471c089e67a3b61 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Mon, 24 Nov 2025 11:33:53 -0500
Subject: [PATCH 03/17] Add error if both collective and nonblocking are
 specified

---
 src/kspace.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kspace.cpp b/src/kspace.cpp
index f390b6279af..9b4bf478bfb 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -528,8 +528,7 @@ void KSpace::modify_params(int narg, char **arg)
       }
       warn_nonneutral = 0; // can't use wire correction with non-neutral system
       iarg += 2;
-    }
-    else if (strcmp(arg[iarg], "amat") == 0) {
+    } else if (strcmp(arg[iarg], "amat") == 0) {
       if (iarg + 2 > narg) error->all(FLERR, "Illegal kspace_modify command");
       if (!pppmflag) error->all(FLERR, "Illegal kspace_modify command 'amat'"
                                       "available for pppm/conp, only");
@@ -614,6 +613,7 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += n;
     }
   }
+  if (collective_flag > 0 && isend_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true.");
 }
 
 /* ---------------------------------------------------------------------- */

From 8ca245b378e4b5774de59678ee450c21443d42ee Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 26 Nov 2025 14:23:27 -0500
Subject: [PATCH 04/17] Merge Kokkos-based KSPACE comms improvements into
 non-Kokkos based, add option for nonblocking

---
 src/KSPACE/fft3d.cpp       |  12 +-
 src/KSPACE/fft3d.h         |   2 +-
 src/KSPACE/fft3d_wrap.cpp  |   4 +-
 src/KSPACE/fft3d_wrap.h    |   2 +-
 src/KSPACE/pppm.cpp        |   6 +-
 src/KSPACE/pppm_dipole.cpp |   6 +-
 src/KSPACE/pppm_disp.cpp   |  12 +-
 src/KSPACE/remap.cpp       | 922 ++++++++++++++++++++++---------------
 src/KSPACE/remap.h         |  12 +-
 src/KSPACE/remap_wrap.cpp  |   4 +-
 src/KSPACE/remap_wrap.h    |   2 +-
 11 files changed, 578 insertions(+), 406 deletions(-)

diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp
index 1426b9893ee..f066c7db656 100644
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@@ -18,6 +18,7 @@
                            FFTW3, KISS FFT, Dfti/MKL, and ACML
                          Phil Blood (PSC) added single precision FFTs
                          Paul Coffman (IBM) added MPI collectives remap
+                         Nick Hagerty (ORNL) added non-blocking MPI pt2pt remap
 ------------------------------------------------------------------------- */
 
 #include "fft3d.h"
@@ -240,6 +241,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
                           2 = permute twice = slow->fast, fast->mid, mid->slow
    nbuf                 returns size of internal storage buffers used by FFT
    usecollective        use collective MPI operations for remapping data
+   useisend             use non-blocking or blocking MPI pt2pt operations for remapping data
 ------------------------------------------------------------------------- */
 
 struct fft_plan_3d *fft_3d_create_plan(
@@ -248,7 +250,7 @@ struct fft_plan_3d *fft_3d_create_plan(
        int in_klo, int in_khi,
        int out_ilo, int out_ihi, int out_jlo, int out_jhi,
        int out_klo, int out_khi,
-       int scaled, int permute, int *nbuf, int usecollective)
+       int scaled, int permute, int *nbuf, int usecollective, int useisend)
 {
   struct fft_plan_3d *plan;
   int me,nprocs;
@@ -313,7 +315,7 @@ struct fft_plan_3d *fft_3d_create_plan(
     first_khi = (ip2+1)*nslow/np2 - 1;
     plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                                           first_ilo,first_ihi,first_jlo,first_jhi,
-                                          first_klo,first_khi,2,0,0,FFT_PRECISION,0);
+                                          first_klo,first_khi,2,0,0,FFT_PRECISION,0,0);
     if (plan->pre_plan == nullptr) {
       free(plan);
       return nullptr;
@@ -338,7 +340,7 @@ struct fft_plan_3d *fft_3d_create_plan(
   plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi,
                                          first_klo,first_khi,second_ilo,second_ihi,
                                          second_jlo,second_jhi,second_klo,second_khi,
-                                         2,1,0,FFT_PRECISION,usecollective);
+                                         2,1,0,FFT_PRECISION,usecollective,useisend);
   if (plan->mid1_plan == nullptr) return nullptr;
 
   // 1d FFTs along mid axis
@@ -379,7 +381,7 @@ struct fft_plan_3d *fft_3d_create_plan(
                          second_jlo,second_jhi,second_klo,second_khi,
                          second_ilo,second_ihi,
                          third_jlo,third_jhi,third_klo,third_khi,
-                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
+                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,useisend);
   if (plan->mid2_plan == nullptr) return nullptr;
 
   // 1d FFTs along slow axis
@@ -408,7 +410,7 @@ struct fft_plan_3d *fft_3d_create_plan(
                            third_klo,third_khi,third_ilo,third_ihi,
                            third_jlo,third_jhi,
                            out_klo,out_khi,out_ilo,out_ihi,
-                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
+                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0,0);
     if (plan->post_plan == nullptr) return nullptr;
   }
 
diff --git a/src/KSPACE/fft3d.h b/src/KSPACE/fft3d.h
index 00a3332d2e8..28c434565da 100644
--- a/src/KSPACE/fft3d.h
+++ b/src/KSPACE/fft3d.h
@@ -143,7 +143,7 @@ struct fft_plan_3d {
 extern "C" {
 void fft_3d(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
 struct fft_plan_3d *fft_3d_create_plan(MPI_Comm, int, int, int, int, int, int, int, int, int, int,
-                                       int, int, int, int, int, int, int, int *, int);
+                                       int, int, int, int, int, int, int, int *, int, int);
 void fft_3d_destroy_plan(struct fft_plan_3d *);
 void factor(int, int *, int *);
 void bifactor(int, int *, int *);
diff --git a/src/KSPACE/fft3d_wrap.cpp b/src/KSPACE/fft3d_wrap.cpp
index 7b00543eeab..44d482a721b 100644
--- a/src/KSPACE/fft3d_wrap.cpp
+++ b/src/KSPACE/fft3d_wrap.cpp
@@ -25,13 +25,13 @@ FFT3d::FFT3d(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
              int in_klo, int in_khi,
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
-             int scaled, int permute, int *nbuf, int usecollective) : Pointers(lmp)
+             int scaled, int permute, int *nbuf, int usecollective, int useisend) : Pointers(lmp)
 {
   #ifndef FFT_HEFFTE
   plan = fft_3d_create_plan(comm,nfast,nmid,nslow,
                             in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                             out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,usecollective);
+                            scaled,permute,nbuf,usecollective,useisend);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan");
   #else
   heffte::plan_options options = heffte::default_options<heffte_backend>();
diff --git a/src/KSPACE/fft3d_wrap.h b/src/KSPACE/fft3d_wrap.h
index 04b828b7de5..e8559cc9641 100644
--- a/src/KSPACE/fft3d_wrap.h
+++ b/src/KSPACE/fft3d_wrap.h
@@ -37,7 +37,7 @@ class FFT3d : protected Pointers {
   enum { FORWARD = 1, BACKWARD = -1 };
 
   FFT3d(class LAMMPS *, MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int,
-        int, int, int, int, int *, int);
+        int, int, int, int, int *, int, int);
   ~FFT3d() override;
   void compute(FFT_SCALAR *, FFT_SCALAR *, int);
   void timing1d(FFT_SCALAR *, int, int);
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index bb92208fe92..46232b13533 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -837,17 +837,17 @@ void PPPM::allocate()
   fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag);
+                   0,0,&tmp,collective_flag,isend_flag);
 
   fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag);
+                   0,0,&tmp,collective_flag,isend_flag);
 
   remap = new Remap(lmp,world,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                     nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag);
+                    1,0,0,FFT_PRECISION,collective_flag,isend_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
index 055e979b28e..193d66f8264 100644
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -622,17 +622,17 @@ void PPPMDipole::allocate()
   fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag);
+                   0,0,&tmp,collective_flag,isend_flag);
 
   fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag);
+                   0,0,&tmp,collective_flag,isend_flag);
 
   remap = new Remap(lmp,world,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                     nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag);
+                    1,0,0,FFT_PRECISION,collective_flag,isend_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index cbce62005f8..cf33931ac84 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -1771,17 +1771,17 @@ void _noopt PPPMDisp::allocate()
     fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                     0,0,&tmp,collective_flag);
+                     0,0,&tmp,collective_flag,isend_flag);
 
     fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                     0,0,&tmp,collective_flag);
+                     0,0,&tmp,collective_flag,isend_flag);
 
     remap = new Remap(lmp,world,
                       nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                       nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                      1,0,0,FFT_PRECISION,collective_flag);
+                      1,0,0,FFT_PRECISION,collective_flag,isend_flag);
   }
 
   // --------------------------------------
@@ -1848,19 +1848,19 @@ void _noopt PPPMDisp::allocate()
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                0,0,&tmp,collective_flag);
+                0,0,&tmp,collective_flag,isend_flag);
 
     fft2_6 =
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
                 nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                0,0,&tmp,collective_flag);
+                0,0,&tmp,collective_flag,isend_flag);
 
     remap_6 =
       new Remap(lmp,world,
                 nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                1,0,0,FFT_PRECISION,collective_flag);
+                1,0,0,FFT_PRECISION,collective_flag,isend_flag);
   }
 
   // --------------------------------------
diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index 06b013dbd33..297705217e8 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -63,31 +63,47 @@
 void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
               struct remap_plan_3d *plan)
 {
+  int me;
+  MPI_Comm_rank(plan->comm,&me);
+
+  FFT_SCALAR *scratch;
+  if (plan->memory == 0)
+    scratch = buf;
+  else
+    scratch = plan->scratch;
+
   // use point-to-point communication
 
   if (!plan->usecollective) {
     int i,isend,irecv;
-    FFT_SCALAR *scratch;
-
-    if (plan->memory == 0)
-      scratch = buf;
-    else
-      scratch = plan->scratch;
-
-    // post all recvs into scratch space
 
-    for (irecv = 0; irecv < plan->nrecv; irecv++)
+    for (irecv = 0; irecv < plan->nrecv; irecv++) {
       MPI_Irecv(&scratch[plan->recv_bufloc[irecv]],plan->recv_size[irecv],
                 MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
                 plan->comm,&plan->request[irecv]);
+    }
 
     // send all messages to other procs
 
     for (isend = 0; isend < plan->nsend; isend++) {
-      plan->pack(&in[plan->send_offset[isend]],
-                 plan->sendbuf,&plan->packplan[isend]);
-      MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
-               plan->send_proc[isend],0,plan->comm);
+      int in_offset = plan->send_offset[isend];
+      if (plan->useisend) {
+        plan->pack(&in[in_offset],
+                  &plan->sendbuf[plan->send_bufloc[isend]],
+                  &plan->packplan[isend]);
+      } else {
+        plan->pack(&in[in_offset],
+                  plan->sendbuf,
+                  &plan->packplan[isend]);
+      }
+
+      if (plan->useisend) {
+        MPI_Isend(plan->sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR,
+                plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]);
+      } else {
+        MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
+                plan->send_proc[isend],0,plan->comm);
+      }
     }
 
     // copy in -> scratch -> out for self data
@@ -95,109 +111,84 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
     if (plan->self) {
       isend = plan->nsend;
       irecv = plan->nrecv;
-      plan->pack(&in[plan->send_offset[isend]],
-                 &scratch[plan->recv_bufloc[irecv]],
+
+      int in_offset = plan->send_offset[isend];
+      int scratch_offset = plan->recv_bufloc[irecv];
+      int out_offset = plan->recv_offset[irecv];
+
+      plan->pack(&in[in_offset],
+                 &scratch[scratch_offset],
                  &plan->packplan[isend]);
-      plan->unpack(&scratch[plan->recv_bufloc[irecv]],
-                   &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]);
+      plan->unpack(&scratch[scratch_offset],
+                   &out[out_offset],&plan->unpackplan[irecv]);
     }
 
     // unpack all messages from scratch -> out
 
     for (i = 0; i < plan->nrecv; i++) {
       MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
-      plan->unpack(&scratch[plan->recv_bufloc[irecv]],
-                   &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]);
-    }
 
-  // use All2Allv collective for remap communication
+      int scratch_offset = plan->recv_bufloc[irecv];
+      int out_offset = plan->recv_offset[irecv];
+
+      plan->unpack(&scratch[scratch_offset],
+                   &out[out_offset],&plan->unpackplan[irecv]);
+    }
 
+    if (plan->useisend) {
+      // finally, wait for all Isends to be done
+      MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE);
+    }
   } else {
     if (plan->commringlen > 0) {
       int isend,irecv;
 
-      // create send and recv buffers for alltoallv collective
-
-      int sendBufferSize = 0;
-      int recvBufferSize = 0;
-      for (int i=0;i<plan->nsend;i++)
-        sendBufferSize += plan->send_size[i];
-      for (int i=0;i<plan->nrecv;i++)
-        recvBufferSize += plan->recv_size[i];
-
-      auto *packedSendBuffer = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * sendBufferSize + 1);
-      auto *packedRecvBuffer = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * recvBufferSize + 1);
+      // populate send data
+      // buffers are allocated and count/displacement buffers
+      // are populated in remap_3d_create_plan
 
-      int *sendcnts = (int *) malloc(sizeof(int) * plan->commringlen);
-      int *rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen);
-      int *sdispls = (int *) malloc(sizeof(int) * plan->commringlen);
-      int *rdispls = (int *) malloc(sizeof(int) * plan->commringlen);
-      int *nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen);
-
-      // create and populate send data, count and displacement buffers
-
-      int currentSendBufferOffset = 0;
+      int numpacked = 0;
       for (isend = 0; isend < plan->commringlen; isend++) {
-        sendcnts[isend] = 0;
-        sdispls[isend] = 0;
-        int foundentry = 0;
-        for (int i=0;(i<plan->nsend && !foundentry); i++) {
-          if (plan->send_proc[i] == plan->commringlist[isend]) {
-            foundentry = 1;
-            sendcnts[isend] = plan->send_size[i];
-            sdispls[isend] = currentSendBufferOffset;
-            plan->pack(&in[plan->send_offset[i]],
-                       &packedSendBuffer[currentSendBufferOffset],
-                       &plan->packplan[i]);
-            currentSendBufferOffset += plan->send_size[i];
-          }
+        if (plan->sendcnts[isend]) {
+          plan->pack(&in[plan->send_offset[numpacked]],
+                      &plan->sendbuf[plan->sdispls[isend]],
+                      &plan->packplan[numpacked]);
+          numpacked++;
+        }
+        else if (plan->commringlist[isend] == me && plan->self) {
+          numpacked++;
         }
       }
 
-      // create and populate recv count and displacement buffers
+      MPI_Alltoallv(plan->sendbuf, plan->sendcnts, plan->sdispls,
+                    MPI_FFT_SCALAR, scratch, plan->rcvcnts,
+                    plan->rdispls, MPI_FFT_SCALAR, plan->comm);
 
-      int currentRecvBufferOffset = 0;
-      for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        rcvcnts[irecv] = 0;
-        rdispls[irecv] = 0;
-        nrecvmap[irecv] = -1;
-        int foundentry = 0;
-        for (int i=0;(i<plan->nrecv && !foundentry); i++) {
-          if (plan->recv_proc[i] == plan->commringlist[irecv]) {
-            foundentry = 1;
-            rcvcnts[irecv] = plan->recv_size[i];
-            rdispls[irecv] = currentRecvBufferOffset;
-            currentRecvBufferOffset += plan->recv_size[i];
-            nrecvmap[irecv] = i;
-          }
-        }
-      }
+      // copy in -> scratch -> out for self data
 
-      MPI_Alltoallv(packedSendBuffer, sendcnts, sdispls,
-                    MPI_FFT_SCALAR, packedRecvBuffer, rcvcnts,
-                    rdispls, MPI_FFT_SCALAR, plan->comm);
+      if (plan->self) {
+        plan->pack(&in[plan->send_offset[plan->selfnsendloc]],
+                  &plan->sendbuf[plan->sdispls[plan->selfcommringloc]],
+                  &plan->packplan[plan->selfnsendloc]);
+        plan->unpack(&plan->sendbuf[plan->sdispls[plan->selfcommringloc]],
+                    &out[plan->recv_offset[plan->selfnrecvloc]],
+                    &plan->unpackplan[plan->selfnrecvloc]);
+      }
 
       // unpack the data from the recv buffer into out
 
-      currentRecvBufferOffset = 0;
+      numpacked = 0;
       for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        if (nrecvmap[irecv] > -1) {
-          plan->unpack(&packedRecvBuffer[currentRecvBufferOffset],
-                       &out[plan->recv_offset[nrecvmap[irecv]]],
-                       &plan->unpackplan[nrecvmap[irecv]]);
-          currentRecvBufferOffset += plan->recv_size[nrecvmap[irecv]];
+        if (plan->rcvcnts[irecv]) {
+          plan->unpack(&scratch[plan->rdispls[irecv]],
+                       &out[plan->recv_offset[numpacked]],
+                       &plan->unpackplan[numpacked]);
+          numpacked++;
+        }
+        else if (plan->commringlist[irecv] == me && plan->self) {
+          numpacked++;
         }
       }
-
-      // free temporary data structures
-
-      free(sendcnts);
-      free(rcvcnts);
-      free(sdispls);
-      free(rdispls);
-      free(nrecvmap);
-      free(packedSendBuffer);
-      free(packedRecvBuffer);
     }
   }
 }
@@ -225,6 +216,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
                           1 = single precision (4 bytes per datum)
                           2 = double precision (8 bytes per datum)
    usecollective        whether to use collective MPI or point-to-point
+   useisend             whether to use non-blocking or blocking MPI point-to-point
 ------------------------------------------------------------------------- */
 
 struct remap_plan_3d *remap_3d_create_plan(
@@ -232,8 +224,8 @@ struct remap_plan_3d *remap_3d_create_plan(
   int in_ilo, int in_ihi, int in_jlo, int in_jhi,
   int in_klo, int in_khi,
   int out_ilo, int out_ihi, int out_jlo, int out_jhi,
-  int out_klo, int out_khi,
-  int nqty, int permute, int memory, int /*precision*/, int usecollective)
+  int out_klo, int out_khi, int nqty, int permute,
+  int memory, int /*precision*/, int usecollective, int useisend)
 
 {
 
@@ -252,6 +244,7 @@ struct remap_plan_3d *remap_3d_create_plan(
   plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d));
   if (plan == nullptr) return nullptr;
   plan->usecollective = usecollective;
+  plan->useisend = useisend;
 
   // store parameters in local data structs
 
@@ -293,342 +286,488 @@ struct remap_plan_3d *remap_3d_create_plan(
     return nullptr;
   }
 
+  // combine input & output extents across all procs
+
+  MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
+                inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
   MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
                 outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
 
-  // count send collides, including self
-
-  nsend = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
-  }
-
-  // malloc space for send info
-
-  if (nsend) {
-    plan->pack = pack_3d;
+  // for efficiency, handle collective & non-collective setup separately
 
-    plan->send_offset = (int *) malloc(nsend*sizeof(int));
-    plan->send_size = (int *) malloc(nsend*sizeof(int));
-    plan->send_proc = (int *) malloc(nsend*sizeof(int));
-    plan->packplan = (struct pack_plan_3d *)
-      malloc(nsend*sizeof(struct pack_plan_3d));
-
-    if (plan->send_offset == nullptr || plan->send_size == nullptr ||
-        plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
-  }
+  if (!plan->usecollective) {
+    // count send & recv collides, including self
 
-  // store send info, with self as last entry
-
-  nsend = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
-      plan->send_proc[nsend] = iproc;
-      plan->send_offset[nsend] = nqty *
-        ((overlap.klo-in.klo)*in.jsize*in.isize +
-         ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
-      plan->packplan[nsend].nfast = nqty*overlap.isize;
-      plan->packplan[nsend].nmid = overlap.jsize;
-      plan->packplan[nsend].nslow = overlap.ksize;
-      plan->packplan[nsend].nstride_line = nqty*in.isize;
-      plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
-      plan->packplan[nsend].nqty = nqty;
-      plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
-      nsend++;
+    nsend = 0;
+    nrecv = 0;
+    for (i = 0; i < nprocs; i++) {
+      nsend += remap_3d_collide(&in,&outarray[i],&overlap);
+      nrecv += remap_3d_collide(&out,&inarray[i],&overlap);
     }
-  }
 
-  // plan->nsend = # of sends not including self
+    // malloc space for send & recv info
 
-  if (nsend && plan->send_proc[nsend-1] == me) {
-    if (plan->usecollective) // for collectives include self in nsend list
-      plan->nsend = nsend;
-    else
-      plan->nsend = nsend - 1;
-  } else
-    plan->nsend = nsend;
+    if (nsend) {
+      plan->pack = pack_3d;
 
-  // combine input extents across all procs
+      plan->send_offset = (int *) malloc(nsend*sizeof(int));
+      plan->send_size = (int *) malloc(nsend*sizeof(int));
+      plan->send_proc = (int *) malloc(nsend*sizeof(int));
+      plan->packplan = (struct pack_plan_3d *)
+        malloc(nsend*sizeof(struct pack_plan_3d));
 
-  MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
-                inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
+      if (plan->useisend)
+        plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request));
+        plan->send_bufloc = (int *) malloc(nsend*sizeof(int));
+        if (plan->send_bufloc == nullptr) return nullptr;
 
-  // count recv collides, including self
+      if (plan->send_offset == nullptr || plan->send_size == nullptr ||
+          plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
+    }
 
-  nrecv = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
-  }
+    if (nrecv) {
+      if (permute == 0)
+        plan->unpack = unpack_3d;
+      else if (permute == 1) {
+        if (nqty == 1)
+          plan->unpack = unpack_3d_permute1_1;
+        else if (nqty == 2)
+          plan->unpack = unpack_3d_permute1_2;
+        else
+          plan->unpack = unpack_3d_permute1_n;
+      }
+      else if (permute == 2) {
+        if (nqty == 1)
+          plan->unpack = unpack_3d_permute2_1;
+        else if (nqty == 2)
+          plan->unpack = unpack_3d_permute2_2;
+        else
+          plan->unpack = unpack_3d_permute2_n;
+      }
 
-  // malloc space for recv info
-
-  if (nrecv) {
-    if (permute == 0)
-      plan->unpack = unpack_3d;
-    else if (permute == 1) {
-      if (nqty == 1)
-        plan->unpack = unpack_3d_permute1_1;
-      else if (nqty == 2)
-        plan->unpack = unpack_3d_permute1_2;
-      else
-        plan->unpack = unpack_3d_permute1_n;
+      plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_size = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
+      plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
+      plan->unpackplan = (struct pack_plan_3d *)
+        malloc(nrecv*sizeof(struct pack_plan_3d));
+
+      if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
+          plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
+          plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
     }
-    else if (permute == 2) {
-      if (nqty == 1)
-        plan->unpack = unpack_3d_permute2_1;
-      else if (nqty == 2)
-        plan->unpack = unpack_3d_permute2_2;
-      else
-        plan->unpack = unpack_3d_permute2_n;
+
+    // store send info, with self as last entry
+
+    nsend = 0;
+    iproc = me;
+    ibuf = 0;
+    for (i = 0; i < nprocs; i++) {
+      iproc++;
+      if (iproc == nprocs) iproc = 0;
+      if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
+        plan->send_proc[nsend] = iproc;
+        plan->send_offset[nsend] = nqty *
+          ((overlap.klo-in.klo)*in.jsize*in.isize +
+          ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
+        plan->packplan[nsend].nfast = nqty*overlap.isize;
+        plan->packplan[nsend].nmid = overlap.jsize;
+        plan->packplan[nsend].nslow = overlap.ksize;
+        plan->packplan[nsend].nstride_line = nqty*in.isize;
+        plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
+        plan->packplan[nsend].nqty = nqty;
+        plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->send_bufloc[nsend] = ibuf;
+        ibuf += plan->send_size[nsend];
+        nsend++;
+      }
     }
 
-    plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_size = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
-    plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
-    plan->unpackplan = (struct pack_plan_3d *)
-      malloc(nrecv*sizeof(struct pack_plan_3d));
-
-    if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
-        plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
-        plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
-  }
+    // plan->nsend = # of sends not including self
+
+    if (nsend && plan->send_proc[nsend-1] == me) plan->nsend = nsend - 1;
+    else plan->nsend = nsend;
+
+    // store recv info, with self as last entry
+
+    ibuf = 0;
+    nrecv = 0;
+    iproc = me;
+
+    for (i = 0; i < nprocs; i++) {
+      iproc++;
+      if (iproc == nprocs) iproc = 0;
+      if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
+        plan->recv_proc[nrecv] = iproc;
+        plan->recv_bufloc[nrecv] = ibuf;
+
+        if (permute == 0) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.klo-out.klo)*out.jsize*out.isize +
+            (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
+          plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else if (permute == 1) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
+            (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.jlo-out.jlo)*out.isize*out.ksize +
+            (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
 
-  // store recv info, with self as last entry
-
-  ibuf = 0;
-  nrecv = 0;
-  iproc = me;
-
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
-      plan->recv_proc[nrecv] = iproc;
-      plan->recv_bufloc[nrecv] = ibuf;
-
-      if (permute == 0) {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.klo-out.klo)*out.jsize*out.isize +
-           (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
-        plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
-        plan->unpackplan[nrecv].nqty = nqty;
-      }
-      else if (permute == 1) {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
-           (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
-        plan->unpackplan[nrecv].nfast = overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
-        plan->unpackplan[nrecv].nqty = nqty;
-      }
-      else {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.jlo-out.jlo)*out.isize*out.ksize +
-           (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
-        plan->unpackplan[nrecv].nfast = overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
-        plan->unpackplan[nrecv].nqty = nqty;
+        plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        ibuf += plan->recv_size[nrecv];
+        nrecv++;
       }
-
-      plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
-      ibuf += plan->recv_size[nrecv];
-      nrecv++;
     }
-  }
 
-  // create sub-comm rank list
+    // plan->nrecv = # of recvs not including self
 
-  if (plan->usecollective) {
-    plan->commringlist = nullptr;
+    if (nrecv && plan->recv_proc[nrecv-1] == me) plan->nrecv = nrecv - 1;
+    else plan->nrecv = nrecv;
 
-    // merge recv and send rank lists
-    // ask Steve Plimpton about method to more accurately determine
-    // maximum number of procs contributing to pencil
+    // init remaining fields in remap plan
 
-    int maxcommsize = nprocs;
-    int *commringlist = (int *) malloc(maxcommsize*sizeof(int));
-    int commringlen = 0;
+    plan->memory = memory;
 
-    for (i = 0; i < nrecv; i++) {
-      commringlist[i] = plan->recv_proc[i];
-      commringlen++;
+    if (nrecv == plan->nrecv) plan->self = 0;
+    else plan->self = 1;
+
+    // the plan->sendbuf and plan->recvbuf are used by both the
+    // collective & non-collective implementations.
+    // For non-collective and blocking, the buffer size is MAX(send_size) for any one send
+
+    // find biggest send message (not including self) and malloc space for it
+
+    size = 0;
+    if (plan->useisend) {
+      for (nsend = 0; nsend < plan->nsend; nsend++)
+        size += plan->send_size[nsend];
+    } else {
+      for (nsend = 0; nsend < plan->nsend; nsend++)
+        size = MAX(size,plan->send_size[nsend]);
     }
 
-    for (i = 0; i < nsend; i++) {
-      int foundentry = 0;
-      for (j = 0; j < commringlen;j++)
-        if (commringlist[j] == plan->send_proc[i]) foundentry = 1;
-      if (!foundentry) {
-        commringlist[commringlen] = plan->send_proc[i];
-        commringlen++;
-      }
+    if (size) {
+      plan->sendbuf = (FFT_SCALAR*) malloc(sizeof(FFT_SCALAR) * size);
+      if (plan->sendbuf == nullptr) return nullptr;
     }
 
-    // sort initial commringlist
+    // if requested, allocate internal scratch space for recvs,
+    // only need it if I will receive any data (including self)
 
-    int swap = 0;
-    for (i = 0 ; i < (commringlen - 1); i++) {
-      for (j = 0 ; j < commringlen - i - 1; j++) {
-        if (commringlist[j] > commringlist[j+1]) {
-          swap = commringlist[j];
-          commringlist[j]   = commringlist[j+1];
-          commringlist[j+1] = swap;
-        }
+    if (memory == 1) {
+      if (nrecv > 0) {
+        plan->scratch = (FFT_SCALAR*) malloc(sizeof(FFT_SCALAR) * nqty*out.isize*out.jsize*out.ksize);
+        if (plan->scratch == nullptr) return nullptr;
       }
     }
 
-    // collide all inarray extents for the comm ring with all output
-    // extents and all outarray extents for the comm ring with all input
-    // extents - if there is a collison add the rank to the comm ring,
-    // keep iterating until nothing is added to commring
-
-    int commringappend = 1;
-    while (commringappend) {
-      int newcommringlen = commringlen;
-      commringappend = 0;
-      for (i = 0; i < commringlen; i++) {
-        for (j = 0; j < nprocs; j++) {
-          if (remap_3d_collide(&inarray[commringlist[i]],
-                               &outarray[j],&overlap)) {
-            int alreadyinlist = 0;
-            for (int k = 0; k < newcommringlen; k++) {
-              if (commringlist[k] == j) {
-                alreadyinlist = 1;
-              }
-            }
-            if (!alreadyinlist) {
-              commringlist[newcommringlen++] = j;
-              commringappend = 1;
-            }
-          }
-          if (remap_3d_collide(&outarray[commringlist[i]],
-                               &inarray[j],&overlap)) {
-            int alreadyinlist = 0;
-            for (int k = 0 ; k < newcommringlen; k++) {
-              if (commringlist[k] == j) alreadyinlist = 1;
-            }
-            if (!alreadyinlist) {
-              commringlist[newcommringlen++] = j;
-              commringappend = 1;
-            }
-          }
-        }
-      }
-      commringlen = newcommringlen;
+    // Non-collectives do not use MPI Communicator Groups
+
+    MPI_Comm_dup(comm,&plan->comm);
+  } else {
+    // Improved approach - use an AllReduce to aggregate which ranks need to be included
+    // To do this, we build the local proc's send/receive list, then do an AllReduce
+    // to create the send/recv count for the Alltoallv
+
+    // local arrays to be used in the allreduce
+    // start with max length -- nprocs. Unused entries will be removed later
+
+    int *local_cnts = (int*) malloc(2*nprocs*sizeof(int));
+    if (local_cnts == nullptr) return nullptr;
+    int *local_sendcnts = local_cnts;
+    int *local_recvcnts = (local_cnts + nprocs);
+
+    // local arrays used to store the results of the allreduce
+
+    int *global_cnts = (int*) malloc(2*nprocs*sizeof(int));
+    if (global_cnts == nullptr) return nullptr;
+    int *global_sendcnts = global_cnts;
+    int *global_recvcnts = (global_cnts + nprocs);
+
+    // count send & recv collides, including self
+
+    nsend = 0;
+    nrecv = 0;
+    for (i = 0; i < nprocs; i++) {
+      local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap);
+      local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap);
+      nsend += local_sendcnts[i];
+      nrecv += local_recvcnts[i];
     }
 
-    // sort the final commringlist
+    // perform an AllReduce to get the counts from all other processors and build sendcnts list
 
-    for (i = 0 ; i < ( commringlen - 1 ); i++) {
-      for (j = 0 ; j < commringlen - i - 1; j++) {
-        if (commringlist[j] > commringlist[j+1]) {
-          swap = commringlist[j];
-          commringlist[j]   = commringlist[j+1];
-          commringlist[j+1] = swap;
-        }
+    MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm);
+
+    // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv
+    // also builds commringlist -- which is already sorted
+
+    int *commringlist = (int*) malloc(nprocs * sizeof(int));
+    int commringlen = 0;
+
+    for (i = 0; i < nprocs; i++) {
+      if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) {
+        commringlist[commringlen] = i;
+        commringlen++;
       }
     }
 
     // resize commringlist to final size
 
-    commringlist = (int *) realloc(commringlist, commringlen*sizeof(int) + 1);
+    commringlist = (int *) realloc(commringlist, commringlen*sizeof(int));
 
     // set the plan->commringlist
 
     plan->commringlen = commringlen;
     plan->commringlist = commringlist;
-  }
 
-  // plan->nrecv = # of recvs not including self
-  // for collectives include self in the nsend list
+    // clean up local buffers that are finished
 
-  if (nrecv && plan->recv_proc[nrecv-1] == me) {
-    if (plan->usecollective) plan->nrecv = nrecv;
-    else plan->nrecv = nrecv - 1;
-  } else plan->nrecv = nrecv;
+    local_sendcnts = nullptr;
+    local_recvcnts = nullptr;
+    global_recvcnts = nullptr;
+    global_sendcnts = nullptr;
+    free(local_cnts);
+    free(global_cnts);
 
-  // init remaining fields in remap plan
+    // malloc space for send & recv info
+    // if the current proc is involved in any way in the communication, allocate space
+    // because of the Alltoallv, both send and recv have to be initialized even if
+    // only one of those is performed
 
-  plan->memory = memory;
+    if (nsend || nrecv) {
 
-  if (nrecv == plan->nrecv) plan->self = 0;
-  else plan->self = 1;
+      // send space
 
-  // free locally malloced space
+      plan->selfcommringloc = -1;
+      plan->selfnsendloc = -1;
+      plan->selfnrecvloc = -1;
 
-  free(inarray);
-  free(outarray);
+      plan->nsend = nsend;
+      plan->pack = pack_3d;
 
-  // find biggest send message (not including self) and malloc space for it
+      plan->send_offset = (int *) malloc(nsend*sizeof(int));
+      plan->send_size = (int *) malloc(plan->commringlen*sizeof(int));
 
-  plan->sendbuf = nullptr;
+      plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int));
+      plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int));
 
-  size = 0;
-  for (nsend = 0; nsend < plan->nsend; nsend++)
-    size = MAX(size,plan->send_size[nsend]);
+      // only used when sendcnt > 0
 
-  if (size) {
-    plan->sendbuf = (FFT_SCALAR *) malloc(size*sizeof(FFT_SCALAR));
-    if (plan->sendbuf == nullptr) return nullptr;
-  }
+      plan->packplan = (struct pack_plan_3d *)
+        malloc(nsend*sizeof(struct pack_plan_3d));
+
+      if (plan->send_offset == nullptr || plan->send_size == nullptr ||
+          plan->sendcnts == nullptr || plan->sdispls == nullptr ||
+          plan->packplan == nullptr) return nullptr;
+
+      // recv space
+
+      plan->nrecv = nrecv;
 
-  // if requested, allocate internal scratch space for recvs,
-  // only need it if I will receive any data (including self)
+      if (permute == 0)
+        plan->unpack = unpack_3d;
+      else if (permute == 1) {
+        if (nqty == 1)
+          plan->unpack = unpack_3d_permute1_1;
+        else if (nqty == 2)
+          plan->unpack = unpack_3d_permute1_2;
+        else
+          plan->unpack = unpack_3d_permute1_n;
+      }
+      else if (permute == 2) {
+        if (nqty == 1)
+          plan->unpack = unpack_3d_permute2_1;
+        else if (nqty == 2)
+          plan->unpack = unpack_3d_permute2_2;
+        else
+          plan->unpack = unpack_3d_permute2_n;
+      }
+
+      plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_size = (int *) malloc(plan->commringlen*sizeof(int));
+
+      plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int));
+      plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int));
 
-  plan->scratch = nullptr;
+      // only used when recvcnt > 0
 
-  if (memory == 1) {
-    if (nrecv > 0) {
-      plan->scratch =
-        (FFT_SCALAR *) malloc((size_t)nqty*out.isize*out.jsize*out.ksize *
-                              sizeof(FFT_SCALAR));
-      if (plan->scratch == nullptr) return nullptr;
+      plan->unpackplan = (struct pack_plan_3d *)
+        malloc(nrecv*sizeof(struct pack_plan_3d));
+
+      if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
+          plan->rcvcnts == nullptr || plan->rdispls == nullptr ||
+          plan->unpackplan == nullptr) return nullptr;
     }
-  }
 
-  // if using collective and the commringlist is NOT empty create a
-  // communicator for the plan based off an MPI_Group created with
-  // ranks from the commringlist
+    // store send info, with self as last entry
 
-  if ((plan->usecollective && (plan->commringlen > 0))) {
-    MPI_Group orig_group, new_group;
-    MPI_Comm_group(comm, &orig_group);
-    MPI_Group_incl(orig_group, plan->commringlen,
-                   plan->commringlist, &new_group);
-    MPI_Comm_create(comm, new_group, &plan->comm);
-  }
+    nsend = 0;
+    ibuf = 0;
+    int total_send_size = 0;
+    for (i = 0; i < plan->commringlen; i++) {
+      iproc = plan->commringlist[i];
+      if (iproc == me) {
+        plan->selfcommringloc = i;
+        plan->selfnsendloc = nsend;
+      }
+      if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
+        // number of entries required for this pack's 3-d coords
+        plan->send_offset[nsend] = nqty *
+          ((overlap.klo-in.klo)*in.jsize*in.isize +
+            ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
+        plan->packplan[nsend].nfast = nqty*overlap.isize;
+        plan->packplan[nsend].nmid = overlap.jsize;
+        plan->packplan[nsend].nslow = overlap.ksize;
+        plan->packplan[nsend].nstride_line = nqty*in.isize;
+        plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
+        plan->packplan[nsend].nqty = nqty;
+        // total amount of overlap
+        plan->send_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->sendcnts[i] = plan->send_size[i];
+        plan->sdispls[i] = ibuf;
+        ibuf += plan->send_size[i];
+        nsend++;
+      } else {
+        plan->send_size[i] = 0;
+        plan->sdispls[i] = ibuf;
+        plan->sendcnts[i] = 0;
+      }
+      total_send_size += plan->send_size[i];
+    }
+
+    if (total_send_size) {
+      plan->sendbuf = (FFT_SCALAR*) malloc(total_send_size * sizeof(FFT_SCALAR));
+      if (plan->sendbuf == nullptr) return nullptr;
+    }
 
-  // if using collective and the comm ring list is empty create
-  // a communicator for the plan with an empty group
+    // store recv info, with self as last entry
 
-  else if ((plan->usecollective) && (plan->commringlen == 0)) {
-    MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm);
+    ibuf = 0;
+    nrecv = 0;
+
+    for (i = 0; i < plan->commringlen; i++) {
+      iproc = plan->commringlist[i];
+      if (iproc == me) {
+        plan->selfnrecvloc = nrecv;
+      }
+      if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
+
+        if (permute == 0) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.klo-out.klo)*out.jsize*out.isize +
+              (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
+          plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else if (permute == 1) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
+              (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.jlo-out.jlo)*out.isize*out.ksize +
+              (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+
+        plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->rcvcnts[i] = plan->recv_size[i];
+        plan->rdispls[i] = ibuf;
+        ibuf += plan->recv_size[i];
+        nrecv++;
+      } else {
+        plan->recv_size[i] = 0;
+        plan->rcvcnts[i] = 0;
+        plan->rdispls[i] = ibuf;
+      }
+    }
+
+    // init remaining fields in remap plan
+
+    plan->memory = memory;
+
+    if (plan->sendcnts[plan->selfcommringloc]) {
+      plan->self = 1;
+      plan->sendcnts[plan->selfcommringloc] = 0;
+      plan->rcvcnts[plan->selfcommringloc] = 0;
+    }
+    else {
+      plan->self = 0;
+    }
+
+    // if requested, allocate internal scratch space for recvs,
+    // only need it if I will receive any data (including self)
+
+    if (memory == 1) {
+      if (nrecv > 0) {
+        plan->scratch = (FFT_SCALAR*) malloc(nqty*out.isize*out.jsize*out.ksize * sizeof(FFT_SCALAR));
+        if (plan->scratch == nullptr) return nullptr;
+      }
+    }
+
+    // if using collective and the commringlist is NOT empty create a
+    // communicator for the plan based off an MPI_Group created with
+    // ranks from the commringlist
+
+    if (plan->commringlen > 0) {
+      MPI_Group orig_group, new_group;
+      MPI_Comm_group(comm, &orig_group);
+      MPI_Group_incl(orig_group, plan->commringlen,
+                      plan->commringlist, &new_group);
+      MPI_Comm_create(comm, new_group, &plan->comm);
+    }
+
+    // if using collective and the comm ring list is empty create
+    // a communicator for the plan with an empty group
+
+    else
+      MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm);
   }
 
-  // not using collective - dup comm
+  // free locally malloced space
 
-  else MPI_Comm_dup(comm,&plan->comm);
+  free(inarray);
+  free(outarray);
 
   // return pointer to plan
 
@@ -641,39 +780,60 @@ struct remap_plan_3d *remap_3d_create_plan(
 
 void remap_3d_destroy_plan(struct remap_plan_3d *plan)
 {
+  if (plan == nullptr) return;
+
   // free MPI communicator
 
-  if (!(plan->usecollective) || (plan->commringlen != 0))
+  if (!((plan->usecollective) && (plan->commringlen == 0)))
     MPI_Comm_free(&plan->comm);
 
   if (plan->usecollective) {
-    if (plan->commringlist != nullptr)
+    if (plan->commringlist != nullptr) {
       free(plan->commringlist);
-  }
+      free(plan->sendcnts);
+      free(plan->rcvcnts);
+      free(plan->sdispls);
+      free(plan->rdispls);
+    }
 
-  // free internal arrays
+    if (plan->nsend) {
+      free(plan->send_offset);
+      free(plan->send_size);
+      free(plan->packplan);
+    }
 
-  if (plan->nsend || plan->self) {
-    free(plan->send_offset);
-    free(plan->send_size);
-    free(plan->send_proc);
-    free(plan->packplan);
-    if (plan->sendbuf) free(plan->sendbuf);
-  }
+    if (plan->nrecv) {
+      free(plan->recv_offset);
+      free(plan->recv_size);
+      free(plan->unpackplan);
+    }
+  } else {
+    // free arrays used in pt2pt communication
+
+    if (plan->nsend || plan->self) {
+      free(plan->send_offset);
+      free(plan->send_size);
+      free(plan->send_proc);
+      free(plan->packplan);
+      if (plan->useisend) {
+        free(plan->isend_reqs);
+        free(plan->send_bufloc);
+      }
+    }
 
-  if (plan->nrecv || plan->self) {
-    free(plan->recv_offset);
-    free(plan->recv_size);
-    free(plan->recv_proc);
-    free(plan->recv_bufloc);
-    free(plan->request);
-    free(plan->unpackplan);
-    if (plan->scratch) free(plan->scratch);
+    if (plan->nrecv || plan->self) {
+      free(plan->recv_offset);
+      free(plan->recv_size);
+      free(plan->recv_proc);
+      free(plan->recv_bufloc);
+      free(plan->request);
+      free(plan->unpackplan);
+    }
   }
 
   // free plan itself
 
-  free(plan);
+  delete plan;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h
index 59d47ede534..c171bbe5a3e 100644
--- a/src/KSPACE/remap.h
+++ b/src/KSPACE/remap.h
@@ -27,6 +27,8 @@ struct remap_plan_3d {
   int *send_offset;                   // extraction loc for each send
   int *send_size;                     // size of each send message
   int *send_proc;                     // proc to send each message to
+  int *send_bufloc;                   // if useisend, offset in send buf for each isend
+  MPI_Request *isend_reqs;            // MPI request for each posted isend
   struct pack_plan_3d *packplan;      // pack plan for each send message
   int *recv_offset;                   // insertion loc for each recv
   int *recv_size;                     // size of each recv message
@@ -40,8 +42,16 @@ struct remap_plan_3d {
   int memory;                         // user provides scratch space or not
   MPI_Comm comm;                      // group of procs performing remap
   int usecollective;                  // use collective or point-to-point MPI
+  int useisend;                       // if using point-to-point MPI, use MPI_Isend
   int commringlen;                    // length of commringlist
   int *commringlist;                  // ranks on communication ring of this plan
+  int *sendcnts;                    // # of elements in send buffer for each rank
+  int *rcvcnts;                     // # of elements in recv buffer for each rank
+  int *sdispls;                     // extraction location in send buffer for each rank
+  int *rdispls;                     // extraction location in recv buffer for each rank
+  int selfcommringloc;              // current proc's location in commringlist
+  int selfnsendloc;                 // current proc's location in send lists
+  int selfnrecvloc;                 // current proc's location in recv lists
 };
 
 // collision between 2 regions
@@ -56,6 +66,6 @@ struct extent_3d {
 
 void remap_3d(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *, struct remap_plan_3d *);
 struct remap_plan_3d *remap_3d_create_plan(MPI_Comm, int, int, int, int, int, int, int, int, int,
-                                           int, int, int, int, int, int, int, int);
+                                           int, int, int, int, int, int, int, int, int);
 void remap_3d_destroy_plan(struct remap_plan_3d *);
 int remap_3d_collide(struct extent_3d *, struct extent_3d *, struct extent_3d *);
diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp
index ca98748011b..31bf6af910e 100644
--- a/src/KSPACE/remap_wrap.cpp
+++ b/src/KSPACE/remap_wrap.cpp
@@ -26,12 +26,12 @@ Remap::Remap(LAMMPS *lmp, MPI_Comm comm,
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
              int nqty, int permute, int memory,
-             int precision, int usecollective) : Pointers(lmp)
+             int precision, int usecollective, int useisend) : Pointers(lmp)
 {
   plan = remap_3d_create_plan(comm,
                               in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                               out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                              nqty,permute,memory,precision,usecollective);
+                              nqty,permute,memory,precision,usecollective,useisend);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan");
 }
 
diff --git a/src/KSPACE/remap_wrap.h b/src/KSPACE/remap_wrap.h
index fc34bc2df17..2cba678c865 100644
--- a/src/KSPACE/remap_wrap.h
+++ b/src/KSPACE/remap_wrap.h
@@ -22,7 +22,7 @@ namespace LAMMPS_NS {
 class Remap : protected Pointers {
  public:
   Remap(class LAMMPS *, MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int,
-        int, int, int, int);
+        int, int, int, int, int);
   ~Remap() override;
   void perform(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *);
 

From afc4c2724590d452533af7a0248f12a27b23a196 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 26 Nov 2025 14:35:40 -0500
Subject: [PATCH 05/17] Add 0 for use_isend parameter of FFT3d init in AMOEBA

---
 src/AMOEBA/amoeba_convolution.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index b50fd673373..f9a0e794601 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -173,17 +173,17 @@ void AmoebaConvolution::allocate_grid()
   fft1 = new FFT3d(lmp,world,nx,ny,nz,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   1,0,&tmp,0);
+                   1,0,&tmp,0,0);
 
   fft2 = new FFT3d(lmp,world,nx,ny,nz,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,0);
+                   0,0,&tmp,0,0);
 
   remap = new Remap(lmp,world,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                     nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    nqty,0,0,FFT_PRECISION,0);
+                    nqty,0,0,FFT_PRECISION,0,0);
 
   // memory allocations
 

From 8b8d1baccd9b29973ab043949554bcc5db08f2a9 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 26 Nov 2025 15:13:47 -0500
Subject: [PATCH 06/17] Added use_isend param=0 for FFT3d init in ELECTRODE and
 PHONON packages

---
 src/ELECTRODE/pppm_electrode.cpp | 6 +++---
 src/PHONON/fix_phonon.cpp        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ELECTRODE/pppm_electrode.cpp b/src/ELECTRODE/pppm_electrode.cpp
index 618b3af31c2..9194691b53c 100644
--- a/src/ELECTRODE/pppm_electrode.cpp
+++ b/src/ELECTRODE/pppm_electrode.cpp
@@ -1069,15 +1069,15 @@ void PPPMElectrode::allocate()
 
   fft1 = new FFT3d(lmp, world, nx_pppm, ny_pppm, nz_pppm, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft,
                    nzlo_fft, nzhi_fft, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft,
-                   0, 0, &tmp, collective_flag);
+                   0, 0, &tmp, collective_flag, 0);
 
   fft2 = new FFT3d(lmp, world, nx_pppm, ny_pppm, nz_pppm, nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft,
                    nzlo_fft, nzhi_fft, nxlo_in, nxhi_in, nylo_in, nyhi_in, nzlo_in, nzhi_in, 0, 0,
-                   &tmp, collective_flag);
+                   &tmp, collective_flag, 0);
 
   remap = new Remap(lmp, world, nxlo_in, nxhi_in, nylo_in, nyhi_in, nzlo_in, nzhi_in, nxlo_fft,
                     nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft, 1, 0, 0, FFT_PRECISION,
-                    collective_flag);
+                    collective_flag, 0);
 
   // ELECTRODE specific allocations
 
diff --git a/src/PHONON/fix_phonon.cpp b/src/PHONON/fix_phonon.cpp
index 7998395038a..94a86b8aa32 100644
--- a/src/PHONON/fix_phonon.cpp
+++ b/src/PHONON/fix_phonon.cpp
@@ -159,7 +159,7 @@ FixPhonon::FixPhonon(LAMMPS *lmp,  int narg, char **arg) : Fix(lmp, narg, arg)
   for (int i = 1; i < nprocs; ++i) fft_disp[i] = fft_disp[i-1] + fft_cnts[i-1];
   delete []nx_loc;
 
-  fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize,0);
+  fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize,0,0);
   memory->create(fft_data, MAX(1,mynq)*2, "fix_phonon:fft_data");
 
   // allocate variables; MAX(1,... is used because a null buffer will result in error for MPI

From 107c2b9c9344c9feaf9cffc72d8443d81192e179 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Mon, 1 Dec 2025 09:52:32 -0500
Subject: [PATCH 07/17] Added usenonblocking param to FFT3D unit tests

---
 unittest/utils/test_fft3d.cpp        | 33 +++++++++++++------------
 unittest/utils/test_fft3d_kokkos.cpp | 36 +++++++++++++++-------------
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/unittest/utils/test_fft3d.cpp b/unittest/utils/test_fft3d.cpp
index 7c1948d286e..d650cd64ada 100644
--- a/unittest/utils/test_fft3d.cpp
+++ b/unittest/utils/test_fft3d.cpp
@@ -116,16 +116,17 @@ class FFT3DTest : public LAMMPSTest {
         int out_klo = 0, out_khi = nslow - 1;
 
         // FFT parameters
-        int scaled        = 0; // No scaling
-        int permute       = 0; // No permutation
-        int nbuf          = 0; // Buffer size (output)
-        int usecollective = 0; // Use point-to-point communication
+        int scaled         = 0; // No scaling
+        int permute        = 0; // No permutation
+        int nbuf           = 0; // Buffer size (output)
+        int usecollective  = 0; // Use point-to-point communication
+        int usenonblocking = 0; // Use blocking point-to-point communication
 
         // Create FFT3d object
         BEGIN_HIDE_OUTPUT();
         fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi,
                         in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi,
-                        scaled, permute, &nbuf, usecollective);
+                        scaled, permute, &nbuf, usecollective, usenonblocking);
         END_HIDE_OUTPUT();
 
         ASSERT_NE(fft, nullptr);
@@ -654,16 +655,17 @@ TEST_F(FFT3DTest, RoundTrip_MPI_2proc_32x32x32)
     std::memset(output_data, 0, 2 * local_size * sizeof(FFT_SCALAR));
 
     // FFT parameters
-    int scaled        = 0; // No scaling
-    int permute       = 0; // No permutation
-    int nbuf          = 0; // Buffer size (output)
-    int usecollective = 0; // Use point-to-point communication
+    int scaled         = 0; // No scaling
+    int permute        = 0; // No permutation
+    int nbuf           = 0; // Buffer size (output)
+    int usecollective  = 0; // Use point-to-point communication
+    int usenonblocking = 0; // Use blocking point-to-point communication
 
     // Create MPI-aware FFT3d object
     BEGIN_HIDE_OUTPUT();
     fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo,
                     in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-                    &nbuf, usecollective);
+                    &nbuf, usecollective, usenonblocking);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft, nullptr);
@@ -799,16 +801,17 @@ TEST_F(FFT3DTest, RoundTrip_MPI_4proc_64x64x64)
     std::memset(output_data, 0, 2 * local_size * sizeof(FFT_SCALAR));
 
     // FFT parameters
-    int scaled        = 0; // No scaling
-    int permute       = 0; // No permutation
-    int nbuf          = 0; // Buffer size (output)
-    int usecollective = 0; // Use point-to-point communication
+    int scaled         = 0; // No scaling
+    int permute        = 0; // No permutation
+    int nbuf           = 0; // Buffer size (output)
+    int usecollective  = 0; // Use point-to-point communication
+    int usenonblocking = 0; // Use blocking point-to-point communication
 
     // Create MPI-aware FFT3d object
     BEGIN_HIDE_OUTPUT();
     fft = new FFT3d(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi, in_jlo, in_jhi, in_klo,
                     in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-                    &nbuf, usecollective);
+                    &nbuf, usecollective, usenonblocking);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft, nullptr);
diff --git a/unittest/utils/test_fft3d_kokkos.cpp b/unittest/utils/test_fft3d_kokkos.cpp
index a1b41b669f4..0cd5b723c9a 100644
--- a/unittest/utils/test_fft3d_kokkos.cpp
+++ b/unittest/utils/test_fft3d_kokkos.cpp
@@ -140,18 +140,19 @@ class FFT3DKokkosTest : public LAMMPSTest {
         int out_klo = 0, out_khi = nslow - 1;
 
         // FFT parameters
-        int scaled        = 0; // No scaling
-        int permute       = 0; // No permutation
-        int nbuf          = 0; // Buffer size (output)
-        int usecollective = 0; // Use point-to-point communication
-        int usegpu        = 0; // Let KOKKOS decide based on backend
+        int scaled         = 0; // No scaling
+        int permute        = 0; // No permutation
+        int nbuf           = 0; // Buffer size (output)
+        int usecollective  = 0; // Use point-to-point communication
+        int usenonblocking = 0; // Use blocking point-to-point communication
+        int usegpu         = 0; // Let KOKKOS decide based on backend
 
         // Create FFT3dKokkos object
         BEGIN_HIDE_OUTPUT();
         fft = new FFT3dKokkos<DeviceType>(lmp, MPI_COMM_WORLD, nfast, nmid, nslow, in_ilo, in_ihi,
                                           in_jlo, in_jhi, in_klo, in_khi, out_ilo, out_ihi, out_jlo,
                                           out_jhi, out_klo, out_khi, scaled, permute, &nbuf,
-                                          usecollective, usegpu);
+                                          usecollective, usenonblocking, usegpu);
         END_HIDE_OUTPUT();
 
         ASSERT_NE(fft, nullptr);
@@ -555,13 +556,13 @@ TEST_F(FFT3DKokkosTest, Threading_OpenMP_Concurrent)
         int out_jlo = in_jlo, out_jhi = in_jhi;
         int out_klo = in_klo, out_khi = in_khi;
         int scaled = 0, permute = 0, nbuf = 0;
-        int usecollective = 0, usegpu_aware = 0;
+        int usecollective = 0, usegpu_aware = 0, usenonblocking = 0;
 
         BEGIN_HIDE_OUTPUT();
         auto fft = new FFT3dKokkos<LMPDeviceType>(
             lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
             in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-            &nbuf, usecollective, usegpu_aware);
+            &nbuf, usecollective, usenonblocking, usegpu_aware);
         END_HIDE_OUTPUT();
 
         ffts.push_back(fft);
@@ -657,13 +658,13 @@ TEST_F(FFT3DKokkosTest, Threading_Threads_Concurrent)
         int out_jlo = in_jlo, out_jhi = in_jhi;
         int out_klo = in_klo, out_khi = in_khi;
         int scaled = 0, permute = 0, nbuf = 0;
-        int usecollective = 0, usegpu_aware = 0;
+        int usecollective = 0, usegpu_aware = 0, usenonblocking = 0;
 
         BEGIN_HIDE_OUTPUT();
         auto fft = new FFT3dKokkos<LMPDeviceType>(
             lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
             in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-            &nbuf, usecollective, usegpu_aware);
+            &nbuf, usecollective, usenonblocking, usegpu_aware);
         END_HIDE_OUTPUT();
 
         ffts.push_back(fft);
@@ -730,13 +731,13 @@ TEST_F(FFT3DKokkosTest, Threading_Safety)
     int out_jlo = in_jlo, out_jhi = in_jhi;
     int out_klo = in_klo, out_khi = in_khi;
     int scaled = 0, permute = 0, nbuf = 0;
-    int usecollective = 0, usegpu_aware = 0;
+    int usecollective = 0, usegpu_aware = 0, usenonblocking = 0;
 
     BEGIN_HIDE_OUTPUT();
     auto fft_device = new FFT3dKokkos<LMPDeviceType>(
         lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
         in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-        &nbuf, usecollective, usegpu_aware);
+        &nbuf, usecollective, usenonblocking, usegpu_aware);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft_device, nullptr);
@@ -880,14 +881,14 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_2proc_32x32x32)
 
     // FFT parameters
     int scaled = 0, permute = 0, nbuf = 0;
-    int usecollective = 0, usegpu_aware = 0;
+    int usecollective = 0, usegpu_aware = 0, usenonblocking = 0;
 
     // Create MPI-aware FFT3dKokkos object
     BEGIN_HIDE_OUTPUT();
     auto fft_mpi = new FFT3dKokkos<LMPDeviceType>(
         lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
         in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-        &nbuf, usecollective, usegpu_aware);
+        &nbuf, usecollective, usenonblocking, usegpu_aware);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft_mpi, nullptr);
@@ -1012,14 +1013,14 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_4proc_64x64x64)
 
     // FFT parameters
     int scaled = 0, permute = 0, nbuf = 0;
-    int usecollective = 0, usegpu_aware = 0;
+    int usecollective = 0, usegpu_aware = 0, usenonblocking = 0;
 
     // Create MPI-aware FFT3dKokkos object
     BEGIN_HIDE_OUTPUT();
     auto fft_mpi = new FFT3dKokkos<LMPDeviceType>(
         lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
         in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-        &nbuf, usecollective, usegpu_aware);
+        &nbuf, usecollective, usenonblocking, usegpu_aware);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft_mpi, nullptr);
@@ -1146,6 +1147,7 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_GPU_2proc)
     // FFT parameters (disable GPU-aware MPI for now)
     int scaled = 0, permute = 0, nbuf = 0;
     int usecollective = 0;
+    int usenonblocking = 0;
     int usegpu_aware  = 0; // Would check lmp->kokkos->gpu_aware_flag if KokkosLMP was complete
 
     // Create MPI+GPU FFT3dKokkos object
@@ -1153,7 +1155,7 @@ TEST_F(FFT3DKokkosTest, RoundTrip_Kokkos_MPI_GPU_2proc)
     auto fft_mpi = new FFT3dKokkos<LMPDeviceType>(
         lmp, MPI_COMM_WORLD, grid_size, grid_size, grid_size, in_ilo, in_ihi, in_jlo, in_jhi,
         in_klo, in_khi, out_ilo, out_ihi, out_jlo, out_jhi, out_klo, out_khi, scaled, permute,
-        &nbuf, usecollective, usegpu_aware);
+        &nbuf, usecollective, usenonblocking, usegpu_aware);
     END_HIDE_OUTPUT();
 
     ASSERT_NE(fft_mpi, nullptr);

From d145b1d6f2a4656d998f08380d7615818046d54c Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 3 Dec 2025 09:31:52 -0500
Subject: [PATCH 08/17] Fix bug in Kokkos Remap, where conditionals were
 ordered incorrectly

---
 src/KOKKOS/remap_kokkos.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp
index f9ab0f7f475..88f17eb40bd 100644
--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@@ -210,15 +210,15 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
 
       int numpacked = 0;
       for (isend = 0; isend < plan->commringlen; isend++) {
-        if (plan->sendcnts[isend]) {
+        if (isend == plan->selfcommringloc && plan->self) {
+          numpacked++;
+        }
+        else if (plan->sendcnts[isend]) {
           plan->pack(d_in,plan->send_offset[numpacked],
                       plan->d_sendbuf,plan->sdispls[isend],
                       &plan->packplan[numpacked]);
           numpacked++;
         }
-        else if (plan->commringlist[isend] == me && plan->self) {
-          numpacked++;
-        }
       }
 
       if (!plan->usegpu_aware)
@@ -246,15 +246,15 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
 
       numpacked = 0;
       for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        if (plan->rcvcnts[irecv]) {
+        if (irecv == plan->selfcommringloc && plan->self) {
+          numpacked++;
+        }
+        else if (plan->rcvcnts[irecv]) {
           plan->unpack(d_scratch,plan->rdispls[irecv],
                        d_out,plan->recv_offset[numpacked],
                        &plan->unpackplan[numpacked]);
           numpacked++;
         }
-        else if (plan->commringlist[irecv] == me && plan->self) {
-          numpacked++;
-        }
       }
     }
   }

From 9e348d5fd292b7069e762c63cc271c8c133e0d35 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 3 Dec 2025 09:35:41 -0500
Subject: [PATCH 09/17] Replace Allreduce-based collective commringlist
 building with C++ set-based

---
 src/KSPACE/remap.cpp | 106 ++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 57 deletions(-)

diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index 297705217e8..73db03cde83 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -15,6 +15,7 @@
 #include "remap.h"
 
 #include <cstdlib>
+#include <set>
 
 #define PACK_DATA FFT_SCALAR
 
@@ -149,15 +150,15 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
 
       int numpacked = 0;
       for (isend = 0; isend < plan->commringlen; isend++) {
-        if (plan->sendcnts[isend]) {
+        if (isend == plan->selfcommringloc && plan->self) {
+          numpacked++;
+        }
+        else if (plan->sendcnts[isend]) {
           plan->pack(&in[plan->send_offset[numpacked]],
                       &plan->sendbuf[plan->sdispls[isend]],
                       &plan->packplan[numpacked]);
           numpacked++;
         }
-        else if (plan->commringlist[isend] == me && plan->self) {
-          numpacked++;
-        }
       }
 
       MPI_Alltoallv(plan->sendbuf, plan->sendcnts, plan->sdispls,
@@ -179,15 +180,15 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
 
       numpacked = 0;
       for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        if (plan->rcvcnts[irecv]) {
+        if (irecv == plan->selfcommringloc && plan->self) {
+          numpacked++;
+        }
+        else if (plan->rcvcnts[irecv]) {
           plan->unpack(&scratch[plan->rdispls[irecv]],
                        &out[plan->recv_offset[numpacked]],
                        &plan->unpackplan[numpacked]);
           numpacked++;
         }
-        else if (plan->commringlist[irecv] == me && plan->self) {
-          numpacked++;
-        }
       }
     }
   }
@@ -488,71 +489,62 @@ struct remap_plan_3d *remap_3d_create_plan(
 
     MPI_Comm_dup(comm,&plan->comm);
   } else {
-    // Improved approach - use an AllReduce to aggregate which ranks need to be included
-    // To do this, we build the local proc's send/receive list, then do an AllReduce
-    // to create the send/recv count for the Alltoallv
-
-    // local arrays to be used in the allreduce
-    // start with max length -- nprocs. Unused entries will be removed later
-
-    int *local_cnts = (int*) malloc(2*nprocs*sizeof(int));
-    if (local_cnts == nullptr) return nullptr;
-    int *local_sendcnts = local_cnts;
-    int *local_recvcnts = (local_cnts + nprocs);
-
-    // local arrays used to store the results of the allreduce
-
-    int *global_cnts = (int*) malloc(2*nprocs*sizeof(int));
-    if (global_cnts == nullptr) return nullptr;
-    int *global_sendcnts = global_cnts;
-    int *global_recvcnts = (global_cnts + nprocs);
-
-    // count send & recv collides, including self
+    int *commringlist;
+    int commringlen = 0;
+    // use a C++ set to organize the commringlist (C++17)
+    std::set<int> commringset;
 
     nsend = 0;
     nrecv = 0;
     for (i = 0; i < nprocs; i++) {
-      local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap);
-      local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap);
-      nsend += local_sendcnts[i];
-      nrecv += local_recvcnts[i];
+      if (remap_3d_collide(&in,&outarray[i],&overlap)) {
+        commringset.insert(i);
+        nsend++;
+      }
+      if (remap_3d_collide(&out,&inarray[i],&overlap)) {
+        commringset.insert(i);
+        nrecv++;
+      }
     }
 
-    // perform an AllReduce to get the counts from all other processors and build sendcnts list
-
-    MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm);
-
-    // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv
-    // also builds commringlist -- which is already sorted
-
-    int *commringlist = (int*) malloc(nprocs * sizeof(int));
-    int commringlen = 0;
-
-    for (i = 0; i < nprocs; i++) {
-      if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) {
-        commringlist[commringlen] = i;
-        commringlen++;
+    int commringappend = 1;
+    while (commringappend) {
+      commringappend = 0;
+      for (int setproci : commringset) {
+        for (j = 0; j < nprocs; j++) {
+          // short-circuit if already in commring
+          if (commringset.find(j) != commringset.end())
+            continue;
+          if (remap_3d_collide(&inarray[setproci],&outarray[j],&overlap)) {
+            auto set_insert_result = commringset.insert(j);
+            if (set_insert_result.second) {
+              commringappend++;
+            }
+          }
+          if (remap_3d_collide(&outarray[setproci],&inarray[j],&overlap)) {
+            auto set_insert_result = commringset.insert(j);
+            if (set_insert_result.second) {
+              commringappend++;
+            }
+          }
+        }
       }
     }
 
-    // resize commringlist to final size
+    // build already-sorted commringlist as an array
+    commringlist = (int*) malloc(commringset.size() * sizeof(int));
+    commringlen = 0;
 
-    commringlist = (int *) realloc(commringlist, commringlen*sizeof(int));
+    for (int setproci : commringset) {
+      commringlist[commringlen] = setproci;
+      commringlen++;
+    }
 
     // set the plan->commringlist
 
     plan->commringlen = commringlen;
     plan->commringlist = commringlist;
 
-    // clean up local buffers that are finished
-
-    local_sendcnts = nullptr;
-    local_recvcnts = nullptr;
-    global_recvcnts = nullptr;
-    global_sendcnts = nullptr;
-    free(local_cnts);
-    free(global_cnts);
-
     // malloc space for send & recv info
     // if the current proc is involved in any way in the communication, allocate space
     // because of the Alltoallv, both send and recv have to be initialized even if

From 1eb68101262a0d75f839f20a228c36e529b126d8 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 3 Dec 2025 10:26:04 -0500
Subject: [PATCH 10/17] Update remap in Kokkos KSPACE to use set instead of
 MPI_Allreduce

---
 src/KOKKOS/remap_kokkos.cpp | 90 +++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 48 deletions(-)

diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp
index 88f17eb40bd..63199e44f00 100644
--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@@ -12,6 +12,8 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+#include <set>
+
 #include "remap_kokkos.h"
 
 #include "error.h"
@@ -552,71 +554,63 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
 
     MPI_Comm_dup(comm,&plan->comm);
   } else {
-    // Improved approach - use an AllReduce to aggregate which ranks need to be included
-    // To do this, we build the local proc's send/receive list, then do an AllReduce
-    // to create the send/recv count for the Alltoallv
-
-    // local arrays to be used in the allreduce
-    // start with max length -- nprocs. Unused entries will be removed later
-
-    int *local_cnts = (int*) malloc(2*nprocs*sizeof(int));
-    if (local_cnts == nullptr) return nullptr;
-    int *local_sendcnts = local_cnts;
-    int *local_recvcnts = (local_cnts + nprocs);
 
-    // local arrays used to store the results of the allreduce
-
-    int *global_cnts = (int*) malloc(2*nprocs*sizeof(int));
-    if (global_cnts == nullptr) return nullptr;
-    int *global_sendcnts = global_cnts;
-    int *global_recvcnts = (global_cnts + nprocs);
-
-    // count send & recv collides, including self
+    int *commringlist;
+    int commringlen = 0;
+    // use a C++ set to organize the commringlist (C++17)
+    std::set<int> commringset;
 
     nsend = 0;
     nrecv = 0;
     for (i = 0; i < nprocs; i++) {
-      local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap);
-      local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap);
-      nsend += local_sendcnts[i];
-      nrecv += local_recvcnts[i];
+      if (remap_3d_collide(&in,&outarray[i],&overlap)) {
+        commringset.insert(i);
+        nsend++;
+      }
+      if (remap_3d_collide(&out,&inarray[i],&overlap)) {
+        commringset.insert(i);
+        nrecv++;
+      }
     }
 
-    // perform an AllReduce to get the counts from all other processors and build sendcnts list
-
-    MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm);
-
-    // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv
-    // also builds commringlist -- which is already sorted
-
-    int *commringlist = (int*) malloc(nprocs * sizeof(int));
-    int commringlen = 0;
-
-    for (i = 0; i < nprocs; i++) {
-      if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) {
-        commringlist[commringlen] = i;
-        commringlen++;
+    int commringappend = 1;
+    while (commringappend) {
+      commringappend = 0;
+      for (int setproci : commringset) {
+        for (int j = 0; j < nprocs; j++) {
+          // short-circuit if already in commring
+          if (commringset.find(j) != commringset.end())
+            continue;
+          if (remap_3d_collide(&inarray[setproci],&outarray[j],&overlap)) {
+            auto set_insert_result = commringset.insert(j);
+            if (set_insert_result.second) {
+              commringappend++;
+            }
+          }
+          if (remap_3d_collide(&outarray[setproci],&inarray[j],&overlap)) {
+            auto set_insert_result = commringset.insert(j);
+            if (set_insert_result.second) {
+              commringappend++;
+            }
+          }
+        }
       }
     }
 
-    // resize commringlist to final size
+    // build already-sorted commringlist as an array
+    commringlist = (int*) malloc(commringset.size() * sizeof(int));
+    commringlen = 0;
 
-    commringlist = (int *) realloc(commringlist, commringlen*sizeof(int));
+    for (int setproci : commringset) {
+      commringlist[commringlen] = setproci;
+      commringlen++;
+    }
 
     // set the plan->commringlist
 
     plan->commringlen = commringlen;
     plan->commringlist = commringlist;
 
-    // clean up local buffers that are finished
-
-    local_sendcnts = nullptr;
-    local_recvcnts = nullptr;
-    global_recvcnts = nullptr;
-    global_sendcnts = nullptr;
-    free(local_cnts);
-    free(global_cnts);
-
     // malloc space for send & recv info
     // if the current proc is involved in any way in the communication, allocate space
     // because of the Alltoallv, both send and recv have to be initialized even if

From 1ea586c74e87aaa375027860f17a80e1c3ffee04 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 3 Dec 2025 10:37:12 -0500
Subject: [PATCH 11/17] Added docs for nonblocking keyword for kspace_modify

---
 doc/src/kspace_modify.rst | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst
index b300213a3b1..3a0d3a99420 100644
--- a/doc/src/kspace_modify.rst
+++ b/doc/src/kspace_modify.rst
@@ -11,11 +11,12 @@ Syntax
    kspace_modify keyword value ...
 
 * one or more keyword/value pairs may be listed
-* keyword = *collective* or *compute* or *cutoff/adjust* or *diff* or *disp/auto* or *fftbench* or *force/disp/kspace* or *force/disp/real* or *force* or *gewald/disp* or *gewald* or *kmax/ewald* or *mesh* or *minorder* or *mix/disp* or *order/disp* or *order* or *overlap* or *scafacos* or *slab* or *splittol* or *wire*
+* keyword = *collective* or *nonblocking* or *compute* or *cutoff/adjust* or *diff* or *disp/auto* or *fftbench* or *force/disp/kspace* or *force/disp/real* or *force* or *gewald/disp* or *gewald* or *kmax/ewald* or *mesh* or *minorder* or *mix/disp* or *order/disp* or *order* or *overlap* or *scafacos* or *slab* or *splittol* or *wire*
 
   .. parsed-literal::
 
        *collective* value = *yes* or *no*
+       *nonblocking* value = *yes* or *no*
        *compute* value = *yes* or *no*
        *cutoff/adjust* value = *yes* or *no*
        *diff* value = *ad* or *ik* = 2 or 4 FFTs for PPPM in smoothed or non-smoothed mode
@@ -86,6 +87,16 @@ collective operations and adequate hardware.
 
 ----------
 
+The *nonblocking* keyword applies only to PPPM.  It is set to *no* by
+default. If this option is set to *yes*, LAMMPS will use non-blocking
+point-to-point MPI operations to remap data for 3d-FFT operations
+instead of the default blocking point-to-point communication. This
+allows for better utilization of full network bandwidth by overlapping
+communication to multiple other ranks at the same time, as well as
+overlapping receiving/unpacking data and sending data.
+
+----------
+
 The *compute* keyword allows Kspace computations to be turned off,
 even though a :doc:`kspace_style <kspace_style>` is defined.  This is
 not useful for running a real simulation, but can be useful for

From 0df30a38e9599c96352a6b8ec61939ca60261963 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Wed, 3 Dec 2025 10:38:48 -0500
Subject: [PATCH 12/17] Added defaults and restriction for
 collective/nonblocking keywords

---
 doc/src/kspace_modify.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst
index 3a0d3a99420..efdf0605bba 100644
--- a/doc/src/kspace_modify.rst
+++ b/doc/src/kspace_modify.rst
@@ -451,7 +451,8 @@ parameters, see the :doc:`Howto dispersion <Howto_dispersion>` doc page.
 Restrictions
 """"""""""""
 
-none
+The *collective* and *nonblocking* keywords are mutually exclusive and
+cannot be enabled at the same time.
 
 Related commands
 """"""""""""""""
@@ -463,6 +464,8 @@ Default
 
 The option defaults are as follows:
 
+* collective = no
+* nonblocking = no
 * compute = yes
 * cutoff/adjust = yes (MSM)
 * diff = ik (PPPM)

From 9df185fc149c4ced1e99131c91935ed3e7f004c8 Mon Sep 17 00:00:00 2001
From: Nick Hagerty <hagertynl@ornl.gov>
Date: Tue, 9 Dec 2025 09:54:23 -0500
Subject: [PATCH 13/17] Removed plan->self usage from collectives, since it's
 faster for self to be included in the collective

---
 src/KOKKOS/remap_kokkos.cpp | 44 +++----------------------------------
 src/KOKKOS/remap_kokkos.h   |  3 ---
 src/KSPACE/remap.cpp        | 43 +++---------------------------------
 src/KSPACE/remap.h          |  3 ---
 4 files changed, 6 insertions(+), 87 deletions(-)

diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp
index 63199e44f00..ebdae5d9548 100644
--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@@ -212,10 +212,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
 
       int numpacked = 0;
       for (isend = 0; isend < plan->commringlen; isend++) {
-        if (isend == plan->selfcommringloc && plan->self) {
-          numpacked++;
-        }
-        else if (plan->sendcnts[isend]) {
+        if (plan->sendcnts[isend]) {
           plan->pack(d_in,plan->send_offset[numpacked],
                       plan->d_sendbuf,plan->sdispls[isend],
                       &plan->packplan[numpacked]);
@@ -235,23 +232,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
       if (!plan->usegpu_aware)
         Kokkos::deep_copy(d_scratch,plan->h_scratch);
 
-      // copy in -> scratch -> out for self data
-
-      if (plan->self) {
-        plan->pack(d_in,plan->send_offset[plan->selfnsendloc],
-                  plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
-                  &plan->packplan[plan->selfnsendloc]);
-        plan->unpack(plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
-                    d_out,plan->recv_offset[plan->selfnrecvloc],
-                    &plan->unpackplan[plan->selfnrecvloc]);
-      }
-
       numpacked = 0;
       for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        if (irecv == plan->selfcommringloc && plan->self) {
-          numpacked++;
-        }
-        else if (plan->rcvcnts[irecv]) {
+        if (plan->rcvcnts[irecv]) {
           plan->unpack(d_scratch,plan->rdispls[irecv],
                        d_out,plan->recv_offset[numpacked],
                        &plan->unpackplan[numpacked]);
@@ -620,10 +603,6 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
 
       // send space
 
-      plan->selfcommringloc = -1;
-      plan->selfnsendloc = -1;
-      plan->selfnrecvloc = -1;
-
       plan->nsend = nsend;
       plan->pack = PackKokkos<DeviceType>::pack_3d;
 
@@ -688,10 +667,6 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
     int total_send_size = 0;
     for (i = 0; i < plan->commringlen; i++) {
       iproc = plan->commringlist[i];
-      if (iproc == me) {
-        plan->selfcommringloc = i;
-        plan->selfnsendloc = nsend;
-      }
       if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
         //plan->send_proc[nsend] = i;
         // number of entries required for this pack's 3-d coords
@@ -730,11 +705,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
 
     for (i = 0; i < plan->commringlen; i++) {
       iproc = plan->commringlist[i];
-      if (iproc == me) {
-        plan->selfnrecvloc = nrecv;
-      }
       if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
-
         if (permute == 0) {
           plan->recv_offset[nrecv] = nqty *
             ((overlap.klo-out.klo)*out.jsize*out.isize +
@@ -784,16 +755,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
     // init remaining fields in remap plan
 
     plan->memory = memory;
-
-    if (plan->sendcnts[plan->selfcommringloc]) {
-      plan->self = 1;
-      plan->sendcnts[plan->selfcommringloc] = 0;
-      plan->rcvcnts[plan->selfcommringloc] = 0;
-    }
-    else {
-      plan->self = 0;
-    }
-
+    plan->self = 0;
 
     // if requested, allocate internal scratch space for recvs,
     // only need it if I will receive any data (including self)
diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h
index a6a15bbf098..6f2f4b4344e 100644
--- a/src/KOKKOS/remap_kokkos.h
+++ b/src/KOKKOS/remap_kokkos.h
@@ -65,9 +65,6 @@ struct remap_plan_3d_kokkos {
   int *rcvcnts;                     // # of elements in recv buffer for each rank
   int *sdispls;                     // extraction location in send buffer for each rank
   int *rdispls;                     // extraction location in recv buffer for each rank
-  int selfcommringloc;              // current proc's location in commringlist
-  int selfnsendloc;                 // current proc's location in send lists
-  int selfnrecvloc;                 // current proc's location in recv lists
 };
 
 template<class DeviceType>
diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index 73db03cde83..3b925126168 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -150,10 +150,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
 
       int numpacked = 0;
       for (isend = 0; isend < plan->commringlen; isend++) {
-        if (isend == plan->selfcommringloc && plan->self) {
-          numpacked++;
-        }
-        else if (plan->sendcnts[isend]) {
+        if (plan->sendcnts[isend]) {
           plan->pack(&in[plan->send_offset[numpacked]],
                       &plan->sendbuf[plan->sdispls[isend]],
                       &plan->packplan[numpacked]);
@@ -165,25 +162,11 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
                     MPI_FFT_SCALAR, scratch, plan->rcvcnts,
                     plan->rdispls, MPI_FFT_SCALAR, plan->comm);
 
-      // copy in -> scratch -> out for self data
-
-      if (plan->self) {
-        plan->pack(&in[plan->send_offset[plan->selfnsendloc]],
-                  &plan->sendbuf[plan->sdispls[plan->selfcommringloc]],
-                  &plan->packplan[plan->selfnsendloc]);
-        plan->unpack(&plan->sendbuf[plan->sdispls[plan->selfcommringloc]],
-                    &out[plan->recv_offset[plan->selfnrecvloc]],
-                    &plan->unpackplan[plan->selfnrecvloc]);
-      }
-
       // unpack the data from the recv buffer into out
 
       numpacked = 0;
       for (irecv = 0; irecv < plan->commringlen; irecv++) {
-        if (irecv == plan->selfcommringloc && plan->self) {
-          numpacked++;
-        }
-        else if (plan->rcvcnts[irecv]) {
+        if (plan->rcvcnts[irecv]) {
           plan->unpack(&scratch[plan->rdispls[irecv]],
                        &out[plan->recv_offset[numpacked]],
                        &plan->unpackplan[numpacked]);
@@ -554,10 +537,6 @@ struct remap_plan_3d *remap_3d_create_plan(
 
       // send space
 
-      plan->selfcommringloc = -1;
-      plan->selfnsendloc = -1;
-      plan->selfnrecvloc = -1;
-
       plan->nsend = nsend;
       plan->pack = pack_3d;
 
@@ -622,10 +601,6 @@ struct remap_plan_3d *remap_3d_create_plan(
     int total_send_size = 0;
     for (i = 0; i < plan->commringlen; i++) {
       iproc = plan->commringlist[i];
-      if (iproc == me) {
-        plan->selfcommringloc = i;
-        plan->selfnsendloc = nsend;
-      }
       if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
         // number of entries required for this pack's 3-d coords
         plan->send_offset[nsend] = nqty *
@@ -663,11 +638,7 @@ struct remap_plan_3d *remap_3d_create_plan(
 
     for (i = 0; i < plan->commringlen; i++) {
       iproc = plan->commringlist[i];
-      if (iproc == me) {
-        plan->selfnrecvloc = nrecv;
-      }
       if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
-
         if (permute == 0) {
           plan->recv_offset[nrecv] = nqty *
             ((overlap.klo-out.klo)*out.jsize*out.isize +
@@ -717,15 +688,7 @@ struct remap_plan_3d *remap_3d_create_plan(
     // init remaining fields in remap plan
 
     plan->memory = memory;
-
-    if (plan->sendcnts[plan->selfcommringloc]) {
-      plan->self = 1;
-      plan->sendcnts[plan->selfcommringloc] = 0;
-      plan->rcvcnts[plan->selfcommringloc] = 0;
-    }
-    else {
-      plan->self = 0;
-    }
+    plan->self = 0;
 
     // if requested, allocate internal scratch space for recvs,
     // only need it if I will receive any data (including self)
diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h
index c171bbe5a3e..057c8e8824c 100644
--- a/src/KSPACE/remap.h
+++ b/src/KSPACE/remap.h
@@ -49,9 +49,6 @@ struct remap_plan_3d {
   int *rcvcnts;                     // # of elements in recv buffer for each rank
   int *sdispls;                     // extraction location in send buffer for each rank
   int *rdispls;                     // extraction location in recv buffer for each rank
-  int selfcommringloc;              // current proc's location in commringlist
-  int selfnsendloc;                 // current proc's location in send lists
-  int selfnrecvloc;                 // current proc's location in recv lists
 };
 
 // collision between 2 regions

From f89472bc886379e7b1bb8dbed8868270949b40ed Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 10 Dec 2025 09:20:31 -0700
Subject: [PATCH 14/17] Rename flag

---
 src/KOKKOS/fft3d_kokkos.cpp | 16 ++++++++--------
 src/KOKKOS/pppm_kokkos.cpp  |  8 ++++----
 src/KOKKOS/remap_kokkos.cpp | 22 +++++++++++-----------
 src/KOKKOS/remap_kokkos.h   |  4 ++--
 src/KSPACE/fft3d.cpp        |  8 ++++----
 src/KSPACE/fft3d_wrap.cpp   |  4 ++--
 src/KSPACE/pppm.cpp         |  6 +++---
 src/KSPACE/pppm_dipole.cpp  |  6 +++---
 src/KSPACE/pppm_disp.cpp    | 12 ++++++------
 src/KSPACE/remap.cpp        | 18 +++++++++---------
 src/KSPACE/remap.h          |  4 ++--
 src/KSPACE/remap_wrap.cpp   |  4 ++--
 src/kspace.cpp              |  6 +++---
 src/kspace.h                |  2 +-
 14 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp
index 99642e27f47..df1380c8853 100644
--- a/src/KOKKOS/fft3d_kokkos.cpp
+++ b/src/KOKKOS/fft3d_kokkos.cpp
@@ -35,7 +35,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
              int scaled, int permute, int *nbuf, int usecollective,
-             int useisend, int usegpu_aware) :
+             int usenonblocking, int usegpu_aware) :
   Pointers(lmp)
 {
   int nthreads = lmp->kokkos->nthreads;
@@ -81,7 +81,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
   plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
                             in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                             out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,usecollective,useisend,nthreads,usegpu_aware);
+                            scaled,permute,nbuf,usecollective,usenonblocking,nthreads,usegpu_aware);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan");
 }
 
@@ -400,7 +400,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
                           2 = permute twice = slow->fast, fast->mid, mid->slow
    nbuf                 returns size of internal storage buffers used by FFT
    usecollective        use collective MPI operations for remapping data
-   useisend             when using point-to-point MPI, use MPI_Isend
+   usenonblocking       when using point-to-point MPI, use MPI_Isend
    usegpu_aware         use GPU-Aware MPI or not
 ------------------------------------------------------------------------- */
 
@@ -412,7 +412,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
        int out_ilo, int out_ihi, int out_jlo, int out_jhi,
        int out_klo, int out_khi,
        int scaled, int permute, int *nbuf, int usecollective,
-       int useisend, int nthreads, int usegpu_aware)
+       int usenonblocking, int nthreads, int usegpu_aware)
 {
   struct fft_plan_3d_kokkos<DeviceType> *plan;
   int me,nprocs;
@@ -469,7 +469,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
       remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                            first_ilo,first_ihi,first_jlo,first_jhi,
                            first_klo,first_khi,2,0,0,FFT_PRECISION,
-                           usecollective,useisend,usegpu_aware);
+                           usecollective,usenonblocking,usegpu_aware);
     if (plan->pre_plan == nullptr) return nullptr;
   }
 
@@ -494,7 +494,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                            first_klo,first_khi,
                            second_ilo,second_ihi,second_jlo,second_jhi,
                            second_klo,second_khi,2,1,0,FFT_PRECISION,
-                           usecollective,useisend,usegpu_aware);
+                           usecollective,usenonblocking,usegpu_aware);
   if (plan->mid1_plan == nullptr) return nullptr;
 
   // 1d FFTs along mid axis
@@ -535,7 +535,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                          second_ilo,second_ihi,
                          third_jlo,third_jhi,third_klo,third_khi,
                          third_ilo,third_ihi,2,1,0,FFT_PRECISION,
-                         usecollective,useisend,usegpu_aware);
+                         usecollective,usenonblocking,usegpu_aware);
   if (plan->mid2_plan == nullptr) return nullptr;
 
   // 1d FFTs along slow axis
@@ -563,7 +563,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
                            third_jlo,third_jhi,
                            out_klo,out_khi,out_ilo,out_ihi,
                            out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
-                           usecollective,useisend,usegpu_aware);
+                           usecollective,usenonblocking,usegpu_aware);
     if (plan->post_plan == nullptr) return nullptr;
   }
 
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 6ed541738e1..df00378eac1 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -833,23 +833,23 @@ void PPPMKokkos<DeviceType>::allocate()
   // remap takes data from 3d brick to FFT decomposition
 
   int collective_flag = force->kspace->collective_flag;
-  int isend_flag = force->kspace->isend_flag;
+  int nonblocking_flag = force->kspace->nonblocking_flag;
   int gpu_aware_flag = lmp->kokkos->gpu_aware_flag;
   int tmp;
 
   fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                         0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag);
+                         0,0,&tmp,collective_flag,nonblocking_flag,gpu_aware_flag);
 
   fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                         0,0,&tmp,collective_flag,isend_flag,gpu_aware_flag);
+                         0,0,&tmp,collective_flag,nonblocking_flag,gpu_aware_flag);
   remap = new RemapKokkos<DeviceType>(lmp,world,
                           nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                           nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                          1,0,0,FFT_PRECISION,collective_flag,isend_flag,gpu_aware_flag);
+                          1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag,gpu_aware_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp
index ebdae5d9548..9530dc626b9 100644
--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@@ -37,13 +37,13 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
              int out_klo, int out_khi,
              int nqty, int permute, int memory,
              int precision, int usecollective,
-             int useisend, int usegpu_aware) : Pointers(lmp)
+             int usenonblocking, int usegpu_aware) : Pointers(lmp)
 {
   plan = remap_3d_create_plan_kokkos(comm,
                               in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                               out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
                               nqty,permute,memory,precision,usecollective,
-                              useisend, usegpu_aware);
+                              usenonblocking, usegpu_aware);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan");
 }
 
@@ -144,7 +144,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
 
     for (isend = 0; isend < plan->nsend; isend++) {
       int in_offset = plan->send_offset[isend];
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         plan->pack(d_in,in_offset,
                   plan->d_sendbuf,plan->send_bufloc[isend],
                   &plan->packplan[isend]);
@@ -157,7 +157,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
       if (!plan->usegpu_aware)
         Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
 
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         MPI_Isend(v_sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR,
                 plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]);
       } else {
@@ -198,7 +198,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
                    d_out,out_offset,&plan->unpackplan[irecv]);
     }
 
-    if (plan->useisend) {
+    if (plan->usenonblocking) {
       // finally, wait for all Isends to be done
       MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE);
     }
@@ -268,7 +268,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
                           1 = single precision (4 bytes per datum)
                           2 = double precision (8 bytes per datum)
    usecollective        whether to use collective MPI or point-to-point
-   useisend             when using point-to-point MPI, use non-blocking MPI_Isend
+   usenonblocking       when using point-to-point MPI, use non-blocking MPI_Isend
    usegpu_aware         whether to use GPU-Aware MPI or not
 ------------------------------------------------------------------------- */
 
@@ -280,7 +280,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
   int out_ilo, int out_ihi, int out_jlo, int out_jhi,
   int out_klo, int out_khi,
   int nqty, int permute, int memory, int /*precision*/,
-  int usecollective, int useisend, int usegpu_aware)
+  int usecollective, int usenonblocking, int usegpu_aware)
 {
 
   struct remap_plan_3d_kokkos<DeviceType> *plan;
@@ -298,7 +298,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
   plan = new struct remap_plan_3d_kokkos<DeviceType>;
   if (plan == nullptr) return nullptr;
   plan->usecollective = usecollective;
-  plan->useisend = useisend;
+  plan->usenonblocking = usenonblocking;
   plan->usegpu_aware = usegpu_aware;
 
   // store parameters in local data structs
@@ -363,7 +363,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
       plan->packplan = (struct pack_plan_3d *)
         malloc(nsend*sizeof(struct pack_plan_3d));
 
-      if (plan->useisend)
+      if (plan->usenonblocking)
         plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request));
         plan->send_bufloc = (int *) malloc(nsend*sizeof(int));
         if (plan->send_bufloc == nullptr) return nullptr;
@@ -509,7 +509,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
     // find biggest send message (not including self) and malloc space for it
 
     size = 0;
-    if (plan->useisend) {
+    if (plan->usenonblocking) {
       for (nsend = 0; nsend < plan->nsend; nsend++)
         size += plan->send_size[nsend];
     } else {
@@ -839,7 +839,7 @@ void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_
       free(plan->send_size);
       free(plan->send_proc);
       free(plan->packplan);
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         free(plan->isend_reqs);
         free(plan->send_bufloc);
       }
diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h
index 6f2f4b4344e..6701f1afea2 100644
--- a/src/KOKKOS/remap_kokkos.h
+++ b/src/KOKKOS/remap_kokkos.h
@@ -40,7 +40,7 @@ struct remap_plan_3d_kokkos {
   int *send_offset;                 // extraction loc for each send
   int *send_size;                   // size of each send message
   int *send_proc;                   // proc to send each message to
-  int *send_bufloc;                 // if useisend, offset in send buf for each isend
+  int *send_bufloc;                 // if usenonblocking, offset in send buf for each isend
   struct pack_plan_3d *packplan;    // pack plan for each send message
   int *recv_offset;                 // insertion loc for each recv
   int *recv_size;                   // size of each recv message
@@ -56,7 +56,7 @@ struct remap_plan_3d_kokkos {
   int memory;                       // user provides scratch space or not
   MPI_Comm comm;                    // group of procs performing remap
   int usecollective;                // use collective or point-to-point MPI
-  int useisend;                     // if using point-to-point MPI, use MPI_Isend
+  int usenonblocking;               // if using point-to-point MPI, use MPI_Isend
   int usegpu_aware;                 // use GPU-Aware MPI or not
   // variables for collective MPI only
   int commringlen;                  // length of commringlist
diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp
index f066c7db656..e5063616625 100644
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@@ -241,7 +241,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
                           2 = permute twice = slow->fast, fast->mid, mid->slow
    nbuf                 returns size of internal storage buffers used by FFT
    usecollective        use collective MPI operations for remapping data
-   useisend             use non-blocking or blocking MPI pt2pt operations for remapping data
+   usenonblocking       use non-blocking or blocking MPI pt2pt operations for remapping data
 ------------------------------------------------------------------------- */
 
 struct fft_plan_3d *fft_3d_create_plan(
@@ -250,7 +250,7 @@ struct fft_plan_3d *fft_3d_create_plan(
        int in_klo, int in_khi,
        int out_ilo, int out_ihi, int out_jlo, int out_jhi,
        int out_klo, int out_khi,
-       int scaled, int permute, int *nbuf, int usecollective, int useisend)
+       int scaled, int permute, int *nbuf, int usecollective, int usenonblocking)
 {
   struct fft_plan_3d *plan;
   int me,nprocs;
@@ -340,7 +340,7 @@ struct fft_plan_3d *fft_3d_create_plan(
   plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi,
                                          first_klo,first_khi,second_ilo,second_ihi,
                                          second_jlo,second_jhi,second_klo,second_khi,
-                                         2,1,0,FFT_PRECISION,usecollective,useisend);
+                                         2,1,0,FFT_PRECISION,usecollective,usenonblocking);
   if (plan->mid1_plan == nullptr) return nullptr;
 
   // 1d FFTs along mid axis
@@ -381,7 +381,7 @@ struct fft_plan_3d *fft_3d_create_plan(
                          second_jlo,second_jhi,second_klo,second_khi,
                          second_ilo,second_ihi,
                          third_jlo,third_jhi,third_klo,third_khi,
-                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,useisend);
+                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective,usenonblocking);
   if (plan->mid2_plan == nullptr) return nullptr;
 
   // 1d FFTs along slow axis
diff --git a/src/KSPACE/fft3d_wrap.cpp b/src/KSPACE/fft3d_wrap.cpp
index 44d482a721b..0fe364ac25f 100644
--- a/src/KSPACE/fft3d_wrap.cpp
+++ b/src/KSPACE/fft3d_wrap.cpp
@@ -25,13 +25,13 @@ FFT3d::FFT3d(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
              int in_klo, int in_khi,
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
-             int scaled, int permute, int *nbuf, int usecollective, int useisend) : Pointers(lmp)
+             int scaled, int permute, int *nbuf, int usecollective, int usenonblocking) : Pointers(lmp)
 {
   #ifndef FFT_HEFFTE
   plan = fft_3d_create_plan(comm,nfast,nmid,nslow,
                             in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                             out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,usecollective,useisend);
+                            scaled,permute,nbuf,usecollective,usenonblocking);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d FFT plan");
   #else
   heffte::plan_options options = heffte::default_options<heffte_backend>();
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 46232b13533..c829e04117f 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -837,17 +837,17 @@ void PPPM::allocate()
   fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag,isend_flag);
+                   0,0,&tmp,collective_flag,nonblocking_flag);
 
   fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag,isend_flag);
+                   0,0,&tmp,collective_flag,nonblocking_flag);
 
   remap = new Remap(lmp,world,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                     nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag,isend_flag);
+                    1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
index 193d66f8264..979f14a3659 100644
--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -622,17 +622,17 @@ void PPPMDipole::allocate()
   fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag,isend_flag);
+                   0,0,&tmp,collective_flag,nonblocking_flag);
 
   fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag,isend_flag);
+                   0,0,&tmp,collective_flag,nonblocking_flag);
 
   remap = new Remap(lmp,world,
                     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                     nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag,isend_flag);
+                    1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index cf33931ac84..ae23ee873cd 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -1771,17 +1771,17 @@ void _noopt PPPMDisp::allocate()
     fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                     0,0,&tmp,collective_flag,isend_flag);
+                     0,0,&tmp,collective_flag,nonblocking_flag);
 
     fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
                      nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                      nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                     0,0,&tmp,collective_flag,isend_flag);
+                     0,0,&tmp,collective_flag,nonblocking_flag);
 
     remap = new Remap(lmp,world,
                       nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
                       nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                      1,0,0,FFT_PRECISION,collective_flag,isend_flag);
+                      1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag);
   }
 
   // --------------------------------------
@@ -1848,19 +1848,19 @@ void _noopt PPPMDisp::allocate()
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                0,0,&tmp,collective_flag,isend_flag);
+                0,0,&tmp,collective_flag,nonblocking_flag);
 
     fft2_6 =
       new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
                 nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
-                0,0,&tmp,collective_flag,isend_flag);
+                0,0,&tmp,collective_flag,nonblocking_flag);
 
     remap_6 =
       new Remap(lmp,world,
                 nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6,
                 nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6,
-                1,0,0,FFT_PRECISION,collective_flag,isend_flag);
+                1,0,0,FFT_PRECISION,collective_flag,nonblocking_flag);
   }
 
   // --------------------------------------
diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index 3b925126168..03e467ca9e8 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -88,7 +88,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
 
     for (isend = 0; isend < plan->nsend; isend++) {
       int in_offset = plan->send_offset[isend];
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         plan->pack(&in[in_offset],
                   &plan->sendbuf[plan->send_bufloc[isend]],
                   &plan->packplan[isend]);
@@ -98,7 +98,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
                   &plan->packplan[isend]);
       }
 
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         MPI_Isend(plan->sendbuf + plan->send_bufloc[isend],plan->send_size[isend],MPI_FFT_SCALAR,
                 plan->send_proc[isend],0,plan->comm,&plan->isend_reqs[isend]);
       } else {
@@ -136,7 +136,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
                    &out[out_offset],&plan->unpackplan[irecv]);
     }
 
-    if (plan->useisend) {
+    if (plan->usenonblocking) {
       // finally, wait for all Isends to be done
       MPI_Waitall(plan->nsend,plan->isend_reqs,MPI_STATUS_IGNORE);
     }
@@ -200,7 +200,7 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
                           1 = single precision (4 bytes per datum)
                           2 = double precision (8 bytes per datum)
    usecollective        whether to use collective MPI or point-to-point
-   useisend             whether to use non-blocking or blocking MPI point-to-point
+   usenonblocking       whether to use non-blocking or blocking MPI point-to-point
 ------------------------------------------------------------------------- */
 
 struct remap_plan_3d *remap_3d_create_plan(
@@ -209,7 +209,7 @@ struct remap_plan_3d *remap_3d_create_plan(
   int in_klo, int in_khi,
   int out_ilo, int out_ihi, int out_jlo, int out_jhi,
   int out_klo, int out_khi, int nqty, int permute,
-  int memory, int /*precision*/, int usecollective, int useisend)
+  int memory, int /*precision*/, int usecollective, int usenonblocking)
 
 {
 
@@ -228,7 +228,7 @@ struct remap_plan_3d *remap_3d_create_plan(
   plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d));
   if (plan == nullptr) return nullptr;
   plan->usecollective = usecollective;
-  plan->useisend = useisend;
+  plan->usenonblocking = usenonblocking;
 
   // store parameters in local data structs
 
@@ -300,7 +300,7 @@ struct remap_plan_3d *remap_3d_create_plan(
       plan->packplan = (struct pack_plan_3d *)
         malloc(nsend*sizeof(struct pack_plan_3d));
 
-      if (plan->useisend)
+      if (plan->usenonblocking)
         plan->isend_reqs = (MPI_Request *) malloc(nsend*sizeof(MPI_Request));
         plan->send_bufloc = (int *) malloc(nsend*sizeof(int));
         if (plan->send_bufloc == nullptr) return nullptr;
@@ -445,7 +445,7 @@ struct remap_plan_3d *remap_3d_create_plan(
     // find biggest send message (not including self) and malloc space for it
 
     size = 0;
-    if (plan->useisend) {
+    if (plan->usenonblocking) {
       for (nsend = 0; nsend < plan->nsend; nsend++)
         size += plan->send_size[nsend];
     } else {
@@ -770,7 +770,7 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan)
       free(plan->send_size);
       free(plan->send_proc);
       free(plan->packplan);
-      if (plan->useisend) {
+      if (plan->usenonblocking) {
         free(plan->isend_reqs);
         free(plan->send_bufloc);
       }
diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h
index 057c8e8824c..8bc73057f0f 100644
--- a/src/KSPACE/remap.h
+++ b/src/KSPACE/remap.h
@@ -27,7 +27,7 @@ struct remap_plan_3d {
   int *send_offset;                   // extraction loc for each send
   int *send_size;                     // size of each send message
   int *send_proc;                     // proc to send each message to
-  int *send_bufloc;                   // if useisend, offset in send buf for each isend
+  int *send_bufloc;                   // if usenonblocking, offset in send buf for each isend
   MPI_Request *isend_reqs;            // MPI request for each posted isend
   struct pack_plan_3d *packplan;      // pack plan for each send message
   int *recv_offset;                   // insertion loc for each recv
@@ -42,7 +42,7 @@ struct remap_plan_3d {
   int memory;                         // user provides scratch space or not
   MPI_Comm comm;                      // group of procs performing remap
   int usecollective;                  // use collective or point-to-point MPI
-  int useisend;                       // if using point-to-point MPI, use MPI_Isend
+  int usenonblocking;                 // if using point-to-point MPI, use MPI_Isend
   int commringlen;                    // length of commringlist
   int *commringlist;                  // ranks on communication ring of this plan
   int *sendcnts;                    // # of elements in send buffer for each rank
diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp
index 31bf6af910e..40651381142 100644
--- a/src/KSPACE/remap_wrap.cpp
+++ b/src/KSPACE/remap_wrap.cpp
@@ -26,12 +26,12 @@ Remap::Remap(LAMMPS *lmp, MPI_Comm comm,
              int out_ilo, int out_ihi, int out_jlo, int out_jhi,
              int out_klo, int out_khi,
              int nqty, int permute, int memory,
-             int precision, int usecollective, int useisend) : Pointers(lmp)
+             int precision, int usecollective, int usenonblocking) : Pointers(lmp)
 {
   plan = remap_3d_create_plan(comm,
                               in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
                               out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                              nqty,permute,memory,precision,usecollective,useisend);
+                              nqty,permute,memory,precision,usecollective,usenonblocking);
   if (plan == nullptr) error->one(FLERR,"Could not create 3d remap plan");
 }
 
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 9b4bf478bfb..249e73b7ae8 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -60,7 +60,7 @@ KSpace::KSpace(LAMMPS *lmp) :
 #else
   collective_flag = 0;
 #endif
-  isend_flag = 0;
+  nonblocking_flag = 0;
 
   kewaldflag = 0;
 
@@ -554,7 +554,7 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"nonblocking") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      isend_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      nonblocking_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"diff") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
@@ -613,7 +613,7 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += n;
     }
   }
-  if (collective_flag > 0 && isend_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true.");
+  if (collective_flag > 0 && nonblocking_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true.");
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/kspace.h b/src/kspace.h
index 9712f91aa32..eff18cf7960 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -131,7 +131,7 @@ class KSpace : protected Pointers {
   int compute_flag;       // 0 if skip compute()
   int fftbench;           // 0 if skip FFT timing
   int collective_flag;    // 1 if use MPI collectives for FFT/remap
-  int isend_flag;         // 1 if use MPI_Isend for FFT/remap
+  int nonblocking_flag;   // 1 if use MPI_Isend for FFT/remap
   int stagger_flag;       // 1 if using staggered PPPM grids
 
   double splittol;    // tolerance for when to truncate splitting

From b0e61971a9d0a6dd9e8fd26d610dec79f79125fc Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 10 Dec 2025 12:22:07 -0500
Subject: [PATCH 15/17] simplify handling of collective and nonblocking
 exclusivity

---
 doc/src/kspace_modify.rst | 5 +++--
 src/kspace.cpp            | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst
index efdf0605bba..4336b557024 100644
--- a/doc/src/kspace_modify.rst
+++ b/doc/src/kspace_modify.rst
@@ -451,8 +451,9 @@ parameters, see the :doc:`Howto dispersion <Howto_dispersion>` doc page.
 Restrictions
 """"""""""""
 
-The *collective* and *nonblocking* keywords are mutually exclusive and
-cannot be enabled at the same time.
+The *collective* and *nonblocking* keywords cannot both be enabled
+at the same time.  Whichever of the two keywords is enabled last will
+disable the other.
 
 Related commands
 """"""""""""""""
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 249e73b7ae8..52e7a65980e 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -551,10 +551,12 @@ void KSpace::modify_params(int narg, char **arg)
     } else if (strcmp(arg[iarg],"collective") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       collective_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      if (collective_flag) nonblocking_flag = 0;
       iarg += 2;
     } else if (strcmp(arg[iarg],"nonblocking") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
       nonblocking_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      if (nonblocking_flag) collective_flag = 0;
       iarg += 2;
     } else if (strcmp(arg[iarg],"diff") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
@@ -613,7 +615,6 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += n;
     }
   }
-  if (collective_flag > 0 && nonblocking_flag > 0) error->all(FLERR,"Illegal kspace_modify command, collective and nonblocking cannot both be true.");
 }
 
 /* ---------------------------------------------------------------------- */

From 033af47e95f638ff479020e8ff9b1f8116add973 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 10 Dec 2025 12:44:32 -0500
Subject: [PATCH 16/17] add version tag

---
 doc/src/kspace_modify.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/src/kspace_modify.rst b/doc/src/kspace_modify.rst
index 4336b557024..869fd13b287 100644
--- a/doc/src/kspace_modify.rst
+++ b/doc/src/kspace_modify.rst
@@ -87,6 +87,8 @@ collective operations and adequate hardware.
 
 ----------
 
+.. versionadded:: 10Dec2025
+
 The *nonblocking* keyword applies only to PPPM.  It is set to *no* by
 default. If this option is set to *yes*, LAMMPS will use non-blocking
 point-to-point MPI operations to remap data for 3d-FFT operations

From f98de9e93474beddd325ac90b6e617e40c2d68f5 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 10 Dec 2025 12:45:01 -0500
Subject: [PATCH 17/17] avoid memory leaks and correct call to free plan

---
 src/KSPACE/remap.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index 03e467ca9e8..2fb6679b102 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -229,6 +229,8 @@ struct remap_plan_3d *remap_3d_create_plan(
   if (plan == nullptr) return nullptr;
   plan->usecollective = usecollective;
   plan->usenonblocking = usenonblocking;
+  plan->scratch = nullptr;
+  plan->sendbuf = nullptr;
 
   // store parameters in local data structs
 
@@ -438,8 +440,7 @@ struct remap_plan_3d *remap_3d_create_plan(
     if (nrecv == plan->nrecv) plan->self = 0;
     else plan->self = 1;
 
-    // the plan->sendbuf and plan->recvbuf are used by both the
-    // collective & non-collective implementations.
+    // plan->sendbuf is used by both the collective & non-collective implementations.
     // For non-collective and blocking, the buffer size is MAX(send_size) for any one send
 
     // find biggest send message (not including self) and malloc space for it
@@ -471,6 +472,7 @@ struct remap_plan_3d *remap_3d_create_plan(
     // Non-collectives do not use MPI Communicator Groups
 
     MPI_Comm_dup(comm,&plan->comm);
+
   } else {
     int *commringlist;
     int commringlen = 0;
@@ -763,6 +765,7 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan)
       free(plan->unpackplan);
     }
   } else {
+
     // free arrays used in pt2pt communication
 
     if (plan->nsend || plan->self) {
@@ -786,9 +789,14 @@ void remap_3d_destroy_plan(struct remap_plan_3d *plan)
     }
   }
 
+  // free buffers, if needed
+
+  if (plan->scratch) free(plan->scratch);
+  if (plan->sendbuf) free(plan->sendbuf);
+
   // free plan itself
 
-  delete plan;
+  free(plan);
 }
 
 /* ----------------------------------------------------------------------