merge develop

parthenon-hpc-lab · jdolence · Dec 11, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
commit 3e7ea4b4ab81c57e4ce1309e12751e92a614689e
diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
@@ -60,15 +60,15 @@ jobs:
           cd build
           # Pick GPU with most available memory
           export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-          ctest -L performance -LE perf-reg
+          ctest -L performance -LE perf-reg -E gmg
 
       # run regression tests
       - name: Regression tests
         run: |
           cd build
           # Pick GPU with most available memory
           export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-          ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
+          ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg -E gmg --timeout 3600
 
       # Test Ascent integration (only most complex setup with MPI and on device)
       - name: Ascent tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,29 +3,63 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+
+
+### Changed (changing behavior/API/variables/...)
+
+
+### Fixed (not changing behavior/API/variables/...)
+- [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR
+
+### Infrastructure (changes irrelevant to downstream codes)
+- [[PR 1009]](https://github.com/parthenon-hpc-lab/parthenon/pull/1009) Move from a single octree to a forest of octrees
+
+
+### Removed (removing behavior/API/varaibles/...)
+
+
+### Incompatibilities (i.e. breaking changes)
+
+
+## Release 24.03
+Date: 2024-03-21
+
+### Added (new features/APIs/variables/...)
+- [[PR 852]](https://github.com/parthenon-hpc-lab/parthenon/pull/852) Add Mesh version of UserWorkBeforeOutput
+- [[PR 998]](https://github.com/parthenon-hpc-lab/parthenon/pull/998) tensor indices added to sparse pack
+- [[PR 999]](https://github.com/parthenon-hpc-lab/parthenon/pull/999) Add a post-initialization hook
 - [[PR 987]](https://github.com/parthenon-hpc-lab/parthenon/pull/987) New tasking infrastructure and capabilities
 - [[PR 969]](https://github.com/parthenon-hpc-lab/parthenon/pull/969) New macro-based auto-naming of profiling regions and kernels
 - [[PR 981]](https://github.com/parthenon-hpc-lab/parthenon/pull/981) Add IndexSplit
 - [[PR 983]](https://github.com/parthenon-hpc-lab/parthenon/pull/983) Add Contains to SparsePack
 - [[PR 968]](https://github.com/parthenon-hpc-lab/parthenon/pull/968) Add per package registration of boundary conditions
 - [[PR 948]](https://github.com/parthenon-hpc-lab/parthenon/pull/948) Add solver interface and update Poisson geometric multi-grid example
+- [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR1019](https://github.com/parthenon-hpc-lab/parthenon/pull/1019) Enable output for non-cell-centered variables
+- [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades
 
 ### Fixed (not changing behavior/API/variables/...)
+- [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool
+- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
 - [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
 - [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
 - [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
 - [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check
 
 ### Infrastructure (changes irrelevant to downstream codes)
+- [[PR 1027]](https://github.com/parthenon-hpc-lab/parthenon/pull/1027) Refactor RestartReader as abstract class
+- [[PR 1017]](https://github.com/parthenon-hpc-lab/parthenon/pull/1017) Make regression tests more verbose on failure
+- [[PR 1007]](https://github.com/parthenon-hpc-lab/parthenon/pull/1007) Split template instantiations for HDF5 Read/Write attributes to speed up compile times
+- [[PR 990]](https://github.com/parthenon-hpc-lab/parthenon/pull/990) Partial refactor of HDF5 I/O code for readability/extendability
 - [[PR 982]](https://github.com/parthenon-hpc-lab/parthenon/pull/982) add some gut check testing for parthenon-VIBE
 
-### Removed (removing behavior/API/varaibles/...)
-
 ### Incompatibilities (i.e. breaking changes)
+- [[PR1019](https://github.com/parthenon-hpc-lab/parthenon/pull/1019) Remove support for file formats < 3
 - [[PR 987]](https://github.com/parthenon-hpc-lab/parthenon/pull/987) Change the API for what was IterativeTasks
 - [[PR 974]](https://github.com/parthenon-hpc-lab/parthenon/pull/974) Change GetParentPointer to always return T*
+- [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation
 
 
 ## Release 23.11

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,6 @@
 #=========================================================================================
 # Parthenon performance portable AMR framework
-# Copyright(C) 2020-2023 The Parthenon collaboration
+# Copyright(C) 2020-2024 The Parthenon collaboration
 # Licensed under the 3-clause BSD License, see LICENSE file for details
 #=========================================================================================
 # (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
@@ -20,7 +20,7 @@ cmake_minimum_required(VERSION 3.16)
 # Imports machine-specific configuration
 include(cmake/MachineCfg.cmake)
 
-project(parthenon VERSION 23.11 LANGUAGES C CXX)
+project(parthenon VERSION 24.03 LANGUAGES C CXX)
 
 if (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.19.0)
   cmake_policy(SET CMP0110 NEW)
@@ -62,9 +62,9 @@ include(cmake/Format.cmake)
 include(cmake/Lint.cmake)
 
 # regression test reference data
-set(REGRESSION_GOLD_STANDARD_VER 20 CACHE STRING "Version of gold standard to download and use")
+set(REGRESSION_GOLD_STANDARD_VER 21 CACHE STRING "Version of gold standard to download and use")
 set(REGRESSION_GOLD_STANDARD_HASH
-  "SHA512=e5e421f3c0be01e4708965542bb8b1b79b5c96de97091e46972e375c7616588d026a9a8e29226d9c7ef75346bc859fd9af72acdc7e95e0d783b5ef29aa4630b1"
+  "SHA512=e16b14272915b4607965e5900961402f6da96dc13da8ea3c3d213d61f82d3a1dded08c40a9ab644aa3409d93a045bba360a90a43dc289b24f525878f9ba50890"
   CACHE STRING "Hash of default gold standard file to download")
 option(REGRESSION_GOLD_STANDARD_SYNC "Automatically sync gold standard files." ON)
 

diff --git a/README.md b/README.md
@@ -13,11 +13,12 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr
 * High performance by
   * device first/device resident approach (work data only in device memory to prevent expensive transfers between host and device)
   * transparent packing of data across blocks (to reduce/hide kernel launch latency)
-  * direct device-to-device communication via asynchronous, one-sided  MPI communication
+  * direct device-to-device communication via asynchronous MPI communication
 * Intermediate abstraction layer to hide complexity of device kernel launches
 * Flexible, plug-in package system
 * Abstract variables controlled via metadata flags
 * Support for particles
+* Support for cell-, node-, face-, and edge-centered fields
 * Multi-stage drivers/integrators with support for task-based parallelism
 
 # Community

diff --git a/doc/sphinx/index.rst b/doc/sphinx/index.rst
@@ -15,11 +15,12 @@ Key Features
 
 * Device first/device resident approach (work data only in device memory to prevent expensive transfers between host and device)
 * Transparent packing of data across blocks (to reduce/hide kernel launch latency)
-* Direct device-to-device communication via asynchronous, one-sided  MPI communication
+* Direct device-to-device communication via asynchronous MPI communication
 * Intermediate abstraction layer to hide complexity of device kernel launches
 * Flexible, plug-in package system
 * Abstract variables controlled via metadata flags
 * Support for particles
+* Support for cell-, node-, face-, and edge-centered fields
 * Multi-stage drivers/integrators with support for task-based parallelism
 
 Community

diff --git a/doc/sphinx/src/README.rst b/doc/sphinx/src/README.rst
@@ -178,29 +178,33 @@ Mesh
 ^^^^
 
 -  ``InitUserMeshData``
+-  ``ProblemGenerator``
+-  ``PostInitialization``
 -  ``PreStepUserWorkInLoop``
 -  ``PostStepUserWorkInLoop``
 -  ``UserWorkAfterLoop``
+-  ``UserMeshWorkBeforeOutput``
 
 MeshBlock
 ^^^^^^^^^
 
 -  ``InitApplicationMeshBlockData``
 -  ``InitMeshBlockUserData``
 -  ``ProblemGenerator``
+-  ``PostInitialization``
 -  ``UserWorkBeforeOutput``
 
 To redefine these functions, the user sets the respective function
 pointers in the ApplicationInput member app_input of the
 ParthenonManager class prior to calling ``ParthenonInit``. This is
 demonstrated in the ``main()`` functions in the examples.
 
-Note that the ``ProblemGenerator``\ s of ``Mesh`` and ``MeshBlock`` are
-mutually exclusive. Moreover, the ``Mesh`` one requires
-``parthenon/mesh/pack_size=-1`` during initialization, i.e., all blocks
-on a rank need to be in a single pack. This allows to use MPI reductions
-inside the function, for example, to globally normalize quantities. The
-``parthenon/mesh/pack_size=-1`` exists only during problem
+Note that the ``ProblemGenerator``\ s (and ``PostInitialization``\ s) of
+``Mesh`` and ``MeshBlock`` are mutually exclusive. Moreover, the ``Mesh``
+ones requires ``parthenon/mesh/pack_size=-1`` during initialization, i.e.,
+all blocks on a rank need to be in a single pack. This allows to use MPI
+reductions inside the function, for example, to globally normalize quantities.
+The ``parthenon/mesh/pack_size=-1`` exists only during problem
 inititalization, i.e., simulations can be restarted with an arbitrary
 ``pack_size``. For an example of the ``Mesh`` version, see the `Poisson
 example <https://github.com/parthenon-hpc-lab/parthenon/blob/develop/example/poisson/parthenon_app_inputs.cpp>`__.

diff --git a/doc/sphinx/src/boundary_communication.rst b/doc/sphinx/src/boundary_communication.rst
@@ -155,17 +155,7 @@ In practice, we denote each channel by a unique key
 so that the ``Mesh`` can contain a map from these keys to communication
 channels. Then, at each remesh, sending blocks and blocks that are
 receiving from blocks on a different rank can create new communication
-channels and register them in this map. *Implementation detail:* To
-build these keys, we currently rely on the
-``MeshBlock::std::unique_ptr<BoundaryValues> pbval`` object to get
-information about the neighboring blocks and build the channels and
-keys. ``BoundaryValues`` has its own communication methods defined, but
-none of these are used for the sparse communication. We really only rely
-on the information stored in ``BoundaryBase`` (which contains general
-information about all of the neighboring blocks on the mesh), which
-``BoundaryValues`` inherits from. Eventually, I think ``pbval`` should
-be turned into a ``BoundaryBase`` object and ``BoundaryValues`` should
-be removed from the code base.
+channels and register them in this map. 
 
 MPI Communication IDs
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/sphinx/src/interface/boundary.rst b/doc/sphinx/src/interface/boundary.rst
@@ -12,53 +12,3 @@ BoundaryCommunication
 
 Pure abstract base class, defines interfaces for managing
 ``BoundaryStatus`` flags and MPI requests
-
-BoundaryBuffer
---------------
-
-Pure abstract base class, defines interfaces for managing MPI
-send/receive and loading/storing data from communication buffers.
-
-BoundaryVariable
-----------------
-
-**Derived from**: ``BoundaryCommunication`` and ``BoundaryBuffer``
-
-**Contains**: ``BoundaryData`` for variable and flux correction.
-
-**Knows about**: ``MeshBlock`` and ``Mesh``
-
-Still abstract base class, but implements some methods for sending and
-receiving buffers.
-
-BoundaryBase
-------------
-
-**Contains**: ``NeighborIndexes`` and ``NeighborBlock`` PER NEIGHBOR,
-number of neighbors, neighbor levels
-
-**Knows about**: ``MeshBlock``
-
-Implements ``SearchAndSetNeighbors``
-
-BoundaryValues
---------------
-
-**Derived from**: ``BoundaryBase`` and ``BoundaryCommunication``
-
-**Knows about**: ``MeshBlock``, all the ``BoundaryVariable`` connected
-to variables of this block
-
-Central class to interact with individual variable boundary data. Owned
-by ``MeshBlock``.
-
-CellCenteredBoundaryVariable
-----------------------------
-
-**Derived from**: ``BoundaryVariable``
-
-**Contains**: Shallow copies of variable data, coarse buffer, and fluxes
-(owned by ``Variable``)
-
-Owned by ``Variable``, implements loading and setting boundary data,
-sending and receiving flux corrections, and more.
diff --git a/doc/sphinx/src/interface/state.rst b/doc/sphinx/src/interface/state.rst
@@ -113,12 +113,19 @@ several useful features and functions.
   if set (defaults to ``nullptr`` an therefore a no-op) to print
   diagnostics after the time-integration advance
 - ``void UserWorkBeforeLoopMesh(Mesh *, ParameterInput *pin, SimTime
-  &tm)`` performs a per-package, mesh-wide calculation after the mesh
-  has been generated, and problem generators called, but before any
-  time evolution. This work is done both on first initialization and
-  on restart. If you would like to avoid doing the work upon restart,
-  you can check for the const ``is_restart`` member field of the ``Mesh``
-  object.
+  &tm)`` performs a per-package, mesh-wide calculation after (1) the mesh
+  has been generated, (2) problem generators are called, and (3) comms
+  are executed, but before any time evolution. This work is done both on
+  first initialization and on restart. If you would like to avoid doing the
+  work upon restart, you can check for the const ``is_restart`` member
+  field of the ``Mesh`` object.  It is worth making a clear distinction
+  between ``UserWorkBeforeLoopMesh`` and ``ApplicationInput``s
+  ``PostInitialization``.  ``PostInitialization`` is very much so tied to
+  initialization, and will not be called upon restarts.  ``PostInitialization``
+  is also carefully positioned after ``ProblemGenerator`` and before
+  ``PreCommFillDerived`` (and hence communications).  In practice, when
+  additional granularity is required inbetween initialization and communication,
+  ``PostInitialization`` may be the desired hook.
 
 The reasoning for providing ``FillDerived*`` and ``EstimateTimestep*``
 function pointers appropriate for usage with both ``MeshData`` and

diff --git a/doc/sphinx/src/mesh/mesh.rst b/doc/sphinx/src/mesh/mesh.rst
@@ -50,6 +50,11 @@ member.
   time-integration advance. The default behavior calls to each package's
   (StateDesrcriptor's) ``PreStepDiagnostics`` method which, in turn,
   delegates to a ``std::function`` member that defaults to a no-op.
+- ``UserMeshWorkBeforeOutput(Mesh*, ParameterInput*, SimTime const&)``
+  is called to perform mesh-wide work immediately before writing an output
+  (the default is a no-op). The most likely use case is to fill derived
+  fields with updated values before writing them out to disk (or passing
+  them to Ascent for in-situ analysis).
 
 Multi-grid Grids Stored in ``Mesh``
 -----------------------------------

diff --git a/doc/sphinx/src/outputs.rst b/doc/sphinx/src/outputs.rst
@@ -46,6 +46,7 @@ look like
 
    <parthenon/output1>
    file_type = hdf5
+   write_xdmf = true # Determines whether xdmf annotations are output
    # nonexistent variables/swarms are ignored
    variables = density, velocity, & # comments are still ok
                energy               # notice the & continuation character
@@ -397,6 +398,11 @@ capable of opening and visualizing Parthenon graphics dumps. In both
 cases, the ``.xdmf`` files should be opened. In ParaView, select the
 “XDMF Reader” when prompted.
 
+.. warning::
+   Currently parthenon face- and edge- centered data is not supported
+   for ParaView and VisIt. However, our python tooling does support
+   all mesh locations.
+
 Preparing outputs for ``yt``
 ----------------------------
 

diff --git a/doc/sphinx/src/parthenon_manager.rst b/doc/sphinx/src/parthenon_manager.rst
@@ -31,12 +31,16 @@ runtimes. The function
 
 Calls the ``Initialize(ParameterInput *pin)`` function of all packages
 to be utilized and creates the grid hierarchy, including the ``Mesh``
-and ``MeshBlock`` objects, and calls the ``ProblemGenerator``
-initialization routines.
+and ``MeshBlock`` objects, and calls the ``ProblemGenerator`` (and
+``PostInitialization``) routines.
 
 The reason these functions are split out is to enable decisions to be
 made by the application between reading the input deck and setting up
-the grid. For example, a common use-case is:
+the grid. For example, during problem initialization, ``ProblemGenerator``
+may be used to be the user-facing API to describe initial conditions,
+whereas, ``PostInitialization`` could use those user-specified fields
+to sync *all* fields prior to entering communication routines. A common
+use-case is:
 
 .. code:: cpp
 
@@ -53,13 +57,14 @@ the grid. For example, a common use-case is:
   if (manager_status == ParthenonStatus::error) {
     pman.ParthenonFinalize();
     return 1;
-  } 
+  }
 
   // Redefine parthenon defaults
   pman.app_input->ProcessPackages = MyProcessPackages;
   std::string prob = pman.pin->GetString("app", "problem");
   if (prob == "problem1") {
     pman.app_input->ProblemGenerator = Problem1Generator;
+    pman.app_input->PostInitialization = Problem1PostInitialization;
   } else {
     pman.app_input->ProblemGenerator = Problem2Generator;
   }

diff --git a/doc/sphinx/src/particles.rst b/doc/sphinx/src/particles.rst
@@ -42,13 +42,13 @@ To add particles to a ``Swarm``, one calls
 
 .. code:: cpp
 
-   ParArray1D<bool> new_particles_mask = swarm->AddEmptyParticles(num_to_add, new_indices)
+   NewParticlesContext context = swarm->AddEmptyParticles(num_to_add);
 
 This call automatically resizes the memory pools as necessary and
-returns a ``ParArray1D<bool>`` mask indicating which indices in the
-``ParticleVariable``\ s are newly available. ``new_indices`` is a
-reference to a ``ParArrayND<int>`` of size ``num_to_add`` which contains
-the indices of each newly added particle.
+returns a ``NewParticlesContext`` object that provides the methods
+``int GetNewParticlesMaxIndex()`` to get the max index of the contiguous block
+of indices into the swarm, and ``int GetNewParticleIndex(const int n)`` to
+convert a new particle index into the swarm index.
 
 To remove particles from a ``Swarm``, one first calls
 

diff --git a/doc/sphinx/src/tasks.rst b/doc/sphinx/src/tasks.rst
@@ -46,9 +46,10 @@ which its called.  The principle use case for this is to add iterative cycles
 to the graph, allowing one to execute a series of tasks repeatedly until some
 criteria are satisfied.  The call takes as arguments the dependencies (via
 ``TaskID``s combined with ``|``) that must be complete before the sublist
-exectues and, optionally, a ``std::pair<int, int>`` specifying the minimum
+exectues and a ``std::pair<int, int>`` specifying the minimum
 and maximum number of times the sublist should execute.  Passing something like
-``{min_iters, max_iters}`` as the second argument should suffice.  ``AddSublist``
+``{min_iters, max_iters}`` as the second argument should suffice, with `{1, 1}`
+leading to a sublist that never cycles.  ``AddSublist``
 returns a ``std::pair<TaskList&, TaskID>`` which is conveniently accessed via
 a structured binding, e.g.
 .. code:: cpp
@@ -137,9 +138,9 @@ with one thread.
 TaskQualifier
 -------------
 
-``TaskQualifier``s provide a mechanism for downstream codes to alter the default
+``TaskQualifier`` s provide a mechanism for downstream codes to alter the default
 behavior of specific tasks in certain ways.  The qualifiers are described below:
-- ``TaskQualifier::local_sync``: Tasks marked with ``local_sync`` synchronize across
+- ``TaskQualifier::local_sync`` : Tasks marked with ``local_sync`` synchronize across
 lists in a region on a given MPI rank.  Tasks that depend on a ``local_sync``
 marked task gain dependencies from the corresponding task on all lists within
 a region.  A typical use for this qualifier is to do a rank-local reduction, for
@@ -148,22 +149,22 @@ per rank, not once per ``TaskList``).  Note that Parthenon links tasks across
 lists in the order they are added to each list, i.e. the ``n``th ``local_sync`` task
 in a list is assumed to be associated with the ``n``th ``local_sync`` task in all
 lists in the region.
-- ``TaskQualifier::global_sync``: Tasks marked with ``global_sync`` implicitly have
+- ``TaskQualifier::global_sync`` : Tasks marked with ``global_sync`` implicitly have
 the same semantics as ``local_sync``, but additionally do a global reduction on the
 ``TaskStatus`` to determine if/when execution can proceed on to dependent tasks.
-- ``TaskQualifier::completion``: Tasks marked with ``completion`` can lead to exiting
+- ``TaskQualifier::completion`` : Tasks marked with ``completion`` can lead to exiting
 execution of the owning ``TaskList``.  If these tasks return ``TaskStatus::complete``
 and the minimum number of iterations of the list have been completed, the remainder
 of the task list will be skipped (or the iteration stopped).  Returning
 ``TaskList::iterate`` leads to continued execution/iteration, unless the maximum
 number of iterations has been reached.
-- ``TaskQualifier::once_per_region``: Tasks with the ``once_per_region`` qualifier
+- ``TaskQualifier::once_per_region`` : Tasks with the ``once_per_region`` qualifier
 will only execute once (per iteration, if relevant) regardless of the number of
 ``TaskList``s in the region.  This can be useful when, for example, doing MPI
 reductions, printing out some rank-wide state, or calling a ``completion`` task
 that depends on some global condition where all lists would evaluate identical code.
 
-``TaskQualifier``s can be combined via the ``|`` operator and all combinations are
+``TaskQualifier`` s can be combined via the ``|`` operator and all combinations are
 supported.  For example, you might mark a task ``global_sync | completion | once_per_region``
 if it were a task to determine whether an iteration should continue that depended
 on some previously reduced quantity.
diff --git a/example/advection/advection_driver.hpp b/example/advection/advection_driver.hpp
@@ -39,6 +39,7 @@ class AdvectionDriver : public MultiStageDriver {
 void ProblemGenerator(MeshBlock *pmb, parthenon::ParameterInput *pin);
 void UserWorkAfterLoop(Mesh *mesh, parthenon::ParameterInput *pin,
                        parthenon::SimTime &tm);
+void UserMeshWorkBeforeOutput(Mesh *pmb, ParameterInput *pin, parthenon::SimTime const &);
 void PostStepMeshUserWorkInLoop(Mesh *mesh, parthenon::ParameterInput *pin,
                                 parthenon::SimTime const &tm);
 parthenon::Packages_t ProcessPackages(std::unique_ptr<parthenon::ParameterInput> &pin);

diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
@@ -192,6 +192,10 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
     pkg->AddSparsePool(field_name, m, std::vector<int>{12, 37});
   }
 
+  // add derived output variable
+  m = Metadata({Metadata::Cell, Metadata::OneCopy}, std::vector<int>({1}));
+  pkg->AddField("my_derived_var", m);
+
   // List (vector) of HistoryOutputVar that will all be enrolled as output variables
   parthenon::HstVar_list hst_vars = {};
   // Now we add a couple of callback functions

diff --git a/example/advection/custom_ascent_actions.yaml b/example/advection/custom_ascent_actions.yaml
@@ -11,4 +11,9 @@
         plt2:
           type: "mesh"
       image_prefix: "ascent_render_%02d"
-
+    scene2:
+      plots:
+        plt2:
+          type: "pseudocolor"
+          field: "my_derived_var"
+      image_prefix: "derived_render_%02d"
diff --git a/example/advection/main.cpp b/example/advection/main.cpp
@@ -24,6 +24,7 @@ int main(int argc, char *argv[]) {
   pman.app_input->ProcessPackages = advection_example::ProcessPackages;
   pman.app_input->ProblemGenerator = advection_example::ProblemGenerator;
   pman.app_input->UserWorkAfterLoop = advection_example::UserWorkAfterLoop;
+  pman.app_input->UserMeshWorkBeforeOutput = advection_example::UserMeshWorkBeforeOutput;
 
   // call ParthenonInit to initialize MPI and Kokkos, parse the input deck, and set up
   auto manager_status = pman.ParthenonInitEnv(argc, argv);

diff --git a/example/advection/parthenon_app_inputs.cpp b/example/advection/parthenon_app_inputs.cpp
@@ -21,6 +21,8 @@
 #include "config.hpp"
 #include "defs.hpp"
 #include "interface/variable_pack.hpp"
+#include "kokkos_abstraction.hpp"
+#include "parameter_input.hpp"
 #include "utils/error_checking.hpp"
 
 using namespace parthenon::package::prelude;
@@ -248,6 +250,25 @@ void UserWorkAfterLoop(Mesh *mesh, ParameterInput *pin, SimTime &tm) {
   return;
 }
 
+void UserMeshWorkBeforeOutput(Mesh *mesh, ParameterInput *pin, SimTime const &) {
+  // loop over blocks
+  for (auto &pmb : mesh->block_list) {
+    auto rc = pmb->meshblock_data.Get(); // get base container
+    auto q = rc->Get("advected").data;
+    auto deriv = rc->Get("my_derived_var").data;
+
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+
+    pmb->par_for(
+        "Advection::FillDerived", 0, 0, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+          deriv(0, k, j, i) = std::log10(q(0, k, j, i) + 1.0e-5);
+        });
+  }
+}
+
 Packages_t ProcessPackages(std::unique_ptr<ParameterInput> &pin) {
   Packages_t packages;
   auto pkg = advection_package::Initialize(pin.get());

diff --git a/example/advection/parthinput.advection b/example/advection/parthinput.advection
@@ -75,7 +75,8 @@ dt = 0.05
 variables = advected, advected_1, & # comments are ok
             one_minus_advected, &
             one_minus_advected_sq, & # on every (& characters are ok in comments)
-            one_minus_sqrt_one_minus_advected_sq # line
+            one_minus_sqrt_one_minus_advected_sq, & # line
+            my_derived_var
 
 <parthenon/output3>
 file_type = hst

diff --git a/example/particle_leapfrog/particle_leapfrog.cpp b/example/particle_leapfrog/particle_leapfrog.cpp
@@ -1,9 +1,9 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2021-2022 The Parthenon collaboration
+// Copyright(C) 2021-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -175,9 +175,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
 
   Kokkos::deep_copy(pmb->exec_space, ids_this_block, ids_this_block_h);
 
-  ParArrayND<int> new_indices;
-  const auto new_particles_mask =
-      swarm->AddEmptyParticles(num_particles_this_block, new_indices);
+  auto new_particles_context = swarm->AddEmptyParticles(num_particles_this_block);
 
   auto &id = swarm->Get<int>("id").Get();
   auto &x = swarm->Get<Real>("x").Get();
@@ -189,7 +187,9 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, new_particles_context.GetNewParticlesMaxIndex(),
+      KOKKOS_LAMBDA(const int new_n) {
+        const int n = new_particles_context.GetNewParticleIndex(new_n);
         const auto &m = ids_this_block(n);
 
         id(n) = m; // global unique id

diff --git a/example/particle_tracers/particle_tracers.cpp b/example/particle_tracers/particle_tracers.cpp
@@ -1,9 +1,9 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2021-2022 The Parthenon collaboration
+// Copyright(C) 2021-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2021-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -375,19 +375,18 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   int num_tracers_meshblock = std::round(num_tracers * number_meshblock / number_mesh);
   int gid = pmb->gid;
 
-  ParArrayND<int> new_indices;
-  swarm->AddEmptyParticles(num_tracers_meshblock, new_indices);
+  auto new_particles_context = swarm->AddEmptyParticles(num_tracers_meshblock);
 
   auto &x = swarm->Get<Real>("x").Get();
   auto &y = swarm->Get<Real>("y").Get();
   auto &z = swarm->Get<Real>("z").Get();
   auto &id = swarm->Get<int>("id").Get();
 
   auto swarm_d = swarm->GetDeviceContext();
-  // This hardcoded implementation should only used in PGEN and not during runtime
-  // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      PARTHENON_AUTO_LABEL, 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, new_particles_context.GetNewParticlesMaxIndex(),
+      KOKKOS_LAMBDA(const int new_n) {
+        const int n = new_particles_context.GetNewParticleIndex(new_n);
         auto rng_gen = rng_pool.get_state();
 
         // Rejection sample the x position

diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -250,8 +250,8 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
   auto vel = pkg->Param<Real>("particle_speed");
   const auto orbiting_particles = pkg->Param<bool>("orbiting_particles");
 
-  ParArrayND<int> new_indices;
-  const auto new_particles_mask = swarm->AddEmptyParticles(num_particles, new_indices);
+  // Create new particles and get accessor
+  auto newParticlesContext = swarm->AddEmptyParticles(num_particles);
 
   // Meshblock geometry
   const IndexRange &ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -278,73 +278,73 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
 
   if (orbiting_particles) {
     pmb->par_for(
-        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
-          if (new_particles_mask(n)) {
-            auto rng_gen = rng_pool.get_state();
-
-            // Randomly sample in space in this meshblock while staying within 0.5 of
-            // origin
-            Real r;
-            do {
-              x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
-              y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
-              z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
-              r = sqrt(x(n) * x(n) + y(n) * y(n) + z(n) * z(n));
-            } while (r > 0.5);
-
-            // Randomly sample direction perpendicular to origin
-            Real theta = acos(2. * rng_gen.drand() - 1.);
-            Real phi = 2. * M_PI * rng_gen.drand();
-            v(0, n) = sin(theta) * cos(phi);
-            v(1, n) = sin(theta) * sin(phi);
-            v(2, n) = cos(theta);
-            // Project v onto plane normal to sphere
-            Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
-            Real NdN = r * r;
-            v(0, n) = v(0, n) - vdN / NdN * x(n);
-            v(1, n) = v(1, n) - vdN / NdN * y(n);
-            v(2, n) = v(2, n) - vdN / NdN * z(n);
-
-            // Normalize
-            Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
-            PARTHENON_DEBUG_REQUIRE(v_tmp > 0., "Speed must be > 0!");
-            for (int ii = 0; ii < 3; ii++) {
-              v(ii, n) *= vel / v_tmp;
-            }
+        PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
+        KOKKOS_LAMBDA(const int new_n) {
+          const int n = newParticlesContext.GetNewParticleIndex(new_n);
+          auto rng_gen = rng_pool.get_state();
 
-            // Create particles at the beginning of the timestep
-            t(n) = t0;
+          // Randomly sample in space in this meshblock while staying within 0.5 of
+          // origin
+          Real r;
+          do {
+            x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
+            y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
+            z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
+            r = sqrt(x(n) * x(n) + y(n) * y(n) + z(n) * z(n));
+          } while (r > 0.5);
+
+          // Randomly sample direction perpendicular to origin
+          Real theta = acos(2. * rng_gen.drand() - 1.);
+          Real phi = 2. * M_PI * rng_gen.drand();
+          v(0, n) = sin(theta) * cos(phi);
+          v(1, n) = sin(theta) * sin(phi);
+          v(2, n) = cos(theta);
+          // Project v onto plane normal to sphere
+          Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
+          Real NdN = r * r;
+          v(0, n) = v(0, n) - vdN / NdN * x(n);
+          v(1, n) = v(1, n) - vdN / NdN * y(n);
+          v(2, n) = v(2, n) - vdN / NdN * z(n);
+
+          // Normalize
+          Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
+          PARTHENON_DEBUG_REQUIRE(v_tmp > 0., "Speed must be > 0!");
+          for (int ii = 0; ii < 3; ii++) {
+            v(ii, n) *= vel / v_tmp;
+          }
 
-            weight(n) = 1.0;
+          // Create particles at the beginning of the timestep
+          t(n) = t0;
 
-            rng_pool.free_state(rng_gen);
-          }
+          weight(n) = 1.0;
+
+          rng_pool.free_state(rng_gen);
         });
   } else {
     pmb->par_for(
-        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
-          if (new_particles_mask(n)) {
-            auto rng_gen = rng_pool.get_state();
+        PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
+        KOKKOS_LAMBDA(const int new_n) {
+          const int n = newParticlesContext.GetNewParticleIndex(new_n);
+          auto rng_gen = rng_pool.get_state();
 
-            // Randomly sample in space in this meshblock
-            x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
-            y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
-            z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
+          // Randomly sample in space in this meshblock
+          x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
+          y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
+          z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
 
-            // Randomly sample direction on the unit sphere, fixing speed
-            Real theta = acos(2. * rng_gen.drand() - 1.);
-            Real phi = 2. * M_PI * rng_gen.drand();
-            v(0, n) = vel * sin(theta) * cos(phi);
-            v(1, n) = vel * sin(theta) * sin(phi);
-            v(2, n) = vel * cos(theta);
+          // Randomly sample direction on the unit sphere, fixing speed
+          Real theta = acos(2. * rng_gen.drand() - 1.);
+          Real phi = 2. * M_PI * rng_gen.drand();
+          v(0, n) = vel * sin(theta) * cos(phi);
+          v(1, n) = vel * sin(theta) * sin(phi);
+          v(2, n) = vel * cos(theta);
 
-            // Create particles at the beginning of the timestep
-            t(n) = t0;
+          // Create particles at the beginning of the timestep
+          t(n) = t0;
 
-            weight(n) = 1.0;
+          weight(n) = 1.0;
 
-            rng_pool.free_state(rng_gen);
-          }
+          rng_pool.free_state(rng_gen);
         });
   }
 
@@ -529,8 +529,8 @@ TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
 #ifdef MPI_PARALLEL
   for (auto &block : blocks) {
     auto swarm = block->swarm_data.Get()->Get("my_particles");
-    for (int n = 0; n < block->pbval->nneighbor; n++) {
-      NeighborBlock &nb = block->pbval->neighbor[n];
+    for (int n = 0; n < block->neighbors.size(); n++) {
+      NeighborBlock &nb = block->neighbors[n];
       // TODO(BRR) May want logic like this if we have non-blocking TaskRegions
       // if (nb.snb.rank != Globals::my_rank) {
       //  if (swarm->vbswarm->bd_var_.flag[nb.bufid] != BoundaryStatus::completed) {
@@ -563,8 +563,8 @@ TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
     auto &pmb = block;
     auto sc = pmb->swarm_data.Get();
     auto swarm = sc->Get("my_particles");
-    for (int n = 0; n < swarm->vbswarm->bd_var_.nbmax; n++) {
-      auto &nb = pmb->pbval->neighbor[n];
+    for (int n = 0; n < pmb->neighbors.size(); n++) {
+      auto &nb = block->neighbors[n];
       swarm->vbswarm->bd_var_.flag[nb.bufid] = BoundaryStatus::waiting;
     }
   }

diff --git a/example/poisson/poisson_driver.cpp b/example/poisson/poisson_driver.cpp
@@ -70,9 +70,7 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
   // and a kokkos view just for fun
   AllReduce<HostArray1D<Real>> *pview_reduce =
       pkg->MutableParam<AllReduce<HostArray1D<Real>>>("view_reduce");
-  int reg_dep_id;
   for (int i = 0; i < num_partitions; i++) {
-    reg_dep_id = 0;
     // make/get a mesh_data container for the state
     auto &md = pmesh->mesh_data.GetOrAdd("base", i);
     auto &mdelta = pmesh->mesh_data.GetOrAdd("delta", i);

diff --git a/example/poisson_gmg/main.cpp b/example/poisson_gmg/main.cpp
@@ -51,6 +51,8 @@ int main(int argc, char *argv[]) {
     if (driver_status != parthenon::DriverStatus::complete ||
         driver.final_rms_residual > 1.e-10 || driver.final_rms_error > 1.e-12)
       success = false;
+    if (driver.final_rms_residual != driver.final_rms_residual) success = false;
+    if (driver.final_rms_error != driver.final_rms_error) success = false;
   }
   // call MPI_Finalize and Kokkos::finalize if necessary
   pman.ParthenonFinalize();

diff --git a/example/poisson_gmg/poisson_driver.cpp b/example/poisson_gmg/poisson_driver.cpp
@@ -79,7 +79,6 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
   const int num_partitions = pmesh->DefaultNumPartitions();
   TaskRegion &region = tc.AddRegion(num_partitions);
-  int reg_dep_id = 0;
   for (int i = 0; i < num_partitions; ++i) {
     TaskList &tl = region[i];
     auto &md = pmesh->mesh_data.GetOrAdd("base", i);
@@ -100,9 +99,11 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
     auto solve = zero_u;
     if (solver == "BiCGSTAB") {
-      solve = bicgstab_solver->AddTasks(tl, zero_u, pmesh, i);
+      auto setup = bicgstab_solver->AddSetupTasks(tl, zero_u, i, pmesh);
+      solve = bicgstab_solver->AddTasks(tl, setup, pmesh, i);
     } else if (solver == "MG") {
-      solve = mg_solver->AddTasks(tl, zero_u, pmesh, i);
+      auto setup = mg_solver->AddSetupTasks(tl, zero_u, i, pmesh);
+      solve = mg_solver->AddTasks(tl, setup, pmesh, i);
     } else {
       PARTHENON_FAIL("Unknown solver type.");
     }

diff --git a/example/poisson_gmg/poisson_driver.hpp b/example/poisson_gmg/poisson_driver.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC

diff --git a/example/poisson_gmg/poisson_package.cpp b/example/poisson_gmg/poisson_package.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -147,6 +147,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
   bicgstab_params.max_iters = max_poisson_iterations;
   bicgstab_params.residual_tolerance = res_tol;
   bicgstab_params.precondition = precondition;
+  bicgstab_params.print_per_step = true;
   parthenon::solvers::BiCGSTABSolver<u, rhs, PoissonEquation> bicg_solver(
       pkg.get(), bicgstab_params, eq);
   pkg->AddParam<>("MGBiCGSTABsolver", bicg_solver,

diff --git a/example/sparse_advection/sparse_advection_driver.cpp b/example/sparse_advection/sparse_advection_driver.cpp
@@ -122,12 +122,12 @@ TaskCollection SparseAdvectionDriver::MakeTaskCollection(BlockList_t &blocks,
                              mdudt.get(), beta * dt, mc1.get());
 
     // do boundary exchange
-    auto restrict =
+    auto boundary =
         parthenon::AddBoundaryExchangeTasks(update, tl, mc1, pmesh->multilevel);
 
     // if this is the last stage, check if we can deallocate any sparse variables
     if (stage == integrator->nstages) {
-      tl.AddTask(restrict, SparseDealloc, mc1.get());
+      tl.AddTask(boundary, SparseDealloc, mc1.get());
     }
   }
 

diff --git a/scripts/python/packages/parthenon_performance_app/parthenon_performance_app/githubapp.py b/scripts/python/packages/parthenon_performance_app/parthenon_performance_app/githubapp.py
@@ -76,7 +76,6 @@ def printTree(self):
 
 
 class GitHubApp:
-
     """
     GitHubApp Class
 

diff --git a/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py b/scripts/python/packages/parthenon_tools/parthenon_tools/movie2d.py
@@ -21,7 +21,12 @@
 
 from argparse import ArgumentParser
 from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from concurrent.futures import (
+    ThreadPoolExecutor,
+    ProcessPoolExecutor,
+    wait,
+    ALL_COMPLETED,
+)
 
 import matplotlib as mpl
 import matplotlib.pyplot as plt
@@ -208,17 +213,26 @@ def plot_dump(
         ye = yf
 
     # get tensor components
-    if len(q.shape) > 6:
-        raise ValueError(
-            "Tensor rank is higher than I can handle. "
-            + "Please revise the movie2d script"
-        )
-    if len(q.shape) == 6:
-        q = q[:, components[0], components[1], 0, :, :]
-    if len(q.shape) == 5:
-        q = q[:, components[-1], 0, :, :]
-    if len(q.shape) == 4:
-        q = q[:, 0, :, :]
+    ntensors = len(q.shape[1:-3])
+    if components:
+        if len(components) != ntensors:
+            raise ValueError(
+                "Tensor rank not the same as number of specified components: {}, {}, {}".format(
+                    len(components), ntensors, q.shape
+                )
+            )
+        # The first index of q is block index. Here we walk through
+        # the tensor components, slowest-first and, iteratively, fix
+        # q's slowest moving non-block index to the fixed tensor
+        # component. Then we move to the next index.
+        for c in components:
+            if c > (q.shape[1] - 1):
+                raise ValueError(
+                    "Component {} out of bounds. Shape = {}".format(c, q.shape)
+                )
+            q = q[:, c]
+    # move to 2d
+    q = q[..., 0, :, :]
 
     fig = plt.figure()
     p = fig.add_subplot(111, aspect=1)
@@ -299,15 +313,16 @@ def plot_dump(
     args.output_directory.mkdir(0o755, True, True)
     logger.info(f"Total files to process: {len(args.files)}")
 
-    components = [0, 0]
+    components = []
     if args.tc is not None:
         components = args.tc
     if args.vc is not None:
-        components = [0, args.vc]
+        components = [args.vc]
     do_swarm = args.swarm is not None
 
     _x = ProcessPoolExecutor if args.worker_type == "process" else ThreadPoolExecutor
     with _x(max_workers=args.workers) as pool:
+        futures = []
         for frame_id, file_name in enumerate(args.files):
             data = phdf(file_name)
 
@@ -376,40 +391,49 @@ def plot_dump(
             # NOTE: After doing 5 test on different precision, keeping 2 looks more promising
             current_time = format(round(data.Time, 2), ".2f")
             if args.debug_plot:
-                pool.submit(
-                    plot_dump,
-                    data.xg,
-                    data.yg,
-                    q,
-                    current_time,
-                    output_file,
-                    True,
-                    data.gid,
-                    data.xig,
-                    data.yig,
-                    data.xeg,
-                    data.yeg,
-                    components,
-                    swarmx,
-                    swarmy,
-                    swarmcolor,
-                    particlesize,
+                futures.append(
+                    pool.submit(
+                        plot_dump,
+                        data.xg,
+                        data.yg,
+                        q,
+                        current_time,
+                        output_file,
+                        True,
+                        data.gid,
+                        data.xig,
+                        data.yig,
+                        data.xeg,
+                        data.yeg,
+                        components,
+                        swarmx,
+                        swarmy,
+                        swarmcolor,
+                        particlesize,
+                    )
                 )
             else:
-                pool.submit(
-                    plot_dump,
-                    data.xng,
-                    data.yng,
-                    q,
-                    current_time,
-                    output_file,
-                    True,
-                    components=components,
-                    swarmx=swarmx,
-                    swarmy=swarmy,
-                    swarmcolor=swarmcolor,
-                    particlesize=particlesize,
+                futures.append(
+                    pool.submit(
+                        plot_dump,
+                        data.xng,
+                        data.yng,
+                        q,
+                        current_time,
+                        output_file,
+                        True,
+                        components=components,
+                        swarmx=swarmx,
+                        swarmy=swarmy,
+                        swarmcolor=swarmcolor,
+                        particlesize=particlesize,
+                    )
                 )
+        wait(futures, return_when=ALL_COMPLETED)
+        for future in futures:
+            exception = future.exception()
+            if exception:
+                raise exception
 
     if not ERROR_FLAG:
         logger.info("All frames produced.")

diff --git a/scripts/python/packages/parthenon_tools/parthenon_tools/phdf.py b/scripts/python/packages/parthenon_tools/parthenon_tools/phdf.py
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -108,14 +108,11 @@ add_library(parthenon
   bvals/boundary_conditions_generic.hpp
   bvals/boundary_conditions.cpp
   bvals/boundary_conditions.hpp
-
   bvals/bvals.cpp
   bvals/bvals.hpp
-  bvals/bvals_base.cpp
-  bvals/bvals_interfaces.hpp
+  bvals/neighbor_block.cpp
+  bvals/neighbor_block.hpp
   bvals/boundary_flag.cpp
-  bvals/bvals_var.cpp
-  bvals/bvals_swarm.cpp
 
   coordinates/coordinates.hpp
   coordinates/uniform_cartesian.hpp
@@ -131,6 +128,7 @@ add_library(parthenon
   interface/mesh_data.hpp
   interface/meshblock_data.cpp
   interface/meshblock_data.hpp
+  interface/swarm_comms.cpp
   interface/swarm_container.cpp
   interface/swarm.cpp
   interface/swarm.hpp
@@ -160,6 +158,12 @@ add_library(parthenon
 
   mesh/amr_loadbalance.cpp
   mesh/domain.hpp
+  mesh/forest/forest.cpp
+  mesh/forest/forest.hpp
+  mesh/forest/relative_orientation.hpp
+  mesh/forest/relative_orientation.cpp
+  mesh/forest/tree.hpp
+  mesh/forest/tree.cpp
   mesh/logical_location.cpp
   mesh/logical_location.hpp
   mesh/mesh_refinement.cpp
@@ -169,8 +173,6 @@ add_library(parthenon
   mesh/mesh.hpp
   mesh/meshblock.hpp
   mesh/meshblock_pack.hpp
-  mesh/meshblock_tree.cpp
-  mesh/meshblock_tree.hpp
   mesh/meshblock.cpp
 
   outputs/ascent.cpp
@@ -183,10 +185,16 @@ add_library(parthenon
   outputs/outputs.cpp
   outputs/outputs.hpp
   outputs/parthenon_hdf5.cpp
+  outputs/parthenon_hdf5_attributes.cpp
+  outputs/parthenon_hdf5_attributes_read.cpp
+  outputs/parthenon_hdf5_attributes_write.cpp
+  outputs/parthenon_hdf5_types.hpp
   outputs/parthenon_xdmf.cpp
   outputs/parthenon_hdf5.hpp
   outputs/parthenon_xdmf.hpp
-  outputs/restart.cpp
+  outputs/restart.hpp
+  outputs/restart_hdf5.cpp
+  outputs/restart_hdf5.hpp
   outputs/vtk.cpp
 
   parthenon/driver.hpp
@@ -226,6 +234,7 @@ add_library(parthenon
   utils/bit_hacks.hpp
   utils/buffer_utils.cpp
   utils/buffer_utils.hpp
+  utils/cell_center_offsets.hpp
   utils/change_rundir.cpp
   utils/communication_buffer.hpp
   utils/cleantypes.hpp
@@ -264,6 +273,8 @@ add_library(parthenon
   parameter_input.cpp
   parameter_input.hpp
   parthenon_array_generic.hpp
+  parthenon_arrays.cpp
+  parthenon_arrays.hpp
   parthenon_manager.cpp
   parthenon_manager.hpp
   parthenon_mpi.hpp

diff --git a/src/application_input.hpp b/src/application_input.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -36,12 +36,17 @@ struct ApplicationInput {
   std::function<void(Mesh *, ParameterInput *)> InitUserMeshData = nullptr;
   std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> MeshProblemGenerator =
       nullptr;
+  std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> MeshPostInitialization =
+      nullptr;
 
   std::function<void(Mesh *, ParameterInput *, SimTime &)> PreStepMeshUserWorkInLoop =
       nullptr;
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
       PostStepMeshUserWorkInLoop = nullptr;
 
+  std::function<void(Mesh *, ParameterInput *, SimTime const &)>
+      UserMeshWorkBeforeOutput = nullptr;
+
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
       PreStepDiagnosticsInLoop = nullptr;
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
@@ -57,6 +62,7 @@ struct ApplicationInput {
       InitApplicationMeshBlockData = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> InitMeshBlockUserData = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> ProblemGenerator = nullptr;
+  std::function<void(MeshBlock *, ParameterInput *)> PostInitialization = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> MeshBlockUserWorkBeforeOutput =
       nullptr;
 };

diff --git a/src/basic_types.hpp b/src/basic_types.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -15,6 +15,7 @@
 
 #include <limits>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -49,7 +50,7 @@ enum class TaskStatus { complete, incomplete, iterate };
 
 enum class AmrTag : int { derefine = -1, same = 0, refine = 1 };
 enum class RefinementOp_t { Prolongation, Restriction, None };
-
+enum class CellLevel : int { coarse = -1, same = 0, fine = 1 };
 // JMM: Not clear this is the best place for this but it minimizes
 // circular dependency nonsense.
 constexpr int NUM_BNDRY_TYPES = 10;
@@ -152,16 +153,17 @@ inline std::vector<TopologicalElement> GetTopologicalElements(TopologicalType tt
   if (tt == TT::Face) return {TE::F1, TE::F2, TE::F3};
   return {TE::CC};
 }
+
 using TE = TopologicalElement;
 // Returns one if the I coordinate of el is offset from the zone center coordinates,
 // and zero otherwise
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetI(TE el) noexcept {
+inline constexpr int TopologicalOffsetI(TE el) {
   return (el == TE::F1 || el == TE::E2 || el == TE::E3 || el == TE::NN);
 }
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetJ(TE el) noexcept {
+inline constexpr int TopologicalOffsetJ(TE el) {
   return (el == TE::F2 || el == TE::E3 || el == TE::E1 || el == TE::NN);
 }
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetK(TE el) noexcept {
+inline constexpr int TopologicalOffsetK(TE el) {
   return (el == TE::F3 || el == TE::E2 || el == TE::E1 || el == TE::NN);
 }
 
@@ -207,6 +209,8 @@ struct SimTime {
 template <typename T>
 using Dictionary = std::unordered_map<std::string, T>;
 
+template <typename T>
+using Triple_t = std::tuple<T, T, T>;
 } // namespace parthenon
 
 #endif // BASIC_TYPES_HPP_
diff --git a/src/bvals/boundary_conditions.cpp b/src/bvals/boundary_conditions.cpp
@@ -16,7 +16,7 @@
 
 #include "bvals/boundary_conditions.hpp"
 #include "bvals/boundary_conditions_generic.hpp"
-#include "bvals/bvals_interfaces.hpp"
+#include "bvals/neighbor_block.hpp"
 #include "defs.hpp"
 #include "interface/meshblock_data.hpp"
 #include "mesh/domain.hpp"

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -18,6 +18,9 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "basic_types.hpp"
@@ -34,6 +37,35 @@ namespace BoundaryFunction {
 enum class BCSide { Inner, Outer };
 enum class BCType { Outflow, Reflect, ConstantDeriv, Fixed, FixedFace };
 
+namespace impl {
+using desc_key_t = std::tuple<bool, TopologicalType>;
+template <class... var_ts>
+using map_bc_pack_descriptor_t =
+    std::unordered_map<desc_key_t, typename SparsePack<var_ts...>::Descriptor,
+                       tuple_hash<desc_key_t>>;
+
+template <class... var_ts>
+map_bc_pack_descriptor_t<var_ts...>
+GetPackDescriptorMap(std::shared_ptr<MeshBlockData<Real>> &rc) {
+  std::vector<std::pair<TopologicalType, MetadataFlag>> elements{
+      {TopologicalType::Cell, Metadata::Cell},
+      {TopologicalType::Face, Metadata::Face},
+      {TopologicalType::Edge, Metadata::Edge},
+      {TopologicalType::Node, Metadata::Node}};
+  map_bc_pack_descriptor_t<var_ts...> my_map;
+  for (auto [tt, md] : elements) {
+    std::vector<MetadataFlag> flags{Metadata::FillGhost};
+    flags.push_back(md);
+    std::set<PDOpt> opts{PDOpt::Coarse};
+    my_map.emplace(std::make_pair(desc_key_t{true, tt},
+                                  MakePackDescriptor<var_ts...>(rc.get(), flags, opts)));
+    my_map.emplace(std::make_pair(desc_key_t{false, tt},
+                                  MakePackDescriptor<var_ts...>(rc.get(), flags)));
+  }
+  return my_map;
+}
+} // namespace impl
+
 template <CoordinateDirection DIR, BCSide SIDE, BCType TYPE, class... var_ts>
 void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
                TopologicalElement el, Real val) {
@@ -46,17 +78,9 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   constexpr bool X3 = (DIR == X3DIR);
   constexpr bool INNER = (SIDE == BCSide::Inner);
 
-  std::vector<MetadataFlag> flags{Metadata::FillGhost};
-  if (GetTopologicalType(el) == TopologicalType::Cell) flags.push_back(Metadata::Cell);
-  if (GetTopologicalType(el) == TopologicalType::Face) flags.push_back(Metadata::Face);
-  if (GetTopologicalType(el) == TopologicalType::Edge) flags.push_back(Metadata::Edge);
-  if (GetTopologicalType(el) == TopologicalType::Node) flags.push_back(Metadata::Node);
-
-  std::set<PDOpt> opts;
-  if (coarse) opts = {PDOpt::Coarse};
-  auto desc = MakePackDescriptor<var_ts...>(
-      rc->GetBlockPointer()->pmy_mesh->resolved_packages.get(), flags, opts);
-  auto q = desc.GetPack(rc.get());
+  static auto descriptors = impl::GetPackDescriptorMap<var_ts...>(rc);
+  auto q =
+      descriptors[impl::desc_key_t{coarse, GetTopologicalType(el)}].GetPack(rc.get());
   const int b = 0;
   const int lstart = q.GetLowerBoundHost(b);
   const int lend = q.GetUpperBoundHost(b);

diff --git a/src/bvals/boundary_flag.cpp b/src/bvals/boundary_flag.cpp
@@ -90,9 +90,8 @@ std::string GetBoundaryString(BoundaryFlag input_flag) {
 //! \fn CheckBoundaryFlag(BoundaryFlag block_flag, CoordinateDirection dir)
 //  \brief Called in each MeshBlock's BoundaryValues() constructor. Mesh() ctor only
 //  checks the validity of user's input mesh/ixn_bc, oxn_bc string values corresponding to
-//  a BoundaryFlag enumerator before passing it to a MeshBlock and then BoundaryBase
-//  object. However, not all BoundaryFlag enumerators can be used in all directions as a
-//  valid MeshBlock boundary.
+//  a BoundaryFlag enumerator before passing it to a MeshBlock. However, not all
+//  BoundaryFlag enumerators can be used in all directions as a valid MeshBlock boundary.
 
 void CheckBoundaryFlag(BoundaryFlag block_flag, CoordinateDirection dir) {
   std::stringstream msg;

diff --git a/src/bvals/bvals.cpp b/src/bvals/bvals.cpp
@@ -14,8 +14,6 @@
 // license in this material to reproduce, prepare derivative works, distribute copies to
 // the public, perform publicly and display publicly, and to permit others to do so.
 //========================================================================================
-//! \file bvals.cpp
-//  \brief constructor/destructor and utility functions for BoundaryValues class
 
 #include "bvals/bvals.hpp"
 
@@ -45,93 +43,131 @@
 
 namespace parthenon {
 
-// BoundaryValues constructor (the first object constructed inside the MeshBlock()
-// constructor): sets functions for the appropriate boundary conditions at each of the 6
-// dirs of a MeshBlock
-BoundaryValues::BoundaryValues(std::weak_ptr<MeshBlock> wpmb, BoundaryFlag *input_bcs,
-                               ParameterInput *pin)
-    : BoundaryBase(wpmb.lock()->pmy_mesh, wpmb.lock()->loc, wpmb.lock()->block_size,
-                   input_bcs),
-      pmy_block_(wpmb) {
-  // Check BC functions for each of the 6 boundaries in turn ---------------------
-  for (int i = 0; i < 6; i++) {
-    switch (block_bcs[i]) {
-    case BoundaryFlag::reflect:
-    case BoundaryFlag::outflow:
-      apply_bndry_fn_[i] = true;
-      break;
-    default: // already initialized to false in class
-      break;
-    }
-  }
-  // Inner x1
-  nface_ = 2;
-  nedge_ = 0;
-  CheckBoundaryFlag(block_bcs[BoundaryFace::inner_x1], CoordinateDirection::X1DIR);
-  CheckBoundaryFlag(block_bcs[BoundaryFace::outer_x1], CoordinateDirection::X1DIR);
-
-  std::shared_ptr<MeshBlock> pmb = GetBlockPointer();
-  if (!pmb->block_size.symmetry(X2DIR)) {
-    nface_ = 4;
-    nedge_ = 4;
-    CheckBoundaryFlag(block_bcs[BoundaryFace::inner_x2], CoordinateDirection::X2DIR);
-    CheckBoundaryFlag(block_bcs[BoundaryFace::outer_x2], CoordinateDirection::X2DIR);
-  }
+BoundarySwarm::BoundarySwarm(std::weak_ptr<MeshBlock> pmb, const std::string &label)
+    : bswarm_index(), pmy_block(pmb), pmy_mesh_(pmb.lock()->pmy_mesh) {
+#ifdef MPI_PARALLEL
+  swarm_comm = pmy_mesh_->GetMPIComm(label);
+#endif
+  InitBoundaryData(bd_var_);
+}
 
-  if (!pmb->block_size.symmetry(X3DIR)) {
-    nface_ = 6;
-    nedge_ = 12;
-    CheckBoundaryFlag(block_bcs[BoundaryFace::inner_x3], CoordinateDirection::X3DIR);
-    CheckBoundaryFlag(block_bcs[BoundaryFace::outer_x3], CoordinateDirection::X3DIR);
+void BoundarySwarm::InitBoundaryData(BoundaryData<> &bd) {
+  auto pmb = GetBlockPointer();
+  BufferID buffer_id(pmb->pmy_mesh->ndim, pmb->pmy_mesh->multilevel);
+  bd.nbmax = buffer_id.size();
+
+  for (int n = 0; n < bd.nbmax; n++) {
+    bd.flag[n] = BoundaryStatus::waiting;
+#ifdef MPI_PARALLEL
+    bd.req_send[n] = MPI_REQUEST_NULL;
+    bd.req_recv[n] = MPI_REQUEST_NULL;
+#endif
   }
-
-  // prevent reallocation of contiguous memory space for each of 4x possible calls to
-  // std::vector<BoundaryVariable *>.push_back() in Field, PassiveScalars
-  bvars.reserve(3);
 }
 
-// destructor
-
-//----------------------------------------------------------------------------------------
-//! \fn void BoundaryValues::SetupPersistentMPI()
-//  \brief Setup persistent MPI requests to be reused throughout the entire simulation
+void BoundarySwarm::SetupPersistentMPI() {
+#ifdef MPI_PARALLEL
+  std::shared_ptr<MeshBlock> pmb = GetBlockPointer();
 
-void BoundaryValues::SetupPersistentMPI() {
-  for (auto bvars_it = bvars.begin(); bvars_it != bvars.end(); ++bvars_it) {
-    (*bvars_it).second->SetupPersistentMPI();
+  // Initialize neighbor communications to other ranks
+  for (int n = 0; n < pmb->neighbors.size(); n++) {
+    NeighborBlock &nb = pmb->neighbors[n];
+    // Neighbor on different MPI process
+    if (nb.snb.rank != Globals::my_rank) {
+      send_tag[nb.bufid] = pmb->pmy_mesh->tag_map.GetTag(pmb.get(), nb);
+      recv_tag[nb.bufid] = pmb->pmy_mesh->tag_map.GetTag(pmb.get(), nb);
+      if (bd_var_.req_send[nb.bufid] != MPI_REQUEST_NULL) {
+        MPI_Request_free(&bd_var_.req_send[nb.bufid]);
+      }
+      if (bd_var_.req_recv[nb.bufid] != MPI_REQUEST_NULL) {
+        MPI_Request_free(&bd_var_.req_recv[nb.bufid]);
+      }
+    }
   }
+#endif
 }
 
-//----------------------------------------------------------------------------------------
-//! \fn void BoundaryValues::StartReceiving(BoundaryCommSubset phase)
-//  \brief initiate MPI_Irecv()
-
-void BoundaryValues::StartReceiving(BoundaryCommSubset phase) {
-  for (auto bvars_it = bvars.begin(); bvars_it != bvars.end(); ++bvars_it) {
-    (*bvars_it).second->StartReceiving(phase);
+// Send particle buffers across meshblocks. If different MPI ranks, use MPI, if same rank,
+// do a deep copy on device.
+void BoundarySwarm::Send(BoundaryCommSubset phase) {
+  std::shared_ptr<MeshBlock> pmb = GetBlockPointer();
+  // Fence to make sure buffers are loaded before sending
+  pmb->exec_space.fence();
+  for (int n = 0; n < pmb->neighbors.size(); n++) {
+    NeighborBlock &nb = pmb->neighbors[n];
+    if (nb.snb.rank != Globals::my_rank) {
+#ifdef MPI_PARALLEL
+      PARTHENON_REQUIRE(bd_var_.req_send[nb.bufid] == MPI_REQUEST_NULL,
+                        "Trying to create a new send before previous send completes!");
+      PARTHENON_MPI_CHECK(MPI_Isend(bd_var_.send[nb.bufid].data(), send_size[nb.bufid],
+                                    MPI_PARTHENON_REAL, nb.snb.rank, send_tag[nb.bufid],
+                                    swarm_comm, &(bd_var_.req_send[nb.bufid])));
+#endif // MPI_PARALLEL
+    } else {
+      MeshBlock &target_block = *pmy_mesh_->FindMeshBlock(nb.snb.gid);
+      std::shared_ptr<BoundarySwarm> ptarget_bswarm =
+          target_block.pbswarm->bswarms[bswarm_index];
+      if (send_size[nb.bufid] > 0) {
+        // Ensure target buffer is large enough
+        if (bd_var_.send[nb.bufid].extent(0) >
+            ptarget_bswarm->bd_var_.recv[nb.targetid].extent(0)) {
+          ptarget_bswarm->bd_var_.recv[nb.targetid] =
+              BufArray1D<Real>("Buffer", (bd_var_.send[nb.bufid].extent(0)));
+        }
+
+        target_block.deep_copy(ptarget_bswarm->bd_var_.recv[nb.targetid],
+                               bd_var_.send[nb.bufid]);
+        ptarget_bswarm->recv_size[nb.targetid] = send_size[nb.bufid];
+        ptarget_bswarm->bd_var_.flag[nb.targetid] = BoundaryStatus::arrived;
+      } else {
+        ptarget_bswarm->recv_size[nb.targetid] = 0;
+        ptarget_bswarm->bd_var_.flag[nb.targetid] = BoundaryStatus::completed;
+      }
+    }
   }
 }
 
-//----------------------------------------------------------------------------------------
-//! \fn void BoundaryValues::ClearBoundary(BoundaryCommSubset phase)
-//  \brief clean up the boundary flags after each loop
-
-void BoundaryValues::ClearBoundary(BoundaryCommSubset phase) {
-  // Note BoundaryCommSubset::mesh_init corresponds to initial exchange of conserved fluid
-  // variables and magentic fields
-  for (auto bvars_it = bvars.begin(); bvars_it != bvars.end(); ++bvars_it) {
-    (*bvars_it).second->ClearBoundary(phase);
+void BoundarySwarm::Receive(BoundaryCommSubset phase) {
+#ifdef MPI_PARALLEL
+  std::shared_ptr<MeshBlock> pmb = GetBlockPointer();
+  const int &mylevel = pmb->loc.level();
+  for (int n = 0; n < pmb->neighbors.size(); n++) {
+    NeighborBlock &nb = pmb->neighbors[n];
+    if (nb.snb.rank != Globals::my_rank) {
+      // Check to see if we got a message
+      int test;
+      MPI_Status status;
+
+      if (bd_var_.flag[nb.bufid] != BoundaryStatus::completed) {
+        PARTHENON_MPI_CHECK(
+            MPI_Iprobe(nb.snb.rank, recv_tag[nb.bufid], swarm_comm, &test, &status));
+        if (!static_cast<bool>(test)) {
+          bd_var_.flag[nb.bufid] = BoundaryStatus::waiting;
+        } else {
+          bd_var_.flag[nb.bufid] = BoundaryStatus::arrived;
+
+          // If message is available, receive it
+          PARTHENON_MPI_CHECK(
+              MPI_Get_count(&status, MPI_PARTHENON_REAL, &(recv_size[nb.bufid])));
+          if (recv_size[nb.bufid] > bd_var_.recv[nb.bufid].extent(0)) {
+            bd_var_.recv[nb.bufid] = BufArray1D<Real>("Buffer", recv_size[nb.bufid]);
+          }
+          PARTHENON_MPI_CHECK(MPI_Recv(bd_var_.recv[nb.bufid].data(), recv_size[nb.bufid],
+                                       MPI_PARTHENON_REAL, nb.snb.rank,
+                                       recv_tag[nb.bufid], swarm_comm, &status));
+        }
+      }
+    }
   }
+#endif
 }
 
 // BoundarySwarms constructor (the first object constructed inside the MeshBlock()
 // constructor): sets functions for the appropriate boundary conditions at each of the 6
 // dirs of a MeshBlock
 BoundarySwarms::BoundarySwarms(std::weak_ptr<MeshBlock> wpmb, BoundaryFlag *input_bcs,
                                ParameterInput *pin)
-    : BoundaryBase(wpmb.lock()->pmy_mesh, wpmb.lock()->loc, wpmb.lock()->block_size,
-                   input_bcs),
-      pmy_block_(wpmb) {
+    : pmy_block_(wpmb) {
   // Check BC functions for each of the 6 boundaries in turn ---------------------
   // TODO(BRR) Add physical particle boundary conditions, maybe using the below code
   /*for (int i = 0; i < 6; i++) {
@@ -144,6 +180,9 @@ BoundarySwarms::BoundarySwarms(std::weak_ptr<MeshBlock> wpmb, BoundaryFlag *inpu
       break;
     }
   }*/
+  for (int i = 0; i < 6; ++i)
+    block_bcs[i] = input_bcs[i];
+
   // Inner x1
   nface_ = 2;
   nedge_ = 0;
@@ -167,7 +206,7 @@ BoundarySwarms::BoundarySwarms(std::weak_ptr<MeshBlock> wpmb, BoundaryFlag *inpu
 }
 
 //----------------------------------------------------------------------------------------
-//! \fn void BoundaryValues::SetupPersistentMPI()
+//! \fn void BoundarySwarms::SetupPersistentMPI()
 //  \brief Setup persistent MPI requests to be reused throughout the entire simulation
 
 void BoundarySwarms::SetupPersistentMPI() {

diff --git a/src/bvals/bvals.hpp b/src/bvals/bvals.hpp
@@ -17,7 +17,7 @@
 #ifndef BVALS_BVALS_HPP_
 #define BVALS_BVALS_HPP_
 //! \file bvals.hpp
-//  \brief defines BoundaryBase, BoundaryValues classes used for setting BCs on all data
+//  \brief defines BoundarySwarms
 
 #include <memory>
 #include <string>
@@ -27,9 +27,9 @@
 #include "basic_types.hpp"
 #include "parthenon_mpi.hpp"
 
-#include "bvals/bvals_interfaces.hpp"
 #include "bvals/comms/bnd_info.hpp"
 #include "bvals/comms/bvals_in_one.hpp"
+#include "bvals/neighbor_block.hpp"
 #include "defs.hpp"
 #include "mesh/domain.hpp"
 #include "parthenon_arrays.hpp"
@@ -39,7 +39,7 @@ namespace parthenon {
 
 // forward declarations
 // TODO(felker): how many of these foward declarations are actually needed now?
-// Can #include "./bvals_interfaces.hpp" suffice?
+// Can #include "./neighbor_block.hpp" suffice?
 template <typename T>
 class Variable;
 class Mesh;
@@ -54,58 +54,95 @@ std::string GetBoundaryString(BoundaryFlag input_flag);
 // + confirming that the MeshBlock's boundaries are all valid selections
 void CheckBoundaryFlag(BoundaryFlag block_flag, CoordinateDirection dir);
 
+// identifiers for status of MPI boundary communications
+enum class BoundaryStatus { waiting, arrived, completed };
+
+//----------------------------------------------------------------------------------------
+//! \struct BoundaryData
+//  \brief structure storing boundary information
+
+template <int n = NMAX_NEIGHBORS>
+struct BoundaryData { // aggregate and POD (even when MPI_PARALLEL is defined)
+  static constexpr int kMaxNeighbor = n;
+  // KGF: "nbmax" only used in bvals_var.cpp, Init/DestroyBoundaryData()
+  int nbmax; // actual maximum number of neighboring MeshBlocks
+  // currently, sflag[] is only used by Multgrid (send buffers are reused each stage in
+  // red-black comm. pattern; need to check if they are available)
+  BoundaryStatus flag[kMaxNeighbor], sflag[kMaxNeighbor];
+  BufArray1D<Real> buffers;
+  BufArray1D<Real> send[kMaxNeighbor], recv[kMaxNeighbor];
+  // host mirror view of recv
+  BufArray1D<Real>::host_mirror_type recv_h[kMaxNeighbor];
+  int recv_size[kMaxNeighbor];
+#ifdef MPI_PARALLEL
+  MPI_Request req_send[kMaxNeighbor], req_recv[kMaxNeighbor];
+#endif
+};
+
 //----------------------------------------------------------------------------------------
-//! \class BoundaryBase
-//  \brief Base class for all BoundaryValues classes (BoundaryValues and MGBoundaryValues)
+//! \class BoundaryCommunication
+//  \brief contains methods for managing BoundaryStatus flags and MPI requests
 
-class BoundaryBase {
+class BoundaryCommunication {
  public:
-  BoundaryBase(Mesh *pm, LogicalLocation iloc, RegionSize isize, BoundaryFlag *input_bcs);
-  virtual ~BoundaryBase() = default;
-  // 1x pair (neighbor index, buffer ID) per entire SET of separate variable buffers
-  // (Field, Passive Scalar, etc.). Greedy allocation for worst-case
-  // of refined 3D; only 26 entries needed/initialized if unrefined 3D, e.g.
-  static NeighborIndexes ni[NMAX_NEIGHBORS];
-  static int bufid[NMAX_NEIGHBORS];
-
-  NeighborBlock neighbor[NMAX_NEIGHBORS];
-  int nneighbor;
-  int nblevel[3][3][3];
-  LogicalLocation loc;
-  BoundaryFlag block_bcs[6];
+  BoundaryCommunication() {}
+  virtual ~BoundaryCommunication() {}
+  // create unique tags for each MeshBlock/buffer/quantity and initialize MPI requests:
+  virtual void SetupPersistentMPI() = 0;
+  // call MPI_Start() on req_recv[]
+  virtual void StartReceiving(BoundaryCommSubset phase) = 0;
+  // call MPI_Wait() on req_send[] and set flag[] to BoundaryStatus::waiting
+  virtual void ClearBoundary(BoundaryCommSubset phase) = 0;
+};
 
-  static int CreateBvalsMPITag(int lid, int bufid);
-  static int CreateBufferID(int ox1, int ox2, int ox3, int fi1, int fi2);
-  static int BufferID(int dim, bool multilevel);
-  static int FindBufferID(int ox1, int ox2, int ox3, int fi1, int fi2);
+//----------------------------------------------------------------------------------------
+//! \class BoundarySwarm
+class BoundarySwarm : public BoundaryCommunication {
+ public:
+  explicit BoundarySwarm(std::weak_ptr<MeshBlock> pmb, const std::string &label);
+  ~BoundarySwarm() = default;
 
-  void
-  SearchAndSetNeighbors(Mesh *mesh, MeshBlockTree &tree, int *ranklist, int *nslist,
-                        const std::unordered_set<LogicalLocation> &newly_refined = {});
+  std::vector<ParArrayND<int>> vars_int;
+  std::vector<ParArrayND<Real>> vars_real;
 
- protected:
-  // 1D refined or unrefined=2
-  // 2D refined=12, unrefined=8
-  // 3D refined=56, unrefined=26. Refinement adds: 3*6 faces + 1*12 edges = +30 neighbors
-  static int maxneighbor_;
+  // (usuallly the std::size_t unsigned integer type)
+  std::vector<BoundaryCommunication *>::size_type bswarm_index;
+
+  // BoundaryCommunication
+  void SetupPersistentMPI() final;
+  void StartReceiving(BoundaryCommSubset phase) final{};
+  void ClearBoundary(BoundaryCommSubset phase) final{};
+  void Receive(BoundaryCommSubset phase);
+  void Send(BoundaryCommSubset phase);
 
+  BoundaryData<> bd_var_;
+  std::weak_ptr<MeshBlock> pmy_block;
   Mesh *pmy_mesh_;
-  RegionSize block_size_;
-  ParArrayND<Real> sarea_[2];
+  int send_tag[NMAX_NEIGHBORS], recv_tag[NMAX_NEIGHBORS];
+  int particle_size, send_size[NMAX_NEIGHBORS], recv_size[NMAX_NEIGHBORS];
 
-  void
-  SetNeighborOwnership(const std::unordered_set<LogicalLocation> &newly_refined = {});
+ protected:
+  int nl_, nu_;
+  void InitBoundaryData(BoundaryData<> &bd);
 
  private:
-  // calculate 3x shared static data members when constructing only the 1st class instance
-  // int maxneighbor_=BufferID() computes ni[] and then calls bufid[]=CreateBufferID()
-  static bool called_;
+  std::shared_ptr<MeshBlock> GetBlockPointer() {
+    if (pmy_block.expired()) {
+      PARTHENON_THROW("Invalid pointer to MeshBlock!");
+    }
+    return pmy_block.lock();
+  }
+
+#ifdef MPI_PARALLEL
+  // Unique communicator for this swarm.
+  MPI_Comm swarm_comm;
+#endif
 };
 
 //----------------------------------------------------------------------------------------
 //! \class BoundarySwarms
 //  \brief centralized class for interacting with each individual swarm boundary data
-class BoundarySwarms : public BoundaryBase, BoundaryCommunication {
+class BoundarySwarms : public BoundaryCommunication {
  public:
   BoundarySwarms(std::weak_ptr<MeshBlock> pmb, BoundaryFlag *input_bcs,
                  ParameterInput *pin);
@@ -119,7 +156,7 @@ class BoundarySwarms : public BoundaryBase, BoundaryCommunication {
     }
   }
 
-  // inherited functions (interface shared with BoundaryVariable objects):
+  // inherited functions:
   // ------
   // called before time-stepper:
   void SetupPersistentMPI() final; // setup MPI requests
@@ -129,9 +166,10 @@ class BoundarySwarms : public BoundaryBase, BoundaryCommunication {
   void ClearBoundary(BoundaryCommSubset phase) final {}
 
  private:
-  // ptr to MeshBlock containing this BoundaryValues
+  // ptr to MeshBlock containing this BoundarySwarms
   std::weak_ptr<MeshBlock> pmy_block_;
   int nface_, nedge_;
+  BoundaryFlag block_bcs[6];
 
   // if a BoundaryPhysics or user fn should be applied at each MeshBlock boundary
   // false --> e.g. block, polar, periodic boundaries
@@ -152,63 +190,6 @@ class BoundarySwarms : public BoundaryBase, BoundaryCommunication {
   friend class BoundarySwarm;
 };
 
-//----------------------------------------------------------------------------------------
-//! \class BoundaryValues
-//  \brief centralized class for interacting with each individual variable boundary data
-//         (design pattern ~ mediator)
-
-class BoundaryValues : public BoundaryBase, // public BoundaryPhysics,
-                       public BoundaryCommunication {
- public:
-  BoundaryValues(std::weak_ptr<MeshBlock> pmb, BoundaryFlag *input_bcs,
-                 ParameterInput *pin);
-
-  // Dictionary of boundary variable pointers indexed by the variable label
-  Dictionary<std::shared_ptr<BoundaryVariable>> bvars;
-
-  void SetBoundaryFlags(BoundaryFlag bc_flag[]) {
-    for (int i = 0; i < 6; i++)
-      bc_flag[i] = block_bcs[i];
-  }
-
-  // inherited functions (interface shared with BoundaryVariable objects):
-  // ------
-  // called before time-stepper:
-  void SetupPersistentMPI() final; // setup MPI requests
-
-  // called before and during time-stepper:
-  void StartReceiving(BoundaryCommSubset phase) final;
-  void ClearBoundary(BoundaryCommSubset phase) final;
-
- private:
-  // ptr to MeshBlock containing this BoundaryValues
-  std::weak_ptr<MeshBlock> pmy_block_;
-  int nface_, nedge_; // used only in fc/flux_correction_fc.cpp calculations
-
-  // if a BoundaryPhysics or user fn should be applied at each MeshBlock boundary
-  // false --> e.g. block, polar, periodic boundaries
-  bool apply_bndry_fn_[6]{}; // C++11: in-class initializer of non-static member
-  // C++11: direct-list-initialization -> value init of array -> zero init of each scalar
-
-  /// Returns shared pointer to a block
-  std::shared_ptr<MeshBlock> GetBlockPointer() {
-    if (pmy_block_.expired()) {
-      PARTHENON_THROW("Invalid pointer to MeshBlock!");
-    }
-    return pmy_block_.lock();
-  }
-
-  // temporary--- Added by @tomidakn on 2015-11-27 in f0f989f85f
-  // TODO(KGF): consider removing this friendship designation
-  friend class Mesh;
-  // currently, this class friendship is required for copying send/recv buffers between
-  // BoundaryVariable objects within different MeshBlocks on the same MPI rank:
-  friend class BoundaryVariable;
-  friend class FaceCenteredBoundaryVariable; // needs nface_, nedge_, num_north/south_...
-  // TODO(KGF): consider removing these friendship designations:
-  friend class BoundarySwarm;
-};
-
 } // namespace parthenon
 
 #endif // BVALS_BVALS_HPP_
diff --git a/src/bvals/bvals_base.cpp b/src/bvals/bvals_base.cpp
diff --git a/src/bvals/bvals_interfaces.hpp b/src/bvals/bvals_interfaces.hpp
diff --git a/src/bvals/bvals_swarm.cpp b/src/bvals/bvals_swarm.cpp
diff --git a/src/bvals/bvals_var.cpp b/src/bvals/bvals_var.cpp
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
@@ -22,8 +22,8 @@
 #include <vector>
 
 #include "basic_types.hpp"
-#include "bvals/bvals_interfaces.hpp"
 #include "bvals/comms/bnd_info.hpp"
+#include "bvals/neighbor_block.hpp"
 #include "config.hpp"
 #include "globals.hpp"
 #include "interface/state_descriptor.hpp"
@@ -408,12 +408,8 @@ ProResInfo ProResInfo::GetSet(MeshBlock *pmb, const NeighborBlock &nb,
   // regions that were filled by coarser neighbors
   bool restricted = false;
   if (mylevel > 0) {
-    for (int k = 0; k < 3; ++k) {
-      for (int j = 0; j < 3; ++j) {
-        for (int i = 0; i < 3; ++i) {
-          restricted = restricted || (pmb->pbval->nblevel[k][j][i] == (mylevel - 1));
-        }
-      }
+    for (const auto &nb : pmb->neighbors) {
+      restricted = restricted || (nb.loc.level() == (mylevel - 1));
     }
   }
 

diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "basic_types.hpp"
-#include "bvals/bvals_interfaces.hpp"
+#include "bvals/neighbor_block.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/variable_state.hpp"
 #include "mesh/domain.hpp"

diff --git a/src/bvals/comms/bvals_in_one.hpp b/src/bvals/comms/bvals_in_one.hpp
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "basic_types.hpp"
-#include "bvals/bvals_interfaces.hpp"
+#include "bvals/neighbor_block.hpp"
 #include "coordinates/coordinates.hpp"
 
 #include "tasks/tasks.hpp"

diff --git a/src/bvals/comms/tag_map.cpp b/src/bvals/comms/tag_map.cpp
@@ -35,13 +35,36 @@ TagMap::rank_pair_t TagMap::MakeChannelPair(const MeshBlock *pmb,
 }
 template <BoundaryType BOUND>
 void TagMap::AddMeshDataToMap(std::shared_ptr<MeshData<Real>> &md) {
-  ForEachBoundary<BOUND>(md, [&](auto pmb, sp_mbd_t rc, nb_t &nb, const sp_cv_t v) {
-    const int other_rank = nb.snb.rank;
-    if (map_.count(other_rank) < 1) map_[other_rank] = rank_pair_map_t();
-    auto &pair_map = map_[other_rank];
-    // Add channel key with an invalid tag
-    pair_map[MakeChannelPair(pmb, nb)] = -1;
-  });
+  for (int block = 0; block < md->NumBlocks(); ++block) {
+    auto &rc = md->GetBlockData(block);
+    auto pmb = rc->GetBlockPointer();
+    // type_t var = []{...}() pattern defines and uses a lambda that
+    // returns  to reduce initializations of var
+    auto *neighbors = [&pmb, &md] {
+      if constexpr (BOUND == BoundaryType::gmg_restrict_send)
+        return &(pmb->gmg_coarser_neighbors);
+      if constexpr (BOUND == BoundaryType::gmg_restrict_recv)
+        return &(pmb->gmg_finer_neighbors);
+      if constexpr (BOUND == BoundaryType::gmg_prolongate_send)
+        return &(pmb->gmg_finer_neighbors);
+      if constexpr (BOUND == BoundaryType::gmg_prolongate_recv)
+        return &(pmb->gmg_coarser_neighbors);
+      if constexpr (BOUND == BoundaryType::gmg_prolongate_recv)
+        return &(pmb->gmg_coarser_neighbors);
+      if constexpr (BOUND == BoundaryType::gmg_same)
+        return pmb->loc.level() == md->grid.logical_level
+                   ? &(pmb->gmg_same_neighbors)
+                   : &(pmb->gmg_composite_finer_neighbors);
+      return &(pmb->neighbors);
+    }();
+    for (auto &nb : *neighbors) {
+      const int other_rank = nb.snb.rank;
+      if (map_.count(other_rank) < 1) map_[other_rank] = rank_pair_map_t();
+      auto &pair_map = map_[other_rank];
+      // Add channel key with an invalid tag
+      pair_map[MakeChannelPair(pmb, nb)] = -1;
+    }
+  }
 }
 template void
 TagMap::AddMeshDataToMap<BoundaryType::any>(std::shared_ptr<MeshData<Real>> &md);
@@ -80,7 +103,10 @@ void TagMap::ResolveMap() {
 int TagMap::GetTag(const MeshBlock *pmb, const NeighborBlock &nb) {
   const int other_rank = nb.snb.rank;
   auto &pair_map = map_[other_rank];
-  return pair_map[MakeChannelPair(pmb, nb)];
+  auto cpair = MakeChannelPair(pmb, nb);
+  PARTHENON_REQUIRE(pair_map.count(cpair) == 1,
+                    "Trying to get tag for key that hasn't been entered.\n");
+  return pair_map[cpair];
 }
 
 } // namespace parthenon
diff --git a/src/bvals/neighbor_block.cpp b/src/bvals/neighbor_block.cpp
@@ -0,0 +1,166 @@
+//========================================================================================
+// Athena++ astrophysical MHD code
+// Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+//! \file neighbor_block.cpp
+//  \brief utility functions for neighbors and buffers
+
+#include "bvals/neighbor_block.hpp"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring> // memcpy()
+#include <iomanip>
+#include <iostream>  // endl
+#include <sstream>   // stringstream
+#include <stdexcept> // runtime_error
+#include <string>    // c_str()
+#include <unordered_set>
+
+#include "globals.hpp"
+#include "mesh/logical_location.hpp"
+#include "mesh/mesh.hpp"
+#include "utils/buffer_utils.hpp"
+#include "utils/error_checking.hpp"
+
+namespace parthenon {
+
+//----------------------------------------------------------------------------------------
+// \!fn void NeighborBlock::SetNeighbor(int irank, int ilevel, int igid, int ilid,
+//                          int iox1, int iox2, int iox3, NeighborConnect itype,
+//                          int ibid, int itargetid, int ifi1=0, int ifi2=0)
+// \brief Set neighbor information
+
+void NeighborBlock::SetNeighbor(LogicalLocation inloc, int irank, int ilevel, int igid,
+                                int ilid, int iox1, int iox2, int iox3,
+                                NeighborConnect itype, int ibid, int itargetid,
+                                int ifi1, // =0
+                                int ifi2  // =0
+) {
+  snb.rank = irank;
+  snb.level = ilevel;
+  snb.gid = igid;
+  snb.lid = ilid;
+  ni.ox1 = iox1;
+  ni.ox2 = iox2;
+  ni.ox3 = iox3;
+  ni.type = itype;
+  ni.fi1 = ifi1;
+  ni.fi2 = ifi2;
+  bufid = ibid;
+  targetid = itargetid;
+  loc = inloc;
+  if (ni.type == NeighborConnect::face) {
+    if (ni.ox1 == -1)
+      fid = BoundaryFace::inner_x1;
+    else if (ni.ox1 == 1)
+      fid = BoundaryFace::outer_x1;
+    else if (ni.ox2 == -1)
+      fid = BoundaryFace::inner_x2;
+    else if (ni.ox2 == 1)
+      fid = BoundaryFace::outer_x2;
+    else if (ni.ox3 == -1)
+      fid = BoundaryFace::inner_x3;
+    else if (ni.ox3 == 1)
+      fid = BoundaryFace::outer_x3;
+  }
+  if (ni.type == NeighborConnect::edge) {
+    if (ni.ox3 == 0)
+      eid = ((((ni.ox1 + 1) >> 1) | ((ni.ox2 + 1) & 2)));
+    else if (ni.ox2 == 0)
+      eid = (4 + (((ni.ox1 + 1) >> 1) | ((ni.ox3 + 1) & 2)));
+    else if (ni.ox1 == 0)
+      eid = (8 + (((ni.ox2 + 1) >> 1) | ((ni.ox3 + 1) & 2)));
+  }
+  return;
+}
+
+NeighborConnect NCFromOffsets(const std::array<int, 3> offsets) {
+  int connect_indicator =
+      std::abs(offsets[0]) + std::abs(offsets[1]) + std::abs(offsets[2]);
+  NeighborConnect nc = NeighborConnect::none;
+  if (connect_indicator == 1) {
+    nc = NeighborConnect::face;
+  } else if (connect_indicator == 2) {
+    nc = NeighborConnect::edge;
+  } else if (connect_indicator == 3) {
+    nc = NeighborConnect::corner;
+  }
+  return nc;
+}
+
+NeighborBlock::NeighborBlock(Mesh *mesh, LogicalLocation loc, int rank, int gid,
+                             std::array<int, 3> offsets, int ibid, int itargetid, int fi1,
+                             int fi2)
+    : NeighborBlock(mesh, loc, rank, gid, 0, offsets, NCFromOffsets(offsets), ibid,
+                    itargetid, fi1, fi2) {}
+
+NeighborBlock::NeighborBlock(Mesh *mesh, LogicalLocation loc, int rank, int gid, int lid,
+                             std::array<int, 3> offsets, NeighborConnect type, int bid,
+                             int target_id, int fi1, int fi2)
+    : snb{rank, loc.level(), lid, gid}, ni{offsets[0], offsets[1], offsets[2],
+                                           fi1,        fi2,        type},
+      bufid{bid}, eid{0}, targetid{target_id}, fid{BoundaryFace::undef}, loc{loc},
+      ownership(true), block_size(mesh->GetBlockSize(loc)) {
+  // TODO(LFR): Look and see if this stuff gets used anywhere
+  if (ni.type == NeighborConnect::face) {
+    if (ni.ox1 == -1)
+      fid = BoundaryFace::inner_x1;
+    else if (ni.ox1 == 1)
+      fid = BoundaryFace::outer_x1;
+    else if (ni.ox2 == -1)
+      fid = BoundaryFace::inner_x2;
+    else if (ni.ox2 == 1)
+      fid = BoundaryFace::outer_x2;
+    else if (ni.ox3 == -1)
+      fid = BoundaryFace::inner_x3;
+    else if (ni.ox3 == 1)
+      fid = BoundaryFace::outer_x3;
+  }
+  if (ni.type == NeighborConnect::edge) {
+    if (ni.ox3 == 0)
+      eid = ((((ni.ox1 + 1) >> 1) | ((ni.ox2 + 1) & 2)));
+    else if (ni.ox2 == 0)
+      eid = (4 + (((ni.ox1 + 1) >> 1) | ((ni.ox3 + 1) & 2)));
+    else if (ni.ox1 == 0)
+      eid = (8 + (((ni.ox2 + 1) >> 1) | ((ni.ox3 + 1) & 2)));
+  }
+}
+
+BufferID::BufferID(int dim, bool multilevel) {
+  std::vector<int> x1offsets = dim > 0 ? std::vector<int>{0, -1, 1} : std::vector<int>{0};
+  std::vector<int> x2offsets = dim > 1 ? std::vector<int>{0, -1, 1} : std::vector<int>{0};
+  std::vector<int> x3offsets = dim > 2 ? std::vector<int>{0, -1, 1} : std::vector<int>{0};
+  for (auto ox3 : x3offsets) {
+    for (auto ox2 : x2offsets) {
+      for (auto ox1 : x1offsets) {
+        const int type = std::abs(ox1) + std::abs(ox2) + std::abs(ox3);
+        if (type == 0) continue;
+        std::vector<int> f1s =
+            (dim - type) > 0 && multilevel ? std::vector<int>{0, 1} : std::vector<int>{0};
+        std::vector<int> f2s =
+            (dim - type) > 1 && multilevel ? std::vector<int>{0, 1} : std::vector<int>{0};
+        for (auto f1 : f1s) {
+          for (auto f2 : f2s) {
+            NeighborIndexes ni{ox1, ox2, ox3, f1, f2, NeighborConnect::face};
+            nis.push_back(ni);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace parthenon
Original file line number	Diff line number	Diff line change
		@@ -76,7 +76,6 @@ def printTree(self):


		class GitHubApp:

		"""
		GitHubApp Class