diff --git a/model/atmosphere/diffusion/tests/diffusion_tests/test_diffusion.py b/model/atmosphere/diffusion/tests/diffusion_tests/test_diffusion.py
index 44774ac60..8be96302f 100644
--- a/model/atmosphere/diffusion/tests/diffusion_tests/test_diffusion.py
+++ b/model/atmosphere/diffusion/tests/diffusion_tests/test_diffusion.py
@@ -5,6 +5,8 @@
 #
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
+import functools
+
 import pytest
 
 import icon4py.model.common.dimension as dims
@@ -428,9 +430,15 @@ def test_run_diffusion_single_step(
     ndyn_substeps,
     backend,
     orchestration,
+    benchmark,
 ):
     if orchestration and not helpers.is_dace(backend):
         pytest.skip("Orchestration test requires a dace backend.")
+
+    if experiment == dt_utils.REGIONAL_EXPERIMENT:
+        # Skip benchmarks for this experiment
+        benchmark = None
+
     grid = get_grid_for_experiment(experiment, backend)
     cell_geometry = get_cell_geometry_for_experiment(experiment, backend)
     edge_geometry = get_edge_geometry_for_experiment(experiment, backend)
@@ -504,14 +512,23 @@ def test_run_diffusion_single_step(
     verify_diffusion_fields(config, diagnostic_state, prognostic_state, savepoint_diffusion_init)
     assert savepoint_diffusion_init.fac_bdydiff_v() == diffusion_granule.fac_bdydiff_v
 
-    diffusion_granule.run(
-        diagnostic_state=diagnostic_state,
-        prognostic_state=prognostic_state,
-        dtime=dtime,
+    helpers.run_verify_and_benchmark(
+        functools.partial(
+            diffusion_granule.run,
+            diagnostic_state=diagnostic_state,
+            prognostic_state=prognostic_state,
+            dtime=dtime,
+        ),
+        functools.partial(
+            verify_diffusion_fields,
+            config=config,
+            diagnostic_state=diagnostic_state,
+            prognostic_state=prognostic_state,
+            diffusion_savepoint=savepoint_diffusion_exit,
+        ),
+        benchmark,
     )
 
-    verify_diffusion_fields(config, diagnostic_state, prognostic_state, savepoint_diffusion_exit)
-
 
 @pytest.mark.datatest
 @pytest.mark.parametrize(
diff --git a/model/atmosphere/dycore/src/icon4py/model/atmosphere/dycore/solve_nonhydro.py b/model/atmosphere/dycore/src/icon4py/model/atmosphere/dycore/solve_nonhydro.py
index 06b62413e..a858fb2cf 100644
--- a/model/atmosphere/dycore/src/icon4py/model/atmosphere/dycore/solve_nonhydro.py
+++ b/model/atmosphere/dycore/src/icon4py/model/atmosphere/dycore/solve_nonhydro.py
@@ -404,7 +404,10 @@ def _validate(self):
             raise NotImplementedError("divdamp_order can only be 24")
 
         if self.divdamp_type == DivergenceDampingType.TWO_DIMENSIONAL:
-            raise NotImplementedError("`DivergenceDampingType.TWO_DIMENSIONAL` (2) is not yet implemented")
+            raise NotImplementedError(
+                "`DivergenceDampingType.TWO_DIMENSIONAL` (2) is not yet implemented"
+            )
+
 
 class NonHydrostaticParams:
     """Calculates derived quantities depending on the NonHydrostaticConfig."""
diff --git a/model/atmosphere/dycore/tests/dycore_stencil_tests/test_compute_advection_in_horizontal_momentum_equation.py b/model/atmosphere/dycore/tests/dycore_stencil_tests/test_compute_advection_in_horizontal_momentum_equation.py
index 6bc5d3482..f9ecd14b5 100644
--- a/model/atmosphere/dycore/tests/dycore_stencil_tests/test_compute_advection_in_horizontal_momentum_equation.py
+++ b/model/atmosphere/dycore/tests/dycore_stencil_tests/test_compute_advection_in_horizontal_momentum_equation.py
@@ -64,7 +64,9 @@ def reference(
         normal_wind_advective_tendency_cp = normal_wind_advective_tendency.copy()
         k = np.arange(nlev)
 
-        upward_vorticity_at_vertices = mo_math_divrot_rot_vertex_ri_dsl_numpy(connectivities, vn, geofac_rot)
+        upward_vorticity_at_vertices = mo_math_divrot_rot_vertex_ri_dsl_numpy(
+            connectivities, vn, geofac_rot
+        )
 
         normal_wind_advective_tendency = compute_advective_normal_wind_tendency_numpy(
             connectivities,
@@ -79,7 +81,7 @@ def reference(
             vn_on_half_levels,
             ddqz_z_full_e,
         )
-        
+
         condition = (np.maximum(3, nrdmax - 2) - 1 <= k) & (k < nlev - 4)
         normal_wind_advective_tendency_extra_diffu = (
             add_extra_diffusion_for_normal_wind_tendency_approaching_cfl_numpy(
diff --git a/model/testing/src/icon4py/model/testing/helpers.py b/model/testing/src/icon4py/model/testing/helpers.py
index 206ded6e0..80ad63d5e 100644
--- a/model/testing/src/icon4py/model/testing/helpers.py
+++ b/model/testing/src/icon4py/model/testing/helpers.py
@@ -6,10 +6,11 @@
 # Please, refer to the LICENSE file in the root directory.
 # SPDX-License-Identifier: BSD-3-Clause
 
+import functools
 import hashlib
 import typing
 from dataclasses import dataclass, field
-from typing import ClassVar
+from typing import Callable, ClassVar, Optional
 
 import gt4py.next as gtx
 import numpy as np
@@ -23,12 +24,6 @@
 from icon4py.model.common.utils import data_allocation as data_alloc
 
 
-try:
-    import pytest_benchmark
-except ModuleNotFoundError:
-    pytest_benchmark = None
-
-
 @pytest.fixture(scope="session")
 def connectivities_as_numpy(grid, backend) -> dict[gtx.Dimension, np.ndarray]:
     return {dim: data_alloc.as_numpy(table) for dim, table in grid.connectivities.items()}
@@ -110,28 +105,34 @@ class Output:
     gtslice: tuple[slice, ...] = field(default_factory=lambda: (slice(None),))
 
 
-def _test_validation(
-    self,
-    grid: base.BaseGrid,
-    backend: gtx_backend.Backend,
-    connectivities_as_numpy: dict,
-    input_data: dict,
-):
-    if self.MARKERS is not None:
-        apply_markers(self.MARKERS, grid, backend)
+def run_verify_and_benchmark(
+    test_func: Callable[[], None],
+    verification_func: Callable[[], None],
+    benchmark_fixture: Optional[pytest.FixtureRequest],
+) -> None:
+    """
+    Function to perform verification and benchmarking of test_func (along with normally executing it).
 
-    connectivities = connectivities_as_numpy
-    reference_outputs = self.reference(
-        connectivities,
-        **{k: v.asnumpy() if isinstance(v, gtx.Field) else v for k, v in input_data.items()},
-    )
+    Args:
+        test_func: Function to be ran, verified and benchmarked.
+        verification_func: Function to be used for verification of test_func.
+        benchmark_fixture: pytest-benchmark fixture.
 
-    input_data = allocate_data(backend, input_data)
+    Note:
+        - test_func and verification_func should be provided with binded arguments, i.e. with functools.partial.
+    """
+    test_func()
+    verification_func()
 
-    self.PROGRAM.with_backend(backend)(
-        **input_data,
-        offset_provider=grid.offset_providers,
-    )
+    if benchmark_fixture is not None and benchmark_fixture.enabled:
+        benchmark_fixture(test_func)
+
+
+def _verify_stencil_test(
+    self,
+    input_data: dict[str, gtx.Field],
+    reference_outputs: dict[str, np.ndarray],
+) -> None:
     for out in self.OUTPUTS:
         name, refslice, gtslice = (
             (out.name, out.refslice, out.gtslice)
@@ -143,32 +144,43 @@ def _test_validation(
             input_data[name].asnumpy()[gtslice],
             reference_outputs[name][refslice],
             equal_nan=True,
-            err_msg=f"Validation failed for '{name}'",
+            err_msg=f"Verification failed for '{name}'",
         )
 
 
-if pytest_benchmark:
-
-    def _test_execution_benchmark(self, pytestconfig, grid, backend, input_data, benchmark):
-        if self.MARKERS is not None:
-            apply_markers(self.MARKERS, grid, backend)
+def _test_and_benchmark(
+    self,
+    grid: base.BaseGrid,
+    backend: gtx_backend.Backend,
+    connectivities_as_numpy: dict[str, np.ndarray],
+    input_data: dict[str, gtx.Field],
+    benchmark: pytest.FixtureRequest,
+):
+    if self.MARKERS is not None:
+        apply_markers(self.MARKERS, grid, backend)
 
-        if pytestconfig.getoption(
-            "--benchmark-disable"
-        ):  # skipping as otherwise program calls are duplicated in tests.
-            pytest.skip("Test skipped due to 'benchmark-disable' option.")
-        else:
-            input_data = allocate_data(backend, input_data)
-            benchmark(
-                self.PROGRAM.with_backend(backend),
-                **input_data,
-                offset_provider=grid.offset_providers,
-            )
+    connectivities = connectivities_as_numpy
+    reference_outputs = self.reference(
+        connectivities,
+        **{k: v.asnumpy() if isinstance(v, gtx.Field) else v for k, v in input_data.items()},
+    )
 
-else:
+    input_data = allocate_data(backend, input_data)
 
-    def _test_execution_benchmark(self, pytestconfig):
-        pytest.skip("Test skipped as `pytest-benchmark` is not installed.")
+    run_verify_and_benchmark(
+        functools.partial(
+            self.PROGRAM.with_backend(backend),
+            **input_data,
+            offset_provider=grid.offset_providers,
+        ),
+        functools.partial(
+            _verify_stencil_test,
+            self=self,
+            input_data=input_data,
+            reference_outputs=reference_outputs,
+        ),
+        benchmark,
+    )
 
 
 class StencilTest:
@@ -199,8 +211,7 @@ def __init_subclass__(cls, **kwargs):
         # reflect the name of the test we do this dynamically here instead of using regular
         # inheritance.
         super().__init_subclass__(**kwargs)
-        setattr(cls, f"test_{cls.__name__}", _test_validation)
-        setattr(cls, f"test_{cls.__name__}_benchmark", _test_execution_benchmark)
+        setattr(cls, f"test_{cls.__name__}", _test_and_benchmark)
 
 
 def reshape(arr: np.ndarray, shape: tuple[int, ...]):
diff --git a/model/testing/src/icon4py/model/testing/pytest_config.py b/model/testing/src/icon4py/model/testing/pytest_config.py
index fe030fcc5..0b7f6c3d6 100644
--- a/model/testing/src/icon4py/model/testing/pytest_config.py
+++ b/model/testing/src/icon4py/model/testing/pytest_config.py
@@ -171,4 +171,4 @@ def pytest_benchmark_update_json(output_json):
     "Replace 'fullname' of pytest benchmarks with a shorter name for better readability in bencher."
     for bench in output_json["benchmarks"]:
         # Replace fullname with name and filter unnecessary prefix and suffix
-        bench["fullname"] = bench["name"].replace("test_", "").replace("_benchmark", "")
+        bench["fullname"] = bench["name"].replace("test_", "")
diff --git a/model/testing/tests/conftest.py b/model/testing/tests/conftest.py
new file mode 100644
index 000000000..fd53adee9
--- /dev/null
+++ b/model/testing/tests/conftest.py
@@ -0,0 +1,16 @@
+# ICON4Py - ICON inspired code in Python and GT4Py
+#
+# Copyright (c) 2022-2024, ETH Zurich and MeteoSwiss
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# ruff: noqa: F405
+# Make sure custom icon4py pytest hooks are loaded
+try:
+    import sys
+
+    _ = sys.modules["icon4py.model.testing.pytest_config"]
+except KeyError:
+    from icon4py.model.testing.pytest_config import *  # noqa: F403 [undefined-local-with-import-star]
diff --git a/model/testing/tests/test_verification_benchmarking.py b/model/testing/tests/test_verification_benchmarking.py
new file mode 100644
index 000000000..ecb459a1c
--- /dev/null
+++ b/model/testing/tests/test_verification_benchmarking.py
@@ -0,0 +1,52 @@
+# ICON4Py - ICON inspired code in Python and GT4Py
+#
+# Copyright (c) 2022-2024, ETH Zurich and MeteoSwiss
+# All rights reserved.
+#
+# Please, refer to the LICENSE file in the root directory.
+# SPDX-License-Identifier: BSD-3-Clause
+import functools
+
+import numpy as np
+from numpy.typing import NDArray
+
+from icon4py.model.testing import helpers
+
+
+BASE_DTYPE = np.int64
+
+
+def incr_func(
+    field: NDArray[BASE_DTYPE],
+    increment: int,
+):
+    field += increment
+
+
+def verify_field(
+    field: NDArray[BASE_DTYPE],
+    increment: int,
+    base_value: int,
+):
+    np.testing.assert_allclose(field, base_value + increment)
+
+
+def test_verification_benchmarking_infrastructure(benchmark):
+    base_value = 1
+    field = np.array((base_value * np.ones((), dtype=BASE_DTYPE)))
+
+    increment = 6
+
+    helpers.run_verify_and_benchmark(
+        functools.partial(incr_func, field=field, increment=increment),
+        functools.partial(verify_field, field=field, increment=increment, base_value=base_value),
+        None,  # no need to benchmark this test
+    )
+
+    current_base_value = field[()]
+    assert (
+        current_base_value != base_value
+    ), "Base values should not be equal. Otherwise, the test did not go through incr_func and/or verify_field functions."
+
+    incr_func(field, increment)
+    verify_field(field, increment, current_base_value)
diff --git a/noxfile.py b/noxfile.py
index 953a3232e..d91c3ab34 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -9,8 +9,6 @@
 from __future__ import annotations
 
 import os
-import json
-import glob
 import re
 from collections.abc import Sequence
 from typing import Final, Literal, TypeAlias
@@ -58,6 +56,7 @@ def benchmark_model(session: nox.Session) -> None:
         *f"pytest \
         -v \
         --benchmark-only \
+        --datatest \
         --benchmark-warmup=on \
         --benchmark-warmup-iterations=30 \
         --benchmark-json=pytest_benchmark_results_{session.python}.json \
@@ -139,7 +138,7 @@ def test_model(session: nox.Session, selection: ModelTestsSubset, subpackage: Mo
     pytest_args = _selection_to_pytest_args(selection)
     with session.chdir(f"model/{subpackage}"):
         session.run(
-            *f"pytest -sv --benchmark-skip -n {os.environ.get('NUM_PROCESSES', 'auto')}".split(),
+            *f"pytest -sv --benchmark-disable -n {os.environ.get('NUM_PROCESSES', 'auto')}".split(),
             *pytest_args,
             *session.posargs,
             success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
@@ -166,7 +165,7 @@ def test_tools(session: nox.Session, datatest: bool) -> None:
 
     with session.chdir("tools"):
         session.run(
-            *f"pytest -sv --benchmark-skip -n {os.environ.get('NUM_PROCESSES', 'auto')} {'--datatest' if datatest else ''}".split(),
+            *f"pytest -sv --benchmark-disable -n {os.environ.get('NUM_PROCESSES', 'auto')} {'--datatest' if datatest else ''}".split(),
             *session.posargs
         )