Move zarr-specific tests to test_zarr.py (#715)

crusaderky · web-flow · commit 39d35b534755 · 2023-03-14T16:09:47.000Z
diff --git a/alembic/versions/2381a77e8487_zarr.py b/alembic/versions/2381a77e8487_zarr.py
@@ -0,0 +1,37 @@
+"""zarr
+
+Revision ID: 2381a77e8487
+Revises: d58983739401
+Create Date: 2023-03-13 14:57:02.474967
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '2381a77e8487'
+down_revision = 'd58983739401'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        update test_run
+        set path = 'benchmarks/test_zarr.py'
+        where path = 'benchmarks/test_array.py'
+        and originalname in (
+            'test_filter_then_average', 
+            'test_access_slices', 
+            'test_sum_residuals'
+        )
+        """
+    )
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
diff --git a/tests/benchmarks/test_array.py b/tests/benchmarks/test_array.py
@@ -21,15 +21,6 @@
 )
 
 
-@pytest.fixture(scope="module")
-def zarr_dataset():
-    s3_uri = (
-        "s3://coiled-runtime-ci/synthetic-zarr/"
-        "synth_random_int_array_2000_cubed.zarr"
-    )
-    return da.from_zarr(s3_uri)
-
-
 def test_anom_mean(small_client):
     # From https://github.com/dask/distributed/issues/2602#issuecomment-498718651
 
@@ -244,32 +235,6 @@ def test_map_overlap_sample(small_client):
     y[5000:5010, 5000:5010].compute()
 
 
-@run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
-@pytest.mark.parametrize("threshold", [50, 100, 200, 255])
-def test_filter_then_average(threshold, zarr_dataset, small_client):
-    """
-    Compute the mean for increasingly sparse boolean filters of an array
-    """
-    zarr_dataset[zarr_dataset > threshold].mean().compute()
-
-
-@run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
-@pytest.mark.parametrize("N", [700, 75, 1])
-def test_access_slices(N, zarr_dataset, small_client):
-    """
-    Accessing just a few chunks of a zarr array should be quick
-    """
-    distributed.wait(zarr_dataset[:N, :N, :N].persist())
-
-
-@run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
-def test_sum_residuals(zarr_dataset, small_client):
-    """
-    Simnple test to that computes as reduction, the array op, the reduction again
-    """
-    (zarr_dataset - zarr_dataset.mean(axis=0)).sum().compute()
-
-
 @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
 def test_rechunk_in_memory(small_client, configure_rechunking):
     x = da.random.random((50000, 50000))
diff --git a/tests/benchmarks/test_zarr.py b/tests/benchmarks/test_zarr.py
@@ -1,16 +1,50 @@
 from __future__ import annotations
 
+import dask.array as da
 import pytest
 import xarray
 
-from ..utils_test import run_up_to_nthreads
+from ..utils_test import run_up_to_nthreads, wait
+
+
+@pytest.fixture(scope="module")
+def zarr_dataset():
+    # shape = (2000, 2000, 2000)
+    # chunks = (200, 200, 200)
+    # Compresses to ~42% of its original size (tested on lz4 4.0)
+    store = (
+        "s3://coiled-runtime-ci/synthetic-zarr/synth_random_int_array_2000_cubed.zarr"
+    )
+    return da.from_zarr(store)
 
 
 @pytest.fixture(scope="module")
 def cmip6():
     store = "s3://coiled-runtime-ci/CMIP6/CMIP/AS-RCEC/TaiESM1/1pctCO2/r1i1p1f1/Amon/zg/gn/v20200225/"
-    ds = xarray.open_dataset(store, engine="zarr", chunks={})
-    yield ds
+    return xarray.open_dataset(store, engine="zarr", chunks={})
+
+
+@run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
+@pytest.mark.parametrize("threshold", [50, 100, 200, 255])
+def test_filter_then_average(small_client, zarr_dataset, threshold):
+    """Compute the mean for increasingly sparse boolean filters of an array"""
+    a = zarr_dataset[zarr_dataset > threshold].mean()
+    wait(a, small_client, 300)
+
+
+@run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
+@pytest.mark.parametrize("N", [700, 75, 1])
+def test_access_slices(small_client, zarr_dataset, N):
+    """Accessing just a few chunks of a zarr array should be quick"""
+    a = zarr_dataset[:N, :N, :N]
+    wait(a, small_client, 300)
+
+
+@run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
+def test_sum_residuals(small_client, zarr_dataset):
+    """Compute reduce, then map, then reduce again"""
+    a = (zarr_dataset - zarr_dataset.mean(axis=0)).sum()
+    wait(a, small_client, 300)
 
 
 @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")