Merge branch 'tickets/DM-51326'

kfindeisen · kfindeisen · commit 2c51a60573b3 · 2025-06-10T17:12:12.000-07:00
diff --git a/python/activator/caching.py b/python/activator/caching.py
@@ -145,6 +145,11 @@ def __len__(self):
             total += len(cache)
         return total
 
+    def __str__(self):
+        summary = ", ".join(f"{type_name}: {len(cache)}/{cache.max_size}"
+                            for type_name, cache in sorted(self._caches.items()))
+        return "{%s}" % summary
+
     def _merge_into_cache(self, inputs: Mapping[str, Set[daf_butler.DatasetRef]]) \
             -> [Set[daf_butler.DatasetRef], Set[daf_butler.DatasetRef], Mapping[str, EvictingSet]]:
         """Compute a bulk update of caches for multiple dataset types.
diff --git a/python/activator/middleware_interface.py b/python/activator/middleware_interface.py
@@ -1767,16 +1767,19 @@ def clean_local_repo(self, exposure_ids: set[int]) -> None:
                     instrument=self.visit.instrument,
                     detector=self.visit.detector,
                 )
+                _log_trace.debug("Removing %d raws for exposures %s.", len(raws), exposure_ids)
                 self.butler.pruneDatasets(raws, disassociate=True, unstore=True, purge=True)
             # Outputs are all in their own runs, so just drop them.
             preload_run = runs.get_preload_run(self.instrument, self._deployment, self._day_obs)
             _remove_run_completely(self.butler, preload_run)
             for pipeline_file in self._get_combined_pipeline_files():
                 output_run = runs.get_output_run(self.instrument, self._deployment, pipeline_file,
                                                  self._day_obs)
+                _log_trace.debug("Removing run %s.", output_run)
                 _remove_run_completely(self.butler, output_run)
 
             # Clean out calibs, templates, and other preloaded datasets
+            _log_trace.debug("Cache contents: %s", self.cache)
             excess_datasets = set()
             for dataset_type in self.butler.registry.queryDatasetTypes(...):
                 excess_datasets |= set(self.butler.query_datasets(
@@ -1834,15 +1837,15 @@ def _filter_datasets(src_repo: Butler,
     -------
     datasets : iterable [`lsst.daf.butler.DatasetRef`]
         The datasets that exist in ``src_repo`` but not ``dest_repo``.
-        datasetRefs are guaranteed to be fully expanded if any only if
+        datasetRefs are guaranteed to be fully expanded if and only if
         ``query`` guarantees it.
 
     Raises
     ------
     _MissingDatasetError
         Raised if the query on ``src_repo`` failed to find any datasets.
     """
-    known_datasets = query(dest_repo, "known datasets")
+    known_datasets = set(query(dest_repo, "known datasets"))
 
     # Let exceptions from src_repo query raise: if it fails, that invalidates
     # this operation.
@@ -1851,7 +1854,10 @@ def _filter_datasets(src_repo: Butler,
         raise _MissingDatasetError("Source repo query found no matches.")
     if all_callback:
         all_callback(src_datasets)
-    return itertools.filterfalse(lambda ref: ref in known_datasets, src_datasets)
+    missing = src_datasets - known_datasets
+    _log_trace.debug("Found %d matching datasets. %d present locally, %d to download.",
+                     len(src_datasets), len(src_datasets & known_datasets), len(missing))
+    return missing
 
 
 def _generic_query(dataset_types: collections.abc.Iterable[str | lsst.daf.butler.DatasetType],