From 31d624567d3da476dd864e88f3a7f2d811aae277 Mon Sep 17 00:00:00 2001
From: "coderabbitai[bot]"
 <136622811+coderabbitai[bot]@users.noreply.github.com>
Date: Tue, 26 May 2026 16:27:12 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20CodeRabbit=20Chat:=20Implement?=
 =?UTF-8?q?=20requested=20code=20changes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/sampleworks/eval/generate_synthetic_sf.py | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/sampleworks/eval/generate_synthetic_sf.py b/src/sampleworks/eval/generate_synthetic_sf.py
index 06a4c79..040c81e 100644
--- a/src/sampleworks/eval/generate_synthetic_sf.py
+++ b/src/sampleworks/eval/generate_synthetic_sf.py
@@ -420,7 +420,9 @@ def process_batch(
     seed
         Optional seed for reproducible R-free flag assignment
     device
-        PyTorch device for computation
+        PyTorch device for computation. NOTE: all parallel workers currently
+        receive the same device, so with n_jobs > 1 all jobs will run on a
+        single GPU. See TODO below.
     n_jobs
         Number of parallel jobs. -1 means use all available CPUs.
     strip_hydrogens
@@ -434,10 +436,27 @@ def process_batch(
     save_structure
         If True, save each processed structure as mmCIF to output_dir.
     """
+    # TODO(engineers): GPU device distribution in parallel batch mode
+    #
+    # Currently all joblib worker processes receive the same `device` argument
+    # (e.g. cuda:0), so when n_jobs > 1 every parallel job competes for the
+    # same GPU. On a multi-GPU machine this wastes available GPU memory and
+    # compute. The fix requires distributing workers across GPUs, for example
+    # by:
+    #   - Accepting a list of GPU IDs (e.g. --gpus 0,1,2,3) and assigning
+    #     worker i to cuda:(gpu_ids[i % len(gpu_ids)]).
+    #   - Or detecting torch.cuda.device_count() and cycling workers across
+    #     cuda:0 … cuda:(N-1).
+    #   - Or letting callers control placement via CUDA_VISIBLE_DEVICES per
+    #     subprocess (requires spawning processes manually rather than using
+    #     joblib with the loky backend).
+    #
+    # Until this is resolved, prefer running with n_jobs=1 (or a small count
+    # that fits on one GPU) when using CUDA, or use CPU-only mode.
     from joblib import delayed, Parallel
 
     rows = load_batch_csv(csv_path)
-    logger.info(f"Processing {len(rows)} structures from {csv_path} using {n_jobs} jobs")
+
 
     Parallel(n_jobs=n_jobs, backend="loky")(
         delayed(_process_single_row)(