From 31d624567d3da476dd864e88f3a7f2d811aae277 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 16:27:12 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20CodeRabbit=20Chat:=20Implement?= =?UTF-8?q?=20requested=20code=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sampleworks/eval/generate_synthetic_sf.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/sampleworks/eval/generate_synthetic_sf.py b/src/sampleworks/eval/generate_synthetic_sf.py index 06a4c79..040c81e 100644 --- a/src/sampleworks/eval/generate_synthetic_sf.py +++ b/src/sampleworks/eval/generate_synthetic_sf.py @@ -420,7 +420,9 @@ def process_batch( seed Optional seed for reproducible R-free flag assignment device - PyTorch device for computation + PyTorch device for computation. NOTE: all parallel workers currently + receive the same device, so with n_jobs > 1 all jobs will run on a + single GPU. See TODO below. n_jobs Number of parallel jobs. -1 means use all available CPUs. strip_hydrogens @@ -434,10 +436,27 @@ def process_batch( save_structure If True, save each processed structure as mmCIF to output_dir. """ + # TODO(engineers): GPU device distribution in parallel batch mode + # + # Currently all joblib worker processes receive the same `device` argument + # (e.g. cuda:0), so when n_jobs > 1 every parallel job competes for the + # same GPU. On a multi-GPU machine this wastes available GPU memory and + # compute. The fix requires distributing workers across GPUs, for example + # by: + # - Accepting a list of GPU IDs (e.g. --gpus 0,1,2,3) and assigning + # worker i to cuda:(gpu_ids[i % len(gpu_ids)]). + # - Or detecting torch.cuda.device_count() and cycling workers across + # cuda:0 … cuda:(N-1). + # - Or letting callers control placement via CUDA_VISIBLE_DEVICES per + # subprocess (requires spawning processes manually rather than using + # joblib with the loky backend). + # + # Until this is resolved, prefer running with n_jobs=1 (or a small count + # that fits on one GPU) when using CUDA, or use CPU-only mode. from joblib import delayed, Parallel rows = load_batch_csv(csv_path) - logger.info(f"Processing {len(rows)} structures from {csv_path} using {n_jobs} jobs") + Parallel(n_jobs=n_jobs, backend="loky")( delayed(_process_single_row)(