diff --git a/README.md b/README.md index b818c42a..c7604afb 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,8 @@ Output files appear in `output/boltz2_pure_guidance/`: `refined.cif` (final ense ```bash pixi run -e boltz python run_grid_search.py \ --proteins proteins.csv \ - --models boltz2 \ # options: boltz1, boltz2, protenix, rf3 (make sure env aligns!) - --methods "X-RAY DIFFRACTION" \ # only useful for Boltz-2, ignored otherwise + --model boltz2 \ # options: boltz1, boltz2, protenix, rf3 (make sure env aligns!) + --method "X-RAY DIFFRACTION" \ # only useful for Boltz-2, ignored otherwise --scalers pure_guidance \ # options: pure_guidance, fk_steering, or both as space-separated list --ensemble-sizes "1 4" \ --gradient-weights "0.1 0.2" \ @@ -113,11 +113,11 @@ name,structure,density,resolution | Argument | Description | Default | |---|---|---| | `--proteins` | CSV with structure/density/resolution columns | required | -| `--models` | Model to run. One of `boltz1`, `boltz2`, `protenix`, `rf3` | required | +| `--model` | Model to run. One of `boltz1`, `boltz2`, `protenix`, `rf3` | `boltz2` | | `--scalers` | Guidance method(s) to sweep | `pure_guidance fk_steering` | | `--ensemble-sizes` | Space-separated values, e.g. `"1 4"` | `"1 2 4 8"` | | `--gradient-weights` | Space-separated values, e.g. `"0.1 0.2"` | `"0.01 0.1 0.2"` | -| `--methods` | Boltz-2 sampling method (required for boltz2) | `X-RAY DIFFRACTION` | +| `--method` | Boltz-2 sampling method | `X-RAY DIFFRACTION` | | `--max-parallel` | Parallel workers (default: number of GPUs) | `auto` | | `--dry-run` | Print jobs without running them | off | | `--force-all` | Re-run including already-successful jobs | off | @@ -126,7 +126,7 @@ name,structure,density,resolution Output layout: `grid_search_results//[_]//ens_gw/` -> **Note**: Jobs are skipped if a `refined.cif` file already exists in the output directory. Some flags (e.g., `--use-tweedie`, `--gradient-normalization`) are not reflected in the directory structure, so changing them alone won't trigger a re-run. Use `--force-all` to re-run all jobs regardless. This is under active development and will likely change soon. +> **Note**: Jobs are skipped if a `refined.cif` file already exists in the output directory. Some flags (e.g., `--step-scaler-type`, `--gradient-normalization`) are not reflected in the directory structure, so changing them alone won't trigger a re-run. Use `--force-all` to re-run all jobs regardless. This is under active development and will likely change soon. Instructions for running evaluation and metrics scripts are coming soon. @@ -170,4 +170,4 @@ To develop on OS X, ensure you have [homebrew](https://brew.sh/) installed and r There are different (and as yet untested) environments for `boltz`. `protenix` won't currently work on a Mac due to the strict requirement of `triton` which requires an NVIDIA GPU. You may find similar issues with other environments. -Debug as needed. +Debug as needed. \ No newline at end of file diff --git a/run_all_models.sh b/run_all_models.sh old mode 100755 new mode 100644 index c958a7cf..58c96ddb --- a/run_all_models.sh +++ b/run_all_models.sh @@ -1,156 +1,119 @@ #!/bin/bash -# Run all 4 model grid searches in parallel, 2 GPUs each -# Total: 8 GPUs used (4 jobs x 2 GPUs each) +# Run all model grid searches in parallel: Boltz1, Boltz2, Protenix, and RF3 +# Total: 16 GPUs used (4 jobs x 4 GPUs each) # -# Models: -# - Boltz2 X-ray diffraction (GPUs 0,1) -# - Boltz2 MD (GPUs 2,3) -# - RosettaFold3 (GPUs 4,5) -# - Protenix (GPUs 6,7) -# -# Checkpoints are BAKED INTO the Docker image at /checkpoints/. -# If missing, the code auto-falls back to mounted paths. +# Checkpoints are BAKED INTO the Docker image - no need to mount them! # # Usage: # ./run_all_models.sh set -e -# Configuration -DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps" -RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}" -MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}" +# Configuration - uses absolute path to data +DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40" +RESULTS_DIR="${RESULTS_DIR:-$HOME/sampleworks-exp/grid_search_results}" +# Docker image to use (override with IMAGE env var) +IMAGE="${IMAGE:-diffuseproject/sampleworks:latest}" -# Create directories +# Create output directory mkdir -p "$RESULTS_DIR" -mkdir -p "$MSA_CACHE_DIR" - -# Pull latest image (no-op if already up to date) -echo "Pulling latest Docker image..." -docker pull diffuseproject/sampleworks:latest # Common docker options DOCKER_OPTS="--rm --shm-size=16g" echo "==========================================" -echo "Starting all model grid searches (4 jobs x 2 GPUs)" +echo "Starting all model grid searches" +echo "Models: boltz1, boltz2, protenix, rf3" echo "Data: $DATA_DIR" echo "Results: $RESULTS_DIR" -echo "MSA Cache: $MSA_CACHE_DIR" -echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)" -echo "" -echo "Models:" -echo " - Boltz2 X-ray (GPUs 0,1)" -echo " - Boltz2 MD (GPUs 2,3)" -echo " - RF3 (GPUs 4,5)" -echo " - Protenix (GPUs 6,7)" +echo "Image: $IMAGE" +echo "Checkpoints: BAKED INTO IMAGE" echo "==========================================" -PIDS=() - -# --- Boltz2 X-ray Diffraction (GPUs 0,1) --- -echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1" -docker run $DOCKER_OPTS \ - --gpus '"device=0,1"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - diffuseproject/sampleworks:latest \ - -e boltz run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --models boltz2 \ - --methods "X-RAY DIFFRACTION" \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" & -PIDS+=($!) -echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})" - -# --- Boltz2 MD (GPUs 2,3) --- -echo "[$(date)] Starting Boltz2 MD on GPUs 2,3" -docker run $DOCKER_OPTS \ - --gpus '"device=2,3"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - diffuseproject/sampleworks:latest \ - -e boltz run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --models boltz2 \ - --methods "MD" \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" & -PIDS+=($!) -echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})" - -# --- RosettaFold3 (GPUs 4,5) --- -echo "[$(date)] Starting RosettaFold3 on GPUs 4,5" -docker run $DOCKER_OPTS \ - --gpus '"device=4,5"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - diffuseproject/sampleworks:latest \ - -e rf3 run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --models rf3 \ - --partial-diffusion-step 120 \ - --scalers pure_guidance \ - --ensemble-sizes "8" \ - --gradient-weights "0.01 0.02 0.05" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/rf3_run.log" & -PIDS+=($!) -echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})" - -# --- Protenix (GPUs 6,7) --- -echo "[$(date)] Starting Protenix on GPUs 6,7" -docker run $DOCKER_OPTS \ - --gpus '"device=6,7"' \ - -v "$DATA_DIR:/data/inputs:ro" \ - -v "$RESULTS_DIR:/data/results" \ - -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \ - diffuseproject/sampleworks:latest \ - -e protenix run_grid_search.py \ - --proteins "/data/inputs/proteins.csv" \ - --models protenix \ - --scalers pure_guidance \ - --partial-diffusion-step 120 \ - --ensemble-sizes "8" \ - --gradient-weights "0.1 0.2 0.5" \ - --gradient-normalization --augmentation --align-to-input \ - --output-dir /data/results \ - 2>&1 | tee "$RESULTS_DIR/protenix_run.log" & -PIDS+=($!) -echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})" +# Track background job PIDs +declare -a PIDS=() +declare -a PID_NAMES=() + +# Function to run a model with specific GPUs +# Usage: run_model [extra_args...] +run_model() { + local model=$1 + local env=$2 + local gpus=$3 + shift 3 + local extra_args=("$@") + + echo "[$(date)] Starting $model on GPUs $gpus" + + docker run $DOCKER_OPTS \ + --gpus "\"device=$gpus\"" \ + -v /mnt/diffuse-private:/mnt/diffuse-private:ro \ + -v "$RESULTS_DIR:/data/results" \ + "$IMAGE" \ + -e "$env" run_grid_search.py \ + --proteins "$DATA_DIR/proteins.csv" \ + --model "$model" \ + --scalers "pure_guidance" \ + --ensemble-sizes "1 4" \ + --gradient-weights "0.1 0.2" \ + --gradient-normalization --augmentation --align-to-input \ + --output-dir /data/results \ + "${extra_args[@]}" \ + 2>&1 | tee "$RESULTS_DIR/${model}_run.log" & + + PIDS+=($!) + PID_NAMES+=("$model") + echo "[$(date)] $model job started (PID: $!)" +} + +# Run all four models in parallel with 4 GPUs each: +# - boltz1: GPUs 0,1,2,3 +# - boltz2: GPUs 4,5,6,7 +# - protenix: GPUs 8,9,10,11 +# - rf3: GPUs 12,13,14,15 + +# Boltz1 (GPUs 0-3) - checkpoints baked in, uses defaults +run_model "boltz1" "boltz" "0,1,2,3" + +# Boltz2 (GPUs 4-7) - needs --method flag +run_model "boltz2" "boltz" "4,5,6,7" --method "X-RAY DIFFRACTION" + +# Protenix (GPUs 8-11) +run_model "protenix" "protenix" "8,9,10,11" + +# RF3 (GPUs 12-15) +run_model "rf3" "rf3" "12,13,14,15" echo "" echo "==========================================" -echo "All 4 jobs launched! PIDs: ${PIDS[*]}" +echo "All model jobs launched!" echo "Logs:" -echo " - $RESULTS_DIR/boltz2_xrd_run.log" -echo " - $RESULTS_DIR/boltz2_md_run.log" -echo " - $RESULTS_DIR/rf3_run.log" +echo " - $RESULTS_DIR/boltz1_run.log" +echo " - $RESULTS_DIR/boltz2_run.log" echo " - $RESULTS_DIR/protenix_run.log" +echo " - $RESULTS_DIR/rf3_run.log" echo "" echo "Monitor GPU usage: nvidia-smi -l 1" echo "Waiting for all jobs to complete..." echo "==========================================" -# Wait for all background jobs -wait +# Wait for all background jobs and check exit codes +overall_exit=0 +for i in "${!PIDS[@]}"; do + if wait "${PIDS[$i]}"; then + echo "[$(date)] ${PID_NAMES[$i]} completed successfully" + else + echo "[$(date)] ${PID_NAMES[$i]} FAILED (exit code: $?)" + overall_exit=1 + fi +done echo "" echo "==========================================" -echo "[$(date)] All jobs completed!" +if [ $overall_exit -eq 0 ]; then + echo "[$(date)] All jobs completed successfully!" +else + echo "[$(date)] Some jobs FAILED — check logs above" +fi echo "==========================================" +exit $overall_exit \ No newline at end of file diff --git a/scripts/eval/rscc_grid_search_script.py b/scripts/eval/rscc_grid_search_script.py index 1819c5f1..a2eeede7 100644 --- a/scripts/eval/rscc_grid_search_script.py +++ b/scripts/eval/rscc_grid_search_script.py @@ -322,4 +322,4 @@ def main(args: argparse.Namespace): if __name__ == "__main__": args = parse_args("Evaluate RSCC on grid search results.") - main(args) + main(args) \ No newline at end of file diff --git a/tests/integration/test_mismatch_integration.py b/tests/integration/test_mismatch_integration.py index 00bdc9fd..dde08e21 100644 --- a/tests/integration/test_mismatch_integration.py +++ b/tests/integration/test_mismatch_integration.py @@ -12,7 +12,7 @@ from atomworks.io.transforms.atom_array import ensure_atom_array_stack from biotite.structure import AtomArray from sampleworks.core.rewards.protocol import RewardInputs -from sampleworks.core.samplers.edm import AF3EDMSampler, EDMSamplerConfig +from sampleworks.core.samplers.edm import AF3EDMSampler from sampleworks.core.samplers.protocol import StepParams from sampleworks.core.scalers.fk_steering import FKSteering from sampleworks.core.scalers.pure_guidance import PureGuidance @@ -720,14 +720,13 @@ class TestSamplerStep: @pytest.fixture def sampler(self) -> AF3EDMSampler: """Sampler configured for deterministic mismatch tests.""" - config = EDMSamplerConfig( + return AF3EDMSampler( augmentation=False, align_to_input=True, alignment_reverse_diffusion=False, scale_guidance_to_diffusion=True, device="cpu", ) - return AF3EDMSampler(config) def _context_with_reference( self, @@ -783,14 +782,13 @@ def test_alignment_reduces_rmsd(self, mismatch_case: MismatchCase, sampler: AF3E state = torch.randn(1, mismatch_case.n_model, 3) context = self._context_with_reference(reconciler, reference) - config_no_align = EDMSamplerConfig( + sampler_no_align = AF3EDMSampler( augmentation=False, align_to_input=False, alignment_reverse_diffusion=False, scale_guidance_to_diffusion=True, device="cpu", ) - sampler_no_align = AF3EDMSampler(config_no_align) torch.manual_seed(42) output_aligned = sampler.step(state.clone(), wrapper, context, features=features) @@ -879,8 +877,7 @@ def _run_scaler(self, case: MismatchCase, scaler_type: str, reward) -> Any: "asym_unit": case.struct_atom_array.copy(), "metadata": {"id": case.id}, } - config = EDMSamplerConfig(augmentation=False, align_to_input=True, device="cpu") - sampler = AF3EDMSampler(config) + sampler = AF3EDMSampler(augmentation=False, align_to_input=True, device="cpu") step_scaler = DataSpaceDPSScaler(step_size=0.01) if scaler_type == "pure_guidance": @@ -1023,4 +1020,4 @@ def test_save_with_model_template(self, tmp_path: Path): ) assert (tmp_path / "refined.cif").exists() - assert (tmp_path / "losses.txt").exists() + assert (tmp_path / "losses.txt").exists() \ No newline at end of file