Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

# The Dockerfile uses COPY --from=diffuseproject/sampleworks-checkpoints:latest
# which Docker automatically pulls from Docker Hub during the build.
# No checkpoint files are needed in the CI build context.
- name: Build and push Docker image
id: build-push
uses: docker/build-push-action@v5
Expand Down
25 changes: 16 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Sampleworks - Protein structure prediction with diffusion model guidance
#
# This container includes all three model environments: boltz, protenix, rf3
# Checkpoints are baked into the image at /checkpoints/
# Checkpoints are baked into the image at /checkpoints/ via a pre-built base image.
#
# Build:
# docker build -t sampleworks .
Expand Down Expand Up @@ -48,12 +48,16 @@
# docker run --gpus all -it sampleworks bash
#
# Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest):
# /checkpoints/boltz1_conf.ckpt - Boltz1 model
# /checkpoints/boltz2_conf.ckpt - Boltz2 model
# /checkpoints/ccd.pkl - Chemical Component Dictionary (required for Boltz)
# /checkpoints/mols/ - Boltz2 molecules data
# /checkpoints/rf3_foundry_01_24_latest.ckpt - RF3 model
# /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model
# /checkpoints/boltz1_conf.ckpt - Boltz1 model (~3.5GB)
# /checkpoints/boltz2_conf.ckpt - Boltz2 model (~2.3GB)
# /checkpoints/ccd.pkl - Chemical Component Dictionary (~345MB)
# /checkpoints/mols/ - Boltz2 molecule data (~2GB)
# /checkpoints/rf3_foundry_01_24_latest.ckpt - RF3 model (~2.9GB)
# /checkpoints/protenix_base_default_v0.5.0.pt - Protenix model (~1.4GB)
#
# Checkpoints base image:
# All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub.
# To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server.

# ============================================================================
# Base stage: CUDA + Pixi + common system dependencies
Expand Down Expand Up @@ -116,10 +120,13 @@ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-comp

# ============================================================================
# Bake in model checkpoints from pre-built base image on Docker Hub
# This image contains: boltz1, boltz2, ccd, mols/, rf3, protenix checkpoints
# ============================================================================
# All checkpoints (Boltz1, Boltz2, CCD, mols, RF3, Protenix) are pre-built
# into diffuseproject/sampleworks-checkpoints:latest on Docker Hub.
# This avoids downloading ~6GB from HuggingFace during build and removes the
# need to have RF3/Protenix checkpoints in the build context.
# Rebuild with: docker build -t diffuseproject/sampleworks-checkpoints:latest
# docker push diffuseproject/sampleworks-checkpoints:latest
# ============================================================================
COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/

# Set default checkpoint paths via environment variables
Expand Down
210 changes: 123 additions & 87 deletions run_all_models.sh
Original file line number Diff line number Diff line change
@@ -1,120 +1,156 @@
#!/bin/bash
# Run all model grid searches in parallel: Boltz1, Boltz2, Protenix, and RF3
# Total: 16 GPUs used (4 jobs x 4 GPUs each)
# Run all 4 model grid searches in parallel, 2 GPUs each
# Total: 8 GPUs used (4 jobs x 2 GPUs each)
#
# Checkpoints are BAKED INTO the Docker image - no need to mount them!
# Models:
# - Boltz2 X-ray diffraction (GPUs 0,1)
# - Boltz2 MD (GPUs 2,3)
# - RosettaFold3 (GPUs 4,5)
# - Protenix (GPUs 6,7)
#
# Checkpoints are BAKED INTO the Docker image at /checkpoints/.
# If missing, the code auto-falls back to mounted paths.
#
# Usage:
# ./run_all_models.sh

set -e

# Configuration - uses absolute path to data
DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40"
RESULTS_DIR="${RESULTS_DIR:-$HOME/sampleworks-exp/grid_search_results}"
# Docker image to use (override with IMAGE env var)
IMAGE="${IMAGE:-diffuseproject/sampleworks:latest}"
# Configuration
DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}"
MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}"

# Create output directory
# Create directories
mkdir -p "$RESULTS_DIR"
mkdir -p "$MSA_CACHE_DIR"

# Pull latest image (no-op if already up to date)
echo "Pulling latest Docker image..."
docker pull diffuseproject/sampleworks:latest

# Common docker options
DOCKER_OPTS="--rm --shm-size=16g"

echo "=========================================="
echo "Starting all model grid searches"
echo "Models: boltz1, boltz2, protenix, rf3"
echo "Starting all model grid searches (4 jobs x 2 GPUs)"
echo "Data: $DATA_DIR"
echo "Results: $RESULTS_DIR"
echo "Image: $IMAGE"
echo "Checkpoints: BAKED INTO IMAGE"
echo "MSA Cache: $MSA_CACHE_DIR"
echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)"
echo ""
echo "Models:"
echo " - Boltz2 X-ray (GPUs 0,1)"
echo " - Boltz2 MD (GPUs 2,3)"
echo " - RF3 (GPUs 4,5)"
echo " - Protenix (GPUs 6,7)"
echo "=========================================="

# Track background job PIDs
declare -a PIDS=()
declare -a PID_NAMES=()

# Function to run a model with specific GPUs
# Usage: run_model <model> <env> <gpus> [extra_args...]
run_model() {
local model=$1
local env=$2
local gpus=$3
shift 3
local extra_args=("$@")

echo "[$(date)] Starting $model on GPUs $gpus"

docker run $DOCKER_OPTS \
--gpus "\"device=$gpus\"" \
-v /mnt/diffuse-private:/mnt/diffuse-private:ro \
-v "$RESULTS_DIR:/data/results" \
"$IMAGE" \
-e "$env" run_grid_search.py \
--proteins "$DATA_DIR/proteins.csv" \
--models "$model" \
--scalers "pure_guidance" \
--ensemble-sizes "1 4" \
--gradient-weights "0.1 0.2" \
--gradient-normalization --augmentation --align-to-input \
--use-tweedie \
--output-dir /data/results \
"${extra_args[@]}" \
2>&1 | tee "$RESULTS_DIR/${model}_run.log" &

PIDS+=($!)
PID_NAMES+=("$model")
echo "[$(date)] $model job started (PID: $!)"
}

# Run all four models in parallel with 4 GPUs each:
# - boltz1: GPUs 0,1,2,3
# - boltz2: GPUs 4,5,6,7
# - protenix: GPUs 8,9,10,11
# - rf3: GPUs 12,13,14,15

# Boltz1 (GPUs 0-3) - checkpoints baked in, uses defaults
run_model "boltz1" "boltz" "0,1,2,3"

# Boltz2 (GPUs 4-7) - needs --methods flag
run_model "boltz2" "boltz" "4,5,6,7" --methods "X-RAY DIFFRACTION"

# Protenix (GPUs 8-11)
run_model "protenix" "protenix" "8,9,10,11"

# RF3 (GPUs 12-15)
run_model "rf3" "rf3" "12,13,14,15"
PIDS=()

# --- Boltz2 X-ray Diffraction (GPUs 0,1) ---
echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1"
docker run $DOCKER_OPTS \
--gpus '"device=0,1"' \
-v "$DATA_DIR:/data/inputs:ro" \
-v "$RESULTS_DIR:/data/results" \
-v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
diffuseproject/sampleworks:latest \
-e boltz run_grid_search.py \
--proteins "/data/inputs/proteins.csv" \
--models boltz2 \
--methods "X-RAY DIFFRACTION" \
--scalers pure_guidance \
--partial-diffusion-step 120 \
--ensemble-sizes "8" \
--gradient-weights "0.1 0.2 0.5" \
--gradient-normalization --augmentation --align-to-input \
--output-dir /data/results \
2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" &
Comment on lines +53 to +69
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

printf 'pipeline status without pipefail: '
bash -lc 'false | tee /dev/null >/dev/null; echo $?'

printf 'wait on a failed background pipeline without pipefail: '
bash -lc 'false | tee /dev/null >/dev/null & pid=$!; wait "$pid"; echo $?'

printf 'plain wait after a failed background pipeline: '
bash -lc 'false | tee /dev/null >/dev/null & wait; echo $?'

Repository: diff-use/sampleworks

Length of output: 634


🏁 Script executed:

find . -name "run_all_models.sh" -type f

Repository: diff-use/sampleworks

Length of output: 83


🏁 Script executed:

head -150 run_all_models.sh | cat -n

Repository: diff-use/sampleworks

Length of output: 6001


Background pipelines are masking docker run failures.

Each of the 4 job launches uses docker run ... | tee ... & without pipefail. Because set -e only applies to the final command in a pipeline, the exit status comes from tee (which always succeeds), not from docker run. Then the bare wait on line 146 returns success after all children exit, regardless of their actual exit codes. This means the script will report "All jobs completed!" even when one or more containers have failed.

Wrap all four launch blocks (lines 49–65, 71–87, 93–107, 113–128) in subshells with set -o pipefail, then replace the bare wait with explicit PID-based waits that check exit codes:

Suggested change
-(
+    set -o pipefail
     docker run $DOCKER_OPTS \
         --gpus '"device=0,1"' \
         -v "$DATA_DIR:/data/inputs:ro" \
         -v "$RESULTS_DIR:/data/results" \
         -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
         diffuseproject/sampleworks:latest \
         -e boltz run_grid_search.py \
         --proteins "/data/inputs/proteins.csv" \
         --models boltz2 \
         --methods "X-RAY DIFFRACTION" \
         --scalers pure_guidance \
         --partial-diffusion-step 120 \
         --ensemble-sizes "8" \
         --gradient-weights "0.1 0.2 0.5" \
         --gradient-normalization --augmentation --align-to-input \
         --output-dir /data/results \
         2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log"
+) &
 # Wait for all background jobs
-wait
+failed=0
+for pid in "${PIDS[@]}"; do
+    if ! wait "$pid"; then
+        failed=1
+    fi
+done
+if (( failed )); then
+    exit 1
+fi

Apply the same subshell + pipefail pattern to all four job launches (lines 71–87, 93–107, 113–128).

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@run_all_models.sh` around lines 49 - 65, The backgrounded docker run
pipelines (each using "docker run ... | tee ... &") are masking failures because
tee hides docker's exit code; wrap each launch pipeline in a subshell that sets
pipefail (e.g., ( set -o pipefail; docker run ... | tee ... ) & ), capture each
background PID (use $! into an array or variables) and then replace the single
bare wait with explicit waits that iterate over those PIDs, check each wait's
exit status, and exit nonzero if any container failed; apply this
subshell+pipefail+PID-capture pattern to all four launches using the existing
"docker run" and "tee" invocations and then perform PID-based waits instead of
the bare "wait".

PIDS+=($!)
echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})"

# --- Boltz2 MD (GPUs 2,3) ---
echo "[$(date)] Starting Boltz2 MD on GPUs 2,3"
docker run $DOCKER_OPTS \
--gpus '"device=2,3"' \
-v "$DATA_DIR:/data/inputs:ro" \
-v "$RESULTS_DIR:/data/results" \
-v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
diffuseproject/sampleworks:latest \
-e boltz run_grid_search.py \
--proteins "/data/inputs/proteins.csv" \
--models boltz2 \
--methods "MD" \
--scalers pure_guidance \
--partial-diffusion-step 120 \
--ensemble-sizes "8" \
--gradient-weights "0.1 0.2 0.5" \
--gradient-normalization --augmentation --align-to-input \
--output-dir /data/results \
2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" &
PIDS+=($!)
echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})"

# --- RosettaFold3 (GPUs 4,5) ---
echo "[$(date)] Starting RosettaFold3 on GPUs 4,5"
docker run $DOCKER_OPTS \
--gpus '"device=4,5"' \
-v "$DATA_DIR:/data/inputs:ro" \
-v "$RESULTS_DIR:/data/results" \
-v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
diffuseproject/sampleworks:latest \
-e rf3 run_grid_search.py \
--proteins "/data/inputs/proteins.csv" \
--models rf3 \
--partial-diffusion-step 120 \
--scalers pure_guidance \
--ensemble-sizes "8" \
--gradient-weights "0.01 0.02 0.05" \
--gradient-normalization --augmentation --align-to-input \
--output-dir /data/results \
2>&1 | tee "$RESULTS_DIR/rf3_run.log" &
PIDS+=($!)
echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})"

# --- Protenix (GPUs 6,7) ---
echo "[$(date)] Starting Protenix on GPUs 6,7"
docker run $DOCKER_OPTS \
--gpus '"device=6,7"' \
-v "$DATA_DIR:/data/inputs:ro" \
-v "$RESULTS_DIR:/data/results" \
-v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
diffuseproject/sampleworks:latest \
-e protenix run_grid_search.py \
--proteins "/data/inputs/proteins.csv" \
--models protenix \
--scalers pure_guidance \
--partial-diffusion-step 120 \
--ensemble-sizes "8" \
--gradient-weights "0.1 0.2 0.5" \
--gradient-normalization --augmentation --align-to-input \
--output-dir /data/results \
2>&1 | tee "$RESULTS_DIR/protenix_run.log" &
PIDS+=($!)
echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})"

echo ""
echo "=========================================="
echo "All model jobs launched!"
echo "All 4 jobs launched! PIDs: ${PIDS[*]}"
echo "Logs:"
echo " - $RESULTS_DIR/boltz1_run.log"
echo " - $RESULTS_DIR/boltz2_run.log"
echo " - $RESULTS_DIR/protenix_run.log"
echo " - $RESULTS_DIR/boltz2_xrd_run.log"
echo " - $RESULTS_DIR/boltz2_md_run.log"
echo " - $RESULTS_DIR/rf3_run.log"
echo " - $RESULTS_DIR/protenix_run.log"
echo ""
echo "Monitor GPU usage: nvidia-smi -l 1"
echo "Waiting for all jobs to complete..."
echo "=========================================="

# Wait for all background jobs and check exit codes
overall_exit=0
for i in "${!PIDS[@]}"; do
if wait "${PIDS[$i]}"; then
echo "[$(date)] ${PID_NAMES[$i]} completed successfully"
else
echo "[$(date)] ${PID_NAMES[$i]} FAILED (exit code: $?)"
overall_exit=1
fi
done
# Wait for all background jobs
wait

echo ""
echo "=========================================="
Comment on lines +150 to 154
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The script now uses a bare wait under set -e, which does not reliably propagate failures from background jobs (it returns the status of the last job waited on). This can report success even if one of the earlier model runs failed. Restore per-PID waiting and aggregate exit codes (as the previous version did) so CI/automation can detect partial failures.

Suggested change
# Wait for all background jobs
wait
echo ""
echo "=========================================="
# Wait for all background jobs individually and aggregate exit codes
overall_status=0
for pid in "${PIDS[@]}"; do
if ! wait "$pid"; then
exit_code=$?
echo "[$(date)] Job with PID $pid failed with exit code $exit_code"
overall_status=$exit_code
else
echo "[$(date)] Job with PID $pid completed successfully"
fi
done
echo ""
echo "=========================================="
if [ "$overall_status" -ne 0 ]; then
echo "[$(date)] One or more jobs failed."
echo "=========================================="
exit "$overall_status"
fi

Copilot uses AI. Check for mistakes.
if [ $overall_exit -eq 0 ]; then
echo "[$(date)] All jobs completed successfully!"
else
echo "[$(date)] Some jobs FAILED — check logs above"
fi
echo "[$(date)] All jobs completed!"
echo "=========================================="
exit $overall_exit
Loading
Loading