diff-use · marcuscollins · Mar 14, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -41,6 +41,9 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
+      # The Dockerfile uses COPY --from=diffuseproject/sampleworks-checkpoints:latest
+      # which Docker automatically pulls from Docker Hub during the build.
+      # No checkpoint files are needed in the CI build context.
       - name: Build and push Docker image
         id: build-push
         uses: docker/build-push-action@v5

diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@
 # Sampleworks - Protein structure prediction with diffusion model guidance
 #
 # This container includes all three model environments: boltz, protenix, rf3
-# Checkpoints are baked into the image at /checkpoints/
+# Checkpoints are baked into the image at /checkpoints/ via a pre-built base image.
 #
 # Build:
 #   docker build -t sampleworks .
@@ -48,12 +48,16 @@
 #   docker run --gpus all -it sampleworks bash
 #
 # Baked-in checkpoints (from diffuseproject/sampleworks-checkpoints:latest):
-#   /checkpoints/boltz1_conf.ckpt                   - Boltz1 model
-#   /checkpoints/boltz2_conf.ckpt                   - Boltz2 model
-#   /checkpoints/ccd.pkl                             - Chemical Component Dictionary (required for Boltz)
-#   /checkpoints/mols/                               - Boltz2 molecules data
-#   /checkpoints/rf3_foundry_01_24_latest.ckpt       - RF3 model
-#   /checkpoints/protenix_base_default_v0.5.0.pt     - Protenix model
+#   /checkpoints/boltz1_conf.ckpt                   - Boltz1 model (~3.5GB)
+#   /checkpoints/boltz2_conf.ckpt                   - Boltz2 model (~2.3GB)
+#   /checkpoints/ccd.pkl                             - Chemical Component Dictionary (~345MB)
+#   /checkpoints/mols/                               - Boltz2 molecule data (~2GB)
+#   /checkpoints/rf3_foundry_01_24_latest.ckpt       - RF3 model (~2.9GB)
+#   /checkpoints/protenix_base_default_v0.5.0.pt     - Protenix model (~1.4GB)
+#
+# Checkpoints base image:
+#   All checkpoints live in diffuseproject/sampleworks-checkpoints:latest on Docker Hub.
+#   To rebuild that image, see /data/users/diffuse/checkpoint-build/ on the GPU server.
 
 # ============================================================================
 # Base stage: CUDA + Pixi + common system dependencies
@@ -116,10 +120,13 @@ print('CUDA extensions compiled successfully')" || echo "CUDA extension pre-comp
 
 # ============================================================================
 # Bake in model checkpoints from pre-built base image on Docker Hub
-# This image contains: boltz1, boltz2, ccd, mols/, rf3, protenix checkpoints
+# ============================================================================
+# All checkpoints (Boltz1, Boltz2, CCD, mols, RF3, Protenix) are pre-built
+# into diffuseproject/sampleworks-checkpoints:latest on Docker Hub.
+# This avoids downloading ~6GB from HuggingFace during build and removes the
+# need to have RF3/Protenix checkpoints in the build context.
 # Rebuild with: docker build -t diffuseproject/sampleworks-checkpoints:latest
 #               docker push diffuseproject/sampleworks-checkpoints:latest
-# ============================================================================
 COPY --from=diffuseproject/sampleworks-checkpoints:latest /checkpoints/ /checkpoints/
 
 # Set default checkpoint paths via environment variables

diff --git a/run_all_models.sh b/run_all_models.sh
@@ -1,120 +1,156 @@
 #!/bin/bash
-# Run all model grid searches in parallel: Boltz1, Boltz2, Protenix, and RF3
-# Total: 16 GPUs used (4 jobs x 4 GPUs each)
+# Run all 4 model grid searches in parallel, 2 GPUs each
+# Total: 8 GPUs used (4 jobs x 2 GPUs each)
 #
-# Checkpoints are BAKED INTO the Docker image - no need to mount them!
+# Models:
+#   - Boltz2 X-ray diffraction (GPUs 0,1)
+#   - Boltz2 MD               (GPUs 2,3)
+#   - RosettaFold3             (GPUs 4,5)
+#   - Protenix                 (GPUs 6,7)
+#
+# Checkpoints are BAKED INTO the Docker image at /checkpoints/.
+# If missing, the code auto-falls back to mounted paths.
 #
 # Usage:
 #   ./run_all_models.sh
 
 set -e
 
-# Configuration - uses absolute path to data
-DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40"
-RESULTS_DIR="${RESULTS_DIR:-$HOME/sampleworks-exp/grid_search_results}"
-# Docker image to use (override with IMAGE env var)
-IMAGE="${IMAGE:-diffuseproject/sampleworks:latest}"
+# Configuration
+DATA_DIR="/mnt/diffuse-private/raw/sampleworks/initial_dataset_40_occ_sweeps"
+RESULTS_DIR="${RESULTS_DIR:-/data/sampleworks-exp/occ_sweep/grid_search_results}"
+MSA_CACHE_DIR="${MSA_CACHE_DIR:-/data/sampleworks-exp/msa_cache}"
 
-# Create output directory
+# Create directories
 mkdir -p "$RESULTS_DIR"
+mkdir -p "$MSA_CACHE_DIR"
+
+# Pull latest image (no-op if already up to date)
+echo "Pulling latest Docker image..."
+docker pull diffuseproject/sampleworks:latest
 
 # Common docker options
 DOCKER_OPTS="--rm --shm-size=16g"
 
 echo "=========================================="
-echo "Starting all model grid searches"
-echo "Models: boltz1, boltz2, protenix, rf3"
+echo "Starting all model grid searches (4 jobs x 2 GPUs)"
 echo "Data: $DATA_DIR"
 echo "Results: $RESULTS_DIR"
-echo "Image: $IMAGE"
-echo "Checkpoints: BAKED INTO IMAGE"
+echo "MSA Cache: $MSA_CACHE_DIR"
+echo "Checkpoints: BAKED INTO IMAGE (with mount fallback)"
+echo ""
+echo "Models:"
+echo "  - Boltz2 X-ray (GPUs 0,1)"
+echo "  - Boltz2 MD    (GPUs 2,3)"
+echo "  - RF3          (GPUs 4,5)"
+echo "  - Protenix     (GPUs 6,7)"
 echo "=========================================="
 
-# Track background job PIDs
-declare -a PIDS=()
-declare -a PID_NAMES=()
-
-# Function to run a model with specific GPUs
-# Usage: run_model <model> <env> <gpus> [extra_args...]
-run_model() {
-    local model=$1
-    local env=$2
-    local gpus=$3
-    shift 3
-    local extra_args=("$@")
-
-    echo "[$(date)] Starting $model on GPUs $gpus"
-
-    docker run $DOCKER_OPTS \
-        --gpus "\"device=$gpus\"" \
-        -v /mnt/diffuse-private:/mnt/diffuse-private:ro \
-        -v "$RESULTS_DIR:/data/results" \
-        "$IMAGE" \
-        -e "$env" run_grid_search.py \
-        --proteins "$DATA_DIR/proteins.csv" \
-        --models "$model" \
-        --scalers "pure_guidance" \
-        --ensemble-sizes "1 4" \
-        --gradient-weights "0.1 0.2" \
-        --gradient-normalization --augmentation --align-to-input \
-        --use-tweedie \
-        --output-dir /data/results \
-        "${extra_args[@]}" \
-        2>&1 | tee "$RESULTS_DIR/${model}_run.log" &
-
-    PIDS+=($!)
-    PID_NAMES+=("$model")
-    echo "[$(date)] $model job started (PID: $!)"
-}
-
-# Run all four models in parallel with 4 GPUs each:
-# - boltz1:   GPUs 0,1,2,3
-# - boltz2:   GPUs 4,5,6,7
-# - protenix: GPUs 8,9,10,11
-# - rf3:      GPUs 12,13,14,15
-
-# Boltz1 (GPUs 0-3) - checkpoints baked in, uses defaults
-run_model "boltz1" "boltz" "0,1,2,3"
-
-# Boltz2 (GPUs 4-7) - needs --methods flag
-run_model "boltz2" "boltz" "4,5,6,7" --methods "X-RAY DIFFRACTION"
-
-# Protenix (GPUs 8-11)
-run_model "protenix" "protenix" "8,9,10,11"
-
-# RF3 (GPUs 12-15)
-run_model "rf3" "rf3" "12,13,14,15"
+PIDS=()
+
+# --- Boltz2 X-ray Diffraction (GPUs 0,1) ---
+echo "[$(date)] Starting Boltz2 X-ray on GPUs 0,1"
+docker run $DOCKER_OPTS \
+    --gpus '"device=0,1"' \
+    -v "$DATA_DIR:/data/inputs:ro" \
+    -v "$RESULTS_DIR:/data/results" \
+    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
+    diffuseproject/sampleworks:latest \
+    -e boltz run_grid_search.py \
+    --proteins "/data/inputs/proteins.csv" \
+    --models boltz2 \
+    --methods "X-RAY DIFFRACTION" \
+    --scalers pure_guidance \
+    --partial-diffusion-step 120 \
+    --ensemble-sizes "8" \
+    --gradient-weights "0.1 0.2 0.5" \
+    --gradient-normalization --augmentation --align-to-input \
+    --output-dir /data/results \
+    2>&1 | tee "$RESULTS_DIR/boltz2_xrd_run.log" &
+PIDS+=($!)
+echo "[$(date)] Boltz2 X-ray job started (PID: ${PIDS[-1]})"
+
+# --- Boltz2 MD (GPUs 2,3) ---
+echo "[$(date)] Starting Boltz2 MD on GPUs 2,3"
+docker run $DOCKER_OPTS \
+    --gpus '"device=2,3"' \
+    -v "$DATA_DIR:/data/inputs:ro" \
+    -v "$RESULTS_DIR:/data/results" \
+    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
+    diffuseproject/sampleworks:latest \
+    -e boltz run_grid_search.py \
+    --proteins "/data/inputs/proteins.csv" \
+    --models boltz2 \
+    --methods "MD" \
+    --scalers pure_guidance \
+    --partial-diffusion-step 120 \
+    --ensemble-sizes "8" \
+    --gradient-weights "0.1 0.2 0.5" \
+    --gradient-normalization --augmentation --align-to-input \
+    --output-dir /data/results \
+    2>&1 | tee "$RESULTS_DIR/boltz2_md_run.log" &
+PIDS+=($!)
+echo "[$(date)] Boltz2 MD job started (PID: ${PIDS[-1]})"
+
+# --- RosettaFold3 (GPUs 4,5) ---
+echo "[$(date)] Starting RosettaFold3 on GPUs 4,5"
+docker run $DOCKER_OPTS \
+    --gpus '"device=4,5"' \
+    -v "$DATA_DIR:/data/inputs:ro" \
+    -v "$RESULTS_DIR:/data/results" \
+    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
+    diffuseproject/sampleworks:latest \
+    -e rf3 run_grid_search.py \
+    --proteins "/data/inputs/proteins.csv" \
+    --models rf3 \
+    --partial-diffusion-step 120 \
+    --scalers pure_guidance \
+    --ensemble-sizes "8" \
+    --gradient-weights "0.01 0.02 0.05" \
+    --gradient-normalization --augmentation --align-to-input \
+    --output-dir /data/results \
+    2>&1 | tee "$RESULTS_DIR/rf3_run.log" &
+PIDS+=($!)
+echo "[$(date)] RosettaFold3 job started (PID: ${PIDS[-1]})"
+
+# --- Protenix (GPUs 6,7) ---
+echo "[$(date)] Starting Protenix on GPUs 6,7"
+docker run $DOCKER_OPTS \
+    --gpus '"device=6,7"' \
+    -v "$DATA_DIR:/data/inputs:ro" \
+    -v "$RESULTS_DIR:/data/results" \
+    -v "$MSA_CACHE_DIR:/root/.sampleworks/msa" \
+    diffuseproject/sampleworks:latest \
+    -e protenix run_grid_search.py \
+    --proteins "/data/inputs/proteins.csv" \
+    --models protenix \
+    --scalers pure_guidance \
+    --partial-diffusion-step 120 \
+    --ensemble-sizes "8" \
+    --gradient-weights "0.1 0.2 0.5" \
+    --gradient-normalization --augmentation --align-to-input \
+    --output-dir /data/results \
+    2>&1 | tee "$RESULTS_DIR/protenix_run.log" &
+PIDS+=($!)
+echo "[$(date)] Protenix job started (PID: ${PIDS[-1]})"
 
 echo ""
 echo "=========================================="
-echo "All model jobs launched!"
+echo "All 4 jobs launched! PIDs: ${PIDS[*]}"
 echo "Logs:"
-echo "  - $RESULTS_DIR/boltz1_run.log"
-echo "  - $RESULTS_DIR/boltz2_run.log"
-echo "  - $RESULTS_DIR/protenix_run.log"
+echo "  - $RESULTS_DIR/boltz2_xrd_run.log"
+echo "  - $RESULTS_DIR/boltz2_md_run.log"
 echo "  - $RESULTS_DIR/rf3_run.log"
+echo "  - $RESULTS_DIR/protenix_run.log"
 echo ""
 echo "Monitor GPU usage: nvidia-smi -l 1"
 echo "Waiting for all jobs to complete..."
 echo "=========================================="
 
-# Wait for all background jobs and check exit codes
-overall_exit=0
-for i in "${!PIDS[@]}"; do
-    if wait "${PIDS[$i]}"; then
-        echo "[$(date)] ${PID_NAMES[$i]} completed successfully"
-    else
-        echo "[$(date)] ${PID_NAMES[$i]} FAILED (exit code: $?)"
-        overall_exit=1
-    fi
-done
+# Wait for all background jobs
+wait
 
 echo ""
 echo "=========================================="
-# Wait for all background jobs
-wait
-
-echo ""
-echo "=========================================="
+# Wait for all background jobs individually and aggregate exit codes
+overall_status=0
+for pid in "${PIDS[@]}"; do
+    if ! wait "$pid"; then
+        exit_code=$?
+        echo "[$(date)] Job with PID $pid failed with exit code $exit_code"
+        overall_status=$exit_code
+    else
+        echo "[$(date)] Job with PID $pid completed successfully"
+    fi
+done
+
+echo ""
+echo "=========================================="
+if [ "$overall_status" -ne 0 ]; then
+    echo "[$(date)] One or more jobs failed."
+    echo "=========================================="
+    exit "$overall_status"
+fi
-# Wait for all background jobs
-wait
-
-echo ""
-echo "=========================================="
+# Wait for all background jobs individually and aggregate exit codes
+overall_status=0
+for pid in "${PIDS[@]}"; do
+    if ! wait "$pid"; then
+        exit_code=$?
+        echo "[$(date)] Job with PID $pid failed with exit code $exit_code"
+        overall_status=$exit_code
+    else
+        echo "[$(date)] Job with PID $pid completed successfully"
+    fi
+done
+
+echo ""
+echo "=========================================="
+if [ "$overall_status" -ne 0 ]; then
+    echo "[$(date)] One or more jobs failed."
+    echo "=========================================="
+    exit "$overall_status"
+fi
-if [ $overall_exit -eq 0 ]; then
-    echo "[$(date)] All jobs completed successfully!"
-else
-    echo "[$(date)] Some jobs FAILED — check logs above"
-fi
+echo "[$(date)] All jobs completed!"
 echo "=========================================="
-exit $overall_exit