sampleworks/docker-entrypoint.sh at main · diff-use/sampleworks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/bin/bash
# Sampleworks Docker Entrypoint
#
# Usage:
#   docker run sampleworks -e <pixi_env> <script> [args...]
#   docker run sampleworks -e boltz run_grid_search.py --proteins /data/proteins.csv ...
#   docker run sampleworks bash  # interactive shell
#
# Available pixi environments: boltz, protenix, rf3
#
# Examples:
#   # Run grid search with RF3
#   docker run --gpus all -v /data:/data sampleworks \
#     -e rf3 run_grid_search.py \
#     --proteins /data/proteins.csv \
#     --models rf3 \
#     --scalers pure_guidance \
#     --ensemble-sizes "1 4" \
#     --gradient-weights "0.1 0.2" \
#     --output-dir /data/results \
#     --use-tweedie \
#     --rf3-checkpoint /data/checkpoints/rf3.ckpt

set -e

show_help() {
    cat << 'EOF'
Sampleworks - Protein structure prediction with diffusion model guidance

USAGE:
    docker run --gpus all --shm-size=16g sampleworks -e <environment> <script> [arguments...]
    docker run sampleworks bash
    docker run sampleworks --help

IMPORTANT:
    Always use --shm-size=16g (or larger) to avoid shared memory errors with DataLoaders.

OPTIONS:
    -e, --env <env>     Pixi environment to use (boltz, protenix, rf3)
    -h, --help          Show this help message
    bash                Start an interactive shell

ENVIRONMENTS:
    boltz       For boltz1 and boltz2 models
    protenix    For protenix model
    rf3         For RF3 model

EXAMPLES:
    # Run grid search with RF3 model
    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
      -e rf3 run_grid_search.py \
      --proteins /data/proteins.csv \
      --models rf3 \
      --scalers pure_guidance \
      --ensemble-sizes "1 4" \
      --gradient-weights "0.1 0.2" \
      --output-dir /data/results \
      --use-tweedie \
      --gradient-normalization \
      --augmentation \
      --align-to-input \
      --rf3-checkpoint /data/checkpoints/rf3_foundry_01_24_latest.ckpt

    # Run grid search with Boltz1 model
    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
      -e boltz run_grid_search.py \
      --proteins /data/proteins.csv \
      --models boltz1 \
      --scalers pure_guidance \
      --ensemble-sizes "1 4" \
      --gradient-weights "0.1 0.2" \
      --output-dir /data/results \
      --use-tweedie \
      --boltz1-checkpoint /data/checkpoints/boltz1_conf.ckpt

    # Run grid search with Boltz2 model
    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
      -e boltz run_grid_search.py \
      --proteins /data/proteins.csv \
      --models boltz2 \
      --scalers pure_guidance \
      --methods "X-RAY DIFFRACTION" \
      --ensemble-sizes "1 4" \
      --gradient-weights "0.1 0.2" \
      --output-dir /data/results \
      --use-tweedie \
      --boltz2-checkpoint /data/checkpoints/boltz2_conf.ckpt

    # Run grid search with Protenix model
    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
      -e protenix run_grid_search.py \
      --proteins /data/proteins.csv \
      --models protenix \
      --scalers pure_guidance \
      --ensemble-sizes "1 4" \
      --gradient-weights "0.1 0.2" \
      --output-dir /data/results \
      --use-tweedie \
      --protenix-checkpoint /data/checkpoints/protenix_base_default_v0.5.0.pt

    # Interactive shell
    docker run --gpus all --shm-size=16g -it sampleworks bash

    # Run a custom script
    docker run --gpus all --shm-size=16g -v /data:/data sampleworks \
      -e boltz scripts/boltz2_pure_guidance.py \
      --structure /data/structure.cif \
      --density /data/density.ccp4 \
      --resolution 1.8

GRID SEARCH ARGUMENTS (run_grid_search.py):
    Required:
      --proteins FILE             CSV file with columns: structure,density,resolution,name

    Model selection:
      --models MODEL              Model to use (boltz1, boltz2, protenix, rf3)
                                  Note: Only one model per run currently supported

    Guidance configuration:
      --scalers SCALER            Guidance method (pure_guidance, fk_steering)
      --ensemble-sizes "N M..."   Space-separated ensemble sizes (e.g., "1 4")
      --gradient-weights "X Y..." Space-separated gradient weights (e.g., "0.1 0.2")
      --use-tweedie               Use Tweedie's formula for gradient computation
      --gradient-normalization    Enable gradient normalization
      --augmentation              Enable data augmentation
      --align-to-input            Enable alignment to input structure

    Output:
      --output-dir DIR            Output directory for results
      --dry-run                   Print commands without executing

    Job control:
      --force-all                 Re-run all jobs including successful ones
      --only-failed               Run only failed jobs
      --only-missing              Run only un-run jobs
      --max-parallel N            Max parallel jobs (default: auto = number of GPUs)

    Model-specific options:
      --boltz1-checkpoint PATH    Path to Boltz1 checkpoint (default: /checkpoints/boltz1_conf.ckpt - BAKED IN)
      --boltz2-checkpoint PATH    Path to Boltz2 checkpoint (default: /checkpoints/boltz2_conf.ckpt - BAKED IN)
      --protenix-checkpoint PATH  Path to Protenix checkpoint
      --rf3-checkpoint PATH       Path to RF3 checkpoint
      --methods METHOD            Boltz2 sampling method (default: "X-RAY DIFFRACTION")

BAKED-IN CHECKPOINTS:
    The following checkpoints are pre-installed in the image:
      /checkpoints/boltz1_conf.ckpt                   - Boltz1 model (~3.5GB)
      /checkpoints/boltz2_conf.ckpt                   - Boltz2 model (~2.3GB)
      /checkpoints/ccd.pkl                             - Chemical Component Dictionary (~345MB)
      /checkpoints/rf3_foundry_01_24_latest.ckpt       - RF3 model (~2.9GB)
      /checkpoints/protenix_base_default_v0.5.0.pt     - Protenix model (~1.4GB)

    FK steering options:
      --num-gd-steps "N M..."     Space-separated GD steps (FK steering only)
      --num-particles N           Number of particles for FK steering (default: 3)
      --fk-lambda FLOAT           Weighting factor for resampling (default: 0.5)
      --fk-resampling-interval N  How often to apply resampling (default: 1)

    Advanced:
      --partial-diffusion-step N  Diffusion step to start from (default: 0)
      --loss-order N              L1 (1) or L2 (2) loss (default: 2)

METADATA HOST PATH REMAPPING:
    By default, job_metadata.json records container-internal paths (e.g.
    /data/inputs/..., /data/results/...). To record host paths instead, pass
    one or more of these environment variables:

    Split mounts (-v $DATA:/data/inputs -v $RESULTS:/data/results):
      SAMPLEWORKS_HOST_INPUT_DIR       Host path mounted at /data/inputs
      SAMPLEWORKS_HOST_RESULTS_DIR    Host path mounted at /data/results

    Single mount (-v /host/path:/data):
      SAMPLEWORKS_HOST_DIR            Host path mounted at /data

    Example (split mounts):
      docker run -v /mnt/data:/data/inputs:ro -v /results:/data/results \
        -e SAMPLEWORKS_HOST_INPUT_DIR=/mnt/data \
        -e SAMPLEWORKS_HOST_RESULTS_DIR=/results \
        sampleworks -e boltz run_grid_search.py ...

PROTEINS CSV FORMAT:
    The --proteins CSV file must have the following columns:
      name        - Protein identifier
      structure   - Path to input structure file (.cif, .pdb)
      density     - Path to density map file (.ccp4, .mrc, .map)
      resolution  - Map resolution in Angstroms

    Example:
      name,structure,density,resolution
      1abc,/data/structures/1abc.cif,/data/maps/1abc.ccp4,2.0
      2xyz,/data/structures/2xyz.cif,/data/maps/2xyz.mrc,1.8

For full argument details, run:
    docker run sampleworks -e boltz run_grid_search.py --help
EOF
}

# Handle special cases first
if [ $# -eq 0 ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
    show_help
    exit 0
fi

# Handle interactive shell
if [ "$1" = "bash" ] || [ "$1" = "sh" ]; then
    exec "$@"
fi

# Parse -e/--env argument
ENV=""
while [[ $# -gt 0 ]]; do
    case $1 in
        -e|--env)
            if [ -z "$2" ] || [[ "$2" == -* ]]; then
                echo "Error: -e/--env requires an environment name (boltz, protenix, rf3)"
                exit 1
            fi
            ENV="$2"
            shift 2
            break
            ;;
        *)
            echo "Error: First argument must be -e <environment>, bash, or --help"
            echo ""
            echo "Usage: docker run sampleworks -e <env> <script> [args...]"
            echo "       docker run sampleworks bash"
            echo "       docker run sampleworks --help"
            exit 1
            ;;
    esac
done

# Validate environment
if [[ -z "$ENV" ]]; then
    echo "Error: Environment not specified. Use -e <env> where env is boltz, protenix, or rf3"
    echo ""
    echo "Usage: docker run sampleworks -e <env> <script> [args...]"
    echo ""
    echo "Examples:"
    echo "  docker run sampleworks -e boltz run_grid_search.py --proteins /data/proteins.csv"
    echo "  docker run sampleworks -e rf3 run_grid_search.py --help"
    echo "  docker run sampleworks bash"
    exit 1
fi

case $ENV in
    boltz|protenix|rf3)
        ;;
    *)
        echo "Error: Invalid environment '$ENV'. Must be one of: boltz, protenix, rf3"
        exit 1
        ;;
esac

# Get the script to run
if [[ $# -eq 0 ]]; then
    echo "Error: No script specified"
    echo "Usage: docker run sampleworks -e <env> <script> [args...]"
    exit 1
fi

SCRIPT="$1"
shift

# If script is "python", run python directly
if [[ "$SCRIPT" == "python" ]]; then
    exec pixi run -e "$ENV" python "$@"
fi

# If script ends in .py, run it with python
if [[ "$SCRIPT" == *.py ]]; then
    # Check if it's a bare script name (like run_grid_search.py)
    if [[ ! -f "$SCRIPT" && -f "/app/$SCRIPT" ]]; then
        SCRIPT="/app/$SCRIPT"
    elif [[ ! -f "$SCRIPT" && -f "/app/scripts/$SCRIPT" ]]; then
        SCRIPT="/app/scripts/$SCRIPT"
    fi
    exec pixi run -e "$ENV" python "$SCRIPT" "$@"
fi

# Otherwise, run command directly via pixi
exec pixi run -e "$ENV" "$SCRIPT" "$@"