Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
c77f9c2
change toml files
Jul 25, 2025
8418cf1
delete asap- mentions in docstrings
Jul 25, 2025
be98b46
Merge branch 'main' of github.com:choderalab/drugforge
Jul 30, 2025
88aa7f9
Update guides
Jul 30, 2025
7bbd638
Merge branch 'main' of github.com:choderalab/drugforge into update-do…
Jul 30, 2025
674cd19
update main readme
apayne97 Jul 30, 2025
2932183
Merge branch 'update-readme' into update-documentation
apayne97 Jul 30, 2025
3755ba1
update docs readme
apayne97 Jul 30, 2025
338e050
update a few of the tutorials
apayne97 Jul 30, 2025
573e89a
formatting
apayne97 Jul 30, 2025
f43cf73
update documentation on spectrum pt.1
Jul 31, 2025
71e3958
add all docs building requirements to yaml file
apayne97 Jul 31, 2025
e80b7c4
fix requirements name and docs
apayne97 Jul 31, 2025
8651706
add sphinx reqs back to requirements.yaml
apayne97 Jul 31, 2025
b6d1749
update docs landing page
apayne97 Jul 31, 2025
a70d65a
Merge remote-tracking branch 'origin/update-documentation' into updat…
apayne97 Jul 31, 2025
698e94d
add logo image
apayne97 Jul 31, 2025
78991d3
Add files for tutorial to aws
Jul 31, 2025
7b9044f
Merge branch 'update-documentation' of github.com:choderalab/drugforg…
Aug 1, 2025
d54d47c
update spectrum tutorial 2
Aug 1, 2025
ed60cd4
add alignment image for tutorial
Aug 1, 2025
7b26d87
changes for tutorial
Aug 5, 2025
9d1ef96
corrected mentions of bespoke fit
Aug 6, 2025
392807a
show blast score in html
Aug 25, 2025
9a5bd25
Add option to calculate bsite similarity
Aug 26, 2025
adba7b0
Merge branch 'main' of github.com:choderalab/drugforge into add-spect…
Aug 26, 2025
3b4e9b4
Merge branch 'main' into add-spectrum-improvements
mariacm12 Sep 2, 2025
562dc1d
Merge branch 'main' into add-spectrum-improvements
ijpulidos Sep 23, 2025
b61e2e3
Merge branch 'main' into add-spectrum-improvements
ijpulidos Oct 14, 2025
384f077
Merge branch 'main' into add-spectrum-improvements
mariacm12 Dec 10, 2025
860c1d7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 10, 2025
0feca65
Fix pre-commit error
Dec 10, 2025
396655f
get rid of mgltool dependency
Dec 11, 2025
4387ca3
Finish removing mgl dependency
Dec 11, 2025
fb15f77
Merge branch 'add-spectrum-improvements' of github.com:choderalab/dru…
Dec 12, 2025
fa96c8d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 12, 2025
a3f85bd
Merge branch 'main' into add-spectrum-improvements
apayne97 Feb 24, 2026
461f7cc
Merge branch 'main' into add-spectrum-improvements
apayne97 Feb 27, 2026
5eb1f25
Merge branch 'main' into add-spectrum-improvements
apayne97 Mar 3, 2026
8a49ec4
add openbabel req to spectrum
apayne97 Mar 3, 2026
94d09d3
Merge remote-tracking branch 'origin/add-spectrum-improvements' into …
apayne97 Mar 3, 2026
22c9fa3
add example test for convert_to_pdbqt but it doesn't work yet
apayne97 Mar 3, 2026
41f8131
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 3, 2026
5416fd3
Merge branch 'main' into add-spectrum-improvements
apayne97 Mar 3, 2026
ea44131
Merge branch 'main' into add-spectrum-improvements
apayne97 Mar 4, 2026
b9663db
add initial pdbqt conversion test
apayne97 Mar 4, 2026
5145160
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2026
e67fedc
test whether file generated by convert_to_pdbqt can be used by vina
apayne97 Mar 4, 2026
eb37101
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 4, 2026
f0938f2
add probably redundant and unnecessary spectrum schema
apayne97 Mar 4, 2026
ae48d45
add some more basic tests
apayne97 Mar 5, 2026
71d8d50
add some more CLI commands
apayne97 Mar 5, 2026
58829c2
add independent spectrum cli with alphafold support
apayne97 Mar 5, 2026
2abbb2f
fix logging file handling :(
apayne97 Mar 5, 2026
e9ecb34
change default alphafold json version to version 2
apayne97 Mar 5, 2026
530aa79
change default alphafold json version to version 2
apayne97 Mar 5, 2026
5e604f9
add structure alignment command for alphafold generated data
apayne97 Mar 5, 2026
667de72
update file handling for alphafold
apayne97 Mar 5, 2026
e1d37d0
add option to run boltz
apayne97 Mar 5, 2026
633df61
make directory before running logging
apayne97 Mar 5, 2026
7691688
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
02692bc
Merge branch 'main' into add-spectrum-workflow
apayne97 Mar 5, 2026
25d1091
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
41b1eaf
add more minimal ligand transfer docking implementation
apayne97 Mar 5, 2026
66c062e
Merge remote-tracking branch 'origin/add-spectrum-workflow' into add-…
apayne97 Mar 5, 2026
a18f00b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
331 changes: 331 additions & 0 deletions drugforge-spectrum/drugforge/spectrum/alphafold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
"""
drugforge/spectrum/alphafold.py
================================
Pydantic models and pure functions for building AlphaFold 3 JSON input files.

Public API
----------
Af3ProteinChain – AF3 JSON representation of a single protein chain
Af3Input – top-level AF3 JSON input (Pydantic model)
make_msa_inputs – FASTA → list of MSA-stage Af3Input objects
make_fold_inputs – MSA output dir → list of fold-stage Af3Input objects
"""

import json
import logging
from pathlib import Path
from typing import Optional

from drugforge.spectrum.schema import ProteinSequence, SequenceList
from pydantic import BaseModel, Field

# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------


class Af3ProteinChain(BaseModel):
"""AF3 JSON representation of a single protein chain."""

id: str = Field("A", description="Chain ID letter used in the AF3 JSON.")
sequence: str = Field(..., description="One-letter amino acid sequence.")
description: Optional[str] = Field(None, description="Human-readable label.")
unpairedMsa: Optional[str] = Field(
None,
description=(
"Pre-computed unpaired MSA in A3M format. "
"Set to None to let AF3 build the MSA (data pipeline). "
"Set to '' to run MSA-free."
),
)
pairedMsa: Optional[str] = Field(
None,
description=(
"Pre-computed paired MSA in A3M format. "
"Set to None alongside unpairedMsa=None so AF3 builds both. "
"Set to '' when providing a custom unpairedMsa for a single chain."
),
)
templates: Optional[list] = Field(
None,
description=(
"List of structural templates. None → AF3 searches for templates. "
"[] → run template-free."
),
)

def to_af3_dict(self, version: int = 2) -> dict:
"""Serialise to the AF3 JSON 'protein' sub-dict."""
d: dict = {"id": self.id, "sequence": self.sequence}
if self.description is not None:
if version >= 4:
d["description"] = self.description
# Always write MSA fields explicitly so AF3 interprets them correctly
d["unpairedMsa"] = self.unpairedMsa
d["pairedMsa"] = self.pairedMsa
d["templates"] = self.templates
return d


class Af3Input(BaseModel):
"""Top-level AF3 JSON input for a single folding job."""

name: str = Field(..., description="Job name; used to name output files.")
model_seeds: list[int] = Field(
default_factory=lambda: [1, 2, 5, 10],
description="List of integer random seeds. At least one required.",
)
chains: list[Af3ProteinChain] = Field(
..., description="Protein chains to include in the folding job."
)
dialect: str = Field("alphafold3", description="Must be 'alphafold3'.")
version: int = Field(2, description="AF3 JSON format version.")

def to_af3_dict(self) -> dict:
"""Serialise to the full AF3 JSON structure."""
return {
"name": self.name,
"modelSeeds": self.model_seeds,
"sequences": [{"protein": chain.to_af3_dict()} for chain in self.chains],
"dialect": self.dialect,
"version": self.version,
}

def write(self, output_dir: str | Path) -> Path:
"""Write this input to ``<output_dir>/<name>.json`` and return the path."""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
out_path = output_dir / f"{self.name}.json"
with open(out_path, "w") as fh:
json.dump(self.to_af3_dict(), fh, indent=2)
return out_path


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _read_msa_output(msa_output_dir: Path, name: str) -> tuple[str, list, str]:
"""Extract ``(unpairedMsa, templates, sequence)`` from an AF3 MSA output.

AF3 writes ``<msa_output_dir>/<name>/<name>_data.json`` after the data
pipeline completes.

Parameters
----------
msa_output_dir:
Root directory of AF3 MSA outputs.
name:
Sequence / job name.

Returns
-------
tuple[str, list, str]
``(unpairedMsa, templates, sequence)``
"""
# resolve() follows symlinks – Nextflow stages inputs as symlinks into
# the process work directory.
data_json = (
Path(msa_output_dir).resolve() / name.lower() / f"{name.lower()}_data.json"
)
if not data_json.exists():
raise FileNotFoundError(
f"MSA output not found for '{name}': expected {data_json}"
)
with open(data_json) as fh:
data = json.load(fh)
protein = data["sequences"][0]["protein"]
return protein["unpairedMsa"], protein["templates"], protein["sequence"]


# ---------------------------------------------------------------------------
# Core functions
# ---------------------------------------------------------------------------


def make_msa_inputs(
fasta_path: str | Path,
seeds: list[int] | None = None,
description_prefix: str = "",
) -> list[Af3Input]:
"""Build MSA-stage AF3 inputs from a FASTA file.

Each returned :class:`Af3Input` has all MSA fields set to ``None`` so that
AlphaFold 3 runs its data pipeline (Jackhmmer / Nhmmer) to build MSAs.
Run the resulting JSONs with ``--norun_inference``.

Parameters
----------
fasta_path:
Path to a FASTA file containing protein sequences.
seeds:
Integer model seeds. Defaults to ``[1, 2, 5, 10]``.
description_prefix:
Optional string prepended to each chain description, e.g. ``"2A protease"``.

Returns
-------
list[Af3Input]
One :class:`Af3Input` per sequence in the FASTA.
"""
if seeds is None:
seeds = [1, 2, 5, 10]

seq_list = SequenceList.from_fasta(fasta_path, aligned=False)

return [
Af3Input(
name=seq.seq_id,
model_seeds=seeds,
chains=[
Af3ProteinChain(
id="A",
sequence=seq.sequence,
description=(
f"{description_prefix} – {seq.seq_id}"
if description_prefix
else seq.seq_id
),
unpairedMsa=None,
pairedMsa=None,
templates=None,
)
],
)
for seq in seq_list
]


def make_fold_inputs(
msa_output_dir: str | Path,
seeds: list[int] | None = None,
fasta_path: str | Path | None = None,
) -> list[Af3Input]:
"""Build fold-stage AF3 inputs from pre-computed MSA outputs.

Reads each ``<name>/<name>_data.json`` written by the AF3 data pipeline
and embeds the ``unpairedMsa`` and ``templates`` into a new
:class:`Af3Input` ready for GPU inference. Run with
``--norun_data_pipeline``.

Parameters
----------
msa_output_dir:
Directory containing one sub-directory per sequence, each holding the
AF3 data-pipeline output JSON.
seeds:
Integer model seeds. Defaults to ``[1, 2, 5, 10]``.
fasta_path:
Optional FASTA used to control which sequences are processed and in
what order. When omitted every sub-directory in *msa_output_dir* is
used (sorted alphabetically).

Returns
-------
list[Af3Input]
One :class:`Af3Input` per sequence.
"""
if seeds is None:
seeds = [1, 2, 5, 10]

# resolve() follows symlinks – important when the directory is staged
# by Nextflow as a symlink pointing to another process's work directory.
msa_dir = Path(msa_output_dir).resolve()

if fasta_path is not None:
seq_list = SequenceList.from_fasta(fasta_path, aligned=False)
names = [seq.seq_id for seq in seq_list]
else:
# is_dir() follows symlinks, so staged symlink dirs are included.
names = sorted(p.name for p in msa_dir.iterdir() if p.is_dir())

if not names:
raise ValueError(f"No sequence directories found in {msa_dir}")

inputs: list[Af3Input] = []
for name in names:
unpaired_msa, templates, sequence = _read_msa_output(msa_dir, name)
inputs.append(
Af3Input(
name=name,
model_seeds=seeds,
chains=[
Af3ProteinChain(
id="A",
sequence=sequence,
description=name,
unpairedMsa=unpaired_msa,
pairedMsa="", # single chain – no inter-chain pairing
templates=templates,
)
],
)
)
return inputs


def select_best_af3(
af3_output_dir: str | Path,
seq_name: str,
ref_pdb: str | Path,
chain: str = "A",
final_pdb: str | Path = "aligned_protein.pdb",
) -> tuple[float, str]:
"""Select the best-ranked AF3 model for a sequence and align it to a reference.

AF3 writes one subdirectory per job named after the sequence. Inside, models
are ranked and named ``<seq_name>_model.cif`` (rank 0 is best). This function
picks the rank-0 model, aligns it to *ref_pdb*, and saves the aligned
structure as a PDB.

Parameters
----------
af3_output_dir:
Root directory of AF3 fold outputs (one sub-directory per sequence).
seq_name:
Name of the sequence / job (must match the sub-directory name).
ref_pdb:
Path to the reference PDB to align against.
chain:
Chain ID to use for alignment, by default ``"A"``.
final_pdb:
Path where the aligned PDB will be saved.

Returns
-------
tuple[float, str]
``(rmsd, path_to_aligned_pdb)``

Raises
------
FileNotFoundError
If the AF3 output directory or the sequence sub-directory is not found,
or if no CIF model files are present.
"""
# Defer import to avoid circular dependency with calculate_rmsd
from drugforge.spectrum.calculate_rmsd import rmsd_alignment

af3_output_dir = Path(af3_output_dir)
seq_dir = af3_output_dir / seq_name.lower()
if not seq_dir.exists():
raise FileNotFoundError(
f"AF3 output directory for '{seq_name}' not found: {seq_dir}"
)

# AF3 lowercases job names; rank-0 model is first when sorted by name.
candidates = sorted(seq_dir.glob(f"{seq_name.lower()}*model*.cif"))
if not candidates:
raise FileNotFoundError(
f"No AF3 CIF model files found for '{seq_name}' in {seq_dir}"
)

best_cif = candidates[0]
logging.info(f"Selected AF3 model for '{seq_name}': {best_cif.name}")

rmsd, aligned_pdb = rmsd_alignment(
str(best_cif), str(ref_pdb), str(final_pdb), chain, chain
)
logging.info(f"RMSD for '{seq_name}' vs reference: {rmsd:.3f} Å")

return rmsd, str(aligned_pdb)
Loading
Loading