Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
name: CI

on:
push:
branches: [main]
paths:
- 'src/**'
- 'tests/**'
- 'pyproject.toml'
- 'pixi.lock'
- '.github/workflows/ci.yml'
- '.pre-commit-config.yaml'
pull_request:
branches: [main]
paths:
- 'src/**'
- 'tests/**'
- 'pyproject.toml'
- 'pixi.lock'
- '.github/workflows/ci.yml'
- '.pre-commit-config.yaml'
workflow_dispatch:

concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true

jobs:
lint:
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install pixi
uses: prefix-dev/[email protected]
with:
environments: boltz-dev

- name: Ruff lint
run: pixi run -e boltz-dev ruff check .

- name: Ruff format check
run: pixi run -e boltz-dev ruff format --check .

typecheck:
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: read
strategy:
fail-fast: false
matrix:
environment: [boltz-dev, protenix-dev, rf3-dev]

name: typecheck (${{ matrix.environment }})

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install pixi
uses: prefix-dev/[email protected]
with:
environments: ${{ matrix.environment }}

- name: Run ty
run: pixi run -e ${{ matrix.environment }} ty check

cpu-tests:
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
strategy:
fail-fast: false
matrix:
environment: [boltz-dev, protenix-dev, rf3-dev]

name: tests (${{ matrix.environment }})

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install pixi
uses: prefix-dev/[email protected]
with:
environments: ${{ matrix.environment }}

- name: Run CPU tests
run: pixi run -e ${{ matrix.environment }} cpu-tests
2 changes: 2 additions & 0 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ jobs:

- name: Install pixi
uses: prefix-dev/setup-pixi@19eac09b398e3d0c747adc7921926a6d802df4da # v0.8.8
with:
cache: false # NFS-backed cache on self-hosted runner handles this

- name: Build CUDA extensions
run: pixi run -e ${{ matrix.environment }} python3 -c "from sampleworks.core.forward_models.xray.real_space_density_deps.ops.csrc import dilate_points_cuda"
Expand Down
4 changes: 2 additions & 2 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,19 @@ include = ["src/sampleworks/eval/bond_angle_and_length_outlier_eval_script.py"]
possibly-missing-attribute = "ignore"

[tool.ty.rules]
# Pre-existing type issues across the codebase; warn instead of error
# so ty runs in CI without blocking PRs while the team fixes them.
unresolved-import = "ignore"
unknown-argument = "warn"
unresolved-attribute = "warn"
invalid-argument-type = "warn"
invalid-assignment = "warn"
invalid-method-override = "warn"
invalid-parameter-default = "warn"
no-matching-overload = "warn"
not-iterable = "warn"
not-subscriptable = "warn"
too-many-positional-arguments = "warn"
unsupported-operator = "warn"
unused-ignore-comment = "warn"
unused-type-ignore-comment = "warn"
5 changes: 2 additions & 3 deletions scripts/eval/bond_geometry_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def bond_length_violations(pose: AtomArray, tolerance: float = 0.1) -> tuple[flo
"""
try:
bounds = check_pose_and_get_bounds(pose)
except (ValueError, BadStructureError) as e:
except (ValueError, BadStructureError):
return np.nan, pd.DataFrame()

bond_indices = np.sort(pose.bonds.as_array()[:, :2], axis=1)
Expand Down Expand Up @@ -97,13 +97,12 @@ def check_pose_and_get_bounds(pose: AtomArray):
"`biotite.structure.io.pdbx.get_structure(..., include_bonds=True)`"
)
raise ValueError("The structure does not have bonds.")

# this fetches values from RDKit, raises BadStructureError if the structure is bad
bounds = get_distance_bounds(pose)
return bounds



def bond_angle_violations(pose: AtomArray, tolerance: float = 0.1) -> tuple[float, pd.DataFrame]:
"""
Calculate the percentage of bonds that are outside acceptable ranges.
Expand Down
4 changes: 1 addition & 3 deletions scripts/eval/run_and_process_phenix_clashscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def main(args) -> None:
return

clashscore_df = pd.concat(clashscore_metrics, ignore_index=True)
clashscore_df.to_csv(
args.grid_search_results_path / "clashscore_metrics.csv", index=False
)
clashscore_df.to_csv(args.grid_search_results_path / "clashscore_metrics.csv", index=False)


def process_one_trial(trial: Trial) -> pd.DataFrame:
Expand Down
21 changes: 10 additions & 11 deletions scripts/eval/run_and_process_tortoize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pandas as pd
from loguru import logger
from pandas import DataFrame

from sampleworks.eval.grid_search_eval_utils import parse_eval_args, setup_evaluation_parameters


Expand All @@ -27,9 +26,7 @@ def main(args: argparse.Namespace) -> None:
try:
subprocess.call("tortoize", stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except FileNotFoundError:
raise RuntimeError(
"tortoize is not available, make sure you have installed it."
) from None
raise RuntimeError("tortoize is not available, make sure you have installed it.") from None
# The dropped variable is a list of ProteinConfigs, not used yet in this script
all_trials, _ = setup_evaluation_parameters(args)

Expand Down Expand Up @@ -122,13 +119,15 @@ def get_protein_level_z_scores(tortoize_json: dict[str, Any]) -> pd.DataFrame:
out: list[dict[str, Any]] = []
model_block = tortoize_json.get("model", {})
for model_id, model_data in model_block.items():
out.append({
"model": str(model_id),
"ramachandran_z_score": model_data.get("ramachandran-z", None),
"ramachandran_jackknife_sd": model_data.get("ramachandran-jackknife-sd", None),
"torsion_z_score": model_data.get("torsion-z", None),
"torsion_jackknife_sd": model_data.get("torsion-jackknife-sd", None)
})
out.append(
{
"model": str(model_id),
"ramachandran_z_score": model_data.get("ramachandran-z", None),
"ramachandran_jackknife_sd": model_data.get("ramachandran-jackknife-sd", None),
"torsion_z_score": model_data.get("torsion-z", None),
"torsion_jackknife_sd": model_data.get("torsion-jackknife-sd", None),
}
)
return pd.DataFrame(out)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ def _ensure_toolchain_env() -> None:
CUDA_AVAILABLE = True
except Exception as e:
print(f"CUDA extension loading failed: {e}")
dilate_points_cuda = None
CUDA_AVAILABLE = False
14 changes: 6 additions & 8 deletions src/sampleworks/eval/grid_search_eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from loguru import logger
from sampleworks.eval.constants import OCCUPANCY_LEVELS
from sampleworks.eval.eval_dataclasses import Trial, TrialList, ProteinConfig
from sampleworks.eval.eval_dataclasses import ProteinConfig, Trial, TrialList
from sampleworks.eval.occupancy_utils import extract_protein_and_occupancy
from sampleworks.utils.guidance_constants import StructurePredictor

Expand Down Expand Up @@ -175,22 +175,22 @@ def parse_eval_args(description: str | None = None):
type=Path,
required=True,
help="Path to the top-level grid search results directory, usu. called "
"``grid_search_results``",
"``grid_search_results``",
)
# not technically used everywhere yet, but requiring it future-proofs.
parser.add_argument(
"--grid-search-inputs-path",
type=Path,
required=True,
help="Path to the directory containing the grid search inputs, in particular "
"the protein configuration CSV file, maps, and reference structures.",
"the protein configuration CSV file, maps, and reference structures.",
default=None,
)
parser.add_argument(
"--protein-configs-csv",
type=Path,
help="Path to the CSV file containing protein configurations, like "
"``${HOME}/configs.csv``. Defaults to sampleworks/data/protein_configs.csv",
"``${HOME}/configs.csv``. Defaults to sampleworks/data/protein_configs.csv",
default=files("sampleworks.data") / "protein_configs.csv",
)
parser.add_argument(
Expand All @@ -215,7 +215,7 @@ def parse_eval_args(description: str | None = None):


def setup_evaluation_parameters(
args: argparse.Namespace
args: argparse.Namespace,
) -> tuple[TrialList, dict[str, ProteinConfig]]:
grid_search_dir = Path(args.grid_search_results_path)

Expand All @@ -227,9 +227,7 @@ def setup_evaluation_parameters(
logger.info(f"Proteins configured: {list(protein_configs.keys())}")

# Scan for experiments (look for refined.cif files)
all_trials = scan_grid_search_results(
grid_search_dir, target_filename=args.target_filename
)
all_trials = scan_grid_search_results(grid_search_dir, target_filename=args.target_filename)
logger.info(f"Found {len(all_trials)} experiments with refined.cif files")

if all_trials:
Expand Down
10 changes: 5 additions & 5 deletions src/sampleworks/utils/msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def _validate_msa_cache_contents(msa_hash: str, msa_dir: Path) -> None:
raise FileNotFoundError(f"No A3M files found for hash {msa_hash} in {msa_dir}")

# Validate that we have matching pairs
csv_indices = {int(f.stem.split('_')[-1]) for f in csv_files}
a3m_indices = {int(f.stem.split('_')[-1]) for f in a3m_files}
csv_indices = {int(f.stem.split("_")[-1]) for f in csv_files}
a3m_indices = {int(f.stem.split("_")[-1]) for f in a3m_files}

if csv_indices != a3m_indices:
raise ValueError(
Expand All @@ -67,16 +67,16 @@ def _validate_msa_cache_contents(msa_hash: str, msa_dir: Path) -> None:
a3m_path = msa_dir / f"{msa_hash}_{idx}.a3m"

# Read CSV sequences (skip header, take second column)
with csv_path.open('r') as f:
with csv_path.open("r") as f:
csv_lines = f.readlines()

if not csv_lines or csv_lines[0].strip() != "key,sequence":
raise ValueError(f"Invalid CSV header in {csv_path}")

csv_sequences = [line.strip().split(',', 1)[1] for line in csv_lines[1:] if line.strip()]
csv_sequences = [line.strip().split(",", 1)[1] for line in csv_lines[1:] if line.strip()]

# Read A3M sequences (every other line, skipping headers)
with a3m_path.open('r') as f:
with a3m_path.open("r") as f:
a3m_lines = f.readlines()

# A3M format: header lines start with '>', sequences on alternating lines
Expand Down
20 changes: 15 additions & 5 deletions tests/eval/test_structure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def mock_protein_config(tmp_path: Path) -> ProteinConfig:
return ProteinConfig(
protein="test",
base_map_dir=tmp_path,
selection=["chain A and resi 1-10", ],
selection=[
"chain A and resi 1-10",
],
resolution=2.0,
map_pattern="{occ_str}.ccp4",
structure_pattern="{occ_str}.cif",
Expand Down Expand Up @@ -256,7 +258,9 @@ def test_converts_atomarray_to_stack(self, tmp_path, basic_atom_array_multichain
config = ProteinConfig(
protein="test",
base_map_dir=tmp_path,
selection=["chain A", ],
selection=[
"chain A",
],
resolution=2.0,
map_pattern="{occ_str}.ccp4",
structure_pattern="{occ_str}.cif",
Expand All @@ -272,7 +276,9 @@ def test_with_real_structure(self, resources_dir):
config = ProteinConfig(
protein="6b8x",
base_map_dir=resources_dir / "6b8x",
selection=["chain A", ],
selection=[
"chain A",
],
resolution=1.74,
map_pattern="{occ_str}.ccp4",
structure_pattern="6b8x_final.pdb",
Expand All @@ -299,7 +305,9 @@ def test_handles_exceptions_gracefully(self, tmp_path):
config = ProteinConfig(
protein="test",
base_map_dir=tmp_path,
selection=["chain Z and resi 999", ],
selection=[
"chain Z and resi 999",
],
resolution=2.0,
map_pattern="{occ_str}.ccp4",
structure_pattern="{occ_str}.cif",
Expand All @@ -314,7 +322,9 @@ def test_with_real_structure(self, resources_dir):
config = ProteinConfig(
protein="6b8x",
base_map_dir=resources_dir / "6b8x",
selection=[selection_string, ],
selection=[
selection_string,
],
resolution=1.74,
map_pattern="{occ_str}.ccp4",
structure_pattern="6b8x_final.pdb",
Expand Down
18 changes: 15 additions & 3 deletions tests/models/protenix/test_ccd_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ class TestExpandTildeCCDCode:

def test_unique_match_expands(self):
"""~QS should expand uniquely to A1AQS."""
result = _expand_tilde_ccd_code("~QS")
fake_codes = ["A1AQS", "GLY", "ALA"]
_build_ccd_suffix_map.cache_clear()
with patch("protenix.data.ccd.get_all_ccd_code", return_value=fake_codes):
result = _expand_tilde_ccd_code("~QS")
_build_ccd_suffix_map.cache_clear()
assert result == "A1AQS"

def test_ambiguous_match_raises(self):
Expand All @@ -37,7 +41,11 @@ def test_ambiguous_match_raises(self):

def test_no_match_returns_original(self):
"""When no code matches the suffix, return the truncated code."""
result = _expand_tilde_ccd_code("~$$")
fake_codes = ["GLY", "ALA"]
_build_ccd_suffix_map.cache_clear()
with patch("protenix.data.ccd.get_all_ccd_code", return_value=fake_codes):
result = _expand_tilde_ccd_code("~$$")
_build_ccd_suffix_map.cache_clear()
assert result == "~$$"


Expand All @@ -46,7 +54,11 @@ class TestStructureToProtenixJsonCCDExpansion:

def test_9bn8_ligand_expanded(self, structure_9bn8):
"""9BN8 structure with ~QS ligand should produce CCD_A1AQS in JSON."""
json_dict = structure_to_protenix_json(structure_9bn8)
_build_ccd_suffix_map.cache_clear()
fake_codes = ["A1AQS", "GLY", "ALA"]
with patch("protenix.data.ccd.get_all_ccd_code", return_value=fake_codes):
json_dict = structure_to_protenix_json(structure_9bn8)
_build_ccd_suffix_map.cache_clear()

ligand_entries = [
entry["ligand"]["ligand"] for entry in json_dict["sequences"] if "ligand" in entry
Expand Down
Loading
Loading