Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@AGENTS.md
1,405 changes: 738 additions & 667 deletions pixi.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"python-dotenv",
"jaxtyping",
"jax",
"joblib",
"einx<0.4",
"hydra-core",
"loguru",
Expand Down
59 changes: 11 additions & 48 deletions scripts/eval/classify_altloc_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

import numpy as np
import pandas as pd
from biotite.structure import AtomArray
from biotite.structure import AtomArray, AtomArrayStack
from loguru import logger
from sampleworks.eval.grid_search_eval_utils import resolve_cif_path
from sampleworks.eval.structure_utils import (
Expand All @@ -59,10 +59,9 @@
from sampleworks.utils.atom_array_utils import (
BACKBONE_ATOM_TYPES,
BLANK_ALTLOC_IDS,
build_pairwise_altloc_arrays,
detect_altlocs,
filter_to_common_atoms,
load_structure_with_altlocs,
select_altloc,
)


Expand Down Expand Up @@ -111,47 +110,9 @@ def _chain_from_selection(selection: str) -> str | None:
return chain_id


def _build_pairwise_altloc_arrays(
atom_array, altloc_ids: list[str]
) -> dict[tuple[str, str], tuple[AtomArray, AtomArray]]:
"""Return ``{(id_i, id_j): (array_i, array_j)}`` pre-filtered to common atoms.

For each unordered altloc pair we build the two per-altloc AtomArrays
(via ``select_altloc(return_full_array=True)``, which includes blank-altloc
atoms as shared context) and then run ``filter_to_common_atoms`` so the two
inputs have identical atom order and count.

We build per-pair rather than using ``map_altlocs_to_stack`` so residues whose
altloc set is a subset of those in the whole structure (e.g. 2YL0 res 60–64
carry only altlocs A and B, not C) still get scored for the pairs where they
exist. A stack level ``filter_to_common_atoms`` would drop them entirely.

TODO: this helper hits the broader issue in how we
handle structures with >2 altlocs.
Fixing that upstream would let us replace this helper
with a direct ``map_altlocs_to_stack`` call and remove a source of
duplication.
"""
pairs: dict[tuple[str, str], tuple[AtomArray, AtomArray]] = {}
for i in range(len(altloc_ids)):
for j in range(i + 1, len(altloc_ids)):
a_i = select_altloc(atom_array, altloc_ids[i], return_full_array=True)
a_j = select_altloc(atom_array, altloc_ids[j], return_full_array=True)
try:
f_i, f_j = filter_to_common_atoms(a_i, a_j)
except RuntimeError as e:
logger.warning(
f"could not match atoms between altlocs "
f"{altloc_ids[i]} and {altloc_ids[j]}: {e}"
)
continue
pairs[(altloc_ids[i], altloc_ids[j])] = (f_i, f_j)
return pairs


def _mean_residue_lddt_for_pair(
gt_array: AtomArray,
pred_array: AtomArray,
gt_array: AtomArrayStack | AtomArray | None,
pred_array: AtomArrayStack | AtomArray | None,
chain: str,
residues: list[int],
) -> float:
Expand Down Expand Up @@ -187,7 +148,7 @@ def _mean_residue_lddt_for_pair(

def _classify_selection(
atom_array: AtomArray,
pair_arrays: dict[tuple[str, str], tuple[AtomArray, AtomArray]],
pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]],
altloc_ids: list[str],
selection_str: str,
protein: str,
Expand Down Expand Up @@ -345,7 +306,7 @@ def _process_structure(
)
return []

pair_arrays = _build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)
pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)

structure_altloc_mask = ~np.isin(atom_array.altloc_id, list(BLANK_ALTLOC_IDS))
structure_backbone_mask = np.isin(atom_array.atom_name, BACKBONE_ATOM_TYPES)
Expand All @@ -356,6 +317,8 @@ def _process_structure(
# find_altloc_selections.py appends a combined all altloc selection
# (atomworks-style with " or " clauses) at the end of each row. That one is
# a union over every span we already processed individually, so skip it.
# NOTE: This will need to be addressed when we
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What will need to be addressed?

# migrate to atomworks-style selections for everything
if " or " in selection_str:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably want to allow selections that have "or" in them, presumably there will be some that have discontinuous but physically contacting residues that we care about?

continue
out = _classify_selection(
Expand All @@ -371,8 +334,8 @@ def _process_structure(
)
if out is None:
continue
row, covered = out
rows.append(row)
classified_row, covered = out
rows.append(classified_row)
classified_res_ids.update(covered)

# residues across all classified spans should equal total unique
Expand Down Expand Up @@ -415,7 +378,7 @@ def main(args: argparse.Namespace) -> None:
)
)

out_df = pd.DataFrame(all_rows, columns=OUTPUT_COLUMNS)
out_df = pd.DataFrame(all_rows, columns=pd.Index(OUTPUT_COLUMNS))
args.output_file.parent.mkdir(parents=True, exist_ok=True)
out_df.to_csv(args.output_file, index=False)
logger.info(f"Wrote {len(out_df)} classified spans to {args.output_file}")
Expand Down
269 changes: 269 additions & 0 deletions scripts/eval/find_max_rmsd_subsegment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
"""Find the maximum RMSD subsegment within each altloc selection.

This script consumes the output of ``scripts/eval/find_altloc_selections.py``
and, for each contiguous altloc span longer than ``--window-size`` residues,
identifies the contiguous subsegment of that size with the highest
RMSD between any pair of alternate conformations.

Only residues with identical residue names across altlocs are considered.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should fix this. Maybe make an issue.

Windows containing compositional heterogeneity (different residue names, e.g. CYS vs CSO)
are skipped.

The primary output CSV preserves the setup expected by
``rscc_grid_search_script.py`` (one row per protein, semicolon joined
selections). An optional diagnostic CSV provides per selection detail.

Usage: find_altloc_selections.py -> this script -> rscc_grid_search_script.py
"""

import argparse
from pathlib import Path

import numpy as np
import pandas as pd
from biotite.structure import AtomArrayStack, get_residues, rmsd as biotite_rmsd
from loguru import logger
from sampleworks.eval.grid_search_eval_utils import resolve_cif_path
from sampleworks.eval.structure_utils import (
ATOMWORKS_COMPARISON_OPS,
parse_selection_string,
)
from sampleworks.utils.atom_array_utils import (
build_pairwise_altloc_arrays,
detect_altlocs,
load_structure_with_altlocs,
)


def _has_compositional_heterogeneity(arr_i, arr_j, mask: np.ndarray) -> bool:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple things:

  1. Didn't one of us create a method for this somewhere else?

  2. This will return True if there are simply different sequences that are the same length, which I wouldn't think of as compositional heterogeneity. Do you want to check for modified residues like CYS->CSO?

"""Check if residues under *mask* have different names between two altloc arrays."""
_, names_i = get_residues(arr_i[mask])
_, names_j = get_residues(arr_j[mask])
return not np.array_equal(names_i, names_j)


def _find_max_rmsd_window(
pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]],
chain: str,
residues: list[int],
window_size: int = 3,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the value you usually use? 3 seems very short to me. Can we go for 5 or more?

) -> tuple[list[int], float, str] | None:
"""Slide a window over residues and find the subsegment with maximum all atom RMSD.

Parameters
----------
pair_arrays
Output of ``build_pairwise_altloc_arrays``.
chain
Chain ID to filter on.
residues
Sorted list of actual residue IDs in the span.
window_size
Number of consecutive residues per window.

Returns
-------
tuple or None
``(best_window_residues, best_rmsd, best_pair_str)`` or ``None``
if no valid RMSD could be computed for any window.
"""
max_rmsd = -np.inf
max_window: list[int] | None = None
max_pair = ""

for w in range(len(residues) - window_size + 1):
window_res = residues[w : w + window_size]

for (alt_i, alt_j), (stack_i, stack_j) in pair_arrays.items():
arr_i = stack_i[0]
arr_j = stack_j[0]

mask = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res)
if mask.sum() == 0:
continue

if _has_compositional_heterogeneity(arr_i, arr_j, mask):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to prevent an error for biotite_rmsd? If so, again I'd think about whether it might make sense to check for modified but otherwise the same residues (e.g. sulfonates). You could take the rmsd on only the common atoms.

continue

rmsd_val = float(biotite_rmsd(arr_i[mask], arr_j[mask]))
Comment on lines +81 to +88
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Require complete residue coverage before scoring a window.

Line 82 only checks for any atoms, so a window with missing residues in either altloc can still be scored and later emitted as the full best_res[0]-best_res[-1] range. Skip windows unless both masked arrays contain exactly window_res.

🐛 Proposed fix
             mask = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res)
             if mask.sum() == 0:
                 continue
 
+            res_ids_i, _ = get_residues(arr_i[mask])
+            res_ids_j, _ = get_residues(arr_j[mask])
+            if list(map(int, res_ids_i)) != window_res or list(map(int, res_ids_j)) != window_res:
+                continue
+
             if _has_compositional_heterogeneity(arr_i, arr_j, mask):
                 continue
 
             rmsd_val = float(biotite_rmsd(arr_i[mask], arr_j[mask]))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/eval/find_max_rmsd_subsegment.py` around lines 81 - 88, The current
check only ensures arr_i has some atoms for the window, allowing partial residue
coverage to be scored; change the logic to require complete residue coverage for
both altloc arrays before computing RMSD: compute masks for both arr_i and arr_j
(e.g., mask_i = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res)
and mask_j = (arr_j.chain_id == chain) & np.isin(arr_j.res_id, window_res)),
then verify that the set/unique res_id values present in arr_i[mask_i] and
arr_j[mask_j] exactly match window_res (or that the count of unique res_ids
equals len(window_res)); if either side is incomplete, continue and do not call
_has_compositional_heterogeneity or biotite_rmsd on that window.

if np.isfinite(rmsd_val) and rmsd_val > max_rmsd:
max_rmsd = rmsd_val
max_window = window_res
max_pair = f"{alt_i}-{alt_j}"

if max_window is None:
return None
return max_window, float(max_rmsd), max_pair


def _process_structure(
row: pd.Series,
cif_root: Path | None,
window_size: int = 3,
) -> list[dict]:
"""Load a structure and narrow all its selections to max RMSD subsegments."""
protein = str(row["protein"])
cif_path = resolve_cif_path(row, cif_root)
if not cif_path.exists():
logger.error(f"[{protein}] CIF file not found: {cif_path}")
return []

selection_field = row.get("selection", "")
if not isinstance(selection_field, str) or not selection_field.strip():
logger.warning(f"[{protein}] no selections in CSV row")
return []

logger.info(f"[{protein}] loading {cif_path}")
atom_array = load_structure_with_altlocs(cif_path)
altloc_info = detect_altlocs(atom_array)
if len(altloc_info.altloc_ids) < 2:
logger.warning(
f"[{protein}] structure has <2 altloc IDs ({altloc_info.altloc_ids}); skipping"
)
return []

pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)
if not pair_arrays:
logger.warning(f"[{protein}] no valid altloc pairs could be built; skipping")
return []

output_rows: list[dict] = []
for sel_str in [s.strip() for s in selection_field.split(";") if s.strip()]:
# Skip atomworks-style catchall selections # NOTE: This will need to be addressed when we
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just fix this now and make it work for atomworks style selections.

# migrate to atomworks-style selections for everything, similar to classify_altloc_regions
if any(op in sel_str for op in ATOMWORKS_COMPARISON_OPS):
continue

chain, start, end = parse_selection_string(sel_str)
if chain is None or start is None or end is None:
logger.warning(f"[{protein}] cannot parse selection: {sel_str}")
continue

# Get resids present in the structure for this range
range_mask = (
(atom_array.chain_id == chain)
& (atom_array.res_id >= start)
& (atom_array.res_id <= end)
)
actual_res_ids = sorted(set(int(r) for r in atom_array.res_id[range_mask]))

if not actual_res_ids:
logger.warning(f"[{protein}] selection matched no residues: {sel_str}")
continue

out: dict = {
"protein": protein,
"structure_pattern": row.get("structure_pattern", ""),
"map_pattern": row.get("map_pattern", ""),
"base_map_dir": row.get("base_map_dir", ""),
"resolution": row.get("resolution", ""),
"original_selection": sel_str,
}

if len(actual_res_ids) <= window_size:
out["selection"] = sel_str
out["max_rmsd"] = float("nan")
out["altloc_pair"] = ""
else:
result = _find_max_rmsd_window(pair_arrays, chain, actual_res_ids, window_size)
if result is None:
logger.warning(f"[{protein}] no valid RMSD window for {sel_str}; keeping original")
out["selection"] = sel_str
out["max_rmsd"] = float("nan")
out["altloc_pair"] = ""
else:
max_res, max_rmsd, max_pair = result
out["selection"] = f"chain {chain} and resi {max_res[0]}-{max_res[-1]}"
out["max_rmsd"] = max_rmsd
out["altloc_pair"] = max_pair

output_rows.append(out)

return output_rows


def main(args: argparse.Namespace) -> None:
input_df = pd.read_csv(args.input_csv)
required = {"protein", "selection"}
missing = required - set(input_df.columns)
if missing:
raise ValueError(f"Input CSV missing required columns: {missing}")

all_rows: list[dict] = []
for _, row in input_df.iterrows():
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a separate module for parallelizing things over rows of a dataframe, I forget what it is called, but it can speed things up a lot if _process_structure is a slow step.

all_rows.extend(
_process_structure(row=row, cif_root=args.cif_root, window_size=args.window_size)
)

detail_df = pd.DataFrame(all_rows)

# Write diagnostic csv
if args.diagnostic_file:
args.diagnostic_file.parent.mkdir(parents=True, exist_ok=True)
detail_df.to_csv(args.diagnostic_file, index=False)
logger.info(f"Wrote {len(detail_df)} rows to {args.diagnostic_file}")

if detail_df.empty:
final_df = pd.DataFrame(
columns=pd.Index(
[
"protein",
"selection",
"structure_pattern",
"map_pattern",
"base_map_dir",
"resolution",
]
)
)
else:
final_rows = []
for protein, group in detail_df.groupby("protein", sort=False):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should use a DataFrame.groupby.agg pattern here. No reason to use a for loop.

final_rows.append(
{
"protein": protein,
"selection": ";".join(group["selection"]),
"structure_pattern": group["structure_pattern"].iloc[0],
"map_pattern": group["map_pattern"].iloc[0],
"base_map_dir": group["base_map_dir"].iloc[0],
"resolution": group["resolution"].iloc[0],
}
)
final_df = pd.DataFrame(final_rows)

args.output_file.parent.mkdir(parents=True, exist_ok=True)
final_df.to_csv(args.output_file, index=False)
logger.info(f"Wrote {len(final_df)} proteins to {args.output_file}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=(
"For each altloc selection spanning more than --window-size residues, "
"find the contiguous subsegment with maximum pairwise all atom RMSD "
"between altloc conformations. Narrows selections for downstream RSCC evaluation."
)
)
parser.add_argument(
"--input-csv",
type=Path,
required=True,
help="Output CSV from find_altloc_selections.py (columns: protein, selection, "
"structure_pattern, map_pattern, base_map_dir, resolution).",
)
parser.add_argument(
"--cif-root",
type=Path,
default=None,
help="Root directory to resolve structure_pattern entries against.",
)
parser.add_argument("--output-file", type=Path, required=True)
parser.add_argument(
"--diagnostic-file",
type=Path,
default=None,
help="Optional per-selection diagnostic CSV with RMSD details.",
)
parser.add_argument("--window-size", type=int, default=3)
args = parser.parse_args()
main(args)
Comment on lines +267 to +269
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Reject non-positive --window-size values.

0 or negative values make the sliding-window calculation degenerate and can silently keep original selections instead of producing a valid max-RMSD subsegment.

🛡️ Proposed fix
     parser.add_argument("--window-size", type=int, default=3)
     args = parser.parse_args()
+    if args.window_size <= 0:
+        parser.error("--window-size must be a positive integer")
     main(args)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
parser.add_argument("--window-size", type=int, default=3)
args = parser.parse_args()
main(args)
parser.add_argument("--window-size", type=int, default=3)
args = parser.parse_args()
if args.window_size <= 0:
parser.error("--window-size must be a positive integer")
main(args)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/eval/find_max_rmsd_subsegment.py` around lines 265 - 267, The CLI
currently accepts non-positive --window-size values which break the
sliding-window logic; add a validation after parsing (before calling main(args))
that checks args.window_size is an integer > 0 and rejects otherwise (raise
SystemExit or parser.error with a clear message). Reference the parser variable
and the main(args) call in find_max_rmsd_subsegment.py so the guard runs
immediately after parser.parse_args() and prevents running the sliding-window
routine with 0 or negative sizes.

Loading
Loading