diff-use · k-chrispens · Apr 16, 2026 · Apr 16, 2026 · Apr 20, 2026 · Apr 23, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
   "python-dotenv",
   "jaxtyping",
   "jax",
+  "joblib",
   "einx<0.4",
   "hydra-core",
   "loguru",

diff --git a/scripts/eval/classify_altloc_regions.py b/scripts/eval/classify_altloc_regions.py
@@ -47,7 +47,7 @@
 
 import numpy as np
 import pandas as pd
-from biotite.structure import AtomArray
+from biotite.structure import AtomArray, AtomArrayStack
 from loguru import logger
 from sampleworks.eval.grid_search_eval_utils import resolve_cif_path
 from sampleworks.eval.structure_utils import (
@@ -59,10 +59,9 @@
 from sampleworks.utils.atom_array_utils import (
     BACKBONE_ATOM_TYPES,
     BLANK_ALTLOC_IDS,
+    build_pairwise_altloc_arrays,
     detect_altlocs,
-    filter_to_common_atoms,
     load_structure_with_altlocs,
-    select_altloc,
 )
 
 
@@ -111,47 +110,9 @@ def _chain_from_selection(selection: str) -> str | None:
     return chain_id
 
 
-def _build_pairwise_altloc_arrays(
-    atom_array, altloc_ids: list[str]
-) -> dict[tuple[str, str], tuple[AtomArray, AtomArray]]:
-    """Return ``{(id_i, id_j): (array_i, array_j)}`` pre-filtered to common atoms.
-
-    For each unordered altloc pair we build the two per-altloc AtomArrays
-    (via ``select_altloc(return_full_array=True)``, which includes blank-altloc
-    atoms as shared context) and then run ``filter_to_common_atoms`` so the two
-    inputs have identical atom order and count.
-
-    We build per-pair rather than using ``map_altlocs_to_stack`` so residues whose
-    altloc set is a subset of those in the whole structure (e.g. 2YL0 res 60–64
-    carry only altlocs A and B, not C) still get scored for the pairs where they
-    exist. A stack level ``filter_to_common_atoms`` would drop them entirely.
-
-    TODO: this helper hits the broader issue in how we
-    handle structures with >2 altlocs.
-    Fixing that upstream would let us replace this helper
-    with a direct ``map_altlocs_to_stack`` call and remove a source of
-    duplication.
-    """
-    pairs: dict[tuple[str, str], tuple[AtomArray, AtomArray]] = {}
-    for i in range(len(altloc_ids)):
-        for j in range(i + 1, len(altloc_ids)):
-            a_i = select_altloc(atom_array, altloc_ids[i], return_full_array=True)
-            a_j = select_altloc(atom_array, altloc_ids[j], return_full_array=True)
-            try:
-                f_i, f_j = filter_to_common_atoms(a_i, a_j)
-            except RuntimeError as e:
-                logger.warning(
-                    f"could not match atoms between altlocs "
-                    f"{altloc_ids[i]} and {altloc_ids[j]}: {e}"
-                )
-                continue
-            pairs[(altloc_ids[i], altloc_ids[j])] = (f_i, f_j)
-    return pairs
-
-
 def _mean_residue_lddt_for_pair(
-    gt_array: AtomArray,
-    pred_array: AtomArray,
+    gt_array: AtomArrayStack | AtomArray | None,
+    pred_array: AtomArrayStack | AtomArray | None,
     chain: str,
     residues: list[int],
 ) -> float:
@@ -187,7 +148,7 @@ def _mean_residue_lddt_for_pair(
 
 def _classify_selection(
     atom_array: AtomArray,
-    pair_arrays: dict[tuple[str, str], tuple[AtomArray, AtomArray]],
+    pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]],
     altloc_ids: list[str],
     selection_str: str,
     protein: str,
@@ -345,7 +306,7 @@ def _process_structure(
         )
         return []
 
-    pair_arrays = _build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)
+    pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)
 
     structure_altloc_mask = ~np.isin(atom_array.altloc_id, list(BLANK_ALTLOC_IDS))
     structure_backbone_mask = np.isin(atom_array.atom_name, BACKBONE_ATOM_TYPES)
@@ -356,6 +317,8 @@ def _process_structure(
         # find_altloc_selections.py appends a combined all altloc selection
         # (atomworks-style with " or " clauses) at the end of each row. That one is
         # a union over every span we already processed individually, so skip it.
+        # NOTE: This will need to be addressed when we
+        # migrate to atomworks-style selections for everything
         if " or " in selection_str:
             continue
         out = _classify_selection(
@@ -371,8 +334,8 @@ def _process_structure(
         )
         if out is None:
             continue
-        row, covered = out
-        rows.append(row)
+        classified_row, covered = out
+        rows.append(classified_row)
         classified_res_ids.update(covered)
 
     # residues across all classified spans should equal total unique
@@ -415,7 +378,7 @@ def main(args: argparse.Namespace) -> None:
             )
         )
 
-    out_df = pd.DataFrame(all_rows, columns=OUTPUT_COLUMNS)
+    out_df = pd.DataFrame(all_rows, columns=pd.Index(OUTPUT_COLUMNS))
     args.output_file.parent.mkdir(parents=True, exist_ok=True)
     out_df.to_csv(args.output_file, index=False)
     logger.info(f"Wrote {len(out_df)} classified spans to {args.output_file}")

diff --git a/scripts/eval/find_max_rmsd_subsegment.py b/scripts/eval/find_max_rmsd_subsegment.py
@@ -0,0 +1,269 @@
+"""Find the maximum RMSD subsegment within each altloc selection.
+
+This script consumes the output of ``scripts/eval/find_altloc_selections.py``
+and, for each contiguous altloc span longer than ``--window-size`` residues,
+identifies the contiguous subsegment of that size with the highest
+RMSD between any pair of alternate conformations.
+
+Only residues with identical residue names across altlocs are considered.
+Windows containing compositional heterogeneity (different residue names, e.g. CYS vs CSO)
+are skipped.
+
+The primary output CSV preserves the setup expected by
+``rscc_grid_search_script.py`` (one row per protein, semicolon joined
+selections).  An optional diagnostic CSV provides per selection detail.
+
+Usage: find_altloc_selections.py -> this script -> rscc_grid_search_script.py
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from biotite.structure import AtomArrayStack, get_residues, rmsd as biotite_rmsd
+from loguru import logger
+from sampleworks.eval.grid_search_eval_utils import resolve_cif_path
+from sampleworks.eval.structure_utils import (
+    ATOMWORKS_COMPARISON_OPS,
+    parse_selection_string,
+)
+from sampleworks.utils.atom_array_utils import (
+    build_pairwise_altloc_arrays,
+    detect_altlocs,
+    load_structure_with_altlocs,
+)
+
+
+def _has_compositional_heterogeneity(arr_i, arr_j, mask: np.ndarray) -> bool:
+    """Check if residues under *mask* have different names between two altloc arrays."""
+    _, names_i = get_residues(arr_i[mask])
+    _, names_j = get_residues(arr_j[mask])
+    return not np.array_equal(names_i, names_j)
+
+
+def _find_max_rmsd_window(
+    pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]],
+    chain: str,
+    residues: list[int],
+    window_size: int = 3,
+) -> tuple[list[int], float, str] | None:
+    """Slide a window over residues and find the subsegment with maximum all atom RMSD.
+
+    Parameters
+    ----------
+    pair_arrays
+        Output of ``build_pairwise_altloc_arrays``.
+    chain
+        Chain ID to filter on.
+    residues
+        Sorted list of actual residue IDs in the span.
+    window_size
+        Number of consecutive residues per window.
+
+    Returns
+    -------
+    tuple or None
+        ``(best_window_residues, best_rmsd, best_pair_str)`` or ``None``
+        if no valid RMSD could be computed for any window.
+    """
+    max_rmsd = -np.inf
+    max_window: list[int] | None = None
+    max_pair = ""
+
+    for w in range(len(residues) - window_size + 1):
+        window_res = residues[w : w + window_size]
+
+        for (alt_i, alt_j), (stack_i, stack_j) in pair_arrays.items():
+            arr_i = stack_i[0]
+            arr_j = stack_j[0]
+
+            mask = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res)
+            if mask.sum() == 0:
+                continue
+
+            if _has_compositional_heterogeneity(arr_i, arr_j, mask):
+                continue
+
+            rmsd_val = float(biotite_rmsd(arr_i[mask], arr_j[mask]))
+            if np.isfinite(rmsd_val) and rmsd_val > max_rmsd:
+                max_rmsd = rmsd_val
+                max_window = window_res
+                max_pair = f"{alt_i}-{alt_j}"
+
+    if max_window is None:
+        return None
+    return max_window, float(max_rmsd), max_pair
+
+
+def _process_structure(
+    row: pd.Series,
+    cif_root: Path | None,
+    window_size: int = 3,
+) -> list[dict]:
+    """Load a structure and narrow all its selections to max RMSD subsegments."""
+    protein = str(row["protein"])
+    cif_path = resolve_cif_path(row, cif_root)
+    if not cif_path.exists():
+        logger.error(f"[{protein}] CIF file not found: {cif_path}")
+        return []
+
+    selection_field = row.get("selection", "")
+    if not isinstance(selection_field, str) or not selection_field.strip():
+        logger.warning(f"[{protein}] no selections in CSV row")
+        return []
+
+    logger.info(f"[{protein}] loading {cif_path}")
+    atom_array = load_structure_with_altlocs(cif_path)
+    altloc_info = detect_altlocs(atom_array)
+    if len(altloc_info.altloc_ids) < 2:
+        logger.warning(
+            f"[{protein}] structure has <2 altloc IDs ({altloc_info.altloc_ids}); skipping"
+        )
+        return []
+
+    pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids)
+    if not pair_arrays:
+        logger.warning(f"[{protein}] no valid altloc pairs could be built; skipping")
+        return []
+
+    output_rows: list[dict] = []
+    for sel_str in [s.strip() for s in selection_field.split(";") if s.strip()]:
+        # Skip atomworks-style catchall selections # NOTE: This will need to be addressed when we
+        # migrate to atomworks-style selections for everything, similar to classify_altloc_regions
+        if any(op in sel_str for op in ATOMWORKS_COMPARISON_OPS):
+            continue
+
+        chain, start, end = parse_selection_string(sel_str)
+        if chain is None or start is None or end is None:
+            logger.warning(f"[{protein}] cannot parse selection: {sel_str}")
+            continue
+
+        # Get resids present in the structure for this range
+        range_mask = (
+            (atom_array.chain_id == chain)
+            & (atom_array.res_id >= start)
+            & (atom_array.res_id <= end)
+        )
+        actual_res_ids = sorted(set(int(r) for r in atom_array.res_id[range_mask]))
+
+        if not actual_res_ids:
+            logger.warning(f"[{protein}] selection matched no residues: {sel_str}")
+            continue
+
+        out: dict = {
+            "protein": protein,
+            "structure_pattern": row.get("structure_pattern", ""),
+            "map_pattern": row.get("map_pattern", ""),
+            "base_map_dir": row.get("base_map_dir", ""),
+            "resolution": row.get("resolution", ""),
+            "original_selection": sel_str,
+        }
+
+        if len(actual_res_ids) <= window_size:
+            out["selection"] = sel_str
+            out["max_rmsd"] = float("nan")
+            out["altloc_pair"] = ""
+        else:
+            result = _find_max_rmsd_window(pair_arrays, chain, actual_res_ids, window_size)
+            if result is None:
+                logger.warning(f"[{protein}] no valid RMSD window for {sel_str}; keeping original")
+                out["selection"] = sel_str
+                out["max_rmsd"] = float("nan")
+                out["altloc_pair"] = ""
+            else:
+                max_res, max_rmsd, max_pair = result
+                out["selection"] = f"chain {chain} and resi {max_res[0]}-{max_res[-1]}"
+                out["max_rmsd"] = max_rmsd
+                out["altloc_pair"] = max_pair
+
+        output_rows.append(out)
+
+    return output_rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_df = pd.read_csv(args.input_csv)
+    required = {"protein", "selection"}
+    missing = required - set(input_df.columns)
+    if missing:
+        raise ValueError(f"Input CSV missing required columns: {missing}")
+
+    all_rows: list[dict] = []
+    for _, row in input_df.iterrows():
+        all_rows.extend(
+            _process_structure(row=row, cif_root=args.cif_root, window_size=args.window_size)
+        )
+
+    detail_df = pd.DataFrame(all_rows)
+
+    # Write diagnostic csv
+    if args.diagnostic_file:
+        args.diagnostic_file.parent.mkdir(parents=True, exist_ok=True)
+        detail_df.to_csv(args.diagnostic_file, index=False)
+        logger.info(f"Wrote {len(detail_df)} rows to {args.diagnostic_file}")
+
+    if detail_df.empty:
+        final_df = pd.DataFrame(
+            columns=pd.Index(
+                [
+                    "protein",
+                    "selection",
+                    "structure_pattern",
+                    "map_pattern",
+                    "base_map_dir",
+                    "resolution",
+                ]
+            )
+        )
+    else:
+        final_rows = []
+        for protein, group in detail_df.groupby("protein", sort=False):
+            final_rows.append(
+                {
+                    "protein": protein,
+                    "selection": ";".join(group["selection"]),
+                    "structure_pattern": group["structure_pattern"].iloc[0],
+                    "map_pattern": group["map_pattern"].iloc[0],
+                    "base_map_dir": group["base_map_dir"].iloc[0],
+                    "resolution": group["resolution"].iloc[0],
+                }
+            )
+        final_df = pd.DataFrame(final_rows)
+
+    args.output_file.parent.mkdir(parents=True, exist_ok=True)
+    final_df.to_csv(args.output_file, index=False)
+    logger.info(f"Wrote {len(final_df)} proteins to {args.output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=(
+            "For each altloc selection spanning more than --window-size residues, "
+            "find the contiguous subsegment with maximum pairwise all atom RMSD "
+            "between altloc conformations. Narrows selections for downstream RSCC evaluation."
+        )
+    )
+    parser.add_argument(
+        "--input-csv",
+        type=Path,
+        required=True,
+        help="Output CSV from find_altloc_selections.py (columns: protein, selection, "
+        "structure_pattern, map_pattern, base_map_dir, resolution).",
+    )
+    parser.add_argument(
+        "--cif-root",
+        type=Path,
+        default=None,
+        help="Root directory to resolve structure_pattern entries against.",
+    )
+    parser.add_argument("--output-file", type=Path, required=True)
+    parser.add_argument(
+        "--diagnostic-file",
+        type=Path,
+        default=None,
+        help="Optional per-selection diagnostic CSV with RMSD details.",
+    )
+    parser.add_argument("--window-size", type=int, default=3)
+    args = parser.parse_args()
+    main(args)
-    parser.add_argument("--window-size", type=int, default=3)
-    args = parser.parse_args()
-    main(args)
+    parser.add_argument("--window-size", type=int, default=3)
+    args = parser.parse_args()
+    if args.window_size <= 0:
+        parser.error("--window-size must be a positive integer")
+    main(args)
-    parser.add_argument("--window-size", type=int, default=3)
-    args = parser.parse_args()
-    main(args)
+    parser.add_argument("--window-size", type=int, default=3)
+    args = parser.parse_args()
+    if args.window_size <= 0:
+        parser.error("--window-size must be a positive integer")
+    main(args)