-
Notifications
You must be signed in to change notification settings - Fork 4
feat: RSCC eval updates #224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| @AGENTS.md |
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ dependencies = [ | |
| "python-dotenv", | ||
| "jaxtyping", | ||
| "jax", | ||
| "joblib", | ||
| "einx<0.4", | ||
| "hydra-core", | ||
| "loguru", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,7 @@ | |
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| from biotite.structure import AtomArray | ||
| from biotite.structure import AtomArray, AtomArrayStack | ||
| from loguru import logger | ||
| from sampleworks.eval.grid_search_eval_utils import resolve_cif_path | ||
| from sampleworks.eval.structure_utils import ( | ||
|
|
@@ -59,10 +59,9 @@ | |
| from sampleworks.utils.atom_array_utils import ( | ||
| BACKBONE_ATOM_TYPES, | ||
| BLANK_ALTLOC_IDS, | ||
| build_pairwise_altloc_arrays, | ||
| detect_altlocs, | ||
| filter_to_common_atoms, | ||
| load_structure_with_altlocs, | ||
| select_altloc, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -111,47 +110,9 @@ def _chain_from_selection(selection: str) -> str | None: | |
| return chain_id | ||
|
|
||
|
|
||
| def _build_pairwise_altloc_arrays( | ||
| atom_array, altloc_ids: list[str] | ||
| ) -> dict[tuple[str, str], tuple[AtomArray, AtomArray]]: | ||
| """Return ``{(id_i, id_j): (array_i, array_j)}`` pre-filtered to common atoms. | ||
|
|
||
| For each unordered altloc pair we build the two per-altloc AtomArrays | ||
| (via ``select_altloc(return_full_array=True)``, which includes blank-altloc | ||
| atoms as shared context) and then run ``filter_to_common_atoms`` so the two | ||
| inputs have identical atom order and count. | ||
|
|
||
| We build per-pair rather than using ``map_altlocs_to_stack`` so residues whose | ||
| altloc set is a subset of those in the whole structure (e.g. 2YL0 res 60–64 | ||
| carry only altlocs A and B, not C) still get scored for the pairs where they | ||
| exist. A stack level ``filter_to_common_atoms`` would drop them entirely. | ||
|
|
||
| TODO: this helper hits the broader issue in how we | ||
| handle structures with >2 altlocs. | ||
| Fixing that upstream would let us replace this helper | ||
| with a direct ``map_altlocs_to_stack`` call and remove a source of | ||
| duplication. | ||
| """ | ||
| pairs: dict[tuple[str, str], tuple[AtomArray, AtomArray]] = {} | ||
| for i in range(len(altloc_ids)): | ||
| for j in range(i + 1, len(altloc_ids)): | ||
| a_i = select_altloc(atom_array, altloc_ids[i], return_full_array=True) | ||
| a_j = select_altloc(atom_array, altloc_ids[j], return_full_array=True) | ||
| try: | ||
| f_i, f_j = filter_to_common_atoms(a_i, a_j) | ||
| except RuntimeError as e: | ||
| logger.warning( | ||
| f"could not match atoms between altlocs " | ||
| f"{altloc_ids[i]} and {altloc_ids[j]}: {e}" | ||
| ) | ||
| continue | ||
| pairs[(altloc_ids[i], altloc_ids[j])] = (f_i, f_j) | ||
| return pairs | ||
|
|
||
|
|
||
| def _mean_residue_lddt_for_pair( | ||
| gt_array: AtomArray, | ||
| pred_array: AtomArray, | ||
| gt_array: AtomArrayStack | AtomArray | None, | ||
| pred_array: AtomArrayStack | AtomArray | None, | ||
| chain: str, | ||
| residues: list[int], | ||
| ) -> float: | ||
|
|
@@ -187,7 +148,7 @@ def _mean_residue_lddt_for_pair( | |
|
|
||
| def _classify_selection( | ||
| atom_array: AtomArray, | ||
| pair_arrays: dict[tuple[str, str], tuple[AtomArray, AtomArray]], | ||
| pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]], | ||
| altloc_ids: list[str], | ||
| selection_str: str, | ||
| protein: str, | ||
|
|
@@ -345,7 +306,7 @@ def _process_structure( | |
| ) | ||
| return [] | ||
|
|
||
| pair_arrays = _build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids) | ||
| pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids) | ||
|
|
||
| structure_altloc_mask = ~np.isin(atom_array.altloc_id, list(BLANK_ALTLOC_IDS)) | ||
| structure_backbone_mask = np.isin(atom_array.atom_name, BACKBONE_ATOM_TYPES) | ||
|
|
@@ -356,6 +317,8 @@ def _process_structure( | |
| # find_altloc_selections.py appends a combined all altloc selection | ||
| # (atomworks-style with " or " clauses) at the end of each row. That one is | ||
| # a union over every span we already processed individually, so skip it. | ||
| # NOTE: This will need to be addressed when we | ||
| # migrate to atomworks-style selections for everything | ||
| if " or " in selection_str: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably want to allow selections that have "or" in them, presumably there will be some that have discontinuous but physically contacting residues that we care about? |
||
| continue | ||
| out = _classify_selection( | ||
|
|
@@ -371,8 +334,8 @@ def _process_structure( | |
| ) | ||
| if out is None: | ||
| continue | ||
| row, covered = out | ||
| rows.append(row) | ||
| classified_row, covered = out | ||
| rows.append(classified_row) | ||
| classified_res_ids.update(covered) | ||
|
|
||
| # residues across all classified spans should equal total unique | ||
|
|
@@ -415,7 +378,7 @@ def main(args: argparse.Namespace) -> None: | |
| ) | ||
| ) | ||
|
|
||
| out_df = pd.DataFrame(all_rows, columns=OUTPUT_COLUMNS) | ||
| out_df = pd.DataFrame(all_rows, columns=pd.Index(OUTPUT_COLUMNS)) | ||
| args.output_file.parent.mkdir(parents=True, exist_ok=True) | ||
| out_df.to_csv(args.output_file, index=False) | ||
| logger.info(f"Wrote {len(out_df)} classified spans to {args.output_file}") | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,269 @@ | ||||||||||||||||||
| """Find the maximum RMSD subsegment within each altloc selection. | ||||||||||||||||||
|
|
||||||||||||||||||
| This script consumes the output of ``scripts/eval/find_altloc_selections.py`` | ||||||||||||||||||
| and, for each contiguous altloc span longer than ``--window-size`` residues, | ||||||||||||||||||
| identifies the contiguous subsegment of that size with the highest | ||||||||||||||||||
| RMSD between any pair of alternate conformations. | ||||||||||||||||||
|
|
||||||||||||||||||
| Only residues with identical residue names across altlocs are considered. | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should fix this. Maybe make an issue. |
||||||||||||||||||
| Windows containing compositional heterogeneity (different residue names, e.g. CYS vs CSO) | ||||||||||||||||||
| are skipped. | ||||||||||||||||||
|
|
||||||||||||||||||
| The primary output CSV preserves the setup expected by | ||||||||||||||||||
| ``rscc_grid_search_script.py`` (one row per protein, semicolon joined | ||||||||||||||||||
| selections). An optional diagnostic CSV provides per selection detail. | ||||||||||||||||||
|
|
||||||||||||||||||
| Usage: find_altloc_selections.py -> this script -> rscc_grid_search_script.py | ||||||||||||||||||
| """ | ||||||||||||||||||
|
|
||||||||||||||||||
| import argparse | ||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||
|
|
||||||||||||||||||
| import numpy as np | ||||||||||||||||||
| import pandas as pd | ||||||||||||||||||
| from biotite.structure import AtomArrayStack, get_residues, rmsd as biotite_rmsd | ||||||||||||||||||
| from loguru import logger | ||||||||||||||||||
| from sampleworks.eval.grid_search_eval_utils import resolve_cif_path | ||||||||||||||||||
| from sampleworks.eval.structure_utils import ( | ||||||||||||||||||
| ATOMWORKS_COMPARISON_OPS, | ||||||||||||||||||
| parse_selection_string, | ||||||||||||||||||
| ) | ||||||||||||||||||
| from sampleworks.utils.atom_array_utils import ( | ||||||||||||||||||
| build_pairwise_altloc_arrays, | ||||||||||||||||||
| detect_altlocs, | ||||||||||||||||||
| load_structure_with_altlocs, | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def _has_compositional_heterogeneity(arr_i, arr_j, mask: np.ndarray) -> bool: | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A couple things:
|
||||||||||||||||||
| """Check if residues under *mask* have different names between two altloc arrays.""" | ||||||||||||||||||
| _, names_i = get_residues(arr_i[mask]) | ||||||||||||||||||
| _, names_j = get_residues(arr_j[mask]) | ||||||||||||||||||
| return not np.array_equal(names_i, names_j) | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def _find_max_rmsd_window( | ||||||||||||||||||
| pair_arrays: dict[tuple[str, str], tuple[AtomArrayStack, AtomArrayStack]], | ||||||||||||||||||
| chain: str, | ||||||||||||||||||
| residues: list[int], | ||||||||||||||||||
| window_size: int = 3, | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the value you usually use? 3 seems very short to me. Can we go for 5 or more? |
||||||||||||||||||
| ) -> tuple[list[int], float, str] | None: | ||||||||||||||||||
| """Slide a window over residues and find the subsegment with maximum all atom RMSD. | ||||||||||||||||||
|
|
||||||||||||||||||
| Parameters | ||||||||||||||||||
| ---------- | ||||||||||||||||||
| pair_arrays | ||||||||||||||||||
| Output of ``build_pairwise_altloc_arrays``. | ||||||||||||||||||
| chain | ||||||||||||||||||
| Chain ID to filter on. | ||||||||||||||||||
| residues | ||||||||||||||||||
| Sorted list of actual residue IDs in the span. | ||||||||||||||||||
| window_size | ||||||||||||||||||
| Number of consecutive residues per window. | ||||||||||||||||||
|
|
||||||||||||||||||
| Returns | ||||||||||||||||||
| ------- | ||||||||||||||||||
| tuple or None | ||||||||||||||||||
| ``(best_window_residues, best_rmsd, best_pair_str)`` or ``None`` | ||||||||||||||||||
| if no valid RMSD could be computed for any window. | ||||||||||||||||||
| """ | ||||||||||||||||||
| max_rmsd = -np.inf | ||||||||||||||||||
| max_window: list[int] | None = None | ||||||||||||||||||
| max_pair = "" | ||||||||||||||||||
|
|
||||||||||||||||||
| for w in range(len(residues) - window_size + 1): | ||||||||||||||||||
| window_res = residues[w : w + window_size] | ||||||||||||||||||
|
|
||||||||||||||||||
| for (alt_i, alt_j), (stack_i, stack_j) in pair_arrays.items(): | ||||||||||||||||||
| arr_i = stack_i[0] | ||||||||||||||||||
| arr_j = stack_j[0] | ||||||||||||||||||
|
|
||||||||||||||||||
| mask = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res) | ||||||||||||||||||
| if mask.sum() == 0: | ||||||||||||||||||
| continue | ||||||||||||||||||
|
|
||||||||||||||||||
| if _has_compositional_heterogeneity(arr_i, arr_j, mask): | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this to prevent an error for biotite_rmsd? If so, again I'd think about whether it might make sense to check for modified but otherwise the same residues (e.g. sulfonates). You could take the rmsd on only the common atoms. |
||||||||||||||||||
| continue | ||||||||||||||||||
|
|
||||||||||||||||||
| rmsd_val = float(biotite_rmsd(arr_i[mask], arr_j[mask])) | ||||||||||||||||||
|
Comment on lines
+81
to
+88
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Require complete residue coverage before scoring a window. Line 82 only checks for any atoms, so a window with missing residues in either altloc can still be scored and later emitted as the full 🐛 Proposed fix mask = (arr_i.chain_id == chain) & np.isin(arr_i.res_id, window_res)
if mask.sum() == 0:
continue
+ res_ids_i, _ = get_residues(arr_i[mask])
+ res_ids_j, _ = get_residues(arr_j[mask])
+ if list(map(int, res_ids_i)) != window_res or list(map(int, res_ids_j)) != window_res:
+ continue
+
if _has_compositional_heterogeneity(arr_i, arr_j, mask):
continue
rmsd_val = float(biotite_rmsd(arr_i[mask], arr_j[mask]))🤖 Prompt for AI Agents |
||||||||||||||||||
| if np.isfinite(rmsd_val) and rmsd_val > max_rmsd: | ||||||||||||||||||
| max_rmsd = rmsd_val | ||||||||||||||||||
| max_window = window_res | ||||||||||||||||||
| max_pair = f"{alt_i}-{alt_j}" | ||||||||||||||||||
|
|
||||||||||||||||||
| if max_window is None: | ||||||||||||||||||
| return None | ||||||||||||||||||
| return max_window, float(max_rmsd), max_pair | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def _process_structure( | ||||||||||||||||||
| row: pd.Series, | ||||||||||||||||||
| cif_root: Path | None, | ||||||||||||||||||
| window_size: int = 3, | ||||||||||||||||||
| ) -> list[dict]: | ||||||||||||||||||
| """Load a structure and narrow all its selections to max RMSD subsegments.""" | ||||||||||||||||||
| protein = str(row["protein"]) | ||||||||||||||||||
| cif_path = resolve_cif_path(row, cif_root) | ||||||||||||||||||
| if not cif_path.exists(): | ||||||||||||||||||
| logger.error(f"[{protein}] CIF file not found: {cif_path}") | ||||||||||||||||||
| return [] | ||||||||||||||||||
|
|
||||||||||||||||||
| selection_field = row.get("selection", "") | ||||||||||||||||||
| if not isinstance(selection_field, str) or not selection_field.strip(): | ||||||||||||||||||
| logger.warning(f"[{protein}] no selections in CSV row") | ||||||||||||||||||
| return [] | ||||||||||||||||||
|
|
||||||||||||||||||
| logger.info(f"[{protein}] loading {cif_path}") | ||||||||||||||||||
| atom_array = load_structure_with_altlocs(cif_path) | ||||||||||||||||||
| altloc_info = detect_altlocs(atom_array) | ||||||||||||||||||
| if len(altloc_info.altloc_ids) < 2: | ||||||||||||||||||
| logger.warning( | ||||||||||||||||||
| f"[{protein}] structure has <2 altloc IDs ({altloc_info.altloc_ids}); skipping" | ||||||||||||||||||
| ) | ||||||||||||||||||
| return [] | ||||||||||||||||||
|
|
||||||||||||||||||
| pair_arrays = build_pairwise_altloc_arrays(atom_array, altloc_info.altloc_ids) | ||||||||||||||||||
| if not pair_arrays: | ||||||||||||||||||
| logger.warning(f"[{protein}] no valid altloc pairs could be built; skipping") | ||||||||||||||||||
| return [] | ||||||||||||||||||
|
|
||||||||||||||||||
| output_rows: list[dict] = [] | ||||||||||||||||||
| for sel_str in [s.strip() for s in selection_field.split(";") if s.strip()]: | ||||||||||||||||||
| # Skip atomworks-style catchall selections # NOTE: This will need to be addressed when we | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's just fix this now and make it work for atomworks style selections. |
||||||||||||||||||
| # migrate to atomworks-style selections for everything, similar to classify_altloc_regions | ||||||||||||||||||
| if any(op in sel_str for op in ATOMWORKS_COMPARISON_OPS): | ||||||||||||||||||
| continue | ||||||||||||||||||
|
|
||||||||||||||||||
| chain, start, end = parse_selection_string(sel_str) | ||||||||||||||||||
| if chain is None or start is None or end is None: | ||||||||||||||||||
| logger.warning(f"[{protein}] cannot parse selection: {sel_str}") | ||||||||||||||||||
| continue | ||||||||||||||||||
|
|
||||||||||||||||||
| # Get resids present in the structure for this range | ||||||||||||||||||
| range_mask = ( | ||||||||||||||||||
| (atom_array.chain_id == chain) | ||||||||||||||||||
| & (atom_array.res_id >= start) | ||||||||||||||||||
| & (atom_array.res_id <= end) | ||||||||||||||||||
| ) | ||||||||||||||||||
| actual_res_ids = sorted(set(int(r) for r in atom_array.res_id[range_mask])) | ||||||||||||||||||
|
|
||||||||||||||||||
| if not actual_res_ids: | ||||||||||||||||||
| logger.warning(f"[{protein}] selection matched no residues: {sel_str}") | ||||||||||||||||||
| continue | ||||||||||||||||||
|
|
||||||||||||||||||
| out: dict = { | ||||||||||||||||||
| "protein": protein, | ||||||||||||||||||
| "structure_pattern": row.get("structure_pattern", ""), | ||||||||||||||||||
| "map_pattern": row.get("map_pattern", ""), | ||||||||||||||||||
| "base_map_dir": row.get("base_map_dir", ""), | ||||||||||||||||||
| "resolution": row.get("resolution", ""), | ||||||||||||||||||
| "original_selection": sel_str, | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| if len(actual_res_ids) <= window_size: | ||||||||||||||||||
| out["selection"] = sel_str | ||||||||||||||||||
| out["max_rmsd"] = float("nan") | ||||||||||||||||||
| out["altloc_pair"] = "" | ||||||||||||||||||
| else: | ||||||||||||||||||
| result = _find_max_rmsd_window(pair_arrays, chain, actual_res_ids, window_size) | ||||||||||||||||||
| if result is None: | ||||||||||||||||||
| logger.warning(f"[{protein}] no valid RMSD window for {sel_str}; keeping original") | ||||||||||||||||||
| out["selection"] = sel_str | ||||||||||||||||||
| out["max_rmsd"] = float("nan") | ||||||||||||||||||
| out["altloc_pair"] = "" | ||||||||||||||||||
| else: | ||||||||||||||||||
| max_res, max_rmsd, max_pair = result | ||||||||||||||||||
| out["selection"] = f"chain {chain} and resi {max_res[0]}-{max_res[-1]}" | ||||||||||||||||||
| out["max_rmsd"] = max_rmsd | ||||||||||||||||||
| out["altloc_pair"] = max_pair | ||||||||||||||||||
|
|
||||||||||||||||||
| output_rows.append(out) | ||||||||||||||||||
|
|
||||||||||||||||||
| return output_rows | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| def main(args: argparse.Namespace) -> None: | ||||||||||||||||||
| input_df = pd.read_csv(args.input_csv) | ||||||||||||||||||
| required = {"protein", "selection"} | ||||||||||||||||||
| missing = required - set(input_df.columns) | ||||||||||||||||||
| if missing: | ||||||||||||||||||
| raise ValueError(f"Input CSV missing required columns: {missing}") | ||||||||||||||||||
|
|
||||||||||||||||||
| all_rows: list[dict] = [] | ||||||||||||||||||
| for _, row in input_df.iterrows(): | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a separate module for parallelizing things over rows of a dataframe, I forget what it is called, but it can speed things up a lot if _process_structure is a slow step. |
||||||||||||||||||
| all_rows.extend( | ||||||||||||||||||
| _process_structure(row=row, cif_root=args.cif_root, window_size=args.window_size) | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| detail_df = pd.DataFrame(all_rows) | ||||||||||||||||||
|
|
||||||||||||||||||
| # Write diagnostic csv | ||||||||||||||||||
| if args.diagnostic_file: | ||||||||||||||||||
| args.diagnostic_file.parent.mkdir(parents=True, exist_ok=True) | ||||||||||||||||||
| detail_df.to_csv(args.diagnostic_file, index=False) | ||||||||||||||||||
| logger.info(f"Wrote {len(detail_df)} rows to {args.diagnostic_file}") | ||||||||||||||||||
|
|
||||||||||||||||||
| if detail_df.empty: | ||||||||||||||||||
| final_df = pd.DataFrame( | ||||||||||||||||||
| columns=pd.Index( | ||||||||||||||||||
| [ | ||||||||||||||||||
| "protein", | ||||||||||||||||||
| "selection", | ||||||||||||||||||
| "structure_pattern", | ||||||||||||||||||
| "map_pattern", | ||||||||||||||||||
| "base_map_dir", | ||||||||||||||||||
| "resolution", | ||||||||||||||||||
| ] | ||||||||||||||||||
| ) | ||||||||||||||||||
| ) | ||||||||||||||||||
| else: | ||||||||||||||||||
| final_rows = [] | ||||||||||||||||||
| for protein, group in detail_df.groupby("protein", sort=False): | ||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should use a DataFrame.groupby.agg pattern here. No reason to use a for loop. |
||||||||||||||||||
| final_rows.append( | ||||||||||||||||||
| { | ||||||||||||||||||
| "protein": protein, | ||||||||||||||||||
| "selection": ";".join(group["selection"]), | ||||||||||||||||||
| "structure_pattern": group["structure_pattern"].iloc[0], | ||||||||||||||||||
| "map_pattern": group["map_pattern"].iloc[0], | ||||||||||||||||||
| "base_map_dir": group["base_map_dir"].iloc[0], | ||||||||||||||||||
| "resolution": group["resolution"].iloc[0], | ||||||||||||||||||
| } | ||||||||||||||||||
| ) | ||||||||||||||||||
| final_df = pd.DataFrame(final_rows) | ||||||||||||||||||
|
|
||||||||||||||||||
| args.output_file.parent.mkdir(parents=True, exist_ok=True) | ||||||||||||||||||
| final_df.to_csv(args.output_file, index=False) | ||||||||||||||||||
| logger.info(f"Wrote {len(final_df)} proteins to {args.output_file}") | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
| if __name__ == "__main__": | ||||||||||||||||||
| parser = argparse.ArgumentParser( | ||||||||||||||||||
| description=( | ||||||||||||||||||
| "For each altloc selection spanning more than --window-size residues, " | ||||||||||||||||||
| "find the contiguous subsegment with maximum pairwise all atom RMSD " | ||||||||||||||||||
| "between altloc conformations. Narrows selections for downstream RSCC evaluation." | ||||||||||||||||||
| ) | ||||||||||||||||||
| ) | ||||||||||||||||||
| parser.add_argument( | ||||||||||||||||||
| "--input-csv", | ||||||||||||||||||
| type=Path, | ||||||||||||||||||
| required=True, | ||||||||||||||||||
| help="Output CSV from find_altloc_selections.py (columns: protein, selection, " | ||||||||||||||||||
| "structure_pattern, map_pattern, base_map_dir, resolution).", | ||||||||||||||||||
| ) | ||||||||||||||||||
| parser.add_argument( | ||||||||||||||||||
| "--cif-root", | ||||||||||||||||||
| type=Path, | ||||||||||||||||||
| default=None, | ||||||||||||||||||
| help="Root directory to resolve structure_pattern entries against.", | ||||||||||||||||||
| ) | ||||||||||||||||||
| parser.add_argument("--output-file", type=Path, required=True) | ||||||||||||||||||
| parser.add_argument( | ||||||||||||||||||
| "--diagnostic-file", | ||||||||||||||||||
| type=Path, | ||||||||||||||||||
| default=None, | ||||||||||||||||||
| help="Optional per-selection diagnostic CSV with RMSD details.", | ||||||||||||||||||
| ) | ||||||||||||||||||
| parser.add_argument("--window-size", type=int, default=3) | ||||||||||||||||||
| args = parser.parse_args() | ||||||||||||||||||
| main(args) | ||||||||||||||||||
|
Comment on lines
+267
to
+269
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reject non-positive
🛡️ Proposed fix parser.add_argument("--window-size", type=int, default=3)
args = parser.parse_args()
+ if args.window_size <= 0:
+ parser.error("--window-size must be a positive integer")
main(args)📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What will need to be addressed?