plinder-org · OleinikovasV · Mar 27, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -28,7 +28,7 @@ jobs:
         uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
-          create-args: python=3.10
+          create-args: python=3.12
           init-shell: bash
           cache-downloads: true
           cache-environment: true

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -22,7 +22,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
       - name: Configure docker
         run: echo ${{ secrets.GITHUB_TOKEN }} | docker login ghcr.io -u ${{ github.repository_owner }} --password-stdin
       - name: Install build and tag requirements

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -23,18 +23,18 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.12"
       - name: Install tox
         run: pip install tox
       - name: Run quality checks
-        run: tox -e py310-lint,py310-type
+        run: tox -e py312-lint,py312-type
       - name: Directory Cache
         uses: actions/cache@v4
         with:
           path: .tox
-          key: tox-${{ runner.os }}-3.10-${{ hashFiles('tox.ini') }}
+          key: tox-${{ runner.os }}-3.12-${{ hashFiles('tox.ini') }}
           restore-keys: |
-            tox-${{ runner.os }}-3.10-
+            tox-${{ runner.os }}-3.12-
 
   test:
     name: Build and test docker image
@@ -101,7 +101,7 @@ jobs:
         uses: mamba-org/setup-micromamba@v1
         with:
           environment-file: environment.yml
-          create-args: python=3.10
+          create-args: python=3.12
           init-shell: bash
           cache-downloads: true
           cache-environment: true

diff --git a/.gitignore b/.gitignore
@@ -149,6 +149,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.claude
 
 # Spyder project settings
 .spyderproject
@@ -186,9 +187,10 @@ cython_debug/
 */*/DS_Store
 
 src/plinder-data/plinder/data/artifacts
-#src/plinder-core
 *.bak*
 *.1.*
 tests/xx
 tests/test_data/plinder/mount/systems/*/
+tmp_foldseek
+tmp_mmseqs
 artifacts
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/README.md b/README.md
@@ -35,9 +35,7 @@ The *PLINDER* project is a community effort, launched by the University of Basel
 SIB Swiss Institute of Bioinformatics, Proxima (formerly VantAI), NVIDIA, MIT CSAIL,
 and will be regularly updated.
 
-To accelerate community adoption, PLINDER will be used as the field’s new Protein-Ligand
-interaction dataset standard as part of an exciting competition at the upcoming 2024
-[Machine Learning in Structural Biology (MLSB)](https://mlsb.io#challenge) Workshop at NeurIPS, one of the field's premiere academic gatherings.
+PLINDER set a new standard for the Protein-Ligand interaction datasets. It was first introduced as part of the 2024 Machine Learning in Structural Biology (MLSB) [Workshop challenge](https://www.mlsb.io/index_2024.html#challenge) at NeurIPS, one of the field's premiere academic gatherings.
 More details about the competition and other helpful practical tips can be found at our recent workshop repo:
 [Moving Beyond Memorization](https://github.com/plinder-org/moving_beyond_memorisation).
 
@@ -58,13 +56,31 @@ release and the `plinder.core` package makes it easy to interact
 with the dataset.
 
 #### 🐛🐛🐛 Known bugs:
-- Source dataset contains incorrect `entry_release_date` dates, please, use `query_index` to get correct dates patched.
-- Complexes containing nucleic acid receptors may [not be saved corectly](https://github.com/plinder-org/plinder/issues/61).
-- `ligand_binding_affinity` queries have been disabled due to a [bug found parsing BindingDB](https://github.com/plinder-org/plinder/issues/94)
+- ~~Source dataset contains incorrect `entry_release_date` dates, please, use `query_index` to get correct dates patched.~~
+- ~~Complexes containing nucleic acid receptors may [not be saved correctly](https://github.com/plinder-org/plinder/issues/61).~~
+- ~~`ligand_binding_affinity` queries have been disabled due to a [bug found parsing BindingDB](https://github.com/plinder-org/plinder/issues/94)~~
+All fixed in WIP — will take effect after dataset regeneration.
 
 #### Changelog:
 
-- 2024-06/v2 (Current):
+- WIP (Current — unreleased):
+    - **Major backend refactor**: replaced OST, gemmi, plip, openbabel with biotite + peppr for data generation; removed 6 dependencies from ingest pipeline
+    - **Nucleic acid support**: DNA/RNA chains now correctly included as receptor neighbors, mainchain/sidechain detection works for both protein and nucleic acids ([#61](https://github.com/plinder-org/plinder/issues/61))
+    - **Custom CIF support**: new `Entry.from_custom_cif_file` for structure-prediction outputs (Boltz, AlphaFold3, Chai-1) that ship CIFs without `_chem_comp_bond` ([#117](https://github.com/plinder-org/plinder/issues/117)). Bond orders come from `ligand_smiles_dict` via positional atom-order match (the convention these tools follow); element/count mismatches raise with the offending position, `force_substructure_match=True` opts into substructure matching when atom order isn't preserved. User SMILES win over CCD for both `smiles` and `resolved_stereo_matches_template` — closes a silent gap where biotite's `LIG` placeholder would pass any 3D conformer. Input CIFs are never mutated; optional `save_fixed_cif` persists the enriched copy.
+    - **Stricter CIF ingest**: H/D/T filtered consistently (`is_hydrogen_isotope`); multi-model CIFs warn and use model 1; multi-instance custom comp_ids must share heavy-atom naming (since `_chem_comp_bond` is comp_id-keyed); silent `connect_via_residue_names` and half-sanitized substructure fallbacks replaced with `ValueError` so corrupt inputs fail loudly.
+    - **Stereochemistry**: CCD ideal 3D coordinates used as stereo ground truth; new `resolved_stereo_matches_template` flag validates resolved structure chirality against CCD template (handles partial resolution via MCS trimming)
+    - **Interactions**: water bridge and metal bridge detection via peppr; halogen bond sidechain flag now computed (was hardcoded)
+    - **Binding affinity**: fixed BindingDB matching — target sequence now validated against PDB SEQRES with 100% core identity, terminal tags/truncations tolerated ([#94](https://github.com/plinder-org/plinder/issues/94)); updated to BindingDB 2026-04
+    - **Optional eval**: OpenStructure and posebusters moved to `pip install plinder[eval]`; base install is numpy 2 compatible; posebusters no longer runs during ingest
+    - **PlinderSystem API**: new `receptor_structure` (biotite AtomArray) and `ligand_mols` (RDKit Mol) properties; OST properties (`receptor_entity`, `ligand_views`) kept for eval but require `plinder[eval]`
+    - **Chain type support**: `Chain.from_cif_data` now assigns proper one-letter codes and chem_types for nucleotides (`RNA Linking`, `DNA Linking`); new `Residue.is_modified` property covers both protein PTMs and modified nucleotide bases
+    - **Save utils**: receptor/ligand chain naming generalized (`PDB_RECEPTOR_CHAINS`); system saving works for protein, NA, and mixed complexes
+    - **System definition**: unified `min_polymer_size=12` replaces separate `min_polymer_size`/`max_non_small_mol_ligand_length` — polymers ≥ 12 residues are receptor, shorter are ligands (threshold matches minimum MMseqs2/Foldseek search length); molecules with BIRD annotation are ligands irrespective of size; ligand chains no longer appear in both receptor and ligand parts of system IDs.
+    - **System grouping**: pocket-based grouping (≥ 3 shared receptor residues on the same chain instance) for adjacent binding sites (e.g. orthosteric + allosteric, cofactor + substrate in same active site); artifacts attach only via 4 Å proximity
+    - **Dead code removal**: removed unused OST-based functions, PDB string roundtrips, duplicate SMILES derivation paths, v1 template matching (consolidated to Rascal MCES `get_matched_template`)
+    - **License**: changed from GPL-2.0 to Apache-2.0 (GPL was only required by PLIP, now removed)
+
+- 2024-06/v2:
     - New systems added based on the 2024-06 RCSB sync
     - Updated system definition to be more stable and depend only on ligand distance rather than PLIP
     - Added annotations for crystal contacts
@@ -124,6 +140,12 @@ For details on the sub-directories, see [Documentation](https://plinder-org.gith
 pip install plinder
 ```
 
+For evaluation scoring (lDDT, RMSD via OpenStructure):
+
+```
+pip install plinder[eval]
+```
+
 ## License
 Data curated by PLINDER are made available under the Apache License 2.0.
 All data curated by BindingDB staff are provided under the Creative Commons Attribution 4.0 License. Data imported from ChEMBL are provided under their Creative Commons Attribution-Share Alike 4.0 Unported License.

diff --git a/dockerfiles/base/env.yml b/dockerfiles/base/env.yml
@@ -13,6 +13,5 @@ dependencies:
   - openstructure
   - mmseqs2
   - foldseek
-  - plip=2.3.0
   - pip:
     - keyrings.google-artifactregistry-auth==1.1.2
diff --git a/docs/conf.py b/docs/conf.py
@@ -5,7 +5,9 @@
 import plinder
 
 DOC_PATH = Path(__file__).parent
-COLUMN_REFERENCE_PATH = DOC_PATH.parent / "src" / "plinder" / "data" / "column_descriptions"
+COLUMN_REFERENCE_PATH = (
+    DOC_PATH.parent / "src" / "plinder" / "data" / "column_descriptions"
+)
 
 # Avoid verbose logs in rendered notebooks
 os.environ["PLINDER_LOG_LEVEL"] = "0"

diff --git a/docs/contribution/development.md b/docs/contribution/development.md
@@ -16,46 +16,50 @@ $ git clone https://github.com/plinder-org/plinder.git
 
 ### Creating the Conda environment
 
-The `plinder` subpackages beside `plinder.core` require dependencies that are not
-installable via `pip`.
-The most convenient way to install the aforementioned extra dependencies is a _Conda_
-environment.
+The data generation pipeline (`plinder.data`) requires a few tools that are only
+available via _Conda_ (mmseqs2, foldseek, reduce).
 If you have not _Conda_ installed yet, we recommend its installation via
 [miniforge](https://github.com/conda-forge/miniforge).
-Afterwards the environment can be created from the `environment.yml` in the local
-repository clone.
 
-:::{note}
-Currently only a Linux environment is fully supported, although the base
-environment also installs to MacOS.
-`plinder.data` uses a number of dependencies which are not simply pip-installable.
-Several dependencies which are referenced by a GitHub link directly, make
-a pip-installable package problematic.
-This includes Linux pytorch, which will not work in MacOS.
-These additional dependencies can be installed by running:
+```console
+$ mamba env create -f environment.yml
+$ mamba activate plinder
+```
+
+### Installing `plinder`
+
+The base install covers data generation and the core library (numpy 2 compatible):
 
 ```console
-$ pip install -r requirements_data.txt
+$ pip install -e ".[dev]"
 ```
 
-`plinder.eval` also relies on `openstructure` for metrics
-calculations. For Windows and MacOS users, please see the relevant
-[_Docker_](#docker-target) resources.
-:::
+### Evaluation scoring (optional)
+
+`plinder.eval` requires [OpenStructure](https://openstructure.org/) for
+lDDT/RMSD scoring metrics. OpenStructure currently requires numpy<2, so it
+is kept as an optional dependency:
 
 ```console
-$ mamba env create -f environment.yml
-$ mamba activate plinder
+$ pip install -e ".[eval]"
 ```
 
-### Installing `plinder`
+:::{note}
+The `eval` extra installs OpenStructure, posebusters and plotly.
+Data generation (`plinder.data`) does **not** require OpenStructure and
+works with numpy 2.
 
-Now `plinder` can be installed into the created environment:
+For the full data pipeline, additional dependencies are needed:
 
 ```console
-$ pip install -e ".[dev]"
+$ pip install -r requirements_data.txt
 ```
 
+This includes Linux pytorch (for the loader) and pipeline-specific tools.
+For Windows and MacOS users, please see the relevant
+[_Docker_](#docker-target) resources.
+:::
+
 ### Enabling Pre-commit hooks
 
 Please install pre-commit hooks, that will run the same code quality checks as the CI:

diff --git a/docs/tablegen.py b/docs/tablegen.py
@@ -56,6 +56,7 @@ def generate_table(description_dir: Path, output_html_path: Path) -> None:
 #         ~column_descriptions["Name"].str.contains("Kinase")
 #     ]
 
+    # TODO: update release/version after next dataset regeneration
     annotation_table = _get_annotation_table("2024-06", "v2", Path(CACHE_FILE))
 
     is_mandatory = np.zeros(column_descriptions.shape[0], dtype=bool)
@@ -66,9 +67,9 @@ def generate_table(description_dir: Path, output_html_path: Path) -> None:
         try:
             column = annotation_table[column_name]
         except KeyError:
-            logger.warning(
+            logger.debug(
                 f"Column '{column_name}' is in column descriptions, "
-                "but not found in annotation table."
+                "but not found in annotation table (expected for unreleased columns)."
             )
             continue
         is_value = _is_value(column, data_type)

diff --git a/environment.yml b/environment.yml
@@ -1,17 +1,22 @@
 #
 # Conda environment definition with dependencies
 #
+# For data generation only (no eval/scoring):
+#   conda env create -f environment.yml
+#   pip install -e .
+#
+# For eval/scoring (adds OpenStructure, requires numpy<2):
+#   pip install -e ".[eval]"
+#
 name: plinder
 channels:
   - conda-forge
   - defaults
   - bioconda
 dependencies:
-  - python=3.10.*
+  - python=3.12.*
   - reduce
-  - openstructure
   - mmseqs2
   - foldseek
-  - plip=2.3.0
   - pip:
     - keyrings.google-artifactregistry-auth==1.1.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,26 +2,21 @@
 name = "plinder"
 dynamic = ["version"]
 dependencies = [
-    "biotite >= 1.0",
+    "biotite >= 1.2",
     "numpy",
     "pandas",
     "typing_extensions",
     "pydantic",
     "tqdm",
-    "plotly",
     "nbformat",
     "google-cloud-storage",
     "gcsfs",
-    "gemmi",
+    "peppr>=0.13",
     "rdkit>=2024.03.6",
     "pyarrow",
     "omegaconf",
-    "mmcif",
-    "eval_type_backport",
-    "posebusters",
     "duckdb",
     "cloudpathlib",
-    "mols2grid",
     "six",
 ]
 description = "PLINDER: The Protein-Ligand INteraction Dataset and Evaluation Resource"
@@ -67,6 +62,11 @@ dev = [
 loader = [
     "torch",
 ]
+eval = [
+    "openstructure",
+    "posebusters>=0.6.4",
+    "plotly",
+]
 plots = [
     "matplotlib",
     "seaborn",

diff --git a/requirements_data.txt b/requirements_data.txt
@@ -2,4 +2,5 @@
   tabulate
   pdb-validation @ git+https://git.scicore.unibas.ch/schwede/ligand-validation.git
   mmpdb @ git+https://github.com/rdkit/mmpdb.git
-  https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840
+  torch @ https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-linux_x86_64.whl#sha256=4856f9d6925121d13c2df07aa7580b767f449dfe71ae5acde9c27535d5da4840 ; sys_platform == "linux"
+  torch >= 2.5 ; sys_platform == "darwin"
diff --git a/src/plinder/__init__.py b/src/plinder/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, Plinder Development Team
 # Distributed under the terms of the Apache License 2.0
 """plinder"""
+
 from pathlib import Path
 
 from ._version import _get_version

diff --git a/src/plinder/core/__init__.py b/src/plinder/core/__init__.py
@@ -13,6 +13,7 @@
 You can disable the MD5 checksum comparison between local files and remote files
 by setting the environment variable `PLINDER_OFFLINE=true`.
 """
+
 from plinder.core.index.system import PlinderSystem
 from plinder.core.index.utils import get_manifest, get_plindex
 from plinder.core.split.utils import get_split