Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Cif reader #4681

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
89 changes: 89 additions & 0 deletions package/MDAnalysis/coordinates/CIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx`
=======================================================================

Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section.


.. _PDBx:
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format
"""
import gemmi
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for clarity, this is a new MPL-2.0 license dependency we'd be adding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would need to make this optional.

import numpy as np

from . import base




class PDBxReader(base.SingleFrameReaderBase):
format = ['cif', 'pdbx']
units = {'time': None, 'length': 'Angstrom'}

def _read_first_frame(self):
doc = gemmi.cif.read(self.filename)

block = doc.sole_block()

coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
self.natoms = len(coords)

xyz = np.zeros((self.natoms, 3), dtype=np.float32)

for i, (x, y, z) in enumerate(coords):
xyz[i, :] = x, y, z
Copy link
Member

@tylerjereddy tylerjereddy Feb 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit unfortunate we need a manual CPython loop over each coordinate. I don't see an obvious alternative from poking around. Gemmi is 90 % C++ it seems, so one might think they could return an appropriate buffer somehow, but that's probably an upstream matter and would likely want to focus on correctness here for now.


ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs)
ts.frame = 0

box = block.find('_cell.', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])
if box:
unitcell = np.zeros(6, dtype=np.float64)
unitcell[:] = box[0]

ts.dimensions = unitcell

if self.convert_units:
# in-place !
self.convert_pos_from_native(self.ts._pos)
if self.ts.dimensions is not None:
self.convert_pos_from_native(self.ts.dimensions[:3])



@staticmethod
def parse_n_atoms(filename, **kwargs):
doc = gemmi.cif.read(self.filename)
block = doc.sole_block()
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
natoms = len(coords)
del doc
return n_atoms


101 changes: 101 additions & 0 deletions package/MDAnalysis/coordinates/PDBx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx`
=======================================================================

Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section.


.. _PDBx:
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format
"""
import gemmi
import numpy as np
from gemmi import FractionalBox

from . import base


class PDBxReader(base.SingleFrameReaderBase):
format = ['cif', 'pdbx']
units = {'time': None, 'length': 'Angstrom'}


def __init__(self, filename, convert_units=True, **kwargs):
super().__init__(filename, convert_units=convert_units, **kwargs)

def _read_first_frame(self):
doc = gemmi.cif.read(self.filename)

block = doc.sole_block()

# PDBx/mmCIF with _cell. sections
box = block.find('_cell.', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])
# CIF file with _cell_ sections
if not box:
box = block.find('_cell_', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])

if box:
unitcell = np.zeros(6, dtype=np.float64)
unitcell[:] = box[0]

# PDBx/mmCIF with _cell. sections
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
fractional = lambda xyz: xyz

# CIF file with _cell_ sections
if not coords:
coords = block.find('_atom_site_', ['fract_x', 'fract_y', 'fract_z'])
fractional = lambda xyz: np.multiply(xyz, unitcell[:3])

if not coords:
raise ValueError("No coordinates found in the file")

self.n_atoms = len(coords)

xyz = np.zeros((self.n_atoms, 3), dtype=np.float32)

for i, (x, y, z) in enumerate(coords):
xyz[i, :] = x, y, z

# for CIF: multiply fractional coordinates by unitcell lengths
xyz = fractional(xyz)

ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs)
ts.frame = 0
ts.dimensions = unitcell

if self.convert_units:
# in-place !
self.convert_pos_from_native(self.ts._pos)
if self.ts.dimensions is not None:
self.convert_pos_from_native(self.ts.dimensions[:3])

return ts


1 change: 1 addition & 0 deletions package/MDAnalysis/coordinates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
from . import NAMDBIN
from . import FHIAIMS
from . import TNG
from . import PDBx
124 changes: 124 additions & 0 deletions package/MDAnalysis/topology/PDBxParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#
"""
PDBx topology parser
====================


See Also
--------
:class:`MDAnalysis.coordinates.PDBx`

"""
import gemmi
import numpy as np

from .base import TopologyReaderBase, change_squash
from ..core.topology import Topology
from ..core.topologyattrs import (
Atomnames,
Atomids,
AltLocs,
Elements,
ICodes,
RecordTypes,
Resids,
Resnames,
Segids,
)


class PDBxParser(TopologyReaderBase):
"""Read a Topology from a PDBx file

Creates the following attributes from these "_atom_site" PDBx loop entries
- "group_PDB" RecordType
- "id" AtomId
- "label_alt_id" AltLoc
- "label_type_symbol" Element
- "label_atom_id" AtomName
- "auth_seq_id" Resid
- "auth_comp_id" Resname
- "pdbx_PDB_ins_code" ICode
- "auth_asym_id" ChainID
"""
format = ['PDBx', 'cif']

def parse(self, **kwargs) -> Topology:
doc = gemmi.cif.read(self.filename)
block = doc.sole_block()

attrs = []

def objarr(x):
return np.array(x, dtype=object)

# hierarchy correspondence:
# seq_id -> residues
# entity_id -> chains
if recordtypes := block.find('_atom_site.', ['group_PDB']):
attrs.append(RecordTypes(recordtypes))
ids = block.find_loop('_atom_site.id')
n_atoms = len(ids)
attrs.append(Atomids(ids))
if altlocs := block.find_loop('_atom_site.label_alt_id'):
altlocs = np.array(altlocs, dtype=object)
altlocs[altlocs == '.'] = ''
attrs.append(AltLocs(altlocs))
if elements_loop := block.find_loop('_atom_site.type_symbol'):
attrs.append(Elements(objarr(elements_loop)))
if names_loop := block.find_loop('_atom_site.label_atom_id'):
attrs.append(Atomnames(objarr(names_loop)))

# sort out residues/segments
# label_seq_id seems to not cover entire model unlike author versions
resids = np.array(block.find_loop('_entity_poly_seq.num'))
resnames = np.array(block.find_loop('_entity_poly_seq.mon_id'))
icodes = np.array(block.find_loop('_atom_site.pdbx_PDB_ins_code'))
chainids = np.array(block.find_loop('_atom_site.auth_asym_id'))

try:
residx, (resids, icodes, resnames, chainids) = change_squash(
(resids, icodes), (resids, icodes, resnames, chainids)
)
segidx, (chainids,) = change_squash((chainids,), (chainids,))
except IndexError:
...
attrs.extend((
Resids(resids),
Resnames(objarr(resnames)),
ICodes(objarr(icodes)),
Segids(chainids),
))

n_residues = len(resids)
n_segments = len(chainids)

return Topology(
n_atoms=n_atoms,
n_res=n_residues,
n_seg=n_segments,
attrs=attrs,
atom_resindex=residx,
residue_segindex=segidx,
)
1 change: 1 addition & 0 deletions package/MDAnalysis/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,4 @@
from . import MinimalParser
from . import ITPParser
from . import FHIAIMSParser
from . import PDBxParser
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.. automodule:: MDAnalysis.coordinates.PDBx
:members:
8 changes: 8 additions & 0 deletions testsuite/MDAnalysisTests/coordinates/test_cif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest
from MDAnalysisTests.datafiles import PDBX, CIF, MMCIF



def test_pdbx()

def test_
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't be too hard to draft tests based on other coordinate readers and some small test files.

Loading