-
Notifications
You must be signed in to change notification settings - Fork 670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Cif reader #4681
base: develop
Are you sure you want to change the base?
[WIP] Cif reader #4681
Changes from all commits
46d174f
63f3f49
4000fe8
ad24371
9f42c5f
f063b14
837b861
4831d45
aeefd56
30e4ffa
b60b57b
5cdcf92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
|
||
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- | ||
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 | ||
# | ||
# MDAnalysis --- https://www.mdanalysis.org | ||
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors | ||
# (see the file AUTHORS for the full list of names) | ||
# | ||
# Released under the GNU Public Licence, v2 or any higher version | ||
# | ||
# Please cite your use of MDAnalysis in published work: | ||
# | ||
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, | ||
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. | ||
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics | ||
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th | ||
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. | ||
# doi: 10.25080/majora-629e541a-00e | ||
# | ||
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. | ||
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. | ||
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 | ||
# | ||
|
||
""" | ||
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx` | ||
======================================================================= | ||
|
||
Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the | ||
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section. | ||
|
||
|
||
.. _PDBx: | ||
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format | ||
""" | ||
import gemmi | ||
import numpy as np | ||
|
||
from . import base | ||
|
||
|
||
|
||
|
||
class PDBxReader(base.SingleFrameReaderBase): | ||
format = ['cif', 'pdbx'] | ||
units = {'time': None, 'length': 'Angstrom'} | ||
|
||
def _read_first_frame(self): | ||
doc = gemmi.cif.read(self.filename) | ||
|
||
block = doc.sole_block() | ||
|
||
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z']) | ||
self.natoms = len(coords) | ||
|
||
xyz = np.zeros((self.natoms, 3), dtype=np.float32) | ||
|
||
for i, (x, y, z) in enumerate(coords): | ||
xyz[i, :] = x, y, z | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A bit unfortunate we need a manual CPython loop over each coordinate. I don't see an obvious alternative from poking around. Gemmi is 90 % C++ it seems, so one might think they could return an appropriate buffer somehow, but that's probably an upstream matter and would likely want to focus on correctness here for now. |
||
|
||
ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs) | ||
ts.frame = 0 | ||
|
||
box = block.find('_cell.', ['length_a', 'length_b', 'length_c', | ||
'angle_alpha', 'angle_beta', 'angle_gamma']) | ||
if box: | ||
unitcell = np.zeros(6, dtype=np.float64) | ||
unitcell[:] = box[0] | ||
|
||
ts.dimensions = unitcell | ||
|
||
if self.convert_units: | ||
# in-place ! | ||
self.convert_pos_from_native(self.ts._pos) | ||
if self.ts.dimensions is not None: | ||
self.convert_pos_from_native(self.ts.dimensions[:3]) | ||
|
||
|
||
|
||
@staticmethod | ||
def parse_n_atoms(filename, **kwargs): | ||
doc = gemmi.cif.read(self.filename) | ||
block = doc.sole_block() | ||
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z']) | ||
natoms = len(coords) | ||
del doc | ||
return n_atoms | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- | ||
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 | ||
# | ||
# MDAnalysis --- https://www.mdanalysis.org | ||
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors | ||
# (see the file AUTHORS for the full list of names) | ||
# | ||
# Released under the GNU Public Licence, v2 or any higher version | ||
# | ||
# Please cite your use of MDAnalysis in published work: | ||
# | ||
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, | ||
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. | ||
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics | ||
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th | ||
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. | ||
# doi: 10.25080/majora-629e541a-00e | ||
# | ||
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. | ||
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. | ||
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 | ||
# | ||
|
||
""" | ||
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx` | ||
======================================================================= | ||
|
||
Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the | ||
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section. | ||
|
||
|
||
.. _PDBx: | ||
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format | ||
""" | ||
import gemmi | ||
import numpy as np | ||
from gemmi import FractionalBox | ||
|
||
from . import base | ||
|
||
|
||
class PDBxReader(base.SingleFrameReaderBase): | ||
format = ['cif', 'pdbx'] | ||
units = {'time': None, 'length': 'Angstrom'} | ||
|
||
|
||
def __init__(self, filename, convert_units=True, **kwargs): | ||
super().__init__(filename, convert_units=convert_units, **kwargs) | ||
|
||
def _read_first_frame(self): | ||
doc = gemmi.cif.read(self.filename) | ||
|
||
block = doc.sole_block() | ||
|
||
# PDBx/mmCIF with _cell. sections | ||
box = block.find('_cell.', ['length_a', 'length_b', 'length_c', | ||
'angle_alpha', 'angle_beta', 'angle_gamma']) | ||
# CIF file with _cell_ sections | ||
if not box: | ||
box = block.find('_cell_', ['length_a', 'length_b', 'length_c', | ||
'angle_alpha', 'angle_beta', 'angle_gamma']) | ||
|
||
if box: | ||
unitcell = np.zeros(6, dtype=np.float64) | ||
unitcell[:] = box[0] | ||
|
||
# PDBx/mmCIF with _cell. sections | ||
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z']) | ||
fractional = lambda xyz: xyz | ||
|
||
# CIF file with _cell_ sections | ||
if not coords: | ||
coords = block.find('_atom_site_', ['fract_x', 'fract_y', 'fract_z']) | ||
fractional = lambda xyz: np.multiply(xyz, unitcell[:3]) | ||
|
||
if not coords: | ||
raise ValueError("No coordinates found in the file") | ||
|
||
self.n_atoms = len(coords) | ||
|
||
xyz = np.zeros((self.n_atoms, 3), dtype=np.float32) | ||
|
||
for i, (x, y, z) in enumerate(coords): | ||
xyz[i, :] = x, y, z | ||
|
||
# for CIF: multiply fractional coordinates by unitcell lengths | ||
xyz = fractional(xyz) | ||
|
||
ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs) | ||
ts.frame = 0 | ||
ts.dimensions = unitcell | ||
|
||
if self.convert_units: | ||
# in-place ! | ||
self.convert_pos_from_native(self.ts._pos) | ||
if self.ts.dimensions is not None: | ||
self.convert_pos_from_native(self.ts.dimensions[:3]) | ||
|
||
return ts | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- | ||
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 | ||
# | ||
# MDAnalysis --- https://www.mdanalysis.org | ||
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors | ||
# (see the file AUTHORS for the full list of names) | ||
# | ||
# Released under the GNU Public Licence, v2 or any higher version | ||
# | ||
# Please cite your use of MDAnalysis in published work: | ||
# | ||
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, | ||
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. | ||
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics | ||
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th | ||
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. | ||
# doi: 10.25080/majora-629e541a-00e | ||
# | ||
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. | ||
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. | ||
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 | ||
# | ||
""" | ||
PDBx topology parser | ||
==================== | ||
|
||
|
||
See Also | ||
-------- | ||
:class:`MDAnalysis.coordinates.PDBx` | ||
|
||
""" | ||
import gemmi | ||
import numpy as np | ||
|
||
from .base import TopologyReaderBase, change_squash | ||
from ..core.topology import Topology | ||
from ..core.topologyattrs import ( | ||
Atomnames, | ||
Atomids, | ||
AltLocs, | ||
Elements, | ||
ICodes, | ||
RecordTypes, | ||
Resids, | ||
Resnames, | ||
Segids, | ||
) | ||
|
||
|
||
class PDBxParser(TopologyReaderBase): | ||
"""Read a Topology from a PDBx file | ||
|
||
Creates the following attributes from these "_atom_site" PDBx loop entries | ||
- "group_PDB" RecordType | ||
- "id" AtomId | ||
- "label_alt_id" AltLoc | ||
- "label_type_symbol" Element | ||
- "label_atom_id" AtomName | ||
- "auth_seq_id" Resid | ||
- "auth_comp_id" Resname | ||
- "pdbx_PDB_ins_code" ICode | ||
- "auth_asym_id" ChainID | ||
""" | ||
format = ['PDBx', 'cif'] | ||
|
||
def parse(self, **kwargs) -> Topology: | ||
doc = gemmi.cif.read(self.filename) | ||
block = doc.sole_block() | ||
|
||
attrs = [] | ||
|
||
def objarr(x): | ||
return np.array(x, dtype=object) | ||
|
||
# hierarchy correspondence: | ||
# seq_id -> residues | ||
# entity_id -> chains | ||
if recordtypes := block.find('_atom_site.', ['group_PDB']): | ||
attrs.append(RecordTypes(recordtypes)) | ||
ids = block.find_loop('_atom_site.id') | ||
n_atoms = len(ids) | ||
attrs.append(Atomids(ids)) | ||
if altlocs := block.find_loop('_atom_site.label_alt_id'): | ||
altlocs = np.array(altlocs, dtype=object) | ||
altlocs[altlocs == '.'] = '' | ||
attrs.append(AltLocs(altlocs)) | ||
if elements_loop := block.find_loop('_atom_site.type_symbol'): | ||
attrs.append(Elements(objarr(elements_loop))) | ||
if names_loop := block.find_loop('_atom_site.label_atom_id'): | ||
attrs.append(Atomnames(objarr(names_loop))) | ||
|
||
# sort out residues/segments | ||
# label_seq_id seems to not cover entire model unlike author versions | ||
resids = np.array(block.find_loop('_entity_poly_seq.num')) | ||
resnames = np.array(block.find_loop('_entity_poly_seq.mon_id')) | ||
icodes = np.array(block.find_loop('_atom_site.pdbx_PDB_ins_code')) | ||
chainids = np.array(block.find_loop('_atom_site.auth_asym_id')) | ||
|
||
try: | ||
residx, (resids, icodes, resnames, chainids) = change_squash( | ||
(resids, icodes), (resids, icodes, resnames, chainids) | ||
) | ||
segidx, (chainids,) = change_squash((chainids,), (chainids,)) | ||
except IndexError: | ||
... | ||
attrs.extend(( | ||
Resids(resids), | ||
Resnames(objarr(resnames)), | ||
ICodes(objarr(icodes)), | ||
Segids(chainids), | ||
)) | ||
|
||
n_residues = len(resids) | ||
n_segments = len(chainids) | ||
|
||
return Topology( | ||
n_atoms=n_atoms, | ||
n_res=n_residues, | ||
n_seg=n_segments, | ||
attrs=attrs, | ||
atom_resindex=residx, | ||
residue_segindex=segidx, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,3 +333,4 @@ | |
from . import MinimalParser | ||
from . import ITPParser | ||
from . import FHIAIMSParser | ||
from . import PDBxParser |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.. automodule:: MDAnalysis.coordinates.PDBx | ||
:members: |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import pytest | ||
from MDAnalysisTests.datafiles import PDBX, CIF, MMCIF | ||
|
||
|
||
|
||
def test_pdbx() | ||
|
||
def test_ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't be too hard to draft tests based on other coordinate readers and some small test files. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for clarity, this is a new MPL-2.0 license dependency we'd be adding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We would need to make this optional.