diff --git a/changelog.rst b/changelog.rst
index 41d6019..507d552 100644
--- a/changelog.rst
+++ b/changelog.rst
@@ -4,6 +4,18 @@ Changelog
The format is based on `Keep a Changelog `__.
This project adheres to `Semantic Versioning `__.
+v0.X.X - 20XX-XX-XX
+-------------------
+
+Added
+~~~~~
+- Additional testpath flag to conftest
+
+Fixed
+~~~~~
+- Accessing data pairs with ``get_from_pairs`` or ``__getitem__`` now allows for case-insensitive searches
+- Quote-delimited strings containing the delimiting character are now parsed properly
+
v0.2.1 - 2025-03-12
-------------------
diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py
index 982c6a7..cd9d5d2 100644
--- a/parsnip/parsnip.py
+++ b/parsnip/parsnip.py
@@ -71,6 +71,7 @@
import warnings
from collections.abc import Iterable
from fnmatch import filter as fnfilter
+from fnmatch import fnmatch
from pathlib import Path
from typing import ClassVar
@@ -244,6 +245,7 @@ def get_from_pairs(self, index: str | Iterable[str]):
This method supports a few unix-style wildcards. Use ``*`` to match any
number of any character, and ``?`` to match any single character. If a
wildcard matches more than one key, a list is returned for that index.
+ Lookups using this method are case-insensitive, per the CIF spec.
Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing
with an Iterable of strings returns a list of values, with ``None`` as a
@@ -286,14 +288,28 @@ def get_from_pairs(self, index: str | Iterable[str]):
if isinstance(index, str): # Escape brackets with []
index = re.sub(_bracket_pattern, r"[\1]", index)
return _flatten_or_none(
- [self.pairs.get(k) for k in fnfilter(self.pairs, index)]
+ [
+ v
+ for (k, v) in self.pairs.items()
+ if fnmatch(k.lower(), index.lower())
+ ]
)
# Escape all brackets in all indices
index = [re.sub(_bracket_pattern, r"[\1]", i) for i in index]
- matches = [fnfilter(self.pairs, pat) for pat in index]
-
- return [_flatten_or_none([self.pairs.get(k, None) for k in m]) for m in matches]
+ matches = [
+ [
+ _flatten_or_none(
+ [
+ v
+ for (k, v) in self.pairs.items()
+ if fnmatch(k.lower(), pat.lower())
+ ]
+ )
+ ]
+ for pat in index
+ ]
+ return [_flatten_or_none(m) for m in matches]
def get_from_loops(self, index: str | Iterable[str]):
"""Return a column or columns from the matching table in :attr:`~.loops`.
@@ -838,7 +854,7 @@ def _parse(self, data_iter: Iterable):
data_iter, _strip_comments(next(data_iter))
)
parsed_line = self._cpat["space_delimited_data"].findall(line)
- parsed_line = [m for m in parsed_line if m != ""]
+ parsed_line = [m for m in parsed_line if m != "" and m != ","]
loop_data.extend([parsed_line] if parsed_line else [])
n_elements, n_cols = (
@@ -870,7 +886,6 @@ def _parse(self, data_iter: Iterable):
stacklevel=2,
)
continue
-
rectable = np.atleast_2d(loop_data)
rectable.dtype = [*zip(loop_keys, [dt] * n_cols)]
rectable = rectable.reshape(rectable.shape, order="F")
@@ -889,7 +904,14 @@ def __repr__(self):
"loop_delimiter": r"([Ll][Oo][Oo][Pp]_)[ |\t]*([^\n]*)",
"block_delimiter": r"([Dd][Aa][Tt][Aa]_)[ |\t]*([^\n]*)",
"key_list": r"_[\w_\.*]+[\[\d\]]*",
- "space_delimited_data": r"(\;[^\;]*\;|\'[^\']*\'|\"[^\"]*\"]|[^\'\"\;\s]*)\s*",
+ "space_delimited_data": (
+ r"("
+ r"\;[^\;]*\;|" # Non-semicolon data bracketed by semicolons
+ r"\'(?:'[^\s]|[^'])*\'|" # Data with single quotes not followed by \s
+ r"\"[^\"]*\"|" # Data with double quotes
+ r"[^\'\"\;\s]*" # Additional non-bracketed data
+ r")[\s]*"
+ ),
}
"""Regex patterns used when parsing files.
diff --git a/tests/conftest.py b/tests/conftest.py
index 9ef1bd7..e95b077 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,17 +4,49 @@
import os
import re
from dataclasses import dataclass
+from glob import glob
import numpy as np
import pytest
+from CifFile import CifFile as pycifRW
+from CifFile import StarError
+from gemmi import cif
from parsnip import CifFile
+ADDITIONAL_TEST_FILES_PATH = ""
+
rng = np.random.default_rng(seed=161181914916)
data_file_path = os.path.dirname(__file__) + "/sample_data/"
+def pycifrw_or_xfail(cif_data):
+ try:
+ return pycifRW(cif_data.filename).first_block()
+ except StarError:
+ pytest.xfail("pycifRW failed to read the file!")
+
+
+def remove_invalid(s):
+ """Our parser strips newlines and carriage returns.
+ TODO: newlines should be retained
+ """
+ if s is None:
+ return None
+ return s.replace("\r", "")
+
+
+def _gemmi_read_keys(filename, keys, as_number=True):
+ try:
+ file_block = cif.read_file(filename).sole_block()
+ except (RuntimeError, ValueError):
+ pytest.xfail("Gemmi failed to read file!")
+ if as_number:
+ return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
+ return np.array([remove_invalid(file_block.find_value(key)) for key in keys])
+
+
def _arrstrip(arr: np.ndarray, pattern: str):
return np.vectorize(lambda x: re.sub(pattern, "", x))(arr)
@@ -34,6 +66,20 @@ class CifData:
assorted_keys = np.loadtxt(data_file_path + "cif_file_keys.txt", dtype=str)
+def combine_marks(*marks, argnames="cif_data"):
+ combinedargvalues = []
+ combinedids = []
+ for mark in marks:
+ argvalues, ids = mark.kwargs["argvalues"], mark.kwargs["ids"]
+ combinedargvalues.extend(argvalues)
+ combinedids.extend(ids)
+ return pytest.mark.parametrize(
+ argnames=argnames,
+ argvalues=combinedargvalues,
+ ids=combinedids,
+ )
+
+
def generate_random_key_sequences(arr, n_samples, seed=42):
rng = np.random.default_rng(seed)
return [
@@ -171,71 +217,18 @@ def random_keys_mark(n_samples=10):
argvalues=cif_data_array,
ids=[cif.filename.split("/")[-1] for cif in cif_data_array],
)
-LINE_TEST_CASES = [
- None,
- "_key",
- "__key",
- "_key.loop_",
- "asdf ",
- "loop_",
- "",
- " ",
- "# comment",
- "_key#comment_ # 2",
- "loop_##com",
- "'my quote' # abc",
- "\"malformed ''#",
- ";oddness\"'\n;asdf",
- "_key.loop.inner_",
- "normal_case",
- "multi.periods....",
- "__underscore__",
- "_key_with_numbers123",
- "test#hash",
- "#standalone",
- "'quote_in_single'",
- '"quote_in_double"',
- " \"mismatched_quotes' ",
- ";semicolon_in_text",
- ";;double_semicolon",
- "trailing_space ",
- " leading_space",
- "_key.with#hash.loop",
- "__double#hash#inside__",
- "single'; quote",
- 'double;"quote',
- "#comment;inside",
- "_tricky'combination;#",
- ";'#another#combo;'",
- '"#edge_case"',
- 'loop;"#complex"case',
- "_'_weird_key'_",
- "semi;;colon_and_hash#",
- "_odd.key_with#hash",
- "__leading_double_underscore__",
- "middle;;semicolon",
- "#just_a_comment",
- '"escaped "quote"',
- "'single_quote_with_hash#'",
- "_period_end.",
- "loop_.trailing_",
- "escaped\\nnewline",
- "#escaped\\twith_tab ",
- "only;semicolon",
- "trailing_semicolon;",
- "leading_semicolon;",
- "_key;.semicolon",
- "semicolon;hash#",
- "complex\"';hash#loop",
- "just_text",
- 'loop#weird"text;',
- "nested'quotes\"here ",
- "normal_case2",
- "__underscored_case__",
- 'escaped\\"quotes#',
- ";semicolon#hash;",
- "double#hash_inside##key",
- "__double..periods__",
- "key#comment ; and_more",
- "_weird_;;#combo ",
+additional_data_array = [
+ CifData(
+ filename=fn,
+ file=CifFile(fn),
+ symop_keys=("_space_group_symop_operation_xyz", "_symmetry_equiv_pos_as_xyz"),
+ )
+ for fn in glob(ADDITIONAL_TEST_FILES_PATH)
]
+additional_files_mark = pytest.mark.parametrize(
+ argnames="cif_data",
+ argvalues=additional_data_array,
+ ids=[cif.filename.split("/")[-1] for cif in additional_data_array],
+)
+
+all_files_mark = combine_marks(cif_files_mark, additional_files_mark)
diff --git a/tests/test_key_reader.py b/tests/test_key_reader.py
index 9785d37..d962be6 100644
--- a/tests/test_key_reader.py
+++ b/tests/test_key_reader.py
@@ -1,49 +1,50 @@
import numpy as np
-from CifFile import CifFile as pycifRW
-from conftest import bad_cif, cif_files_mark, random_keys_mark
-from gemmi import cif
+from conftest import (
+ _gemmi_read_keys,
+ all_files_mark,
+ bad_cif,
+ pycifrw_or_xfail,
+ random_keys_mark,
+)
from more_itertools import flatten
-def remove_invalid(s):
- """Our parser strips newlines and carriage returns.
- TODO: newlines should be retained
- """
- if s is None:
- return None
- return s.replace("\r", "")
+def _array_assertion_verbose(keys, test_data, real_data):
+ keys = np.asarray(keys)
+ test_data = np.asarray(test_data)
+ real_data = np.asarray(real_data)
+ msg = (
+ f"Key(s) {keys[test_data != real_data]} did not match:\n"
+ f"{test_data[test_data != real_data]}!="
+ f"{real_data[test_data != real_data]}\n"
+ )
+ np.testing.assert_equal(test_data, real_data, err_msg=msg)
-def _gemmi_read_keys(filename, keys, as_number=True):
- file_block = cif.read_file(filename).sole_block()
- if as_number:
- return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
- return np.array([remove_invalid(file_block.find_value(key)) for key in keys])
-
-
-@cif_files_mark
+@all_files_mark
def test_read_key_value_pairs(cif_data):
- pycif = pycifRW(cif_data.filename).first_block()
+ pycif = pycifrw_or_xfail(cif_data)
+
invalid = [*flatten(pycif.loops.values()), *cif_data.failing]
all_keys = [key for key in pycif.true_case.values() if key.lower() not in invalid]
parsnip_data = cif_data.file[all_keys]
for i, value in enumerate(parsnip_data):
- assert cif_data.file[all_keys[i]] == value
- assert cif_data.file[all_keys[i]] == cif_data.file.get_from_pairs(all_keys[i])
+ np.testing.assert_equal(cif_data.file[all_keys[i]], value)
+ np.testing.assert_equal(
+ cif_data.file[all_keys[i]], cif_data.file.get_from_pairs(all_keys[i])
+ )
gemmi_data = _gemmi_read_keys(cif_data.filename, keys=all_keys, as_number=False)
- np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
+ _array_assertion_verbose(all_keys, parsnip_data, gemmi_data)
-@cif_files_mark
+@all_files_mark
@random_keys_mark(n_samples=20)
def test_read_key_value_pairs_random(cif_data, keys):
- parsnip_data = cif_data.file[keys]
- np.testing.assert_equal(parsnip_data, cif_data.file.get_from_pairs(keys))
- for i, value in enumerate(parsnip_data):
- assert cif_data.file.pairs.get(keys[i], None) == value
+ parsnip_data = np.asarray(cif_data.file[keys])
+ _array_assertion_verbose(keys, parsnip_data, cif_data.file.get_from_pairs(keys))
gemmi_data = _gemmi_read_keys(cif_data.filename, keys=keys, as_number=False)
- np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
+ _array_assertion_verbose(keys, parsnip_data, gemmi_data)
def test_read_key_value_pairs_badcif(cif_data=bad_cif):
@@ -59,4 +60,4 @@ def test_read_key_value_pairs_badcif(cif_data=bad_cif):
r"45.6a/\s",
None,
]
- np.testing.assert_array_equal(parsnip_data, correct_data)
+ _array_assertion_verbose(cif_data.manual_keys, parsnip_data, correct_data)
diff --git a/tests/test_patterns.py b/tests/test_patterns.py
index 93e4f46..7966458 100644
--- a/tests/test_patterns.py
+++ b/tests/test_patterns.py
@@ -15,6 +15,8 @@
_write_debug_output,
)
+BOX_ATOL = 8e-7
+
TEST_CASES = [
None,
"_key",
@@ -222,13 +224,13 @@ def test_box(cif_data):
freud_box_2 = freud.Box(*cif_data.file.box)
parsnip_box = _box_from_lengths_and_angles(*cif_box)
- np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=1e-15)
+ np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=BOX_ATOL)
np.testing.assert_allclose(
- parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=1e-15
+ parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=BOX_ATOL
)
if "PDB" not in cif_data.filename:
np.testing.assert_allclose(
[*freud_box.L, freud_box.xy, freud_box.xz, freud_box.yz],
[*freud_box_2.L, freud_box_2.xy, freud_box_2.xz, freud_box_2.yz],
- atol=1e-8,
+ atol=BOX_ATOL,
)
diff --git a/tests/test_table_reader.py b/tests/test_table_reader.py
index d12cea9..7f39429 100644
--- a/tests/test_table_reader.py
+++ b/tests/test_table_reader.py
@@ -3,8 +3,13 @@
import numpy as np
import pytest
from ase.io import cif as asecif
-from CifFile import CifFile as pycifRW
-from conftest import _arrstrip, bad_cif, cif_files_mark
+from conftest import (
+ _arrstrip,
+ all_files_mark,
+ bad_cif,
+ cif_files_mark,
+ pycifrw_or_xfail,
+)
from gemmi import cif
from more_itertools import flatten
@@ -17,18 +22,25 @@
def _gemmi_read_table(filename, keys):
- return np.array(cif.read_file(filename).sole_block().find(keys))
+ try:
+ return np.array(cif.read_file(filename).sole_block().find(keys))
+ except (RuntimeError, ValueError):
+ pytest.xfail("Gemmi failed to read file!")
-@cif_files_mark
+@all_files_mark
def test_reads_all_keys(cif_data):
- pycif = pycifRW(cif_data.filename).first_block()
+ pycif = pycifrw_or_xfail(cif_data)
loop_keys = [*flatten(pycif.loops.values())]
all_keys = [key for key in pycif.true_case.values() if key.lower() in loop_keys]
found_labels = [*flatten(cif_data.file.loop_labels)]
for key in all_keys:
- assert key in found_labels, f"Missing label: {found_labels}"
+ assert key in found_labels, f"Missing label: {key}"
+
+ if "A2BC_tP16" in cif_data.filename:
+ print(cif_data.filename)
+ pytest.xfail("Double single quote at EOL is not supported.")
for loop in pycif.loops.values():
loop = [pycif.true_case[key] for key in loop]
@@ -59,7 +71,9 @@ def test_read_atom_sites(cif_data):
return
warnings.filterwarnings("ignore", category=UserWarning)
+
atoms = asecif.read_cif(cif_data.filename)
+
ase_data = [
occ for site in atoms.info["occupancy"].values() for occ in site.values()
]
@@ -118,7 +132,6 @@ def test_bad_cif_atom_sites(cif_data=bad_cif):
np.testing.assert_array_equal(
parsnip_data[:, 3], ["0.00000(1)", "0.00000", "0.19180", "0.09390"]
)
-
# "_atom_site_fract_z"
np.testing.assert_array_equal(
parsnip_data[:, 4], ["0.25000", "0.(28510)", "0.05170", "0.41220"]
diff --git a/tests/test_unitcells.py b/tests/test_unitcells.py
index 1088c84..43677b6 100644
--- a/tests/test_unitcells.py
+++ b/tests/test_unitcells.py
@@ -1,27 +1,34 @@
+import re
import warnings
from contextlib import nullcontext
+from glob import glob
import numpy as np
import pytest
from ase import io
from ase.build import supercells
-from conftest import _arrstrip, box_keys, cif_files_mark
+from conftest import (
+ ADDITIONAL_TEST_FILES_PATH,
+ _arrstrip,
+ _gemmi_read_keys,
+ all_files_mark,
+ box_keys,
+ cif_files_mark,
+)
from gemmi import cif
from more_itertools import flatten
-
-def _gemmi_read_table(filename, keys):
- return np.array(cif.read_file(filename).sole_block().find(keys))
+from parsnip import CifFile
-def _gemmi_read_keys(filename, keys, as_number=True):
- file_block = cif.read_file(filename).sole_block()
- if as_number:
- return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
- return np.array([file_block.find_value(key) for key in keys])
+def _gemmi_read_table(filename, keys):
+ try:
+ return np.array(cif.read_file(filename).sole_block().find(keys))
+ except (RuntimeError, ValueError):
+ pytest.xfail("Gemmi failed to read file!")
-@cif_files_mark # TODO: test with conversions to numeric as well
+@all_files_mark # TODO: test with conversions to numeric as well
def test_read_wyckoff_positions(cif_data):
if "PDB_4INS_head.cif" in cif_data.filename:
return
@@ -32,7 +39,7 @@ def test_read_wyckoff_positions(cif_data):
np.testing.assert_array_equal(parsnip_data, gemmi_data)
-@cif_files_mark
+@all_files_mark
def test_read_cell_params(cif_data, keys=box_keys):
if "PDB_4INS_head.cif" in cif_data.filename:
keys = (key[0] + key[1:].replace("_", ".", 1) for key in keys)
@@ -45,13 +52,17 @@ def test_read_cell_params(cif_data, keys=box_keys):
assert min(normalized[:3]) == 1
-@cif_files_mark
+@all_files_mark
def test_read_symmetry_operations(cif_data):
if "PDB_4INS_head.cif" in cif_data.filename:
return # Excerpt of PDB file does not contain symmetry information
parsnip_data = cif_data.file.symops
- gemmi_data = _gemmi_read_table(filename=cif_data.filename, keys=cif_data.symop_keys)
+ gemmi_data = [
+ _gemmi_read_table(filename=cif_data.filename, keys=[k])
+ for k in cif_data.symop_keys
+ ]
+ gemmi_data = gemmi_data[0] if len(gemmi_data[0]) != 0 else gemmi_data[1]
np.testing.assert_array_equal(parsnip_data, gemmi_data)
@@ -144,3 +155,15 @@ def test_invalid_unit_cell(cif_data):
# with pytest.raises(ValueError, match="did not return any data"):
# cif_data.file.build_unit_cell()
# cif_data.file._pairs["_cell_angle_alpha"] = previous_alpha
+
+
+@pytest.mark.skipif(ADDITIONAL_TEST_FILES_PATH == "", reason="No test path provided.")
+@pytest.mark.parametrize("filename", glob(ADDITIONAL_TEST_FILES_PATH))
+def test_build_accuracy(filename):
+ def n_from_pearson(p: str) -> int:
+ return int(re.sub(r"[^\w]", "", p)[2:])
+
+ cif = CifFile(filename)
+ n, uc = n_from_pearson(cif["*Pearson"]), cif.build_unit_cell()
+ uc = np.array(sorted(uc, key=lambda x: tuple(x)))
+ np.testing.assert_equal(len(uc), n, err_msg="cell does not match Pearson symbol!")