glotzerlab · janbridley · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/changelog.rst b/changelog.rst
@@ -4,6 +4,18 @@ Changelog
 The format is based on `Keep a Changelog <http://keepachangelog.com/en/1.1.0/>`__.
 This project adheres to `Semantic Versioning <http://semver.org/spec/v2.0.0.html>`__.
 
+v0.X.X - 20XX-XX-XX
+-------------------
+
+Added
+~~~~~
+- Additional testpath flag to conftest
+
+Fixed
+~~~~~
+- Accessing data pairs with ``get_from_pairs`` or ``__getitem__`` now allows for case-insensitive searches
+- Quote-delimited strings containing the delimiting character are now parsed properly
+
 v0.2.1 - 2025-03-12
 -------------------
 

diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py
@@ -71,6 +71,7 @@
 import warnings
 from collections.abc import Iterable
 from fnmatch import filter as fnfilter
+from fnmatch import fnmatch
 from pathlib import Path
 from typing import ClassVar
 
@@ -244,6 +245,7 @@ def get_from_pairs(self, index: str | Iterable[str]):
             This method supports a few unix-style wildcards. Use ``*`` to match any
             number of any character, and ``?`` to match any single character. If a
             wildcard matches more than one key, a list is returned for that index.
+            Lookups using this method are case-insensitive, per the CIF spec.
 
         Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing
         with an Iterable of strings returns a list of values, with ``None`` as a
@@ -286,14 +288,28 @@ def get_from_pairs(self, index: str | Iterable[str]):
         if isinstance(index, str):  # Escape brackets with []
             index = re.sub(_bracket_pattern, r"[\1]", index)
             return _flatten_or_none(
-                [self.pairs.get(k) for k in fnfilter(self.pairs, index)]
+                [
+                    v
+                    for (k, v) in self.pairs.items()
+                    if fnmatch(k.lower(), index.lower())
+                ]
             )
 
         # Escape all brackets in all indices
         index = [re.sub(_bracket_pattern, r"[\1]", i) for i in index]
-        matches = [fnfilter(self.pairs, pat) for pat in index]
-
-        return [_flatten_or_none([self.pairs.get(k, None) for k in m]) for m in matches]
+        matches = [
+            [
+                _flatten_or_none(
+                    [
+                        v
+                        for (k, v) in self.pairs.items()
+                        if fnmatch(k.lower(), pat.lower())
+                    ]
+                )
+            ]
+            for pat in index
+        ]
+        return [_flatten_or_none(m) for m in matches]
 
     def get_from_loops(self, index: str | Iterable[str]):
         """Return a column or columns from the matching table in :attr:`~.loops`.
@@ -838,7 +854,7 @@ def _parse(self, data_iter: Iterable):
                         data_iter, _strip_comments(next(data_iter))
                     )
                     parsed_line = self._cpat["space_delimited_data"].findall(line)
-                    parsed_line = [m for m in parsed_line if m != ""]
+                    parsed_line = [m for m in parsed_line if m != "" and m != ","]
                     loop_data.extend([parsed_line] if parsed_line else [])
 
                 n_elements, n_cols = (
@@ -870,7 +886,6 @@ def _parse(self, data_iter: Iterable):
                         stacklevel=2,
                     )
                     continue
-
                 rectable = np.atleast_2d(loop_data)
                 rectable.dtype = [*zip(loop_keys, [dt] * n_cols)]
                 rectable = rectable.reshape(rectable.shape, order="F")
@@ -889,7 +904,14 @@ def __repr__(self):
         "loop_delimiter": r"([Ll][Oo][Oo][Pp]_)[ |\t]*([^\n]*)",
         "block_delimiter": r"([Dd][Aa][Tt][Aa]_)[ |\t]*([^\n]*)",
         "key_list": r"_[\w_\.*]+[\[\d\]]*",
-        "space_delimited_data": r"(\;[^\;]*\;|\'[^\']*\'|\"[^\"]*\"]|[^\'\"\;\s]*)\s*",
+        "space_delimited_data": (
+            r"("
+            r"\;[^\;]*\;|"  # Non-semicolon data bracketed by semicolons
+            r"\'(?:'[^\s]|[^'])*\'|"  # Data with single quotes not followed by \s
+            r"\"[^\"]*\"|"  # Data with double quotes
+            r"[^\'\"\;\s]*"  # Additional non-bracketed data
+            r")[\s]*"
+        ),
     }
     """Regex patterns used when parsing files.
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,17 +4,49 @@
 import os
 import re
 from dataclasses import dataclass
+from glob import glob
 
 import numpy as np
 import pytest
+from CifFile import CifFile as pycifRW
+from CifFile import StarError
+from gemmi import cif
 
 from parsnip import CifFile
 
+ADDITIONAL_TEST_FILES_PATH = ""
+
 rng = np.random.default_rng(seed=161181914916)
 
 data_file_path = os.path.dirname(__file__) + "/sample_data/"
 
 
+def pycifrw_or_xfail(cif_data):
+    try:
+        return pycifRW(cif_data.filename).first_block()
+    except StarError:
+        pytest.xfail("pycifRW failed to read the file!")
+
+
+def remove_invalid(s):
+    """Our parser strips newlines and carriage returns.
+    TODO: newlines should be retained
+    """
+    if s is None:
+        return None
+    return s.replace("\r", "")
+
+
+def _gemmi_read_keys(filename, keys, as_number=True):
+    try:
+        file_block = cif.read_file(filename).sole_block()
+    except (RuntimeError, ValueError):
+        pytest.xfail("Gemmi failed to read file!")
+    if as_number:
+        return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
+    return np.array([remove_invalid(file_block.find_value(key)) for key in keys])
+
+
 def _arrstrip(arr: np.ndarray, pattern: str):
     return np.vectorize(lambda x: re.sub(pattern, "", x))(arr)
 
@@ -34,6 +66,20 @@ class CifData:
 assorted_keys = np.loadtxt(data_file_path + "cif_file_keys.txt", dtype=str)
 
 
+def combine_marks(*marks, argnames="cif_data"):
+    combinedargvalues = []
+    combinedids = []
+    for mark in marks:
+        argvalues, ids = mark.kwargs["argvalues"], mark.kwargs["ids"]
+        combinedargvalues.extend(argvalues)
+        combinedids.extend(ids)
+    return pytest.mark.parametrize(
+        argnames=argnames,
+        argvalues=combinedargvalues,
+        ids=combinedids,
+    )
+
+
 def generate_random_key_sequences(arr, n_samples, seed=42):
     rng = np.random.default_rng(seed)
     return [
@@ -171,71 +217,18 @@ def random_keys_mark(n_samples=10):
     argvalues=cif_data_array,
     ids=[cif.filename.split("/")[-1] for cif in cif_data_array],
 )
-LINE_TEST_CASES = [
-    None,
-    "_key",
-    "__key",
-    "_key.loop_",
-    "asdf        ",
-    "loop_",
-    "",
-    " ",
-    "# comment",
-    "_key#comment_ # 2",
-    "loop_##com",
-    "'my quote' # abc",
-    "\"malformed ''#",
-    ";oddness\"'\n;asdf",
-    "_key.loop.inner_",
-    "normal_case",
-    "multi.periods....",
-    "__underscore__",
-    "_key_with_numbers123",
-    "test#hash",
-    "#standalone",
-    "'quote_in_single'",
-    '"quote_in_double"',
-    " \"mismatched_quotes' ",
-    ";semicolon_in_text",
-    ";;double_semicolon",
-    "trailing_space ",
-    " leading_space",
-    "_key.with#hash.loop",
-    "__double#hash#inside__",
-    "single'; quote",
-    'double;"quote',
-    "#comment;inside",
-    "_tricky'combination;#",
-    ";'#another#combo;'",
-    '"#edge_case"',
-    'loop;"#complex"case',
-    "_'_weird_key'_",
-    "semi;;colon_and_hash#",
-    "_odd.key_with#hash",
-    "__leading_double_underscore__",
-    "middle;;semicolon",
-    "#just_a_comment",
-    '"escaped "quote"',
-    "'single_quote_with_hash#'",
-    "_period_end.",
-    "loop_.trailing_",
-    "escaped\\nnewline",
-    "#escaped\\twith_tab ",
-    "only;semicolon",
-    "trailing_semicolon;",
-    "leading_semicolon;",
-    "_key;.semicolon",
-    "semicolon;hash#",
-    "complex\"';hash#loop",
-    "just_text",
-    'loop#weird"text;',
-    "nested'quotes\"here   ",
-    "normal_case2",
-    "__underscored_case__",
-    'escaped\\"quotes#',
-    ";semicolon#hash;",
-    "double#hash_inside##key",
-    "__double..periods__",
-    "key#comment ; and_more",
-    "_weird_;;#combo    ",
+additional_data_array = [
+    CifData(
+        filename=fn,
+        file=CifFile(fn),
+        symop_keys=("_space_group_symop_operation_xyz", "_symmetry_equiv_pos_as_xyz"),
+    )
+    for fn in glob(ADDITIONAL_TEST_FILES_PATH)
 ]
+additional_files_mark = pytest.mark.parametrize(
+    argnames="cif_data",
+    argvalues=additional_data_array,
+    ids=[cif.filename.split("/")[-1] for cif in additional_data_array],
+)
+
+all_files_mark = combine_marks(cif_files_mark, additional_files_mark)
diff --git a/tests/test_key_reader.py b/tests/test_key_reader.py
@@ -1,49 +1,50 @@
 import numpy as np
-from CifFile import CifFile as pycifRW
-from conftest import bad_cif, cif_files_mark, random_keys_mark
-from gemmi import cif
+from conftest import (
+    _gemmi_read_keys,
+    all_files_mark,
+    bad_cif,
+    pycifrw_or_xfail,
+    random_keys_mark,
+)
 from more_itertools import flatten
 
 
-def remove_invalid(s):
-    """Our parser strips newlines and carriage returns.
-    TODO: newlines should be retained
-    """
-    if s is None:
-        return None
-    return s.replace("\r", "")
+def _array_assertion_verbose(keys, test_data, real_data):
+    keys = np.asarray(keys)
+    test_data = np.asarray(test_data)
+    real_data = np.asarray(real_data)
+    msg = (
+        f"Key(s) {keys[test_data != real_data]} did not match:\n"
+        f"{test_data[test_data != real_data]}!="
+        f"{real_data[test_data != real_data]}\n"
+    )
+    np.testing.assert_equal(test_data, real_data, err_msg=msg)
 
 
-def _gemmi_read_keys(filename, keys, as_number=True):
-    file_block = cif.read_file(filename).sole_block()
-    if as_number:
-        return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
-    return np.array([remove_invalid(file_block.find_value(key)) for key in keys])
-
-
-@cif_files_mark
+@all_files_mark
 def test_read_key_value_pairs(cif_data):
-    pycif = pycifRW(cif_data.filename).first_block()
+    pycif = pycifrw_or_xfail(cif_data)
+
     invalid = [*flatten(pycif.loops.values()), *cif_data.failing]
     all_keys = [key for key in pycif.true_case.values() if key.lower() not in invalid]
 
     parsnip_data = cif_data.file[all_keys]
     for i, value in enumerate(parsnip_data):
-        assert cif_data.file[all_keys[i]] == value
-        assert cif_data.file[all_keys[i]] == cif_data.file.get_from_pairs(all_keys[i])
+        np.testing.assert_equal(cif_data.file[all_keys[i]], value)
+        np.testing.assert_equal(
+            cif_data.file[all_keys[i]], cif_data.file.get_from_pairs(all_keys[i])
+        )
     gemmi_data = _gemmi_read_keys(cif_data.filename, keys=all_keys, as_number=False)
-    np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
+    _array_assertion_verbose(all_keys, parsnip_data, gemmi_data)
 
 
-@cif_files_mark
+@all_files_mark
 @random_keys_mark(n_samples=20)
 def test_read_key_value_pairs_random(cif_data, keys):
-    parsnip_data = cif_data.file[keys]
-    np.testing.assert_equal(parsnip_data, cif_data.file.get_from_pairs(keys))
-    for i, value in enumerate(parsnip_data):
-        assert cif_data.file.pairs.get(keys[i], None) == value
+    parsnip_data = np.asarray(cif_data.file[keys])
+    _array_assertion_verbose(keys, parsnip_data, cif_data.file.get_from_pairs(keys))
     gemmi_data = _gemmi_read_keys(cif_data.filename, keys=keys, as_number=False)
-    np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
+    _array_assertion_verbose(keys, parsnip_data, gemmi_data)
 
 
 def test_read_key_value_pairs_badcif(cif_data=bad_cif):
@@ -59,4 +60,4 @@ def test_read_key_value_pairs_badcif(cif_data=bad_cif):
         r"45.6a/\s",
         None,
     ]
-    np.testing.assert_array_equal(parsnip_data, correct_data)
+    _array_assertion_verbose(cif_data.manual_keys, parsnip_data, correct_data)
diff --git a/tests/test_patterns.py b/tests/test_patterns.py
@@ -15,6 +15,8 @@
     _write_debug_output,
 )
 
+BOX_ATOL = 8e-7
+
 TEST_CASES = [
     None,
     "_key",
@@ -222,13 +224,13 @@ def test_box(cif_data):
     freud_box_2 = freud.Box(*cif_data.file.box)
     parsnip_box = _box_from_lengths_and_angles(*cif_box)
 
-    np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=1e-15)
+    np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=BOX_ATOL)
     np.testing.assert_allclose(
-        parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=1e-15
+        parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=BOX_ATOL
     )
     if "PDB" not in cif_data.filename:
         np.testing.assert_allclose(
             [*freud_box.L, freud_box.xy, freud_box.xz, freud_box.yz],
             [*freud_box_2.L, freud_box_2.xy, freud_box_2.xz, freud_box_2.yz],
-            atol=1e-8,
+            atol=BOX_ATOL,
         )