Skip to content

Allow non-delimiting quotes in space-delimited data #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Mar 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
57bd867
Fix aflow accents
janbridley Mar 17, 2025
1b2591c
1 fail
janbridley Mar 17, 2025
4bf81d5
Remove unused fstring
janbridley Mar 17, 2025
5bdd9ac
Simplify nondelimiting quote parsing
janbridley Mar 17, 2025
ac89d4f
Fix debug statement
janbridley Mar 17, 2025
ad7c679
Ensure delimiting quotes are handled properly
janbridley Mar 17, 2025
6743b9e
Allow lastnames with single quote
janbridley Mar 17, 2025
c1bac3f
Standardize to rstring
janbridley Mar 17, 2025
5fefec1
Enclose pattern in group
janbridley Mar 17, 2025
a6f0bf1
Generalize pattern to arbitrary nondelimiting quotes
janbridley Mar 17, 2025
1a0b2f8
WIPOn case insensitivity
janbridley Mar 17, 2025
8f49724
Working code for bulk aflow tests
janbridley Mar 17, 2025
6c6462a
Proper case insensitivity
janbridley Mar 17, 2025
b5b0c6d
Clean up case insensitive code
janbridley Mar 17, 2025
f54dd15
Remove unused code from conftest
janbridley Mar 17, 2025
cae4faf
Standardize box testing atol
janbridley Mar 17, 2025
7be7798
Lint parsnip
janbridley Mar 17, 2025
74b8ee4
Move additional test configuration to conftest.py
janbridley Mar 17, 2025
22c57e0
Excuse StarErrors in table reader tests
janbridley Mar 17, 2025
3f3ef74
Refactor marks to skip nongeneral tests
janbridley Mar 17, 2025
2f9e50e
Simplify regex pattern
janbridley Mar 17, 2025
26f2985
Move pycifrw_or_xfail to conftest
janbridley Mar 17, 2025
cf99650
Lint
janbridley Mar 17, 2025
a243e61
Move gemmi_read_keys to conftest
janbridley Mar 17, 2025
742fffc
Fix tests for symops
janbridley Mar 17, 2025
7c7d5b8
Lint
janbridley Mar 17, 2025
211c33d
Remove duplicate test
janbridley Mar 17, 2025
026a4c8
Clean up
janbridley Mar 17, 2025
e6ed120
Simplify regex further
janbridley Mar 17, 2025
a21abcf
Remove comment code
janbridley Mar 17, 2025
667c8d1
Update changelog
janbridley Mar 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ Changelog
The format is based on `Keep a Changelog <http://keepachangelog.com/en/1.1.0/>`__.
This project adheres to `Semantic Versioning <http://semver.org/spec/v2.0.0.html>`__.

v0.X.X - 20XX-XX-XX
-------------------

Added
~~~~~
- Additional testpath flag to conftest

Fixed
~~~~~
- Accessing data pairs with ``get_from_pairs`` or ``__getitem__`` now allows for case-insensitive searches
- Quote-delimited strings containing the delimiting character are now parsed properly

v0.2.1 - 2025-03-12
-------------------

Expand Down
36 changes: 29 additions & 7 deletions parsnip/parsnip.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import warnings
from collections.abc import Iterable
from fnmatch import filter as fnfilter
from fnmatch import fnmatch
from pathlib import Path
from typing import ClassVar

Expand Down Expand Up @@ -244,6 +245,7 @@ def get_from_pairs(self, index: str | Iterable[str]):
This method supports a few unix-style wildcards. Use ``*`` to match any
number of any character, and ``?`` to match any single character. If a
wildcard matches more than one key, a list is returned for that index.
Lookups using this method are case-insensitive, per the CIF spec.

Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing
with an Iterable of strings returns a list of values, with ``None`` as a
Expand Down Expand Up @@ -286,14 +288,28 @@ def get_from_pairs(self, index: str | Iterable[str]):
if isinstance(index, str): # Escape brackets with []
index = re.sub(_bracket_pattern, r"[\1]", index)
return _flatten_or_none(
[self.pairs.get(k) for k in fnfilter(self.pairs, index)]
[
v
for (k, v) in self.pairs.items()
if fnmatch(k.lower(), index.lower())
]
)

# Escape all brackets in all indices
index = [re.sub(_bracket_pattern, r"[\1]", i) for i in index]
matches = [fnfilter(self.pairs, pat) for pat in index]

return [_flatten_or_none([self.pairs.get(k, None) for k in m]) for m in matches]
matches = [
[
_flatten_or_none(
[
v
for (k, v) in self.pairs.items()
if fnmatch(k.lower(), pat.lower())
]
)
]
for pat in index
]
return [_flatten_or_none(m) for m in matches]

def get_from_loops(self, index: str | Iterable[str]):
"""Return a column or columns from the matching table in :attr:`~.loops`.
Expand Down Expand Up @@ -838,7 +854,7 @@ def _parse(self, data_iter: Iterable):
data_iter, _strip_comments(next(data_iter))
)
parsed_line = self._cpat["space_delimited_data"].findall(line)
parsed_line = [m for m in parsed_line if m != ""]
parsed_line = [m for m in parsed_line if m != "" and m != ","]
loop_data.extend([parsed_line] if parsed_line else [])

n_elements, n_cols = (
Expand Down Expand Up @@ -870,7 +886,6 @@ def _parse(self, data_iter: Iterable):
stacklevel=2,
)
continue

rectable = np.atleast_2d(loop_data)
rectable.dtype = [*zip(loop_keys, [dt] * n_cols)]
rectable = rectable.reshape(rectable.shape, order="F")
Expand All @@ -889,7 +904,14 @@ def __repr__(self):
"loop_delimiter": r"([Ll][Oo][Oo][Pp]_)[ |\t]*([^\n]*)",
"block_delimiter": r"([Dd][Aa][Tt][Aa]_)[ |\t]*([^\n]*)",
"key_list": r"_[\w_\.*]+[\[\d\]]*",
"space_delimited_data": r"(\;[^\;]*\;|\'[^\']*\'|\"[^\"]*\"]|[^\'\"\;\s]*)\s*",
"space_delimited_data": (
r"("
r"\;[^\;]*\;|" # Non-semicolon data bracketed by semicolons
r"\'(?:'[^\s]|[^'])*\'|" # Data with single quotes not followed by \s
r"\"[^\"]*\"|" # Data with double quotes
r"[^\'\"\;\s]*" # Additional non-bracketed data
r")[\s]*"
),
}
"""Regex patterns used when parsing files.

Expand Down
127 changes: 60 additions & 67 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,49 @@
import os
import re
from dataclasses import dataclass
from glob import glob

import numpy as np
import pytest
from CifFile import CifFile as pycifRW
from CifFile import StarError
from gemmi import cif

from parsnip import CifFile

ADDITIONAL_TEST_FILES_PATH = ""

rng = np.random.default_rng(seed=161181914916)

data_file_path = os.path.dirname(__file__) + "/sample_data/"


def pycifrw_or_xfail(cif_data):
try:
return pycifRW(cif_data.filename).first_block()
except StarError:
pytest.xfail("pycifRW failed to read the file!")


def remove_invalid(s):
"""Our parser strips newlines and carriage returns.
TODO: newlines should be retained
"""
if s is None:
return None
return s.replace("\r", "")


def _gemmi_read_keys(filename, keys, as_number=True):
try:
file_block = cif.read_file(filename).sole_block()
except (RuntimeError, ValueError):
pytest.xfail("Gemmi failed to read file!")
if as_number:
return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
return np.array([remove_invalid(file_block.find_value(key)) for key in keys])


def _arrstrip(arr: np.ndarray, pattern: str):
return np.vectorize(lambda x: re.sub(pattern, "", x))(arr)

Expand All @@ -34,6 +66,20 @@ class CifData:
assorted_keys = np.loadtxt(data_file_path + "cif_file_keys.txt", dtype=str)


def combine_marks(*marks, argnames="cif_data"):
combinedargvalues = []
combinedids = []
for mark in marks:
argvalues, ids = mark.kwargs["argvalues"], mark.kwargs["ids"]
combinedargvalues.extend(argvalues)
combinedids.extend(ids)
return pytest.mark.parametrize(
argnames=argnames,
argvalues=combinedargvalues,
ids=combinedids,
)


def generate_random_key_sequences(arr, n_samples, seed=42):
rng = np.random.default_rng(seed)
return [
Expand Down Expand Up @@ -171,71 +217,18 @@ def random_keys_mark(n_samples=10):
argvalues=cif_data_array,
ids=[cif.filename.split("/")[-1] for cif in cif_data_array],
)
LINE_TEST_CASES = [
None,
"_key",
"__key",
"_key.loop_",
"asdf ",
"loop_",
"",
" ",
"# comment",
"_key#comment_ # 2",
"loop_##com",
"'my quote' # abc",
"\"malformed ''#",
";oddness\"'\n;asdf",
"_key.loop.inner_",
"normal_case",
"multi.periods....",
"__underscore__",
"_key_with_numbers123",
"test#hash",
"#standalone",
"'quote_in_single'",
'"quote_in_double"',
" \"mismatched_quotes' ",
";semicolon_in_text",
";;double_semicolon",
"trailing_space ",
" leading_space",
"_key.with#hash.loop",
"__double#hash#inside__",
"single'; quote",
'double;"quote',
"#comment;inside",
"_tricky'combination;#",
";'#another#combo;'",
'"#edge_case"',
'loop;"#complex"case',
"_'_weird_key'_",
"semi;;colon_and_hash#",
"_odd.key_with#hash",
"__leading_double_underscore__",
"middle;;semicolon",
"#just_a_comment",
'"escaped "quote"',
"'single_quote_with_hash#'",
"_period_end.",
"loop_.trailing_",
"escaped\\nnewline",
"#escaped\\twith_tab ",
"only;semicolon",
"trailing_semicolon;",
"leading_semicolon;",
"_key;.semicolon",
"semicolon;hash#",
"complex\"';hash#loop",
"just_text",
'loop#weird"text;',
"nested'quotes\"here ",
"normal_case2",
"__underscored_case__",
'escaped\\"quotes#',
";semicolon#hash;",
"double#hash_inside##key",
"__double..periods__",
"key#comment ; and_more",
"_weird_;;#combo ",
additional_data_array = [
CifData(
filename=fn,
file=CifFile(fn),
symop_keys=("_space_group_symop_operation_xyz", "_symmetry_equiv_pos_as_xyz"),
)
for fn in glob(ADDITIONAL_TEST_FILES_PATH)
]
additional_files_mark = pytest.mark.parametrize(
argnames="cif_data",
argvalues=additional_data_array,
ids=[cif.filename.split("/")[-1] for cif in additional_data_array],
)

all_files_mark = combine_marks(cif_files_mark, additional_files_mark)
59 changes: 30 additions & 29 deletions tests/test_key_reader.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,50 @@
import numpy as np
from CifFile import CifFile as pycifRW
from conftest import bad_cif, cif_files_mark, random_keys_mark
from gemmi import cif
from conftest import (
_gemmi_read_keys,
all_files_mark,
bad_cif,
pycifrw_or_xfail,
random_keys_mark,
)
from more_itertools import flatten


def remove_invalid(s):
"""Our parser strips newlines and carriage returns.
TODO: newlines should be retained
"""
if s is None:
return None
return s.replace("\r", "")
def _array_assertion_verbose(keys, test_data, real_data):
keys = np.asarray(keys)
test_data = np.asarray(test_data)
real_data = np.asarray(real_data)
msg = (
f"Key(s) {keys[test_data != real_data]} did not match:\n"
f"{test_data[test_data != real_data]}!="
f"{real_data[test_data != real_data]}\n"
)
np.testing.assert_equal(test_data, real_data, err_msg=msg)


def _gemmi_read_keys(filename, keys, as_number=True):
file_block = cif.read_file(filename).sole_block()
if as_number:
return np.array([cif.as_number(file_block.find_value(key)) for key in keys])
return np.array([remove_invalid(file_block.find_value(key)) for key in keys])


@cif_files_mark
@all_files_mark
def test_read_key_value_pairs(cif_data):
pycif = pycifRW(cif_data.filename).first_block()
pycif = pycifrw_or_xfail(cif_data)

invalid = [*flatten(pycif.loops.values()), *cif_data.failing]
all_keys = [key for key in pycif.true_case.values() if key.lower() not in invalid]

parsnip_data = cif_data.file[all_keys]
for i, value in enumerate(parsnip_data):
assert cif_data.file[all_keys[i]] == value
assert cif_data.file[all_keys[i]] == cif_data.file.get_from_pairs(all_keys[i])
np.testing.assert_equal(cif_data.file[all_keys[i]], value)
np.testing.assert_equal(
cif_data.file[all_keys[i]], cif_data.file.get_from_pairs(all_keys[i])
)
gemmi_data = _gemmi_read_keys(cif_data.filename, keys=all_keys, as_number=False)
np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
_array_assertion_verbose(all_keys, parsnip_data, gemmi_data)


@cif_files_mark
@all_files_mark
@random_keys_mark(n_samples=20)
def test_read_key_value_pairs_random(cif_data, keys):
parsnip_data = cif_data.file[keys]
np.testing.assert_equal(parsnip_data, cif_data.file.get_from_pairs(keys))
for i, value in enumerate(parsnip_data):
assert cif_data.file.pairs.get(keys[i], None) == value
parsnip_data = np.asarray(cif_data.file[keys])
_array_assertion_verbose(keys, parsnip_data, cif_data.file.get_from_pairs(keys))
gemmi_data = _gemmi_read_keys(cif_data.filename, keys=keys, as_number=False)
np.testing.assert_equal(parsnip_data, gemmi_data, verbose=True)
_array_assertion_verbose(keys, parsnip_data, gemmi_data)


def test_read_key_value_pairs_badcif(cif_data=bad_cif):
Expand All @@ -59,4 +60,4 @@ def test_read_key_value_pairs_badcif(cif_data=bad_cif):
r"45.6a/\s",
None,
]
np.testing.assert_array_equal(parsnip_data, correct_data)
_array_assertion_verbose(cif_data.manual_keys, parsnip_data, correct_data)
8 changes: 5 additions & 3 deletions tests/test_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
_write_debug_output,
)

BOX_ATOL = 8e-7

TEST_CASES = [
None,
"_key",
Expand Down Expand Up @@ -222,13 +224,13 @@ def test_box(cif_data):
freud_box_2 = freud.Box(*cif_data.file.box)
parsnip_box = _box_from_lengths_and_angles(*cif_box)

np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=1e-15)
np.testing.assert_allclose(parsnip_box[:3], freud_box.L, atol=BOX_ATOL)
np.testing.assert_allclose(
parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=1e-15
parsnip_box[3:], [freud_box.xy, freud_box.xz, freud_box.yz], atol=BOX_ATOL
)
if "PDB" not in cif_data.filename:
np.testing.assert_allclose(
[*freud_box.L, freud_box.xy, freud_box.xz, freud_box.yz],
[*freud_box_2.L, freud_box_2.xy, freud_box_2.xz, freud_box_2.yz],
atol=1e-8,
atol=BOX_ATOL,
)
Loading
Loading