From 09e90f43c7746899c994ed7115db31bf77a69d7a Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 14:58:06 -0400 Subject: [PATCH 01/19] Return additional data with unit cell --- parsnip/parsnip.py | 104 +++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 37 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 0f7e72e..029003b 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -88,7 +88,6 @@ _flatten_or_none, _is_data, _is_key, - _matrix_from_lengths_and_angles, _safe_eval, _strip_comments, _strip_quotes, @@ -387,6 +386,9 @@ def get_from_loops(self, index: ArrayLike): return None return result[0] if len(result) == 1 else result + if isinstance(index, (set, frozenset)): + index = list(index) # TODO: add to changelog + result, index = [], np.atleast_1d(index) for table in self.loops: matches = index[np.any(index[:, None] == table.dtype.names, axis=1)] @@ -459,6 +461,8 @@ def angle_is_invalid(x: float): def build_unit_cell( self, n_decimal_places: int = 4, + additional_cols: ArrayLike | None = None, + # additional_cols: ArrayLike | None = "_atom_site_label", verbose: bool = False, ): """Reconstruct fractional atomic positions from Wyckoff sites and symops. @@ -496,51 +500,73 @@ def build_unit_cell( ------ ValueError If the stored data cannot form a valid box. - """ - fractional_positions = self.wyckoff_positions + If the `additional_cols` is not properly associated with the unit cell. + """ # TODO: does this format correctly? + symops, fractional_positions = self.symops, self.wyckoff_positions + + if additional_cols is not None: + # Find the table of Wyckoff positions and compare to keys in additional_data + invalid_keys = next( + set(np.atleast_1d(additional_cols)) - set(labels) + for labels in self.loop_labels + if set(labels) & set(self.__class__._WYCKOFF_KEYS) + ) + print(invalid_keys) + if invalid_keys: + msg = ( + f"Requested keys {invalid_keys} are not included in the `_atom_site" + "_fract_[xyz]` loop and cannot be included in the unit cell." + ) + raise ValueError(msg) # Read the cell params and convert to a matrix of basis vectors - cell = self.read_cell_params(degrees=False) - cell_matrix = _matrix_from_lengths_and_angles(*cell) + # cell = self.read_cell_params(degrees=False) + # cell_matrix = _matrix_from_lengths_and_angles(*cell) symops_str = np.array2string( - self.symops, + symops, separator=",", # Place a comma after each line in the array for eval threshold=np.inf, # Ensure that every line is included in the string floatmode="unique", # Ensures strings can uniquely represent each float ) + # print((symops_str,), "\n\n") + # print(symops_str, "\n") all_frac_positions = [ _safe_eval(symops_str, *xyz) for xyz in fractional_positions - ] + ] # Compute N_symmetry_elements coordinates for each Wyckoff site pos = np.vstack(all_frac_positions) pos %= 1 # Wrap particles into the box + # auxiliary_data = + # print([self.symops]) + print(fractional_positions) + # print(pos[:8]) + # print(pos.shape, fractional_positions.shape) + print(self.get_from_loops("_atom_site_label")) + # Filter unique points. This takes some time but makes the method faster overall _, unique_indices, unique_counts = np.unique( - pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 + # pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 + pos.round(n_decimal_places), + return_index=True, + return_counts=True, + axis=0, ) + unique_indices.sort() # TODO: is this correct? if verbose: _write_debug_output(unique_indices, unique_counts, pos, check="Initial") - # Remove initial duplicates, then map to real space for a second check - pos = pos[unique_indices] + if additional_cols is None: + return pos[unique_indices] - real_space_positions = pos @ cell_matrix - - _, unique_indices, unique_counts = np.unique( - real_space_positions.round(n_decimal_places), - return_index=True, - return_counts=True, - axis=0, + tiled_data = np.repeat( + self.get_from_loops(additional_data), len(symops), axis=0 ) - if verbose: - _write_debug_output(unique_indices, unique_counts, pos, check="Secondary") - - return pos[unique_indices] + return [*zip(tiled_data[unique_indices], pos[unique_indices])] @property def box(self): @@ -646,13 +672,9 @@ def symops(self): .. _`parsable algebraic form`: https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Ispace_group_symop_operation_xyz.html """ - symmetry_keys = ( - "_symmetry_equiv_pos_as_xyz", - "_space_group_symop_operation_xyz", - ) - # Only one key is valid in each standard, so we only ever get one match. - return self.get_from_loops(symmetry_keys) + return self.get_from_loops(self.__class__._SYMOP_KEYS) + # TODO: shape is (N, 1), not (N, ) @property def wyckoff_positions(self): @@ -665,16 +687,10 @@ def wyckoff_positions(self): .. _`fractional coordinates`: https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Iatom_site_fract_.html """ - xyz_keys = ( - "_atom_site_fract_x", - "_atom_site_fract_y", - "_atom_site_fract_z", - "_atom_site_Cartn_x", - "_atom_site_Cartn_y", - "_atom_site_Cartn_z", - ) # Only one set should be stored at a time - - return cast_array_to_float(arr=self.get_from_loops(xyz_keys), dtype=float) + # TODO: add additional checking in get_from_loops to verify correctness + return cast_array_to_float( + arr=self.get_from_loops(self.__class__._WYCKOFF_KEYS), dtype=float + ) @property def cast_values(self): @@ -842,3 +858,17 @@ def __repr__(self): This dictionary can be modified to change parsing behavior, although doing is not recommended. Changes to this variable are shared across all instances of the class. """ + + _SYMOP_KEYS = ( + "_symmetry_equiv_pos_as_xyz", + "_space_group_symop_operation_xyz", + ) + # TODO: cannot be set/frozenset: order matters! + _WYCKOFF_KEYS = ( + "_atom_site_fract_x", + "_atom_site_fract_y", + "_atom_site_fract_z", + "_atom_site_Cartn_x", + "_atom_site_Cartn_y", + "_atom_site_Cartn_z", + ) # Only one set should be stored at a time From e64efb344315ca3f8bba673155744d56c0524ae9 Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 15:02:09 -0400 Subject: [PATCH 02/19] Clean up build_unit_cell --- parsnip/parsnip.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 029003b..4d06a30 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -228,7 +228,6 @@ def __getitem__(self, index: str | Iterable[str]): for key in index: pairs_match = self.get_from_pairs(key) loops_match = self.get_from_loops(key) - # print(pairs_match if pairs_match is not None else loops_match) output.append(pairs_match if pairs_match is not None else loops_match) return output[0] if len(output) == 1 else output @@ -462,7 +461,6 @@ def build_unit_cell( self, n_decimal_places: int = 4, additional_cols: ArrayLike | None = None, - # additional_cols: ArrayLike | None = "_atom_site_label", verbose: bool = False, ): """Reconstruct fractional atomic positions from Wyckoff sites and symops. @@ -500,8 +498,9 @@ def build_unit_cell( ------ ValueError If the stored data cannot form a valid box. - If the `additional_cols` is not properly associated with the unit cell. - """ # TODO: does this format correctly? + ValueError + If the `additional_cols` are not properly associated with the unit cell. + """ symops, fractional_positions = self.symops, self.wyckoff_positions if additional_cols is not None: @@ -511,7 +510,6 @@ def build_unit_cell( for labels in self.loop_labels if set(labels) & set(self.__class__._WYCKOFF_KEYS) ) - print(invalid_keys) if invalid_keys: msg = ( f"Requested keys {invalid_keys} are not included in the `_atom_site" @@ -529,8 +527,6 @@ def build_unit_cell( threshold=np.inf, # Ensure that every line is included in the string floatmode="unique", # Ensures strings can uniquely represent each float ) - # print((symops_str,), "\n\n") - # print(symops_str, "\n") all_frac_positions = [ _safe_eval(symops_str, *xyz) for xyz in fractional_positions @@ -539,20 +535,9 @@ def build_unit_cell( pos = np.vstack(all_frac_positions) pos %= 1 # Wrap particles into the box - # auxiliary_data = - # print([self.symops]) - print(fractional_positions) - # print(pos[:8]) - # print(pos.shape, fractional_positions.shape) - print(self.get_from_loops("_atom_site_label")) - # Filter unique points. This takes some time but makes the method faster overall _, unique_indices, unique_counts = np.unique( - # pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 - pos.round(n_decimal_places), - return_index=True, - return_counts=True, - axis=0, + pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 ) unique_indices.sort() # TODO: is this correct? From 3c09e61a731caab8c274beab75adc2f52773cb9e Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 15:46:48 -0400 Subject: [PATCH 03/19] Test structured array --- parsnip/parsnip.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 4d06a30..6b1efac 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -460,7 +460,10 @@ def angle_is_invalid(x: float): def build_unit_cell( self, n_decimal_places: int = 4, - additional_cols: ArrayLike | None = None, + additional_columns: ArrayLike | None = [ + "_atom_site_label", + "_atom_site_U_iso_or_equiv", + ], verbose: bool = False, ): """Reconstruct fractional atomic positions from Wyckoff sites and symops. @@ -499,14 +502,14 @@ def build_unit_cell( ValueError If the stored data cannot form a valid box. ValueError - If the `additional_cols` are not properly associated with the unit cell. + If the `additional_columns` are not properly associated with the unit cell. """ symops, fractional_positions = self.symops, self.wyckoff_positions - if additional_cols is not None: + if additional_columns is not None: # Find the table of Wyckoff positions and compare to keys in additional_data invalid_keys = next( - set(np.atleast_1d(additional_cols)) - set(labels) + set(np.atleast_1d(additional_columns)) - set(labels) for labels in self.loop_labels if set(labels) & set(self.__class__._WYCKOFF_KEYS) ) @@ -544,14 +547,26 @@ def build_unit_cell( if verbose: _write_debug_output(unique_indices, unique_counts, pos, check="Initial") - if additional_cols is None: + if additional_columns is None: return pos[unique_indices] tiled_data = np.repeat( - self.get_from_loops(additional_data), len(symops), axis=0 + self.get_from_loops(additional_columns), len(symops), axis=0 ) - return [*zip(tiled_data[unique_indices], pos[unique_indices])] + # return tiled_data[unique_indices], pos[unique_indices] + return np.rec.array( + list(zip(*tiled_data.T, *pos.T)), + dtype=[ + *[ + (label, tiled_data.dtype) + for label in np.atleast_1d(additional_columns) + ], + ("x", float), + ("y", float), + ("z", float), + ], + ) @property def box(self): From 706225eb1e90bdf3cf1a6fd99491cd266275b638 Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 15:47:34 -0400 Subject: [PATCH 04/19] Swap back to multi-array return --- parsnip/parsnip.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 6b1efac..b9b6127 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -554,19 +554,7 @@ def build_unit_cell( self.get_from_loops(additional_columns), len(symops), axis=0 ) - # return tiled_data[unique_indices], pos[unique_indices] - return np.rec.array( - list(zip(*tiled_data.T, *pos.T)), - dtype=[ - *[ - (label, tiled_data.dtype) - for label in np.atleast_1d(additional_columns) - ], - ("x", float), - ("y", float), - ("z", float), - ], - ) + return tiled_data[unique_indices], pos[unique_indices] @property def box(self): From 3a53b217c1ad80d0d39be92f00b8c410763d4ea2 Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 15:53:34 -0400 Subject: [PATCH 05/19] Reset default for additional_columns --- parsnip/parsnip.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index b9b6127..41dbae2 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -460,10 +460,7 @@ def angle_is_invalid(x: float): def build_unit_cell( self, n_decimal_places: int = 4, - additional_columns: ArrayLike | None = [ - "_atom_site_label", - "_atom_site_U_iso_or_equiv", - ], + additional_columns: ArrayLike | None = None, verbose: bool = False, ): """Reconstruct fractional atomic positions from Wyckoff sites and symops. From c88799be3443429f977c533b3f3aae1cec546e58 Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 16:05:34 -0400 Subject: [PATCH 06/19] Re-enable realspace duplicate check --- parsnip/parsnip.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 41dbae2..f611cc5 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -88,6 +88,7 @@ _flatten_or_none, _is_data, _is_key, + _matrix_from_lengths_and_angles, _safe_eval, _strip_comments, _strip_quotes, @@ -518,8 +519,8 @@ def build_unit_cell( raise ValueError(msg) # Read the cell params and convert to a matrix of basis vectors - # cell = self.read_cell_params(degrees=False) - # cell_matrix = _matrix_from_lengths_and_angles(*cell) + cell = self.read_cell_params(degrees=False) + cell_matrix = _matrix_from_lengths_and_angles(*cell) symops_str = np.array2string( symops, @@ -536,13 +537,32 @@ def build_unit_cell( pos %= 1 # Wrap particles into the box # Filter unique points. This takes some time but makes the method faster overall - _, unique_indices, unique_counts = np.unique( + _, unique_fractional, unique_counts = np.unique( pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 ) - unique_indices.sort() # TODO: is this correct? if verbose: - _write_debug_output(unique_indices, unique_counts, pos, check="Initial") + _write_debug_output( + unique_fractional, unique_counts, pos, check="Fractional" + ) + + # Double-check for duplicates with real space coordinates + real_space_positions = pos @ cell_matrix + + _, unique_realspace, unique_counts = np.unique( + real_space_positions.round(n_decimal_places), + return_index=True, + return_counts=True, + axis=0, + ) + + # Merge unique points from realspace and fractional calculations + unique_indices = sorted({*unique_fractional} & {*unique_realspace}) + + if verbose: + _write_debug_output( + unique_fractional, unique_counts, pos, check="Realspace" + ) if additional_columns is None: return pos[unique_indices] From 172ab00b7eeafef72f96845c6ee15b0f8eb45a8b Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 16:05:43 -0400 Subject: [PATCH 07/19] Fix format error --- parsnip/parsnip.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index f611cc5..150af4f 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -818,10 +818,10 @@ def _parse(self, data_iter: Iterable): if n_elements % n_cols != 0: warnings.warn( - f"Parsed data for table {len(self.loops) + 1} cannot be resolved" - f" into a table of the expected size and will be ignored. " - f"Got n={n_elements} items, expected c={n_cols} columns: " - f"n%c={n_elements % n_cols}).", + f"Parsed data for table {len(self.loops) + 1} cannot be" + f" resolved into a table of the expected size and will be" + f"ignored. Got n={n_elements} items, expected c={n_cols}" + f"columns: n%c={n_elements % n_cols}).", category=ParseWarning, stacklevel=2, ) From 686b831096923b8c858210db88a9b34a0673c2e6 Mon Sep 17 00:00:00 2001 From: janbridley Date: Tue, 11 Mar 2025 16:09:41 -0400 Subject: [PATCH 08/19] Remove unnecessary TODO --- parsnip/parsnip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 150af4f..8237c25 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -868,7 +868,6 @@ def __repr__(self): "_symmetry_equiv_pos_as_xyz", "_space_group_symop_operation_xyz", ) - # TODO: cannot be set/frozenset: order matters! _WYCKOFF_KEYS = ( "_atom_site_fract_x", "_atom_site_fract_y", From 2c7c9025ee4b07db96e0d2c8fabefe484a679a0c Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 10:20:15 -0400 Subject: [PATCH 09/19] Fix type hinting in read_cell_params --- parsnip/parsnip.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 8237c25..dc686b1 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -410,11 +410,11 @@ def read_cell_params(self, degrees: bool = True, normalize: bool = False): Parameters ---------- degrees : bool, optional - When True, angles are returned in degrees (as per the CIF spec). When - False, angles are converted to radians. Default value = ``True`` - normalize: (bool, optional) + When ``True``, angles are returned in degrees (as in the CIF spec). When + ``False``, angles are converted to radians. Default value = ``True`` + normalize: bool, optional Whether to scale the unit cell such that the smallest lattice parameter - is `1.0`. + is ``1.0``. Default value = ``False`` Returns From c2a926b0f62ad2754580d6f604df87b254ea9072 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 10:20:41 -0400 Subject: [PATCH 10/19] Document additional_columns --- parsnip/parsnip.py | 59 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index dc686b1..3f1c4e4 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -461,32 +461,55 @@ def angle_is_invalid(x: float): def build_unit_cell( self, n_decimal_places: int = 4, - additional_columns: ArrayLike | None = None, + additional_columns: str | Iterable[str] | None = None, verbose: bool = False, ): """Reconstruct fractional atomic positions from Wyckoff sites and symops. Rather than storing an entire unit cell's atomic positions, CIF files instead include the data required to recreate those positions based on symmetry rules. - Symmetry operations (stored as strings of x,y,z position permutations) are + Symmetry operations (stored as strings of `x,y,z` position permutations) are applied to the Wyckoff (symmetry irreducible) positions to create a list of possible atomic sites. These are then wrapped into the unit cell and filtered for uniqueness to yield the final crystal. - .. warning:: - - Reconstructing positions requires several floating point calculations that - can be impacted by low-precision data in CIF files. Typically, at least four - decimal places are required to accurately reconstruct complicated unit - cells: less precision than this can yield cells with duplicate or missing - positions. + Example + ------- + Construct the atomic positions of the FCC unit cell from its Wyckoff sites: + + >>> pos = cif.build_unit_cell() + >>> pos + array([[0. , 0. , 0. ], + [0. , 0.5, 0.5], + [0.5, 0. , 0.5], + [0.5, 0.5, 0. ]]) + + Reconstruct a unit cell with its associated atomic labels: + + >>> atoms, pos = cif.build_unit_cell(additional_columns=["_atom_site_label"]) + >>> atoms + array([['Cu1'], + ['Cu1'], + ['Cu1'], + ['Cu1']], dtype='>> pos + array([[0. , 0. , 0. ], + [0. , 0.5, 0.5], + [0.5, 0. , 0.5], + [0.5, 0.5, 0. ]]) - Args: - n_decimal_places : (int, optional) + Parameters + ---------- + n_decimal_places : int, optional The number of decimal places to round each position to for the uniqueness comparison. Values higher than 4 may not work for all CIF files. Default value = ``4`` - verbose : (bool, optional) + additional_columns : str | Iterable[str] | None, optional + A column name or list of column names from the loop containing + the Wyckoff site positions. This data is replicated alongside the atomic + coordinates and returned in an auxiliary array. + Default value = ``None`` + verbose : bool, optional Whether to print debug information about the uniqueness checks. Default value = ``False`` @@ -500,7 +523,17 @@ def build_unit_cell( ValueError If the stored data cannot form a valid box. ValueError - If the `additional_columns` are not properly associated with the unit cell. + If the ``additional_columns`` are not properly associated with the Wyckoff + positions. + + + .. warning:: + + Reconstructing positions requires several floating point calculations that + can be impacted by low-precision data in CIF files. Typically, at least four + decimal places are required to accurately reconstruct complicated unit + cells: less precision than this can yield cells with duplicate or missing + positions. """ symops, fractional_positions = self.symops, self.wyckoff_positions From 8c090043b9d78181821b66bceeaca8799ee96478 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 10:20:54 -0400 Subject: [PATCH 11/19] Clean up comments --- parsnip/parsnip.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 3f1c4e4..01a5e2c 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -556,10 +556,7 @@ def build_unit_cell( cell_matrix = _matrix_from_lengths_and_angles(*cell) symops_str = np.array2string( - symops, - separator=",", # Place a comma after each line in the array for eval - threshold=np.inf, # Ensure that every line is included in the string - floatmode="unique", # Ensures strings can uniquely represent each float + symops, separator=",", threshold=np.inf, floatmode="unique" ) all_frac_positions = [ @@ -591,6 +588,7 @@ def build_unit_cell( # Merge unique points from realspace and fractional calculations unique_indices = sorted({*unique_fractional} & {*unique_realspace}) + # TODO: Reintroduce AFLOW test suite if verbose: _write_debug_output( From 5f1fe420b9b8b014de507590b29541936cec2581 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 10:57:25 -0400 Subject: [PATCH 12/19] Clean up docstrings --- parsnip/parsnip.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 01a5e2c..0d89573 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -77,7 +77,6 @@ import numpy as np from more_itertools import flatten, peekable from numpy.lib.recfunctions import structured_to_unstructured -from numpy.typing import ArrayLike from parsnip._errors import ParseWarning from parsnip.patterns import ( @@ -182,7 +181,7 @@ def loops(self): Returns ------- - list[:class:`numpy.ndarray[str]`]: + list[numpy.ndarray[str]] A list of structured arrays containing table data from the file. """ return self._loops @@ -223,6 +222,11 @@ def __getitem__(self, index: str | Iterable[str]): >>> cif[["_journal*", "_atom_site_fract_?"]] [['1999', '0', '123'], ...array([['0.0000000000', '0.0000000000', '0.0000000000']], dtype='>> atoms, pos = cif.build_unit_cell(additional_columns=["_atom_site_label"]) >>> atoms @@ -504,7 +509,7 @@ def build_unit_cell( The number of decimal places to round each position to for the uniqueness comparison. Values higher than 4 may not work for all CIF files. Default value = ``4`` - additional_columns : str | Iterable[str] | None, optional + additional_columns : str | typing.Iterable[str] | None, optional A column name or list of column names from the loop containing the Wyckoff site positions. This data is replicated alongside the atomic coordinates and returned in an auxiliary array. @@ -527,7 +532,7 @@ def build_unit_cell( positions. - .. warning:: + .. caution:: Reconstructing positions requires several floating point calculations that can be impacted by low-precision data in CIF files. Typically, at least four @@ -702,7 +707,7 @@ def symops(self): Returns ------- - :math:`(N,)` :class:`numpy.ndarray[str]`: + :math:`(N,1)` numpy.ndarray[str]: An array of strings containing the symmetry operations in a `parsable algebraic form`_. @@ -710,7 +715,6 @@ def symops(self): """ # Only one key is valid in each standard, so we only ever get one match. return self.get_from_loops(self.__class__._SYMOP_KEYS) - # TODO: shape is (N, 1), not (N, ) @property def wyckoff_positions(self): @@ -718,7 +722,7 @@ def wyckoff_positions(self): Returns ------- - :math:`(N, 3)` :class:`numpy.ndarray[float]`: + :math:`(N, 3)` :class:`numpy.ndarray`: Symmetry-irreducible positions of atoms in `fractional coordinates`_. .. _`fractional coordinates`: https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Iatom_site_fract_.html From 40603a69e85191f633f4562bb9f24c04fb0c6c08 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 13:09:11 -0400 Subject: [PATCH 13/19] Add tests for new functionality --- tests/test_unitcells.py | 57 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/tests/test_unitcells.py b/tests/test_unitcells.py index 6f536fb..b7c747e 100644 --- a/tests/test_unitcells.py +++ b/tests/test_unitcells.py @@ -1,4 +1,6 @@ +import re import warnings +from contextlib import nullcontext import numpy as np import pytest @@ -6,6 +8,7 @@ from ase.build import supercells from conftest import box_keys, cif_files_mark from gemmi import cif +from more_itertools import flatten def _gemmi_read_table(filename, keys): @@ -19,6 +22,10 @@ def _gemmi_read_keys(filename, keys, as_number=True): return np.array([file_block.find_value(key) for key in keys]) +def _arrstrip(arr: np.ndarray, pattern: str): + return np.vectorize(lambda x: re.sub(pattern, "", x))(arr) + + @cif_files_mark # TODO: test with conversions to numeric as well def test_read_wyckoff_positions(cif_data): if "PDB_4INS_head.cif" in cif_data.filename: @@ -54,17 +61,47 @@ def test_read_symmetry_operations(cif_data): @cif_files_mark -@pytest.mark.parametrize("n_decimal_places", [3, 4, 5, 6, 9]) -def test_build_unit_cell(cif_data, n_decimal_places): +# @pytest.mark.parametrize("n_decimal_places", [3, 4, 5, 6, 9]) +@pytest.mark.parametrize("n_decimal_places", [3]) +@pytest.mark.parametrize( + "cols", + [ + None, + "_atom_site_type_symbol", + ["_atom_site_type_symbol", "_atom_site_occupancy"], + ], +) +def test_build_unit_cell(cif_data, n_decimal_places, cols): warnings.filterwarnings("ignore", "crystal system", category=UserWarning) if "PDB_4INS_head.cif" in cif_data.filename: return - parsnip_positions = ( - cif_data.file.build_unit_cell(n_decimal_places=n_decimal_places) - @ cif_data.file.lattice_vectors.T + should_raise = cols is not None and any( + k not in flatten(cif_data.file.loop_labels) for k in np.atleast_1d(cols) ) + occupancies, read_data = None, None + with ( + pytest.raises(ValueError, match=r"not included in the `_atom_site_fract_\[xyz") + if should_raise + else nullcontext() + ): + read_data = cif_data.file.build_unit_cell( + n_decimal_places=n_decimal_places, additional_columns=cols + ) + + if read_data is None: + return # ValueError was raised + + if cols is None: + parsnip_positions = read_data @ cif_data.file.lattice_vectors.T + else: + auxiliary_arr, parsnip_positions = read_data + parsnip_positions @= cif_data.file.lattice_vectors.T + + che_symbols = _arrstrip(auxiliary_arr[:, 0], r"[^A-Za-z]+") + if isinstance(cols, list): + occupancies = _arrstrip(auxiliary_arr[:, 1], r"[^\d\.]+").astype(float) # Read the structure, then extract to Python builtin types. Then, wrap into the box ase_file = io.read(cif_data.filename) @@ -77,11 +114,21 @@ def test_build_unit_cell(cif_data, n_decimal_places): ase_positions = np.array( sorted(ase_data.get_positions(), key=lambda x: (x[0], x[1], x[2])) ) + ase_symbols = np.array(ase_data.get_chemical_symbols()) parsnip_minmax = [parsnip_positions.min(axis=0), parsnip_positions.max(axis=0)] ase_minmax = [ase_positions.min(axis=0), ase_positions.max(axis=0)] np.testing.assert_allclose(parsnip_minmax, ase_minmax, atol=1e-12) + if cols is not None: + # NOTE: ASE saves the occupancies of the most dominant species! + # Parsnip makes no assumptions regarding the correct occupancy + # Check all if full occupancy, partial if occ is ndarray and None if occ is None + mask = ( + ... if cif_data.file["_atom_site_occupancy"] is None else occupancies == 1 + ) + np.testing.assert_equal(che_symbols[mask], ase_symbols[mask]) + if "zeolite" in cif_data.filename: return # Four decimal places not sufficient to reconstruct this structure np.testing.assert_allclose(parsnip_positions, ase_positions, atol=1e-12) From 937ad1f67fa052f26ec4e34cfa73448b30200d52 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 13:10:53 -0400 Subject: [PATCH 14/19] Fix None formatting in docstrings --- parsnip/parsnip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 0d89573..f6557e1 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -190,7 +190,7 @@ def __getitem__(self, index: str | Iterable[str]): """Return an item or list of items from :meth:`~.pairs` and :meth:`~.loops`. This getter searches the entire CIF state to identify the input keys, returning - `None` if the key does not match any data. Matching columns from `loop` tables + ``None`` if the key does not match any data. Matching columns from `loop` tables are returned as 1D arrays. .. tip:: @@ -246,7 +246,7 @@ def get_from_pairs(self, index: str | Iterable[str]): wildcard matches more than one key, a list is returned for that index. Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing - with an Iterable of strings returns a list of values, with `None` as a + with an Iterable of strings returns a list of values, with ``None`` as a placeholder for keys that did not match any data. Example From 2c863a845f8a5b3fe685bc43db1b55c34c2c004d Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 13:11:16 -0400 Subject: [PATCH 15/19] Clean up other docstrings --- parsnip/parsnip.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index f6557e1..36bec81 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -407,7 +407,7 @@ def get_from_loops(self, index: str | Iterable[str]): return _flatten_or_none(result) def read_cell_params(self, degrees: bool = True, normalize: bool = False): - r"""Read the `unit cell parameters`_ (lengths and angles) from a CIF file. + r"""Read the `unit cell parameters`_ (lengths and angles). .. _`unit cell parameters`: https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Ccell.html @@ -695,7 +695,7 @@ def loop_labels(self): @property def symops(self): - r"""Extract the symmetry operations from a CIF file. + r"""Extract the symmetry operations in a `parsable algebraic form`_. Example ------- @@ -708,8 +708,7 @@ def symops(self): Returns ------- :math:`(N,1)` numpy.ndarray[str]: - An array of strings containing the symmetry operations in a - `parsable algebraic form`_. + An array containing the symmetry operations. .. _`parsable algebraic form`: https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Ispace_group_symop_operation_xyz.html """ @@ -718,7 +717,7 @@ def symops(self): @property def wyckoff_positions(self): - r"""Extract symmetry-irreducible, fractional x,y,z coordinates from a CIF file. + r"""Extract symmetry-irreducible, fractional `x,y,z` coordinates. Returns ------- From 67cb6784194e5aca8ddbe142203e0073a4f58525 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 13:11:41 -0400 Subject: [PATCH 16/19] Clean up examples and verbose logging --- parsnip/parsnip.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index 36bec81..d4008dd 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -491,17 +491,18 @@ def build_unit_cell( Reconstruct a unit cell with its associated atomic labels. The ordering of the auxiliary data array will match the ordering of the atomic positions: - >>> atoms, pos = cif.build_unit_cell(additional_columns=["_atom_site_label"]) - >>> atoms - array([['Cu1'], - ['Cu1'], - ['Cu1'], - ['Cu1']], dtype='>> pos + >>> data = cif.build_unit_cell(additional_columns=["_atom_site_type_symbol"]) + >>> data[0] # Chemical symbol for the atoms at each lattice site + array([['Cu'], + ['Cu'], + ['Cu'], + ['Cu']], dtype='>> data[1] # Lattice positions array([[0. , 0. , 0. ], [0. , 0.5, 0.5], [0.5, 0. , 0.5], [0.5, 0.5, 0. ]]) + >>> assert (pos==data[1]).all() Parameters ---------- @@ -545,7 +546,7 @@ def build_unit_cell( if additional_columns is not None: # Find the table of Wyckoff positions and compare to keys in additional_data invalid_keys = next( - set(np.atleast_1d(additional_columns)) - set(labels) + set(map(str, np.atleast_1d(additional_columns))) - set(labels) for labels in self.loop_labels if set(labels) & set(self.__class__._WYCKOFF_KEYS) ) @@ -571,16 +572,11 @@ def build_unit_cell( pos = np.vstack(all_frac_positions) pos %= 1 # Wrap particles into the box - # Filter unique points. This takes some time but makes the method faster overall + # Filter unique points. TODO: support arbitrary precision, fractional sites _, unique_fractional, unique_counts = np.unique( pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0 ) - if verbose: - _write_debug_output( - unique_fractional, unique_counts, pos, check="Fractional" - ) - # Double-check for duplicates with real space coordinates real_space_positions = pos @ cell_matrix @@ -596,6 +592,9 @@ def build_unit_cell( # TODO: Reintroduce AFLOW test suite if verbose: + _write_debug_output( + unique_fractional, unique_counts, pos, check="Fractional" + ) _write_debug_output( unique_fractional, unique_counts, pos, check="Realspace" ) From bc2c1b3974706b979396dafc7a75c8a2f8ad8de1 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 13:23:33 -0400 Subject: [PATCH 17/19] Fix for python 3.7 --- tests/test_unitcells.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_unitcells.py b/tests/test_unitcells.py index b7c747e..29c4085 100644 --- a/tests/test_unitcells.py +++ b/tests/test_unitcells.py @@ -97,7 +97,7 @@ def test_build_unit_cell(cif_data, n_decimal_places, cols): parsnip_positions = read_data @ cif_data.file.lattice_vectors.T else: auxiliary_arr, parsnip_positions = read_data - parsnip_positions @= cif_data.file.lattice_vectors.T + parsnip_positions = parsnip_positions @ cif_data.file.lattice_vectors.T che_symbols = _arrstrip(auxiliary_arr[:, 0], r"[^A-Za-z]+") if isinstance(cols, list): From 793a43b7210efd3c1734f594364db5dc2772e2f9 Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 15:05:30 -0400 Subject: [PATCH 18/19] Re-enable parameterized rounding --- tests/test_unitcells.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_unitcells.py b/tests/test_unitcells.py index 29c4085..17c03bb 100644 --- a/tests/test_unitcells.py +++ b/tests/test_unitcells.py @@ -61,8 +61,7 @@ def test_read_symmetry_operations(cif_data): @cif_files_mark -# @pytest.mark.parametrize("n_decimal_places", [3, 4, 5, 6, 9]) -@pytest.mark.parametrize("n_decimal_places", [3]) +@pytest.mark.parametrize("n_decimal_places", [3, 4, 6, 9]) @pytest.mark.parametrize( "cols", [ From 953a8d6eecf83dd369ed3be1b366d331d937c41d Mon Sep 17 00:00:00 2001 From: janbridley Date: Wed, 12 Mar 2025 16:01:47 -0400 Subject: [PATCH 19/19] Update ChangeLog --- changelog.rst | 13 +++++++++++++ parsnip/parsnip.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/changelog.rst b/changelog.rst index 7f7a3c7..41e1c29 100644 --- a/changelog.rst +++ b/changelog.rst @@ -4,6 +4,19 @@ Changelog The format is based on `Keep a Changelog `__. This project adheres to `Semantic Versioning `__. +v0.2.1 - 20XX-XX-XX +------------------- + +Added +~~~~~ +- New ``additional_columns`` parameter for ``build_unit_cell`` that allows the return of + atom site labels and similar data alongside unit cell positions. +- Ensured consistent ordering of lattice positions returned from ``build_unit_cell``. + +Fixed +~~~~~ +- Type hints now properly link to their associated documentation. + v0.2.0 - 2025-02-19 ------------------- diff --git a/parsnip/parsnip.py b/parsnip/parsnip.py index d4008dd..982c6a7 100644 --- a/parsnip/parsnip.py +++ b/parsnip/parsnip.py @@ -391,7 +391,7 @@ def get_from_loops(self, index: str | Iterable[str]): return result[0] if len(result) == 1 else result if isinstance(index, (set, frozenset)): - index = list(index) # TODO: add to changelog + index = list(index) result, index = [], np.atleast_1d(index) for table in self.loops: