|
| 1 | +"""Configuration classes for directory content comparison. |
| 2 | +
|
| 3 | +This module contains the configuration classes and validation functions. |
| 4 | +""" |
| 5 | + |
| 6 | +# LICENSE HEADER MANAGED BY add-license-header |
| 7 | +# Copyright (c) 2023-2025 Blue Brain Project, EPFL. |
| 8 | +# |
| 9 | +# This file is part of dir-content-diff. |
| 10 | +# See https://github.com/BlueBrain/dir-content-diff for further info. |
| 11 | +# |
| 12 | +# SPDX-License-Identifier: Apache-2.0 |
| 13 | +# LICENSE HEADER MANAGED BY add-license-header |
| 14 | + |
| 15 | +import re |
| 16 | +from collections.abc import Callable |
| 17 | +from typing import Any |
| 18 | +from typing import Dict |
| 19 | +from typing import Iterable |
| 20 | +from typing import Literal |
| 21 | +from typing import Optional |
| 22 | +from typing import Pattern |
| 23 | +from typing import Tuple |
| 24 | +from typing import Union |
| 25 | + |
| 26 | +import attrs |
| 27 | + |
| 28 | +from dir_content_diff.base_comparators import BaseComparator |
| 29 | +from dir_content_diff.registry import get_comparators |
| 30 | + |
| 31 | +# Type alias for comparators |
| 32 | +ComparatorType = Union[BaseComparator, Callable] |
| 33 | + |
| 34 | + |
| 35 | +def _convert_iterable_to_tuple( |
| 36 | + x: Optional[Iterable[str]], |
| 37 | +) -> Optional[Tuple[str, ...]]: |
| 38 | + """Convert an iterable to a tuple, or return None.""" |
| 39 | + if x is None: |
| 40 | + return None |
| 41 | + return tuple(x) |
| 42 | + |
| 43 | + |
| 44 | +def _validate_specific_args(instance, attribute, value): # pylint: disable=unused-argument |
| 45 | + """Validate specific_args structure.""" |
| 46 | + for file_path, args in value.items(): |
| 47 | + if not isinstance(args, dict): |
| 48 | + raise ValueError(f"specific_args['{file_path}'] must be a dictionary") |
| 49 | + # Note: regex patterns in specific_args will be validated during compilation |
| 50 | + # in __attrs_post_init__, so no need to validate them here |
| 51 | + |
| 52 | + |
| 53 | +def _validate_export_formatted_files(instance, attribute, value): # pylint: disable=unused-argument |
| 54 | + """Validate export_formatted_files is either bool or non-empty string.""" |
| 55 | + if isinstance(value, str) and len(value.strip()) == 0: |
| 56 | + raise ValueError( |
| 57 | + "export_formatted_files must be a non-empty string when provided as string" |
| 58 | + ) |
| 59 | + |
| 60 | + |
| 61 | +def _validate_comparators(instance, attribute, value): # pylint: disable=unused-argument |
| 62 | + """Validate comparators are either BaseComparator instances or callable.""" |
| 63 | + for ext, comparator in value.items(): |
| 64 | + if not (isinstance(comparator, BaseComparator) or callable(comparator)): |
| 65 | + raise ValueError( |
| 66 | + f"Comparator for extension '{ext}' must be a BaseComparator instance " |
| 67 | + "or callable" |
| 68 | + ) |
| 69 | + |
| 70 | + |
| 71 | +@attrs.frozen |
| 72 | +class ComparisonConfig: |
| 73 | + """Configuration class to store comparison settings. |
| 74 | +
|
| 75 | + Attributes: |
| 76 | + include_patterns: A list of regular expression patterns. If the relative path of a |
| 77 | + file does not match any of these patterns, it is ignored during the comparison. Note |
| 78 | + that this means that any specific arguments for that file will also be ignored. |
| 79 | + exclude_patterns: A list of regular expression patterns. If the relative path of a |
| 80 | + file matches any of these patterns, it is ignored during the comparison. Note that |
| 81 | + this means that any specific arguments for that file will also be ignored. |
| 82 | + comparators: A ``dict`` to override the registered comparators. |
| 83 | + specific_args: A ``dict`` with the args/kwargs that should be given to the |
| 84 | + comparator for a given file. This ``dict`` should be like the following: |
| 85 | +
|
| 86 | + .. code-block:: Python |
| 87 | +
|
| 88 | + { |
| 89 | + <relative_file_path>: { |
| 90 | + comparator: ComparatorInstance, |
| 91 | + args: [arg1, arg2, ...], |
| 92 | + kwargs: { |
| 93 | + kwarg_name_1: kwarg_value_1, |
| 94 | + kwarg_name_2: kwarg_value_2, |
| 95 | + } |
| 96 | + }, |
| 97 | + <another_file_path>: {...}, |
| 98 | + <a name for this category>: { |
| 99 | + "patterns": ["regex1", "regex2", ...], |
| 100 | + ... (other arguments) |
| 101 | + } |
| 102 | + } |
| 103 | +
|
| 104 | + If the "patterns" entry is present, then the name is not considered and is only used as |
| 105 | + a helper for the user. When a "patterns" entry is detected, the other arguments are |
| 106 | + applied to all files whose relative name matches one of the given regular expression |
| 107 | + patterns. If a file could match multiple patterns of different groups, only the first |
| 108 | + one is considered. |
| 109 | +
|
| 110 | + Note that all entries in this ``dict`` are optional. |
| 111 | + return_raw_diffs: If set to ``True``, only the raw differences are returned instead |
| 112 | + of a formatted report. |
| 113 | + export_formatted_files: If set to ``True`` or a not empty string, create a |
| 114 | + new directory with formatted compared data files. If a string is passed, this string is |
| 115 | + used as suffix for the new directory. If `True` is passed, the suffix is |
| 116 | + ``_FORMATTED``. |
| 117 | + max_workers: Maximum number of worker threads/processes for parallel execution. If None, |
| 118 | + defaults to min(32, (os.cpu_count() or 1) + 4) as per executor default. |
| 119 | + executor_type: Type of executor to use for parallel execution. 'thread' uses |
| 120 | + ThreadPoolExecutor (better for I/O-bound tasks), 'process' uses ProcessPoolExecutor |
| 121 | + (better for CPU-bound tasks), 'sequential' disables parallel execution. |
| 122 | + """ |
| 123 | + |
| 124 | + include_patterns: Optional[Iterable[str]] = attrs.field( |
| 125 | + default=None, converter=_convert_iterable_to_tuple |
| 126 | + ) |
| 127 | + exclude_patterns: Optional[Iterable[str]] = attrs.field( |
| 128 | + default=None, converter=_convert_iterable_to_tuple |
| 129 | + ) |
| 130 | + comparators: Optional[Dict[Optional[str], ComparatorType]] = attrs.field( |
| 131 | + default=None, validator=attrs.validators.optional(_validate_comparators) |
| 132 | + ) |
| 133 | + specific_args: Optional[Dict[str, Dict[str, Any]]] = attrs.field( |
| 134 | + default=None, validator=attrs.validators.optional(_validate_specific_args) |
| 135 | + ) |
| 136 | + return_raw_diffs: bool = attrs.field(default=False) |
| 137 | + export_formatted_files: Union[bool, str] = attrs.field( |
| 138 | + default=False, validator=_validate_export_formatted_files |
| 139 | + ) |
| 140 | + executor_type: Literal["sequential", "thread", "process"] = attrs.field( |
| 141 | + default="sequential" |
| 142 | + ) |
| 143 | + max_workers: Optional[int] = attrs.field(default=None) |
| 144 | + |
| 145 | + # Compiled patterns - computed once, no caching complexity needed |
| 146 | + compiled_include_patterns: Tuple[Pattern[str], ...] = attrs.field(init=False) |
| 147 | + compiled_exclude_patterns: Tuple[Pattern[str], ...] = attrs.field(init=False) |
| 148 | + pattern_specific_args: Dict[Pattern[str], Dict[str, Any]] = attrs.field( |
| 149 | + init=False, repr=False |
| 150 | + ) |
| 151 | + |
| 152 | + def __attrs_post_init__(self): |
| 153 | + """Initialize computed fields after attrs initialization.""" |
| 154 | + # Validate and compile patterns - with frozen, we compile once and store directly |
| 155 | + try: |
| 156 | + compiled_include = self._compile_patterns(self.include_patterns) |
| 157 | + object.__setattr__(self, "compiled_include_patterns", compiled_include) |
| 158 | + except ValueError as e: |
| 159 | + raise ValueError(f"Error in include_patterns: {e}") from e |
| 160 | + |
| 161 | + try: |
| 162 | + compiled_exclude = self._compile_patterns(self.exclude_patterns) |
| 163 | + object.__setattr__(self, "compiled_exclude_patterns", compiled_exclude) |
| 164 | + except ValueError as e: |
| 165 | + raise ValueError(f"Error in exclude_patterns: {e}") from e |
| 166 | + |
| 167 | + # Setup specific args and pattern specific args |
| 168 | + if self.specific_args is None: |
| 169 | + # Use object.__setattr__ to modify the field even if it's frozen |
| 170 | + object.__setattr__(self, "specific_args", {}) |
| 171 | + |
| 172 | + # Setup pattern specific args |
| 173 | + pattern_specific_args = {} |
| 174 | + if self.specific_args: # Check if it's not None |
| 175 | + for file_path, v in self.specific_args.items(): |
| 176 | + if "patterns" in v: |
| 177 | + patterns = v.pop("patterns", []) |
| 178 | + for pattern in patterns: |
| 179 | + try: |
| 180 | + compiled_pattern = self._compile_pattern(pattern) |
| 181 | + pattern_specific_args[compiled_pattern] = v |
| 182 | + except ValueError as e: |
| 183 | + raise ValueError( |
| 184 | + f"Error in specific_args['{file_path}']['patterns']: {e}" |
| 185 | + ) from e |
| 186 | + |
| 187 | + object.__setattr__(self, "pattern_specific_args", pattern_specific_args) |
| 188 | + |
| 189 | + # Setup comparators |
| 190 | + if self.comparators is None: |
| 191 | + object.__setattr__(self, "comparators", get_comparators()) |
| 192 | + |
| 193 | + def _compile_pattern(self, pattern: str) -> Pattern[str]: |
| 194 | + """Compile a regex pattern.""" |
| 195 | + try: |
| 196 | + return re.compile(pattern) |
| 197 | + except re.error as e: |
| 198 | + raise ValueError(f"Invalid regex pattern: '{pattern}'") from e |
| 199 | + |
| 200 | + def _compile_patterns( |
| 201 | + self, patterns: Optional[Iterable[str]] |
| 202 | + ) -> Tuple[Pattern[str], ...]: |
| 203 | + """Compile regex patterns from any iterable to tuple.""" |
| 204 | + if patterns is None: |
| 205 | + return () |
| 206 | + return tuple(self._compile_pattern(pattern) for pattern in patterns) |
| 207 | + |
| 208 | + # Note: compiled_include_patterns, compiled_exclude_patterns, and pattern_specific_args |
| 209 | + # are now direct attributes set in __attrs_post_init__, no properties needed! |
| 210 | + |
| 211 | + def should_ignore_file(self, relative_path: str) -> bool: |
| 212 | + """Check if a file should be ignored.""" |
| 213 | + # Check inclusion patterns first |
| 214 | + if self.compiled_include_patterns: |
| 215 | + included = any( |
| 216 | + pattern.match(relative_path) |
| 217 | + for pattern in self.compiled_include_patterns |
| 218 | + ) |
| 219 | + if not included: |
| 220 | + return True |
| 221 | + |
| 222 | + # Check exclusion patterns |
| 223 | + return any( |
| 224 | + pattern.match(relative_path) for pattern in self.compiled_exclude_patterns |
| 225 | + ) |
0 commit comments