Skip to content

Commit 1725f45

Browse files
Refactor: Reorganize the code (#90)
1 parent c0e7a83 commit 1725f45

File tree

15 files changed

+1047
-900
lines changed

15 files changed

+1047
-900
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
This project provides simple tools to compare the content of a directory against a reference
1111
directory.
1212

13-
This is useful to check the results of a process that generate several files, like a luigi
13+
This is useful to check the results of a process that generates several files, like a luigi
1414
workflow for example.
1515

1616

dir_content_diff/__init__.py

Lines changed: 31 additions & 800 deletions
Large diffs are not rendered by default.

dir_content_diff/base_comparators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ def _cast_from_attribute(text, attr):
513513
elif str(text).lower() == "false":
514514
res = False
515515
else:
516-
raise ValueError("Bool attributes expect 'true' or 'false'.")
516+
raise ValueError("Boolean attributes expect 'true' or 'false'.")
517517
elif value_type == "list":
518518
res = []
519519
elif value_type == "dict":

dir_content_diff/cli/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919
import click
2020
from yaml import safe_load
2121

22-
from dir_content_diff import _DEFAULT_EXPORT_SUFFIX
2322
from dir_content_diff import compare_files
2423
from dir_content_diff import compare_trees
2524
from dir_content_diff import export_formatted_file
2625
from dir_content_diff import pick_comparator
26+
from dir_content_diff.core import _DEFAULT_EXPORT_SUFFIX
2727
from dir_content_diff.util import LOGGER
2828

2929

dir_content_diff/config.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
"""Configuration classes for directory content comparison.
2+
3+
This module contains the configuration classes and validation functions.
4+
"""
5+
6+
# LICENSE HEADER MANAGED BY add-license-header
7+
# Copyright (c) 2023-2025 Blue Brain Project, EPFL.
8+
#
9+
# This file is part of dir-content-diff.
10+
# See https://github.com/BlueBrain/dir-content-diff for further info.
11+
#
12+
# SPDX-License-Identifier: Apache-2.0
13+
# LICENSE HEADER MANAGED BY add-license-header
14+
15+
import re
16+
from collections.abc import Callable
17+
from typing import Any
18+
from typing import Dict
19+
from typing import Iterable
20+
from typing import Literal
21+
from typing import Optional
22+
from typing import Pattern
23+
from typing import Tuple
24+
from typing import Union
25+
26+
import attrs
27+
28+
from dir_content_diff.base_comparators import BaseComparator
29+
from dir_content_diff.registry import get_comparators
30+
31+
# Type alias for comparators
32+
ComparatorType = Union[BaseComparator, Callable]
33+
34+
35+
def _convert_iterable_to_tuple(
36+
x: Optional[Iterable[str]],
37+
) -> Optional[Tuple[str, ...]]:
38+
"""Convert an iterable to a tuple, or return None."""
39+
if x is None:
40+
return None
41+
return tuple(x)
42+
43+
44+
def _validate_specific_args(instance, attribute, value): # pylint: disable=unused-argument
45+
"""Validate specific_args structure."""
46+
for file_path, args in value.items():
47+
if not isinstance(args, dict):
48+
raise ValueError(f"specific_args['{file_path}'] must be a dictionary")
49+
# Note: regex patterns in specific_args will be validated during compilation
50+
# in __attrs_post_init__, so no need to validate them here
51+
52+
53+
def _validate_export_formatted_files(instance, attribute, value): # pylint: disable=unused-argument
54+
"""Validate export_formatted_files is either bool or non-empty string."""
55+
if isinstance(value, str) and len(value.strip()) == 0:
56+
raise ValueError(
57+
"export_formatted_files must be a non-empty string when provided as string"
58+
)
59+
60+
61+
def _validate_comparators(instance, attribute, value): # pylint: disable=unused-argument
62+
"""Validate comparators are either BaseComparator instances or callable."""
63+
for ext, comparator in value.items():
64+
if not (isinstance(comparator, BaseComparator) or callable(comparator)):
65+
raise ValueError(
66+
f"Comparator for extension '{ext}' must be a BaseComparator instance "
67+
"or callable"
68+
)
69+
70+
71+
@attrs.frozen
72+
class ComparisonConfig:
73+
"""Configuration class to store comparison settings.
74+
75+
Attributes:
76+
include_patterns: A list of regular expression patterns. If the relative path of a
77+
file does not match any of these patterns, it is ignored during the comparison. Note
78+
that this means that any specific arguments for that file will also be ignored.
79+
exclude_patterns: A list of regular expression patterns. If the relative path of a
80+
file matches any of these patterns, it is ignored during the comparison. Note that
81+
this means that any specific arguments for that file will also be ignored.
82+
comparators: A ``dict`` to override the registered comparators.
83+
specific_args: A ``dict`` with the args/kwargs that should be given to the
84+
comparator for a given file. This ``dict`` should be like the following:
85+
86+
.. code-block:: Python
87+
88+
{
89+
<relative_file_path>: {
90+
comparator: ComparatorInstance,
91+
args: [arg1, arg2, ...],
92+
kwargs: {
93+
kwarg_name_1: kwarg_value_1,
94+
kwarg_name_2: kwarg_value_2,
95+
}
96+
},
97+
<another_file_path>: {...},
98+
<a name for this category>: {
99+
"patterns": ["regex1", "regex2", ...],
100+
... (other arguments)
101+
}
102+
}
103+
104+
If the "patterns" entry is present, then the name is not considered and is only used as
105+
a helper for the user. When a "patterns" entry is detected, the other arguments are
106+
applied to all files whose relative name matches one of the given regular expression
107+
patterns. If a file could match multiple patterns of different groups, only the first
108+
one is considered.
109+
110+
Note that all entries in this ``dict`` are optional.
111+
return_raw_diffs: If set to ``True``, only the raw differences are returned instead
112+
of a formatted report.
113+
export_formatted_files: If set to ``True`` or a not empty string, create a
114+
new directory with formatted compared data files. If a string is passed, this string is
115+
used as suffix for the new directory. If `True` is passed, the suffix is
116+
``_FORMATTED``.
117+
max_workers: Maximum number of worker threads/processes for parallel execution. If None,
118+
defaults to min(32, (os.cpu_count() or 1) + 4) as per executor default.
119+
executor_type: Type of executor to use for parallel execution. 'thread' uses
120+
ThreadPoolExecutor (better for I/O-bound tasks), 'process' uses ProcessPoolExecutor
121+
(better for CPU-bound tasks), 'sequential' disables parallel execution.
122+
"""
123+
124+
include_patterns: Optional[Iterable[str]] = attrs.field(
125+
default=None, converter=_convert_iterable_to_tuple
126+
)
127+
exclude_patterns: Optional[Iterable[str]] = attrs.field(
128+
default=None, converter=_convert_iterable_to_tuple
129+
)
130+
comparators: Optional[Dict[Optional[str], ComparatorType]] = attrs.field(
131+
default=None, validator=attrs.validators.optional(_validate_comparators)
132+
)
133+
specific_args: Optional[Dict[str, Dict[str, Any]]] = attrs.field(
134+
default=None, validator=attrs.validators.optional(_validate_specific_args)
135+
)
136+
return_raw_diffs: bool = attrs.field(default=False)
137+
export_formatted_files: Union[bool, str] = attrs.field(
138+
default=False, validator=_validate_export_formatted_files
139+
)
140+
executor_type: Literal["sequential", "thread", "process"] = attrs.field(
141+
default="sequential"
142+
)
143+
max_workers: Optional[int] = attrs.field(default=None)
144+
145+
# Compiled patterns - computed once, no caching complexity needed
146+
compiled_include_patterns: Tuple[Pattern[str], ...] = attrs.field(init=False)
147+
compiled_exclude_patterns: Tuple[Pattern[str], ...] = attrs.field(init=False)
148+
pattern_specific_args: Dict[Pattern[str], Dict[str, Any]] = attrs.field(
149+
init=False, repr=False
150+
)
151+
152+
def __attrs_post_init__(self):
153+
"""Initialize computed fields after attrs initialization."""
154+
# Validate and compile patterns - with frozen, we compile once and store directly
155+
try:
156+
compiled_include = self._compile_patterns(self.include_patterns)
157+
object.__setattr__(self, "compiled_include_patterns", compiled_include)
158+
except ValueError as e:
159+
raise ValueError(f"Error in include_patterns: {e}") from e
160+
161+
try:
162+
compiled_exclude = self._compile_patterns(self.exclude_patterns)
163+
object.__setattr__(self, "compiled_exclude_patterns", compiled_exclude)
164+
except ValueError as e:
165+
raise ValueError(f"Error in exclude_patterns: {e}") from e
166+
167+
# Setup specific args and pattern specific args
168+
if self.specific_args is None:
169+
# Use object.__setattr__ to modify the field even if it's frozen
170+
object.__setattr__(self, "specific_args", {})
171+
172+
# Setup pattern specific args
173+
pattern_specific_args = {}
174+
if self.specific_args: # Check if it's not None
175+
for file_path, v in self.specific_args.items():
176+
if "patterns" in v:
177+
patterns = v.pop("patterns", [])
178+
for pattern in patterns:
179+
try:
180+
compiled_pattern = self._compile_pattern(pattern)
181+
pattern_specific_args[compiled_pattern] = v
182+
except ValueError as e:
183+
raise ValueError(
184+
f"Error in specific_args['{file_path}']['patterns']: {e}"
185+
) from e
186+
187+
object.__setattr__(self, "pattern_specific_args", pattern_specific_args)
188+
189+
# Setup comparators
190+
if self.comparators is None:
191+
object.__setattr__(self, "comparators", get_comparators())
192+
193+
def _compile_pattern(self, pattern: str) -> Pattern[str]:
194+
"""Compile a regex pattern."""
195+
try:
196+
return re.compile(pattern)
197+
except re.error as e:
198+
raise ValueError(f"Invalid regex pattern: '{pattern}'") from e
199+
200+
def _compile_patterns(
201+
self, patterns: Optional[Iterable[str]]
202+
) -> Tuple[Pattern[str], ...]:
203+
"""Compile regex patterns from any iterable to tuple."""
204+
if patterns is None:
205+
return ()
206+
return tuple(self._compile_pattern(pattern) for pattern in patterns)
207+
208+
# Note: compiled_include_patterns, compiled_exclude_patterns, and pattern_specific_args
209+
# are now direct attributes set in __attrs_post_init__, no properties needed!
210+
211+
def should_ignore_file(self, relative_path: str) -> bool:
212+
"""Check if a file should be ignored."""
213+
# Check inclusion patterns first
214+
if self.compiled_include_patterns:
215+
included = any(
216+
pattern.match(relative_path)
217+
for pattern in self.compiled_include_patterns
218+
)
219+
if not included:
220+
return True
221+
222+
# Check exclusion patterns
223+
return any(
224+
pattern.match(relative_path) for pattern in self.compiled_exclude_patterns
225+
)

0 commit comments

Comments
 (0)