Skip to content

Commit 7a49424

Browse files
Fix: Add an auto_format parameter to the ValidationResultSet constructor (#62)
1 parent 08dfd07 commit 7a49424

File tree

6 files changed

+46
-15
lines changed

6 files changed

+46
-15
lines changed

data_validation_framework/result.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,9 @@ class ValidationResultSet(pd.DataFrame):
6161
"exception": None,
6262
}
6363

64-
def __init__(self, *args, output_columns=None, **kwargs):
64+
def __init__(self, *args, output_columns=None, auto_format=False, **kwargs):
6565
super().__init__(*args, **kwargs)
66-
67-
# Skip weird states due to Pandas and tqdm interactions
68-
data = kwargs.get("data", args[0] if len(args) > 0 else None)
69-
if not isinstance(
70-
data, (ValidationResultSet, pd.core.internals.managers.BlockManager)
71-
) and not set(self.index).intersection(set(ValidationResultSet.out_cols.keys())):
66+
if auto_format:
7267
self.format_data(output_columns)
7368

7469
@property

data_validation_framework/task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ def run(self):
573573
)
574574

575575
# Format the DataFrame
576-
df = ValidationResultSet(new_df, output_columns=self.output_columns)
576+
df = ValidationResultSet(new_df, output_columns=self.output_columns, auto_format=True)
577577

578578
# Copy current index
579579
index = df.index.copy()
@@ -703,7 +703,7 @@ def _process(self, df, *args, **kwargs):
703703
df.loc[df["comment"].isnull(), "comment"] = ""
704704
df.loc[df["exception"].isnull(), "exception"] = ""
705705

706-
return df
706+
return ValidationResultSet(df, auto_format=True)
707707

708708

709709
class ValidationWorkflow(SetValidationTask, luigi.WrapperTask):

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"tqdm>=4.40",
1515
]
1616
doc_reqs = [
17+
"docutils<0.21", # Temporary fix for m2r2
1718
"m2r2",
1819
"sphinx",
1920
"sphinx-bluebrain-theme",

tests/test_result.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class TestValidationResultSet:
3434

3535
def test_defaults(self):
3636
"""Check defaults."""
37-
df = result.ValidationResultSet(index=["a", "b"])
37+
df = result.ValidationResultSet(index=["a", "b"], auto_format=True)
3838
assert df.to_dict() == {
3939
"is_valid": {"a": True, "b": True},
4040
"ret_code": {"a": 0, "b": 0},
@@ -44,7 +44,7 @@ def test_defaults(self):
4444

4545
def test_defaults_with_extra_data(self):
4646
"""Check defaults with extra data."""
47-
df = result.ValidationResultSet({"a": [1, 2], "b": [3, 4]})
47+
df = result.ValidationResultSet({"a": [1, 2], "b": [3, 4]}, auto_format=True)
4848
assert df.to_dict() == {
4949
"is_valid": {0: True, 1: True},
5050
"ret_code": {0: 0, 1: 0},
@@ -56,7 +56,7 @@ def test_defaults_with_extra_data(self):
5656

5757
def test_default_ret_code(self):
5858
"""Check default return code."""
59-
df = result.ValidationResultSet({"is_valid": [True, False, True, False]})
59+
df = result.ValidationResultSet({"is_valid": [True, False, True, False]}, auto_format=True)
6060
assert df.to_dict() == {
6161
"is_valid": {0: True, 1: False, 2: True, 3: False},
6262
"ret_code": {0: 0, 1: 1, 2: 0, 3: 1},
@@ -67,7 +67,9 @@ def test_default_ret_code(self):
6767
def test_output_columns(self):
6868
"""Check output_columns argument."""
6969
df = result.ValidationResultSet(
70-
index=["a", "b"], output_columns={"col1": "val1", "col2": "val2"}
70+
index=["a", "b"],
71+
output_columns={"col1": "val1", "col2": "val2"},
72+
auto_format=True,
7173
)
7274
assert df.to_dict() == {
7375
"is_valid": {"a": True, "b": True},
@@ -84,6 +86,7 @@ def test_output_columns_with_existing_columns(self):
8486
{"is_valid": [True, False], "col1": ["test1", "test2"]},
8587
index=["a", "b"],
8688
output_columns={"col1": "val1", "col2": "val2"},
89+
auto_format=True,
8790
)
8891
assert df.to_dict() == {
8992
"is_valid": {"a": True, "b": False},
@@ -126,9 +129,11 @@ def test_column_order(self):
126129
{"is_valid": [True, False], "col1": ["test1", "test2"]},
127130
index=["a", "b"],
128131
output_columns={"col1": "val1", "col2": "val2"},
132+
auto_format=True,
129133
)
130134
df_order = result.ValidationResultSet(
131-
pd.DataFrame(df[["col2", "comment", "is_valid", "col1", "exception", "ret_code"]])
135+
pd.DataFrame(df[["col2", "comment", "is_valid", "col1", "exception", "ret_code"]]),
136+
auto_format=True,
132137
)
133138
assert df_order.columns.tolist() == [
134139
"is_valid",
@@ -153,7 +158,8 @@ def test_column_order(self):
153158
"ret_code",
154159
]
155160
]
156-
)
161+
),
162+
auto_format=True,
157163
)
158164
assert df_order.columns.tolist() == [
159165
"is_valid",
@@ -171,6 +177,7 @@ def test_pandas_method(self):
171177
{"is_valid": [True, False], "col1": ["test1", "test2"]},
172178
index=["a", "b"],
173179
output_columns={"col1": "val1", "col2": "val2"},
180+
auto_format=True,
174181
)
175182
new_df = result.ValidationResultSet(df)
176183
assert new_df.apply(lambda x: x.ret_code * 5, axis=1).to_dict() == {"a": 0, "b": 5}

tests/test_task.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1975,6 +1975,33 @@ def inputs(self):
19751975
assert (tmpdir / "TestWorkflow" / "report.csv").exists()
19761976
assert not (tmpdir / "report.pdf").exists()
19771977

1978+
def test_recursive_error(self, tmpdir, dataset_df_path):
1979+
"""Check using as a pandas.DataFrame."""
1980+
1981+
class TestTask(task.ElementValidationTask):
1982+
output_columns = {"transformed_morph_path": None}
1983+
1984+
@staticmethod
1985+
def validation_function(row, data_dir):
1986+
return result.ValidationResult(
1987+
is_valid=True, transformed_morph_path="/tmp/test_path"
1988+
)
1989+
1990+
class TestWorkflow(task.ValidationWorkflow):
1991+
args = ["transformed_dataset.csv", 2]
1992+
1993+
@staticmethod
1994+
def validation_function(df, data_dir, df_path="reduced_df.csv", parents=None):
1995+
print(df) # The recursive error was happening here
1996+
1997+
def inputs(self):
1998+
return {TestTask: {}}
1999+
2000+
assert luigi.build(
2001+
[TestWorkflow(dataset_df=dataset_df_path, result_path=str(tmpdir))],
2002+
local_scheduler=True,
2003+
)
2004+
19782005
class TestReport:
19792006
"""Test the report generation after workflow run."""
19802007

tests/test_util.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def test_apply_to_df(nb_processes, redirect_stdout):
8484
}
8585
),
8686
output_columns={"new_data": None, "pid": None},
87+
auto_format=True,
8788
)
8889

8990
res = util.apply_to_df(

0 commit comments

Comments
 (0)