Fix: Add an auto_format parameter to the ValidationResultSet constructor (#62)

adrien-berchet · web-flow · commit 7a4942473714 · 2024-04-26T15:27:05.000+02:00
diff --git a/data_validation_framework/result.py b/data_validation_framework/result.py
@@ -61,14 +61,9 @@ class ValidationResultSet(pd.DataFrame):
         "exception": None,
     }
 
-    def __init__(self, *args, output_columns=None, **kwargs):
+    def __init__(self, *args, output_columns=None, auto_format=False, **kwargs):
         super().__init__(*args, **kwargs)
-
-        # Skip weird states due to Pandas and tqdm interactions
-        data = kwargs.get("data", args[0] if len(args) > 0 else None)
-        if not isinstance(
-            data, (ValidationResultSet, pd.core.internals.managers.BlockManager)
-        ) and not set(self.index).intersection(set(ValidationResultSet.out_cols.keys())):
+        if auto_format:
             self.format_data(output_columns)
 
     @property
diff --git a/data_validation_framework/task.py b/data_validation_framework/task.py
@@ -573,7 +573,7 @@ def run(self):
             )
 
         # Format the DataFrame
-        df = ValidationResultSet(new_df, output_columns=self.output_columns)
+        df = ValidationResultSet(new_df, output_columns=self.output_columns, auto_format=True)
 
         # Copy current index
         index = df.index.copy()
@@ -703,7 +703,7 @@ def _process(self, df, *args, **kwargs):
         df.loc[df["comment"].isnull(), "comment"] = ""
         df.loc[df["exception"].isnull(), "exception"] = ""
 
-        return df
+        return ValidationResultSet(df, auto_format=True)
 
 
 class ValidationWorkflow(SetValidationTask, luigi.WrapperTask):
diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
     "tqdm>=4.40",
 ]
 doc_reqs = [
+    "docutils<0.21",  # Temporary fix for m2r2
     "m2r2",
     "sphinx",
     "sphinx-bluebrain-theme",
diff --git a/tests/test_result.py b/tests/test_result.py
@@ -34,7 +34,7 @@ class TestValidationResultSet:
 
     def test_defaults(self):
         """Check defaults."""
-        df = result.ValidationResultSet(index=["a", "b"])
+        df = result.ValidationResultSet(index=["a", "b"], auto_format=True)
         assert df.to_dict() == {
             "is_valid": {"a": True, "b": True},
             "ret_code": {"a": 0, "b": 0},
@@ -44,7 +44,7 @@ def test_defaults(self):
 
     def test_defaults_with_extra_data(self):
         """Check defaults with extra data."""
-        df = result.ValidationResultSet({"a": [1, 2], "b": [3, 4]})
+        df = result.ValidationResultSet({"a": [1, 2], "b": [3, 4]}, auto_format=True)
         assert df.to_dict() == {
             "is_valid": {0: True, 1: True},
             "ret_code": {0: 0, 1: 0},
@@ -56,7 +56,7 @@ def test_defaults_with_extra_data(self):
 
     def test_default_ret_code(self):
         """Check default return code."""
-        df = result.ValidationResultSet({"is_valid": [True, False, True, False]})
+        df = result.ValidationResultSet({"is_valid": [True, False, True, False]}, auto_format=True)
         assert df.to_dict() == {
             "is_valid": {0: True, 1: False, 2: True, 3: False},
             "ret_code": {0: 0, 1: 1, 2: 0, 3: 1},
@@ -67,7 +67,9 @@ def test_default_ret_code(self):
     def test_output_columns(self):
         """Check output_columns argument."""
         df = result.ValidationResultSet(
-            index=["a", "b"], output_columns={"col1": "val1", "col2": "val2"}
+            index=["a", "b"],
+            output_columns={"col1": "val1", "col2": "val2"},
+            auto_format=True,
         )
         assert df.to_dict() == {
             "is_valid": {"a": True, "b": True},
@@ -84,6 +86,7 @@ def test_output_columns_with_existing_columns(self):
             {"is_valid": [True, False], "col1": ["test1", "test2"]},
             index=["a", "b"],
             output_columns={"col1": "val1", "col2": "val2"},
+            auto_format=True,
         )
         assert df.to_dict() == {
             "is_valid": {"a": True, "b": False},
@@ -126,9 +129,11 @@ def test_column_order(self):
             {"is_valid": [True, False], "col1": ["test1", "test2"]},
             index=["a", "b"],
             output_columns={"col1": "val1", "col2": "val2"},
+            auto_format=True,
         )
         df_order = result.ValidationResultSet(
-            pd.DataFrame(df[["col2", "comment", "is_valid", "col1", "exception", "ret_code"]])
+            pd.DataFrame(df[["col2", "comment", "is_valid", "col1", "exception", "ret_code"]]),
+            auto_format=True,
         )
         assert df_order.columns.tolist() == [
             "is_valid",
@@ -153,7 +158,8 @@ def test_column_order(self):
                         "ret_code",
                     ]
                 ]
-            )
+            ),
+            auto_format=True,
         )
         assert df_order.columns.tolist() == [
             "is_valid",
@@ -171,6 +177,7 @@ def test_pandas_method(self):
             {"is_valid": [True, False], "col1": ["test1", "test2"]},
             index=["a", "b"],
             output_columns={"col1": "val1", "col2": "val2"},
+            auto_format=True,
         )
         new_df = result.ValidationResultSet(df)
         assert new_df.apply(lambda x: x.ret_code * 5, axis=1).to_dict() == {"a": 0, "b": 5}
diff --git a/tests/test_task.py b/tests/test_task.py
@@ -1975,6 +1975,33 @@ def inputs(self):
         assert (tmpdir / "TestWorkflow" / "report.csv").exists()
         assert not (tmpdir / "report.pdf").exists()
 
+    def test_recursive_error(self, tmpdir, dataset_df_path):
+        """Check using as a pandas.DataFrame."""
+
+        class TestTask(task.ElementValidationTask):
+            output_columns = {"transformed_morph_path": None}
+
+            @staticmethod
+            def validation_function(row, data_dir):
+                return result.ValidationResult(
+                    is_valid=True, transformed_morph_path="/tmp/test_path"
+                )
+
+        class TestWorkflow(task.ValidationWorkflow):
+            args = ["transformed_dataset.csv", 2]
+
+            @staticmethod
+            def validation_function(df, data_dir, df_path="reduced_df.csv", parents=None):
+                print(df)  # The recursive error was happening here
+
+            def inputs(self):
+                return {TestTask: {}}
+
+        assert luigi.build(
+            [TestWorkflow(dataset_df=dataset_df_path, result_path=str(tmpdir))],
+            local_scheduler=True,
+        )
+
     class TestReport:
         """Test the report generation after workflow run."""
 
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -84,6 +84,7 @@ def test_apply_to_df(nb_processes, redirect_stdout):
             }
         ),
         output_columns={"new_data": None, "pid": None},
+        auto_format=True,
     )
 
     res = util.apply_to_df(

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`"tqdm>=4.40",`
`15`	`15`	`]`
`16`	`16`	`doc_reqs = [`
	`17`	`+ "docutils<0.21", # Temporary fix for m2r2`
`17`	`18`	`"m2r2",`
`18`	`19`	`"sphinx",`
`19`	`20`	`"sphinx-bluebrain-theme",`
Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def test_apply_to_df(nb_processes, redirect_stdout):`
`84`	`84`	`}`
`85`	`85`	`),`
`86`	`86`	`output_columns={"new_data": None, "pid": None},`
	`87`	`+ auto_format=True,`
`87`	`88`	`)`
`88`	`89`
`89`	`90`	`res = util.apply_to_df(`