[SPARK-43502][PYTHON][CONNECT] DataFrame.drop` should accept empty column

zhengruifeng · zhengruifeng · commit 0df4c01b7c4d · 2023-05-16T12:38:08.000+08:00
### What changes were proposed in this pull request? Make `DataFrame.drop` accept empty column ### Why are the changes needed? to be consistent with vanilla PySpark ### Does this PR introduce _any_ user-facing change? yes ``` In [1]: df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) In [2]: df.drop() ``` before: ``` In [2]: df.drop() --------------------------------------------------------------------------- PySparkValueError Traceback (most recent call last) Cell In[2], line 1 ----> 1 df.drop() File ~/Dev/spark/python/pyspark/sql/connect/dataframe.py:449, in DataFrame.drop(self, *cols) 444 raise PySparkTypeError( 445 error_class="NOT_COLUMN_OR_STR", 446 message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__}, 447 ) 448 if len(_cols) == 0: --> 449 raise PySparkValueError( 450 error_class="CANNOT_BE_EMPTY", 451 message_parameters={"item": "cols"}, 452 ) 454 return DataFrame.withPlan( 455 plan.Drop( 456 child=self._plan, (...) 459 session=self._session, 460 ) PySparkValueError: [CANNOT_BE_EMPTY] At least one cols must be specified. ``` after ``` In [2]: df.drop() Out[2]: DataFrame[id: bigint, age: bigint] ``` ### How was this patch tested? enabled UT Closes #41180 from zhengruifeng/connect_drop_empty_col. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -445,11 +445,6 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":
                 error_class="NOT_COLUMN_OR_STR",
                 message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__},
             )
-        if len(_cols) == 0:
-            raise PySparkValueError(
-                error_class="CANNOT_BE_EMPTY",
-                message_parameters={"item": "cols"},
-            )
 
         return DataFrame.withPlan(
             plan.Drop(
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
@@ -664,7 +664,8 @@ def __init__(
         columns: List[Union[Column, str]],
     ) -> None:
         super().__init__(child)
-        assert len(columns) > 0 and all(isinstance(c, (Column, str)) for c in columns)
+        if len(columns) > 0:
+            assert all(isinstance(c, (Column, str)) for c in columns)
         self._columns = columns
 
     def plan(self, session: "SparkConnectClient") -> proto.Relation:
diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe.py b/python/pyspark/sql/tests/connect/test_parity_dataframe.py
@@ -84,11 +84,6 @@ def test_to_pandas_with_duplicated_column_names(self):
     def test_to_pandas_from_mixed_dataframe(self):
         self.check_to_pandas_from_mixed_dataframe()
 
-    # TODO(SPARK-43502): DataFrame.drop should support empty column
-    @unittest.skip("Fails in Spark Connect, should enable.")
-    def test_drop_empty_column(self):
-        super().test_drop_empty_column()
-
 
 if __name__ == "__main__":
     import unittest