[SPARK-54478][SPARK-54479][SPARK-54480][SPARK-54484] Re-enable streaming tests for connect compat test CI

HeartSaVioR · HeartSaVioR · commit aecd932c34d0 · 2025-12-01T17:00:28.000+09:00
### What changes were proposed in this pull request? This PR proposes to re-enable streaming tests for connect compatibility test CI. ### Why are the changes needed? They were disabled due to failure, but I can't reproduce these failures in both local and CI after installing zstandard. Code change to trigger compatibility test CI against test branch + install zstandard: master...HeartSaVioR:spark:WIP-investigate-ss-spark-connect-compat-test-failures-master-and-4.0 Code change to re-enable these tests during reproducing: branch-4.0...HeartSaVioR:spark:branch-4.0-SC-213385 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? GA run with the above reproducer setup: https://github.com/HeartSaVioR/spark/actions/runs/19807973545/job/56745231698 ### Was this patch authored or co-authored using generative AI tooling? No. Closes #53266 from HeartSaVioR/reenable-streaming-connect-tests. Authored-by: Jungtaek Lim <kabhwan.opensource@gmail.com> Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py
@@ -1568,7 +1568,6 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt
         self._jwrite.foreach(jForeachWriter)
         return self
 
-    # SPARK-54478: Reenable doctest
     def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter":
         """
         Sets the output of the streaming query to be processed using the provided
@@ -1601,9 +1600,9 @@ def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamW
         ...     my_value = 100
         ...     batch_df.collect()
         ...
-        >>> q = df.writeStream.foreachBatch(func).start()  # doctest: +SKIP
-        >>> time.sleep(3)  # doctest: +SKIP
-        >>> q.stop()  # doctest: +SKIP
+        >>> q = df.writeStream.foreachBatch(func).start()
+        >>> time.sleep(3)
+        >>> q.stop()
         >>> # if in Spark Connect, my_value = -1, else my_value = 100
         """
         from py4j.java_gateway import java_import
diff --git a/python/pyspark/sql/tests/connect/pandas/streaming/test_parity_pandas_grouped_map_with_state.py b/python/pyspark/sql/tests/connect/pandas/streaming/test_parity_pandas_grouped_map_with_state.py
@@ -15,17 +15,13 @@
 # limitations under the License.
 #
 import unittest
-import os
 
 from pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state import (
     GroupedApplyInPandasWithStateTestsMixin,
 )
 from pyspark.testing.connectutils import ReusedConnectTestCase
 
 
-@unittest.skipIf(
-    os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", "SPARK-54479: To be reenabled"
-)
 class GroupedApplyInPandasWithStateTests(
     GroupedApplyInPandasWithStateTestsMixin, ReusedConnectTestCase
 ):
diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py b/python/pyspark/sql/tests/connect/streaming/test_parity_foreach_batch.py
@@ -16,7 +16,6 @@
 #
 
 import unittest
-import os
 
 from pyspark.sql.tests.streaming.test_streaming_foreach_batch import StreamingTestsForeachBatchMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase, should_test_connect
@@ -26,9 +25,6 @@
     from pyspark.errors.exceptions.connect import StreamingPythonRunnerInitializationException
 
 
-@unittest.skipIf(
-    os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", "SPARK-54480: To be reenabled"
-)
 class StreamingForeachBatchParityTests(StreamingTestsForeachBatchMixin, ReusedConnectTestCase):
     def test_streaming_foreach_batch_propagates_python_errors(self):
         super().test_streaming_foreach_batch_propagates_python_errors()
diff --git a/python/pyspark/sql/tests/test_python_streaming_datasource.py b/python/pyspark/sql/tests/test_python_streaming_datasource.py
@@ -139,9 +139,6 @@ def streamWriter(self, schema, overwrite):
 
         return TestDataSource
 
-    @unittest.skipIf(
-        os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", "SPARK-54484: To be reenabled"
-    )
     def test_stream_reader(self):
         self.spark.dataSource.register(self._get_test_data_source())
         df = self.spark.readStream.format("TestDataSource").load()
@@ -216,7 +213,6 @@ def streamReader(self, schema):
 
         assertDataFrameEqual(df, expected_data)
 
-    @unittest.skipIf(os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", "To be reenabled")
     def test_simple_stream_reader(self):
         class SimpleStreamReader(SimpleDataSourceStreamReader):
             def initialOffset(self):