[SPARK-51575][PYTHON] Combine Python Data Source pushdown & plan read workers

wengh · allisonwang-db · commit 46bd9ccecefd · 2025-03-26T17:38:24.000-07:00
Follow up of #49961 ### What changes were proposed in this pull request? As pointed out by #49961 (comment), at the time of filter pushdown we already have enough information to also plan read partitions. So this PR changes the filter pushdown worker to also get partitions, reducing the number of exchanges between Python and Scala. Changes: - Extract part of `plan_data_source_read.py` that is responsible for sending the partitions and the read function to JVM. - Use the extracted logic to also send the partitions and read function when doing filter pushdown in `data_source_pushdown_filters.py`. - Update the Scala code accordingly. ### Why are the changes needed? To improve Python Data Source performance when filter pushdown configuration is enabled. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests in `test_python_datasource.py` ### Was this patch authored or co-authored using generative AI tooling? No Closes #50340 from wengh/pyds-combine-pushdown-plan. Authored-by: Haoyu Weng <wenghy02@gmail.com> Signed-off-by: Allison Wang <allison.wang@databricks.com>
diff --git a/python/pyspark/sql/worker/data_source_pushdown_filters.py b/python/pyspark/sql/worker/data_source_pushdown_filters.py
@@ -47,6 +47,7 @@
     StringStartsWith,
 )
 from pyspark.sql.types import StructType, VariantVal, _parse_datatype_json_string
+from pyspark.sql.worker.plan_data_source_read import write_read_func_and_partitions
 from pyspark.util import handle_worker_exception, local_connect_and_auth
 from pyspark.worker_util import (
     check_python_version,
@@ -131,11 +132,12 @@ def main(infile: IO, outfile: IO) -> None:
     - a `DataSource` instance representing the data source
     - a `StructType` instance representing the output schema of the data source
     - a list of filters to be pushed down
+    - configuration values
 
     This process then creates a `DataSourceReader` instance by calling the `reader` method
     on the `DataSource` instance. It applies the filters by calling the `pushFilters` method
-    on the reader and determines which filters are supported. The data source with updated reader
-    is then sent back to the JVM along with the indices of the supported filters.
+    on the reader and determines which filters are supported. The indices of the supported
+    filters are sent back to the JVM, along with the list of partitions and the read function.
     """
     faulthandler_log_path = os.environ.get("PYTHON_FAULTHANDLER_DIR", None)
     try:
@@ -220,10 +222,22 @@ def main(infile: IO, outfile: IO) -> None:
                 },
             )
 
-        # Monkey patch the data source instance
-        # to return the existing reader with the pushed down filters.
-        data_source.reader = lambda schema: reader  # type: ignore[method-assign]
-        pickleSer._write_with_length(data_source, outfile)
+        # Receive the max arrow batch size.
+        max_arrow_batch_size = read_int(infile)
+        assert max_arrow_batch_size > 0, (
+            "The maximum arrow batch size should be greater than 0, but got "
+            f"'{max_arrow_batch_size}'"
+        )
+
+        # Return the read function and partitions. Doing this in the same worker as filter pushdown
+        # helps reduce the number of Python worker calls.
+        write_read_func_and_partitions(
+            outfile,
+            reader=reader,
+            data_source=data_source,
+            schema=schema,
+            max_arrow_batch_size=max_arrow_batch_size,
+        )
 
         # Return the supported filter indices.
         write_int(len(supported_filter_indices), outfile)
diff --git a/python/pyspark/sql/worker/plan_data_source_read.py b/python/pyspark/sql/worker/plan_data_source_read.py
@@ -168,6 +168,101 @@ def batched(iterator: Iterator, n: int) -> Iterator:
         yield batch
 
 
+def write_read_func_and_partitions(
+    outfile: IO,
+    *,
+    reader: Union[DataSourceReader, DataSourceStreamReader],
+    data_source: DataSource,
+    schema: StructType,
+    max_arrow_batch_size: int,
+) -> None:
+    is_streaming = isinstance(reader, DataSourceStreamReader)
+
+    # Create input converter.
+    converter = ArrowTableToRowsConversion._create_converter(BinaryType())
+
+    # Create output converter.
+    return_type = schema
+
+    def data_source_read_func(iterator: Iterable[pa.RecordBatch]) -> Iterable[pa.RecordBatch]:
+        partition_bytes = None
+
+        # Get the partition value from the input iterator.
+        for batch in iterator:
+            # There should be only one row/column in the batch.
+            assert batch.num_columns == 1 and batch.num_rows == 1, (
+                "Expected each batch to have exactly 1 column and 1 row, "
+                f"but found {batch.num_columns} columns and {batch.num_rows} rows."
+            )
+            columns = [column.to_pylist() for column in batch.columns]
+            partition_bytes = converter(columns[0][0])
+
+        assert (
+            partition_bytes is not None
+        ), "The input iterator for Python data source read function is empty."
+
+        # Deserialize the partition value.
+        partition = pickleSer.loads(partition_bytes)
+
+        assert partition is None or isinstance(partition, InputPartition), (
+            "Expected the partition value to be of type 'InputPartition', "
+            f"but found '{type(partition).__name__}'."
+        )
+
+        output_iter = reader.read(partition)  # type: ignore[arg-type]
+
+        # Validate the output iterator.
+        if not isinstance(output_iter, Iterator):
+            raise PySparkRuntimeError(
+                errorClass="DATA_SOURCE_INVALID_RETURN_TYPE",
+                messageParameters={
+                    "type": type(output_iter).__name__,
+                    "name": data_source.name(),
+                    "supported_types": "iterator",
+                },
+            )
+
+        return records_to_arrow_batches(output_iter, max_arrow_batch_size, return_type, data_source)
+
+    command = (data_source_read_func, return_type)
+    pickleSer._write_with_length(command, outfile)
+
+    if not is_streaming:
+        # The partitioning of python batch source read is determined before query execution.
+        try:
+            partitions = reader.partitions()  # type: ignore[call-arg]
+            if not isinstance(partitions, list):
+                raise PySparkRuntimeError(
+                    errorClass="DATA_SOURCE_TYPE_MISMATCH",
+                    messageParameters={
+                        "expected": "'partitions' to return a list",
+                        "actual": f"'{type(partitions).__name__}'",
+                    },
+                )
+            if not all(isinstance(p, InputPartition) for p in partitions):
+                partition_types = ", ".join([f"'{type(p).__name__}'" for p in partitions])
+                raise PySparkRuntimeError(
+                    errorClass="DATA_SOURCE_TYPE_MISMATCH",
+                    messageParameters={
+                        "expected": "elements in 'partitions' to be of type 'InputPartition'",
+                        "actual": partition_types,
+                    },
+                )
+            if len(partitions) == 0:
+                partitions = [None]  # type: ignore[list-item]
+        except NotImplementedError:
+            partitions = [None]  # type: ignore[list-item]
+
+        # Return the serialized partition values.
+        write_int(len(partitions), outfile)
+        for partition in partitions:
+            pickleSer._write_with_length(partition, outfile)
+    else:
+        # Send an empty list of partition for stream reader because partitions are planned
+        # in each microbatch during query execution.
+        write_int(0, outfile)
+
+
 def main(infile: IO, outfile: IO) -> None:
     """
     Main method for planning a data source read.
@@ -284,91 +379,14 @@ def main(infile: IO, outfile: IO) -> None:
                     },
                 )
 
-        # Create input converter.
-        converter = ArrowTableToRowsConversion._create_converter(BinaryType())
-
-        # Create output converter.
-        return_type = schema
-
-        def data_source_read_func(iterator: Iterable[pa.RecordBatch]) -> Iterable[pa.RecordBatch]:
-            partition_bytes = None
-
-            # Get the partition value from the input iterator.
-            for batch in iterator:
-                # There should be only one row/column in the batch.
-                assert batch.num_columns == 1 and batch.num_rows == 1, (
-                    "Expected each batch to have exactly 1 column and 1 row, "
-                    f"but found {batch.num_columns} columns and {batch.num_rows} rows."
-                )
-                columns = [column.to_pylist() for column in batch.columns]
-                partition_bytes = converter(columns[0][0])
-
-            assert (
-                partition_bytes is not None
-            ), "The input iterator for Python data source read function is empty."
-
-            # Deserialize the partition value.
-            partition = pickleSer.loads(partition_bytes)
-
-            assert partition is None or isinstance(partition, InputPartition), (
-                "Expected the partition value to be of type 'InputPartition', "
-                f"but found '{type(partition).__name__}'."
-            )
-
-            output_iter = reader.read(partition)  # type: ignore[arg-type]
-
-            # Validate the output iterator.
-            if not isinstance(output_iter, Iterator):
-                raise PySparkRuntimeError(
-                    errorClass="DATA_SOURCE_INVALID_RETURN_TYPE",
-                    messageParameters={
-                        "type": type(output_iter).__name__,
-                        "name": data_source.name(),
-                        "supported_types": "iterator",
-                    },
-                )
-
-            return records_to_arrow_batches(
-                output_iter, max_arrow_batch_size, return_type, data_source
-            )
-
-        command = (data_source_read_func, return_type)
-        pickleSer._write_with_length(command, outfile)
-
-        if not is_streaming:
-            # The partitioning of python batch source read is determined before query execution.
-            try:
-                partitions = reader.partitions()  # type: ignore[call-arg]
-                if not isinstance(partitions, list):
-                    raise PySparkRuntimeError(
-                        errorClass="DATA_SOURCE_TYPE_MISMATCH",
-                        messageParameters={
-                            "expected": "'partitions' to return a list",
-                            "actual": f"'{type(partitions).__name__}'",
-                        },
-                    )
-                if not all(isinstance(p, InputPartition) for p in partitions):
-                    partition_types = ", ".join([f"'{type(p).__name__}'" for p in partitions])
-                    raise PySparkRuntimeError(
-                        errorClass="DATA_SOURCE_TYPE_MISMATCH",
-                        messageParameters={
-                            "expected": "elements in 'partitions' to be of type 'InputPartition'",
-                            "actual": partition_types,
-                        },
-                    )
-                if len(partitions) == 0:
-                    partitions = [None]  # type: ignore[list-item]
-            except NotImplementedError:
-                partitions = [None]  # type: ignore[list-item]
-
-            # Return the serialized partition values.
-            write_int(len(partitions), outfile)
-            for partition in partitions:
-                pickleSer._write_with_length(partition, outfile)
-        else:
-            # Send an empty list of partition for stream reader because partitions are planned
-            # in each microbatch during query execution.
-            write_int(0, outfile)
+        # Send the read function and partitions to the JVM.
+        write_read_func_and_partitions(
+            outfile,
+            reader=reader,
+            data_source=data_source,
+            schema=schema,
+            max_arrow_batch_size=max_arrow_batch_size,
+        )
     except BaseException as e:
         handle_worker_exception(e, outfile)
         sys.exit(-1)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonDataSourceV2.scala
@@ -52,8 +52,23 @@ class PythonDataSourceV2 extends TableProvider {
     dataSourceInPython
   }
 
-  def setDataSourceInPython(dataSourceInPython: PythonDataSourceCreationResult): Unit = {
-    this.dataSourceInPython = dataSourceInPython
+  private var readInfo: PythonDataSourceReadInfo = _
+
+  def getOrCreateReadInfo(
+      shortName: String,
+      options: CaseInsensitiveStringMap,
+      outputSchema: StructType,
+      isStreaming: Boolean
+  ): PythonDataSourceReadInfo = {
+    if (readInfo == null) {
+      val creationResult = getOrCreateDataSourceInPython(shortName, options, Some(outputSchema))
+      readInfo = source.createReadInfoInPython(creationResult, outputSchema, isStreaming)
+    }
+    readInfo
+  }
+
+  def setReadInfo(readInfo: PythonDataSourceReadInfo): Unit = {
+    this.readInfo = readInfo
   }
 
   override def inferSchema(options: CaseInsensitiveStringMap): StructType = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala
@@ -90,10 +90,7 @@ class PythonMicroBatchStream(
   }
 
   private lazy val readInfo: PythonDataSourceReadInfo = {
-    ds.source.createReadInfoInPython(
-      ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema)),
-      outputSchema,
-      isStreaming = true)
+    ds.getOrCreateReadInfo(shortName, options, outputSchema, isStreaming = true)
   }
 
   override def createReaderFactory(): PartitionReaderFactory = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScan.scala
@@ -63,10 +63,7 @@ class PythonBatch(
   private val jobArtifactUUID = JobArtifactSet.getCurrentJobArtifactState.map(_.uuid)
 
   private lazy val infoInPython: PythonDataSourceReadInfo = {
-    ds.source.createReadInfoInPython(
-      ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema)),
-      outputSchema,
-      isStreaming = false)
+    ds.getOrCreateReadInfo(shortName, options, outputSchema, isStreaming = false)
   }
 
   override def planInputPartitions(): Array[InputPartition] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonScanBuilder.scala
@@ -42,18 +42,19 @@ class PythonScanBuilder(
     }
 
     val dataSource = ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema))
-    val result = ds.source.pushdownFiltersInPython(dataSource, outputSchema, filters)
-
-    // The Data Source instance state changes after pushdown to remember the reader instance
-    // created and the filters pushed down. So pushdownFiltersInPython returns a new pickled
-    // Data Source instance. We need to use that new instance for further operations.
-    ds.setDataSourceInPython(dataSource.copy(dataSource = result.dataSource))
-
-    // Partition the filters into supported and unsupported ones.
-    val isPushed = result.isFilterPushed.zip(filters)
-    supportedFilters = isPushed.collect { case (true, filter) => filter }.toArray
-    val unsupported = isPushed.collect { case (false, filter) => filter }.toArray
-    unsupported
+    ds.source.pushdownFiltersInPython(dataSource, outputSchema, filters) match {
+      case None => filters // No filters are supported.
+      case Some(result) =>
+        // Filter pushdown also returns partitions and the read function.
+        // This helps reduce the number of Python worker calls.
+        ds.setReadInfo(result.readInfo)
+
+        // Partition the filters into supported and unsupported ones.
+        val isPushed = result.isFilterPushed.zip(filters)
+        supportedFilters = isPushed.collect { case (true, filter) => filter }.toArray
+        val unsupported = isPushed.collect { case (false, filter) => filter }.toArray
+        unsupported
+    }
   }
 
   override def pushedFilters(): Array[Filter] = supportedFilters
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/UserDefinedPythonDataSource.scala

Original file line number	Diff line number	Diff line change
`@@ -90,10 +90,7 @@ class PythonMicroBatchStream(`
`90`	`90`	`}`
`91`	`91`
`92`	`92`	`private lazy val readInfo: PythonDataSourceReadInfo = {`
`93`		`- ds.source.createReadInfoInPython(`
`94`		`- ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema)),`
`95`		`- outputSchema,`
`96`		`- isStreaming = true)`
	`93`	`+ ds.getOrCreateReadInfo(shortName, options, outputSchema, isStreaming = true)`
`97`	`94`	`}`
`98`	`95`
`99`	`96`	`override def createReaderFactory(): PartitionReaderFactory = {`