fallback input

xinrong-meng · xinrong-meng · commit 00d4e03aa218 · 2025-02-03T13:44:59.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -22,11 +22,13 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkException
 import org.apache.spark.api.python.PythonEvalType
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern._
+import org.apache.spark.sql.types.UserDefinedType
 
 
 /**
@@ -157,7 +159,7 @@ object ExtractGroupingPythonUDFFromAggregate extends Rule[LogicalPlan] {
  * This has the limitation that the input to the Python UDF is not allowed include attributes from
  * multiple child operators.
  */
-object ExtractPythonUDFs extends Rule[LogicalPlan] {
+object ExtractPythonUDFs extends Rule[LogicalPlan] with Logging {
 
   private type EvalType = Int
   private type EvalTypeChecker = EvalType => Boolean
@@ -271,9 +273,21 @@ object ExtractPythonUDFs extends Rule[LogicalPlan] {
           val evaluation = evalType match {
             case PythonEvalType.SQL_BATCHED_UDF =>
               BatchEvalPython(validUdfs, resultAttrs, child)
-            case PythonEvalType.SQL_SCALAR_PANDAS_UDF | PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
-                 | PythonEvalType.SQL_ARROW_BATCHED_UDF =>
+            case PythonEvalType.SQL_SCALAR_PANDAS_UDF | PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF =>
               ArrowEvalPython(validUdfs, resultAttrs, child, evalType)
+            case PythonEvalType.SQL_ARROW_BATCHED_UDF =>
+              // Check if any input columns are UDTs for SQL_ARROW_BATCHED_UDF
+              val hasUDTInput = child.output.exists(
+                attr => attr.dataType.isInstanceOf[UserDefinedType[_]])
+
+              if (hasUDTInput) {
+                // Use BatchEvalPython if UDT is detected
+                logWarning("Arrow optimization disabled due to UDT input. " +
+                  "Falling back to non-Arrow-optimized UDF execution.")
+                BatchEvalPython(validUdfs, resultAttrs, child)
+              } else {
+                ArrowEvalPython(validUdfs, resultAttrs, child, evalType)
+              }
             case _ =>
               throw SparkException.internalError("Unexpected UDF evalType")
           }