conf true

xinrong-meng · xinrong-meng · commit e25b0ebcaec5 · 2025-02-04T17:59:49.000-08:00
diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst
@@ -356,8 +356,8 @@ Arrow Python UDFs are user defined functions that are executed row-by-row, utili
 transfer and serialization. To define an Arrow Python UDF, you can use the :meth:`udf` decorator or wrap the function
 with the :meth:`udf` method, ensuring the ``useArrow`` parameter is set to True. Additionally, you can enable Arrow
 optimization for Python UDFs throughout the entire SparkSession by setting the Spark configuration
-``spark.sql.execution.pythonUDF.arrow.enabled`` to true. It's important to note that the Spark configuration takes
-effect only when ``useArrow`` is either not set or set to None.
+``spark.sql.execution.pythonUDF.arrow.enabled`` to true, which is the default. It's important to note that the Spark
+configuration takes effect only when ``useArrow`` is either not set or set to None.
 
 The type hints for Arrow Python UDFs should be specified in the same way as for default, pickled Python UDFs.
 
diff --git a/python/docs/source/user_guide/sql/type_conversions.rst b/python/docs/source/user_guide/sql/type_conversions.rst
@@ -57,7 +57,7 @@ are listed below:
       - Default
     * - spark.sql.execution.pythonUDF.arrow.enabled
       - Enable PyArrow in PySpark. See more `here <arrow_pandas.rst>`_.
-      - False
+      - True
     * - spark.sql.pyspark.inferNestedDictAsStruct.enabled
       - When enabled, nested dictionaries are inferred as StructType. Otherwise, they are inferred as MapType.
       - False
diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py
@@ -41,6 +41,7 @@
     UDFRegistration as PySparkUDFRegistration,
     UserDefinedFunction as PySparkUserDefinedFunction,
 )
+from pyspark.sql.utils import has_arrow
 from pyspark.errors import PySparkTypeError, PySparkRuntimeError
 
 if TYPE_CHECKING:
@@ -58,6 +59,7 @@ def _create_py_udf(
     returnType: "DataTypeOrString",
     useArrow: Optional[bool] = None,
 ) -> "UserDefinedFunctionLike":
+    is_arrow_enabled = False
     if useArrow is None:
         is_arrow_enabled = False
         try:
@@ -78,6 +80,14 @@ def _create_py_udf(
 
     eval_type: int = PythonEvalType.SQL_BATCHED_UDF
 
+    if is_arrow_enabled and not has_arrow:
+        is_arrow_enabled = False
+        warnings.warn(
+            "Arrow optimization failed to enable because PyArrow is not installed. "
+            "Falling back to a non-Arrow-optimized UDF.",
+            RuntimeWarning,
+        )
+
     if is_arrow_enabled:
         try:
             is_func_with_args = len(getfullargspec(f).args) > 0
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -26291,7 +26291,8 @@ def udf(
         Defaults to :class:`StringType`.
     useArrow : bool, optional
         whether to use Arrow to optimize the (de)serialization. When it is None, the
-        Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect.
+        Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect,
+        which is "true" by default.
 
     Examples
     --------
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
@@ -34,7 +34,7 @@
     StructType,
     _parse_datatype_string,
 )
-from pyspark.sql.utils import get_active_spark_context
+from pyspark.sql.utils import get_active_spark_context, has_arrow
 from pyspark.sql.pandas.types import to_arrow_type
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 from pyspark.errors import PySparkTypeError, PySparkNotImplementedError, PySparkRuntimeError
@@ -118,7 +118,7 @@ def _create_py_udf(
     # Note: The values of 'SQL Type' are DDL formatted strings, which can be used as `returnType`s.
     # Note: The values inside the table are generated by `repr`. X' means it throws an exception
     # during the conversion.
-
+    is_arrow_enabled = False
     if useArrow is None:
         from pyspark.sql import SparkSession
 
@@ -131,6 +131,14 @@ def _create_py_udf(
     else:
         is_arrow_enabled = useArrow
 
+    if is_arrow_enabled and not has_arrow:
+        is_arrow_enabled = False
+        warnings.warn(
+            "Arrow optimization failed to enable because PyArrow is not installed. "
+            "Falling back to a non-Arrow-optimized UDF.",
+            RuntimeWarning,
+        )
+
     eval_type: int = PythonEvalType.SQL_BATCHED_UDF
 
     if is_arrow_enabled:
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
@@ -63,6 +63,15 @@
     from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex
 
 
+has_arrow: bool = False
+try:
+    import pyarrow  # noqa: F401
+
+    has_arrow = True
+except ImportError:
+    pass
+
+
 FuncT = TypeVar("FuncT", bound=Callable[..., Any])
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3497,7 +3497,7 @@ object SQLConf {
         "can only be enabled when the given function takes at least one argument.")
       .version("3.4.0")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val PYTHON_UDF_ARROW_CONCURRENCY_LEVEL =
     buildConf("spark.sql.execution.pythonUDF.arrow.concurrency.level")