Skip to content

Commit e25b0eb

Browse files
committed
conf true
1 parent ac2f3a4 commit e25b0eb

File tree

7 files changed

+35
-7
lines changed

7 files changed

+35
-7
lines changed

python/docs/source/user_guide/sql/arrow_pandas.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,8 @@ Arrow Python UDFs are user defined functions that are executed row-by-row, utili
356356
transfer and serialization. To define an Arrow Python UDF, you can use the :meth:`udf` decorator or wrap the function
357357
with the :meth:`udf` method, ensuring the ``useArrow`` parameter is set to True. Additionally, you can enable Arrow
358358
optimization for Python UDFs throughout the entire SparkSession by setting the Spark configuration
359-
``spark.sql.execution.pythonUDF.arrow.enabled`` to true. It's important to note that the Spark configuration takes
360-
effect only when ``useArrow`` is either not set or set to None.
359+
``spark.sql.execution.pythonUDF.arrow.enabled`` to true, which is the default. It's important to note that the Spark
360+
configuration takes effect only when ``useArrow`` is either not set or set to None.
361361

362362
The type hints for Arrow Python UDFs should be specified in the same way as for default, pickled Python UDFs.
363363

python/docs/source/user_guide/sql/type_conversions.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ are listed below:
5757
- Default
5858
* - spark.sql.execution.pythonUDF.arrow.enabled
5959
- Enable PyArrow in PySpark. See more `here <arrow_pandas.rst>`_.
60-
- False
60+
- True
6161
* - spark.sql.pyspark.inferNestedDictAsStruct.enabled
6262
- When enabled, nested dictionaries are inferred as StructType. Otherwise, they are inferred as MapType.
6363
- False

python/pyspark/sql/connect/udf.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
UDFRegistration as PySparkUDFRegistration,
4242
UserDefinedFunction as PySparkUserDefinedFunction,
4343
)
44+
from pyspark.sql.utils import has_arrow
4445
from pyspark.errors import PySparkTypeError, PySparkRuntimeError
4546

4647
if TYPE_CHECKING:
@@ -58,6 +59,7 @@ def _create_py_udf(
5859
returnType: "DataTypeOrString",
5960
useArrow: Optional[bool] = None,
6061
) -> "UserDefinedFunctionLike":
62+
is_arrow_enabled = False
6163
if useArrow is None:
6264
is_arrow_enabled = False
6365
try:
@@ -78,6 +80,14 @@ def _create_py_udf(
7880

7981
eval_type: int = PythonEvalType.SQL_BATCHED_UDF
8082

83+
if is_arrow_enabled and not has_arrow:
84+
is_arrow_enabled = False
85+
warnings.warn(
86+
"Arrow optimization failed to enable because PyArrow is not installed. "
87+
"Falling back to a non-Arrow-optimized UDF.",
88+
RuntimeWarning,
89+
)
90+
8191
if is_arrow_enabled:
8292
try:
8393
is_func_with_args = len(getfullargspec(f).args) > 0

python/pyspark/sql/functions/builtin.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26291,7 +26291,8 @@ def udf(
2629126291
Defaults to :class:`StringType`.
2629226292
useArrow : bool, optional
2629326293
whether to use Arrow to optimize the (de)serialization. When it is None, the
26294-
Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect.
26294+
Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect,
26295+
which is "true" by default.
2629526296

2629626297
Examples
2629726298
--------

python/pyspark/sql/udf.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
StructType,
3535
_parse_datatype_string,
3636
)
37-
from pyspark.sql.utils import get_active_spark_context
37+
from pyspark.sql.utils import get_active_spark_context, has_arrow
3838
from pyspark.sql.pandas.types import to_arrow_type
3939
from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
4040
from pyspark.errors import PySparkTypeError, PySparkNotImplementedError, PySparkRuntimeError
@@ -118,7 +118,7 @@ def _create_py_udf(
118118
# Note: The values of 'SQL Type' are DDL formatted strings, which can be used as `returnType`s.
119119
# Note: The values inside the table are generated by `repr`. X' means it throws an exception
120120
# during the conversion.
121-
121+
is_arrow_enabled = False
122122
if useArrow is None:
123123
from pyspark.sql import SparkSession
124124

@@ -131,6 +131,14 @@ def _create_py_udf(
131131
else:
132132
is_arrow_enabled = useArrow
133133

134+
if is_arrow_enabled and not has_arrow:
135+
is_arrow_enabled = False
136+
warnings.warn(
137+
"Arrow optimization failed to enable because PyArrow is not installed. "
138+
"Falling back to a non-Arrow-optimized UDF.",
139+
RuntimeWarning,
140+
)
141+
134142
eval_type: int = PythonEvalType.SQL_BATCHED_UDF
135143

136144
if is_arrow_enabled:

python/pyspark/sql/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,15 @@
6363
from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex
6464

6565

66+
has_arrow: bool = False
67+
try:
68+
import pyarrow # noqa: F401
69+
70+
has_arrow = True
71+
except ImportError:
72+
pass
73+
74+
6675
FuncT = TypeVar("FuncT", bound=Callable[..., Any])
6776

6877

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3497,7 +3497,7 @@ object SQLConf {
34973497
"can only be enabled when the given function takes at least one argument.")
34983498
.version("3.4.0")
34993499
.booleanConf
3500-
.createWithDefault(false)
3500+
.createWithDefault(true)
35013501

35023502
val PYTHON_UDF_ARROW_CONCURRENCY_LEVEL =
35033503
buildConf("spark.sql.execution.pythonUDF.arrow.concurrency.level")

0 commit comments

Comments
 (0)