Skip to content

Commit 51b5f30

Browse files
committed
[SPARK-53390][PS] Raise error when bools with None astype to ints under ANSI
### What changes were proposed in this pull request? - Raise error when bools with None `astype` to ints under ANSI, to match native pandas behavior: ```py >>> pd.Series([True, None]).astype(int) Traceback (most recent call last): ... TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' ``` - Improve tests ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-53389 ### Does this PR introduce _any_ user-facing change? No the feature isn't released. ```py >>> psser = ps.Series([True, None]) ``` From ```py >>> psser.astype(int) 0 1.0 1 NaN dtype: float64 ``` To ```py >>> psser.astype(int) ... ValueError: Cannot convert bools with missing values to integer ``` ### How was this patch tested? Commands below pass ``` SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_as_type AsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_as_type AsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.series.test_as_type SeriesAsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.series.test_as_type SeriesAsTypeTests.test_astype" ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #52133 from xinrong-meng/test_astype. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Xinrong Meng <[email protected]>
1 parent 86dad83 commit 51b5f30

File tree

3 files changed

+26
-16
lines changed

3 files changed

+26
-16
lines changed

python/pyspark/pandas/data_type_ops/boolean_ops.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
from typing import Any, Union
2020

2121
import pandas as pd
22-
from pandas.api.types import CategoricalDtype
22+
from pandas.api.types import CategoricalDtype, is_integer_dtype # type: ignore[attr-defined]
2323
from pandas.core.dtypes.common import is_numeric_dtype
2424

2525
from pyspark.pandas.base import column_op, IndexOpsMixin
26+
from pyspark.pandas.config import get_option
2627
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
2728
from pyspark.pandas.data_type_ops.base import (
2829
DataTypeOps,
@@ -321,6 +322,13 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
321322
),
322323
)
323324
else:
325+
is_ansi = is_ansi_mode_enabled(index_ops._internal.spark_frame.sparkSession)
326+
if is_ansi and get_option("compute.eager_check"):
327+
if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
328+
if index_ops.hasnans:
329+
raise ValueError(
330+
"Cannot convert %s with missing values to integer" % self.pretty_name
331+
)
324332
return _as_other_type(index_ops, dtype, spark_type)
325333

326334
def neg(self, operand: IndexOpsLike) -> IndexOpsLike:

python/pyspark/pandas/tests/data_type_ops/test_as_type.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,17 @@
3232

3333

3434
class AsTypeTestsMixin:
35-
"""Unit tests for arithmetic operations of numeric data types.
36-
37-
A few test cases are disabled because pandas-on-Spark returns float64 whereas pandas
38-
returns float32.
39-
The underlying reason is the respective Spark operations return DoubleType always.
40-
"""
35+
"""Unit tests for arithmetic operations of numeric data types."""
4136

4237
def test_astype(self):
4338
pdf, psdf = self.pdf, self.psdf
39+
int_types = [int, np.int32, np.int16, np.int8]
40+
cat_type = CategoricalDtype(categories=[2, 1, 3])
41+
other_types = [float, np.float32, bool, str, "category", cat_type]
4442
for col in self.numeric_df_cols:
4543
pser, psser = pdf[col], psdf[col]
4644

47-
for int_type in [int, np.int32, np.int16, np.int8]:
45+
for int_type in int_types:
4846
if not pser.hasnans:
4947
self.assert_eq(pser.astype(int_type), psser.astype(int_type))
5048
else:
@@ -54,14 +52,9 @@ def test_astype(self):
5452
"values to integer" % psser._dtype_op.pretty_name,
5553
lambda: psser.astype(int_type),
5654
)
55+
for other_type in other_types:
56+
self.assert_eq(pser.astype(other_type), psser.astype(other_type))
5757

58-
self.assert_eq(pser.astype(bool), psser.astype(bool))
59-
self.assert_eq(pser.astype(float), psser.astype(float))
60-
self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
61-
self.assert_eq(pser.astype(str), psser.astype(str))
62-
self.assert_eq(pser.astype("category"), psser.astype("category"))
63-
cat_type = CategoricalDtype(categories=[2, 1, 3])
64-
self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
6558
if extension_object_dtypes_available and extension_float_dtypes_available:
6659
pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
6760
psser = ps.from_pandas(pser)

python/pyspark/pandas/tests/series/test_as_type.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pyspark import pandas as ps
2323
from pyspark.testing.pandasutils import PandasOnSparkTestCase
2424
from pyspark.testing.sqlutils import SQLTestUtils
25+
from pyspark.testing.utils import is_ansi_mode_test
2526
from pyspark.pandas.typedef.typehints import (
2627
extension_dtypes_available,
2728
extension_float_dtypes_available,
@@ -31,6 +32,7 @@
3132

3233
class SeriesAsTypeMixin:
3334
def test_astype(self):
35+
# numeric
3436
psers = [pd.Series([10, 20, 15, 30, 45], name="x")]
3537

3638
if extension_dtypes_available:
@@ -41,12 +43,14 @@ def test_astype(self):
4143
for pser in psers:
4244
self._test_numeric_astype(pser)
4345

46+
# numeric with nulls
4447
pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x")
4548
psser = ps.Series(pser)
4649

4750
self.assert_eq(psser.astype(bool), pser.astype(bool))
4851
self.assert_eq(psser.astype(str), pser.astype(str))
4952

53+
# strings
5054
pser = pd.Series(["hi", "hi ", " ", " \t", "", None], name="x")
5155
psser = ps.Series(pser)
5256

@@ -60,12 +64,16 @@ def test_astype(self):
6064
self._check_extension(psser.astype("string"), pser.astype("string"))
6165
self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype()))
6266

67+
# bools
6368
pser = pd.Series([True, False, None], name="x")
6469
psser = ps.Series(pser)
65-
6670
self.assert_eq(psser.astype(bool), pser.astype(bool))
6771
self.assert_eq(psser.astype(str), pser.astype(str))
6872

73+
if is_ansi_mode_test:
74+
with self.assertRaisesRegex(ValueError, "with missing values to integer"):
75+
self.assert_eq(psser.astype(int))
76+
6977
if extension_object_dtypes_available:
7078
from pandas import BooleanDtype, StringDtype
7179

@@ -74,6 +82,7 @@ def test_astype(self):
7482
self._check_extension(psser.astype("string"), pser.astype("string"))
7583
self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype()))
7684

85+
# datetimes
7786
pser = pd.Series(["2020-10-27 00:00:01", None], name="x")
7887
psser = ps.Series(pser)
7988

0 commit comments

Comments
 (0)