[SPARK-53390][PS] Raise error when bools with None astype to ints under ANSI

xinrong-meng · xinrong-meng · commit 51b5f30ae321 · 2025-08-27T13:17:06.000-07:00
### What changes were proposed in this pull request? - Raise error when bools with None `astype` to ints under ANSI, to match native pandas behavior: ```py >>> pd.Series([True, None]).astype(int) Traceback (most recent call last): ... TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType' ``` - Improve tests ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-53389 ### Does this PR introduce _any_ user-facing change? No the feature isn't released. ```py >>> psser = ps.Series([True, None]) ``` From ```py >>> psser.astype(int) 0 1.0 1 NaN dtype: float64 ``` To ```py >>> psser.astype(int) ... ValueError: Cannot convert bools with missing values to integer ``` ### How was this patch tested? Commands below pass ``` SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_as_type AsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_as_type AsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.series.test_as_type SeriesAsTypeTests.test_astype" SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.series.test_as_type SeriesAsTypeTests.test_astype" ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #52133 from xinrong-meng/test_astype. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Xinrong Meng <xinrong@apache.org>
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -19,10 +19,11 @@
 from typing import Any, Union
 
 import pandas as pd
-from pandas.api.types import CategoricalDtype
+from pandas.api.types import CategoricalDtype, is_integer_dtype  # type: ignore[attr-defined]
 from pandas.core.dtypes.common import is_numeric_dtype
 
 from pyspark.pandas.base import column_op, IndexOpsMixin
+from pyspark.pandas.config import get_option
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.data_type_ops.base import (
     DataTypeOps,
@@ -321,6 +322,13 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
                 ),
             )
         else:
+            is_ansi = is_ansi_mode_enabled(index_ops._internal.spark_frame.sparkSession)
+            if is_ansi and get_option("compute.eager_check"):
+                if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
+                    if index_ops.hasnans:
+                        raise ValueError(
+                            "Cannot convert %s with missing values to integer" % self.pretty_name
+                        )
             return _as_other_type(index_ops, dtype, spark_type)
 
     def neg(self, operand: IndexOpsLike) -> IndexOpsLike:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_as_type.py b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
@@ -32,19 +32,17 @@
 
 
 class AsTypeTestsMixin:
-    """Unit tests for arithmetic operations of numeric data types.
-
-    A few test cases are disabled because pandas-on-Spark returns float64 whereas pandas
-    returns float32.
-    The underlying reason is the respective Spark operations return DoubleType always.
-    """
+    """Unit tests for arithmetic operations of numeric data types."""
 
     def test_astype(self):
         pdf, psdf = self.pdf, self.psdf
+        int_types = [int, np.int32, np.int16, np.int8]
+        cat_type = CategoricalDtype(categories=[2, 1, 3])
+        other_types = [float, np.float32, bool, str, "category", cat_type]
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
 
-            for int_type in [int, np.int32, np.int16, np.int8]:
+            for int_type in int_types:
                 if not pser.hasnans:
                     self.assert_eq(pser.astype(int_type), psser.astype(int_type))
                 else:
@@ -54,14 +52,9 @@ def test_astype(self):
                         "values to integer" % psser._dtype_op.pretty_name,
                         lambda: psser.astype(int_type),
                     )
+            for other_type in other_types:
+                self.assert_eq(pser.astype(other_type), psser.astype(other_type))
 
-            self.assert_eq(pser.astype(bool), psser.astype(bool))
-            self.assert_eq(pser.astype(float), psser.astype(float))
-            self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
-            self.assert_eq(pser.astype(str), psser.astype(str))
-            self.assert_eq(pser.astype("category"), psser.astype("category"))
-            cat_type = CategoricalDtype(categories=[2, 1, 3])
-            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
         if extension_object_dtypes_available and extension_float_dtypes_available:
             pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
             psser = ps.from_pandas(pser)
diff --git a/python/pyspark/pandas/tests/series/test_as_type.py b/python/pyspark/pandas/tests/series/test_as_type.py
@@ -22,6 +22,7 @@
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.testing.utils import is_ansi_mode_test
 from pyspark.pandas.typedef.typehints import (
     extension_dtypes_available,
     extension_float_dtypes_available,
@@ -31,6 +32,7 @@
 
 class SeriesAsTypeMixin:
     def test_astype(self):
+        # numeric
         psers = [pd.Series([10, 20, 15, 30, 45], name="x")]
 
         if extension_dtypes_available:
@@ -41,12 +43,14 @@ def test_astype(self):
         for pser in psers:
             self._test_numeric_astype(pser)
 
+        # numeric with nulls
         pser = pd.Series([10, 20, 15, 30, 45, None, np.nan], name="x")
         psser = ps.Series(pser)
 
         self.assert_eq(psser.astype(bool), pser.astype(bool))
         self.assert_eq(psser.astype(str), pser.astype(str))
 
+        # strings
         pser = pd.Series(["hi", "hi ", " ", " \t", "", None], name="x")
         psser = ps.Series(pser)
 
@@ -60,12 +64,16 @@ def test_astype(self):
             self._check_extension(psser.astype("string"), pser.astype("string"))
             self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype()))
 
+        # bools
         pser = pd.Series([True, False, None], name="x")
         psser = ps.Series(pser)
-
         self.assert_eq(psser.astype(bool), pser.astype(bool))
         self.assert_eq(psser.astype(str), pser.astype(str))
 
+        if is_ansi_mode_test:
+            with self.assertRaisesRegex(ValueError, "with missing values to integer"):
+                self.assert_eq(psser.astype(int))
+
         if extension_object_dtypes_available:
             from pandas import BooleanDtype, StringDtype
 
@@ -74,6 +82,7 @@ def test_astype(self):
             self._check_extension(psser.astype("string"), pser.astype("string"))
             self._check_extension(psser.astype(StringDtype()), pser.astype(StringDtype()))
 
+        # datetimes
         pser = pd.Series(["2020-10-27 00:00:01", None], name="x")
         psser = ps.Series(pser)