diff --git a/pandas/conftest.py b/pandas/conftest.py index f9c10a7758bd2..927056f7de752 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2116,3 +2116,10 @@ def temp_file(tmp_path): file_path = tmp_path / str(uuid.uuid4()) file_path.touch() return file_path + + +@pytest.fixture(params=[True, False]) +def pdep16_nan_behavior(request): + opt = request.param + with pd.option_context("mode.pdep16_nan_behavior", opt): + yield opt diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e7a6b207363c3..d2de91f31b9ec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -11,6 +11,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, missing as libmissing, @@ -308,7 +310,9 @@ def __setitem__(self, key, value) -> None: def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 - if self._data.dtype.kind == "f" and lib.is_float(key): + if lib.is_float(key) and get_option("mode.PDEP16_nan_behavior"): + key = self.dtype.na_value + elif self._data.dtype.kind == "f" and lib.is_float(key): return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -655,6 +659,8 @@ def reconstruct(x: np.ndarray): # reached in e.g. np.sqrt on BooleanArray # we don't support float16 x = x.astype(np.float32) + if get_option("mode.PDEP16_nan_behavior"): + m[np.isnan(x)] = True return FloatingArray(x, m) else: x[mask] = np.nan @@ -860,6 +866,9 @@ def _maybe_mask_result( if result.dtype.kind == "f": from pandas.core.arrays import FloatingArray + if get_option("mode.PDEP16_nan_behavior"): + mask[np.isnan(result)] = True + return FloatingArray(result, mask, copy=False) elif result.dtype.kind == "b": diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index f319a3cc05575..1e93b2a83c018 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -8,6 +8,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, missing as libmissing, @@ -101,6 +103,8 @@ def __from_arrow__( array = array.combine_chunks() data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype) + if data.dtype.kind == "f" and get_option("mode.PDEP16_nan_behavior"): + mask[np.isnan(data)] = False return array_class(data.copy(), ~mask, copy=False) @classmethod @@ -261,10 +265,19 @@ def __init__( f"values should be {descr} numpy array. Use " "the 'pd.array' function instead" ) + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be bool numpy array. Use the 'pd.array' function instead" + ) + if values.dtype == np.float16: # If we don't raise here, then accessing self.dtype would raise raise TypeError("FloatingArray does not support np.float16 dtype.") + # NB: if get_option("mode.PDEP16_nan_behavior") is True + # then caller is responsible for ensuring + # assert mask[np.isnan(values)].all() + super().__init__(values, mask, copy=copy) @cache_readonly diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..b6debbfa54a2c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -427,6 +427,17 @@ def is_terminal() -> bool: validator=is_one_of_factory([True, False, "warn"]), ) +with cf.config_prefix("mode"): + cf.register_option( + "PDEP16_nan_behavior", + True, + # TODO: set the default to False before merging; + # True is just to find the tests that break with it enabled. + "Whether to enable the PDEP-16 behavior *consistently* treating NaN " + "and NA as interchangeable for the purposes of numpy-nullable dtypes.", + validator=is_one_of_factory([True, False]), + ) + # user warnings chained_assignment = """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4e1ea07907cdb..4308f59290319 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6584,6 +6584,15 @@ def _maybe_cast_indexer(self, key): If we have a float key and are not a floating index, then try to cast to an int if equivalent. """ + if ( + is_float(key) + and np.isnan(key) + and isinstance(self.dtype, ExtensionDtype) + and self.dtype.kind == "f" + and get_option("mode.pdep16_nan_behavior") + ): + # TODO: better place to do this? + key = self.dtype.na_value return key def _maybe_cast_listlike_indexer(self, target) -> Index: diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 009fac4c2f5ed..695b9290bd0ba 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -35,21 +35,24 @@ def test_array_op(dtype, opname, exp): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(dtype, zero, negative): +def test_divide_by_zero(dtype, zero, negative, pdep16_nan_behavior): # TODO pending NA/NaN discussion # https://github.com/pandas-dev/pandas/issues/32265/ a = pd.array([0, 1, -1, None], dtype=dtype) result = a / zero + exp_mask = np.array([False, False, False, True]) + if pdep16_nan_behavior: + exp_mask[[0, -1]] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(dtype): +def test_pow_scalar(dtype, pdep16_nan_behavior): a = pd.array([-1, 0, 1, None, 2], dtype=dtype) result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) @@ -64,11 +67,14 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = a**np.nan - # TODO np.nan should be converted to pd.NA / missing before operation? - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, - ) + if pdep16_nan_behavior: + expected = pd.array([None, None, 1, None, None], dtype=dtype) + else: + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -87,9 +93,11 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask - ) + if not pdep16_nan_behavior: + # Otherwise the previous `expected` can be reused + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index a429649f1ce1d..55f94379360c3 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -38,11 +38,15 @@ def test_equals(): assert a1.equals(a2) is False -def test_equals_nan_vs_na(): +def test_equals_nan_vs_na(pdep16_nan_behavior): # GH#44382 mask = np.zeros(3, dtype=bool) data = np.array([1.0, np.nan, 3.0], dtype=np.float64) + if pdep16_nan_behavior: + # Under PDEP16, all callers of the FloatingArray constructor should + # ensure that mask[np.isnan(data)] = True + mask[1] = True left = FloatingArray(data, mask) assert left.equals(left) @@ -57,7 +61,11 @@ def test_equals_nan_vs_na(): assert right.equals(right) tm.assert_extension_array_equal(right, right) - assert not left.equals(right) + if not pdep16_nan_behavior: + assert not left.equals(right) + else: + # the constructor will set the NaN locations to NA + assert left.equals(right) # with mask[1] = True, the only difference is data[1], which should # not matter for equals diff --git a/pandas/tests/arrays/floating/test_contains.py b/pandas/tests/arrays/floating/test_contains.py index 956642697bf32..008236d05e809 100644 --- a/pandas/tests/arrays/floating/test_contains.py +++ b/pandas/tests/arrays/floating/test_contains.py @@ -3,10 +3,13 @@ import pandas as pd -def test_contains_nan(): +def test_contains_nan(pdep16_nan_behavior): # GH#52840 arr = pd.array(range(5)) / 0 assert np.isnan(arr._data[0]) - assert not arr.isna()[0] + if pdep16_nan_behavior: + assert arr.isna()[0] + else: + assert not arr.isna()[0] assert np.nan in arr diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index e954cecba417a..dbf641380797b 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -81,11 +81,18 @@ def test_to_numpy_na_value(box): tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_na_value_with_nan(): +def test_to_numpy_na_value_with_nan(pdep16_nan_behavior): # array with both NaN and NA -> only fill NA with `na_value` - arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + mask = np.array([False, False, True]) + if pdep16_nan_behavior: + mask[1] = True + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), mask) result = arr.to_numpy(dtype="float64", na_value=-1) - expected = np.array([0.0, np.nan, -1.0], dtype="float64") + if pdep16_nan_behavior: + # the NaN passed to the constructor is considered as NA + expected = np.array([0.0, -1.0, -1.0], dtype="float64") + else: + expected = np.array([0.0, np.nan, -1.0], dtype="float64") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 9fbea2022c87b..6bd4b6f424d5e 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -52,13 +52,16 @@ def test_div(dtype): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(zero, negative): +def test_divide_by_zero(zero, negative, pdep16_nan_behavior): # https://github.com/pandas-dev/pandas/issues/27398, GH#22793 a = pd.array([0, 1, -1, None], dtype="Int64") result = a / zero + exp_mask = np.array([False, False, False, True]) + if pdep16_nan_behavior: + exp_mask[0] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 @@ -99,7 +102,7 @@ def test_mod(dtype): tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(): +def test_pow_scalar(pdep16_nan_behavior): a = pd.array([-1, 0, 1, None, 2], dtype="Int64") result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") @@ -114,10 +117,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = a**np.nan - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), - np.array([False, False, False, True, False]), - ) + if pdep16_nan_behavior: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), + np.array([False, False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -136,10 +142,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype="float64"), - np.array([False, False, True, False]), - ) + if pdep16_nan_behavior: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), + np.array([False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) @@ -212,7 +221,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # TODO test unsigned overflow -def test_arith_coerce_scalar(data, all_arithmetic_operators): +def test_arith_coerce_scalar(data, all_arithmetic_operators, pdep16_nan_behavior): op = tm.get_op_from_name(all_arithmetic_operators) s = pd.Series(data) other = 0.01 @@ -222,7 +231,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): expected = expected.astype("Float64") # rmod results in NaN that wasn't NA in original nullable Series -> unmask it - if all_arithmetic_operators == "__rmod__": + if all_arithmetic_operators == "__rmod__" and not pdep16_nan_behavior: mask = (s == 0).fillna(False).to_numpy(bool) expected.array._mask[mask] = False diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 33300fff925f6..582ff0ecbd8bf 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -22,11 +22,14 @@ def test_ufuncs_single_int(ufunc): @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): +def test_ufuncs_single_float(ufunc, pdep16_nan_behavior): a = pd.array([1, 2, -3, np.nan]) with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + if pdep16_nan_behavior: + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + else: + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 79eb64b5a654f..56ddbe697c8d1 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -31,7 +31,7 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing): + def test_contains(self, data, data_missing, pdep16_nan_behavior): # GH-37867 # Tests for membership checks. Membership checks for nan-likes is tricky and # the settled on rule is: `nan_like in arr` is True if nan_like is @@ -55,7 +55,15 @@ def test_contains(self, data, data_missing): # type check for e.g. two instances of Decimal("NAN") continue assert na_value_obj not in data - assert na_value_obj not in data_missing + if ( + pdep16_nan_behavior + and isinstance(na_value_obj, float) + and isinstance(data, pd.core.arrays.BaseMaskedArray) + ): + # TODO: wrong place for this override + assert na_value_obj in data_missing + else: + assert na_value_obj not in data_missing def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 3c1b98d57b2a0..6937242959fa1 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -339,35 +339,49 @@ def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype): with pytest.raises(KeyError, match="NA"): idx.get_loc(NA) - def test_get_loc_masked_na_and_nan(self): + def test_get_loc_masked_na_and_nan(self, pdep16_nan_behavior): # GH#39133 - idx = Index( - FloatingArray( - np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) - ) - ) - result = idx.get_loc(NA) - assert result == 2 - result = idx.get_loc(np.nan) - assert result == 3 + mask = np.array([False, False, True, False]) + if pdep16_nan_behavior: + mask[-1] = True + idx = Index(FloatingArray(np.array([1, 2, 1, np.nan]), mask=mask)) + if pdep16_nan_behavior: + # NaN and NA are consistently treated as the same + result = idx.get_loc(NA) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + result = idx.get_loc(np.nan) + tm.assert_numpy_array_equal(result, expected) + else: + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 idx = Index( FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) ) result = idx.get_loc(NA) assert result == 2 - with pytest.raises(KeyError, match="nan"): - idx.get_loc(np.nan) + if pdep16_nan_behavior: + result = idx.get_loc(np.nan) + assert result == 2 + else: + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) - idx = Index( - FloatingArray( - np.array([1, 2, np.nan]), mask=np.array([False, False, False]) - ) - ) + mask = np.array([False, False, False]) + if pdep16_nan_behavior: + mask[-1] = True + idx = Index(FloatingArray(np.array([1, 2, np.nan]), mask=mask)) result = idx.get_loc(np.nan) assert result == 2 - with pytest.raises(KeyError, match="NA"): - idx.get_loc(NA) + if pdep16_nan_behavior: + result = idx.get_loc(NA) + assert result == 2 + else: + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) @pytest.mark.parametrize("val", [4, 2]) def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val):