Skip to content

Commit 7708be0

Browse files
committed
POC: NA-only behavior for numpy-nullable dtypes
1 parent 0490e1b commit 7708be0

File tree

13 files changed

+164
-55
lines changed

13 files changed

+164
-55
lines changed

pandas/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2116,3 +2116,10 @@ def temp_file(tmp_path):
21162116
file_path = tmp_path / str(uuid.uuid4())
21172117
file_path.touch()
21182118
return file_path
2119+
2120+
2121+
@pytest.fixture(params=[True, False])
2122+
def pdep16_nan_behavior(request):
2123+
opt = request.param
2124+
with pd.option_context("mode.pdep16_nan_behavior", opt):
2125+
yield opt

pandas/core/arrays/masked.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
import numpy as np
1313

14+
from pandas._config import get_option
15+
1416
from pandas._libs import (
1517
lib,
1618
missing as libmissing,
@@ -308,7 +310,9 @@ def __setitem__(self, key, value) -> None:
308310
def __contains__(self, key) -> bool:
309311
if isna(key) and key is not self.dtype.na_value:
310312
# GH#52840
311-
if self._data.dtype.kind == "f" and lib.is_float(key):
313+
if lib.is_float(key) and get_option("mode.PDEP16_nan_behavior"):
314+
key = self.dtype.na_value
315+
elif self._data.dtype.kind == "f" and lib.is_float(key):
312316
return bool((np.isnan(self._data) & ~self._mask).any())
313317

314318
return bool(super().__contains__(key))
@@ -655,6 +659,8 @@ def reconstruct(x: np.ndarray):
655659
# reached in e.g. np.sqrt on BooleanArray
656660
# we don't support float16
657661
x = x.astype(np.float32)
662+
if get_option("mode.PDEP16_nan_behavior"):
663+
m[np.isnan(x)] = True
658664
return FloatingArray(x, m)
659665
else:
660666
x[mask] = np.nan
@@ -860,6 +866,9 @@ def _maybe_mask_result(
860866
if result.dtype.kind == "f":
861867
from pandas.core.arrays import FloatingArray
862868

869+
if get_option("mode.PDEP16_nan_behavior"):
870+
mask[np.isnan(result)] = True
871+
863872
return FloatingArray(result, mask, copy=False)
864873

865874
elif result.dtype.kind == "b":

pandas/core/arrays/numeric.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
import numpy as np
1010

11+
from pandas._config import get_option
12+
1113
from pandas._libs import (
1214
lib,
1315
missing as libmissing,
@@ -101,6 +103,8 @@ def __from_arrow__(
101103
array = array.combine_chunks()
102104

103105
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
106+
if data.dtype.kind == "f" and get_option("mode.PDEP16_nan_behavior"):
107+
mask[np.isnan(data)] = False
104108
return array_class(data.copy(), ~mask, copy=False)
105109

106110
@classmethod
@@ -261,10 +265,19 @@ def __init__(
261265
f"values should be {descr} numpy array. Use "
262266
"the 'pd.array' function instead"
263267
)
268+
if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool):
269+
raise TypeError(
270+
"mask should be bool numpy array. Use the 'pd.array' function instead"
271+
)
272+
264273
if values.dtype == np.float16:
265274
# If we don't raise here, then accessing self.dtype would raise
266275
raise TypeError("FloatingArray does not support np.float16 dtype.")
267276

277+
# NB: if get_option("mode.PDEP16_nan_behavior") is True
278+
# then caller is responsible for ensuring
279+
# assert mask[np.isnan(values)].all()
280+
268281
super().__init__(values, mask, copy=copy)
269282

270283
@cache_readonly

pandas/core/config_init.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,17 @@ def is_terminal() -> bool:
427427
validator=is_one_of_factory([True, False, "warn"]),
428428
)
429429

430+
with cf.config_prefix("mode"):
431+
cf.register_option(
432+
"PDEP16_nan_behavior",
433+
True,
434+
# TODO: set the default to False before merging;
435+
# True is just to find the tests that break with it enabled.
436+
"Whether to enable the PDEP-16 behavior *consistently* treating NaN "
437+
"and NA as interchangeable for the purposes of numpy-nullable dtypes.",
438+
validator=is_one_of_factory([True, False]),
439+
)
440+
430441

431442
# user warnings
432443
chained_assignment = """

pandas/core/indexes/base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6584,6 +6584,15 @@ def _maybe_cast_indexer(self, key):
65846584
If we have a float key and are not a floating index, then try to cast
65856585
to an int if equivalent.
65866586
"""
6587+
if (
6588+
is_float(key)
6589+
and np.isnan(key)
6590+
and isinstance(self.dtype, ExtensionDtype)
6591+
and self.dtype.kind == "f"
6592+
and get_option("mode.pdep16_nan_behavior")
6593+
):
6594+
# TODO: better place to do this?
6595+
key = self.dtype.na_value
65876596
return key
65886597

65896598
def _maybe_cast_listlike_indexer(self, target) -> Index:

pandas/tests/arrays/floating/test_arithmetic.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,24 @@ def test_array_op(dtype, opname, exp):
3535

3636

3737
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
38-
def test_divide_by_zero(dtype, zero, negative):
38+
def test_divide_by_zero(dtype, zero, negative, pdep16_nan_behavior):
3939
# TODO pending NA/NaN discussion
4040
# https://github.com/pandas-dev/pandas/issues/32265/
4141
a = pd.array([0, 1, -1, None], dtype=dtype)
4242
result = a / zero
43+
exp_mask = np.array([False, False, False, True])
44+
if pdep16_nan_behavior:
45+
exp_mask[[0, -1]] = True
4346
expected = FloatingArray(
4447
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
45-
np.array([False, False, False, True]),
48+
exp_mask,
4649
)
4750
if negative:
4851
expected *= -1
4952
tm.assert_extension_array_equal(result, expected)
5053

5154

52-
def test_pow_scalar(dtype):
55+
def test_pow_scalar(dtype, pdep16_nan_behavior):
5356
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
5457
result = a**0
5558
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
@@ -64,11 +67,14 @@ def test_pow_scalar(dtype):
6467
tm.assert_extension_array_equal(result, expected)
6568

6669
result = a**np.nan
67-
# TODO np.nan should be converted to pd.NA / missing before operation?
68-
expected = FloatingArray(
69-
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
70-
mask=a._mask,
71-
)
70+
if pdep16_nan_behavior:
71+
expected = pd.array([None, None, 1, None, None], dtype=dtype)
72+
else:
73+
# TODO np.nan should be converted to pd.NA / missing before operation?
74+
expected = FloatingArray(
75+
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
76+
mask=a._mask,
77+
)
7278
tm.assert_extension_array_equal(result, expected)
7379

7480
# reversed
@@ -87,9 +93,11 @@ def test_pow_scalar(dtype):
8793
tm.assert_extension_array_equal(result, expected)
8894

8995
result = np.nan**a
90-
expected = FloatingArray(
91-
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
92-
)
96+
if not pdep16_nan_behavior:
97+
# Otherwise the previous `expected` can be reused
98+
expected = FloatingArray(
99+
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
100+
)
93101
tm.assert_extension_array_equal(result, expected)
94102

95103

pandas/tests/arrays/floating/test_comparison.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,15 @@ def test_equals():
3838
assert a1.equals(a2) is False
3939

4040

41-
def test_equals_nan_vs_na():
41+
def test_equals_nan_vs_na(pdep16_nan_behavior):
4242
# GH#44382
4343

4444
mask = np.zeros(3, dtype=bool)
4545
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
46+
if pdep16_nan_behavior:
47+
# Under PDEP16, all callers of the FloatingArray constructor should
48+
# ensure that mask[np.isnan(data)] = True
49+
mask[1] = True
4650

4751
left = FloatingArray(data, mask)
4852
assert left.equals(left)
@@ -57,7 +61,11 @@ def test_equals_nan_vs_na():
5761
assert right.equals(right)
5862
tm.assert_extension_array_equal(right, right)
5963

60-
assert not left.equals(right)
64+
if not pdep16_nan_behavior:
65+
assert not left.equals(right)
66+
else:
67+
# the constructor will set the NaN locations to NA
68+
assert left.equals(right)
6169

6270
# with mask[1] = True, the only difference is data[1], which should
6371
# not matter for equals

pandas/tests/arrays/floating/test_contains.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33
import pandas as pd
44

55

6-
def test_contains_nan():
6+
def test_contains_nan(pdep16_nan_behavior):
77
# GH#52840
88
arr = pd.array(range(5)) / 0
99

1010
assert np.isnan(arr._data[0])
11-
assert not arr.isna()[0]
11+
if pdep16_nan_behavior:
12+
assert arr.isna()[0]
13+
else:
14+
assert not arr.isna()[0]
1215
assert np.nan in arr

pandas/tests/arrays/floating/test_to_numpy.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,18 @@ def test_to_numpy_na_value(box):
8181
tm.assert_numpy_array_equal(result, expected)
8282

8383

84-
def test_to_numpy_na_value_with_nan():
84+
def test_to_numpy_na_value_with_nan(pdep16_nan_behavior):
8585
# array with both NaN and NA -> only fill NA with `na_value`
86-
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
86+
mask = np.array([False, False, True])
87+
if pdep16_nan_behavior:
88+
mask[1] = True
89+
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), mask)
8790
result = arr.to_numpy(dtype="float64", na_value=-1)
88-
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
91+
if pdep16_nan_behavior:
92+
# the NaN passed to the constructor is considered as NA
93+
expected = np.array([0.0, -1.0, -1.0], dtype="float64")
94+
else:
95+
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
8996
tm.assert_numpy_array_equal(result, expected)
9097

9198

pandas/tests/arrays/integer/test_arithmetic.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,16 @@ def test_div(dtype):
5252

5353

5454
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
55-
def test_divide_by_zero(zero, negative):
55+
def test_divide_by_zero(zero, negative, pdep16_nan_behavior):
5656
# https://github.com/pandas-dev/pandas/issues/27398, GH#22793
5757
a = pd.array([0, 1, -1, None], dtype="Int64")
5858
result = a / zero
59+
exp_mask = np.array([False, False, False, True])
60+
if pdep16_nan_behavior:
61+
exp_mask[0] = True
5962
expected = FloatingArray(
6063
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
61-
np.array([False, False, False, True]),
64+
exp_mask,
6265
)
6366
if negative:
6467
expected *= -1
@@ -99,7 +102,7 @@ def test_mod(dtype):
99102
tm.assert_extension_array_equal(result, expected)
100103

101104

102-
def test_pow_scalar():
105+
def test_pow_scalar(pdep16_nan_behavior):
103106
a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
104107
result = a**0
105108
expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
@@ -114,10 +117,13 @@ def test_pow_scalar():
114117
tm.assert_extension_array_equal(result, expected)
115118

116119
result = a**np.nan
117-
expected = FloatingArray(
118-
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
119-
np.array([False, False, False, True, False]),
120-
)
120+
if pdep16_nan_behavior:
121+
expected = expected.astype("Float64")
122+
else:
123+
expected = FloatingArray(
124+
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
125+
np.array([False, False, False, True, False]),
126+
)
121127
tm.assert_extension_array_equal(result, expected)
122128

123129
# reversed
@@ -136,10 +142,13 @@ def test_pow_scalar():
136142
tm.assert_extension_array_equal(result, expected)
137143

138144
result = np.nan**a
139-
expected = FloatingArray(
140-
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
141-
np.array([False, False, True, False]),
142-
)
145+
if pdep16_nan_behavior:
146+
expected = expected.astype("Float64")
147+
else:
148+
expected = FloatingArray(
149+
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
150+
np.array([False, False, True, False]),
151+
)
143152
tm.assert_extension_array_equal(result, expected)
144153

145154

@@ -212,7 +221,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
212221
# TODO test unsigned overflow
213222

214223

215-
def test_arith_coerce_scalar(data, all_arithmetic_operators):
224+
def test_arith_coerce_scalar(data, all_arithmetic_operators, pdep16_nan_behavior):
216225
op = tm.get_op_from_name(all_arithmetic_operators)
217226
s = pd.Series(data)
218227
other = 0.01
@@ -222,7 +231,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
222231
expected = expected.astype("Float64")
223232

224233
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
225-
if all_arithmetic_operators == "__rmod__":
234+
if all_arithmetic_operators == "__rmod__" and not pdep16_nan_behavior:
226235
mask = (s == 0).fillna(False).to_numpy(bool)
227236
expected.array._mask[mask] = False
228237

pandas/tests/arrays/integer/test_function.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,14 @@ def test_ufuncs_single_int(ufunc):
2222

2323

2424
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
25-
def test_ufuncs_single_float(ufunc):
25+
def test_ufuncs_single_float(ufunc, pdep16_nan_behavior):
2626
a = pd.array([1, 2, -3, np.nan])
2727
with np.errstate(invalid="ignore"):
2828
result = ufunc(a)
29-
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
29+
if pdep16_nan_behavior:
30+
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
31+
else:
32+
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
3033
tm.assert_extension_array_equal(result, expected)
3134

3235
s = pd.Series(a)

pandas/tests/extension/base/interface.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def test_can_hold_na_valid(self, data):
3131
# GH-20761
3232
assert data._can_hold_na is True
3333

34-
def test_contains(self, data, data_missing):
34+
def test_contains(self, data, data_missing, pdep16_nan_behavior):
3535
# GH-37867
3636
# Tests for membership checks. Membership checks for nan-likes is tricky and
3737
# the settled on rule is: `nan_like in arr` is True if nan_like is
@@ -55,7 +55,15 @@ def test_contains(self, data, data_missing):
5555
# type check for e.g. two instances of Decimal("NAN")
5656
continue
5757
assert na_value_obj not in data
58-
assert na_value_obj not in data_missing
58+
if (
59+
pdep16_nan_behavior
60+
and isinstance(na_value_obj, float)
61+
and isinstance(data, pd.core.arrays.BaseMaskedArray)
62+
):
63+
# TODO: wrong place for this override
64+
assert na_value_obj in data_missing
65+
else:
66+
assert na_value_obj not in data_missing
5967

6068
def test_memory_usage(self, data):
6169
s = pd.Series(data)

0 commit comments

Comments
 (0)