From 8ce67401309bfff68010b3cf4edc5a97ea1e3c3d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Dec 2023 07:30:29 -0800 Subject: [PATCH 01/31] DEPR: DatetimeArray/TimedeltaArray.__init__ (#56043) * DEPR: DatetimeArray/TimedeltaArray.__init__ * mypy fixup * fix PeriodArray test * update doctest * remove accidental --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/_mixins.py | 11 +- pandas/core/arrays/datetimelike.py | 17 ++- pandas/core/arrays/datetimes.py | 4 +- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/timedeltas.py | 8 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/indexes/datetimelike.py | 5 +- pandas/core/internals/managers.py | 2 +- pandas/core/ops/array_ops.py | 4 +- .../arrays/datetimes/test_constructors.py | 103 +++++++++++------- pandas/tests/arrays/test_array.py | 6 +- pandas/tests/arrays/test_datetimelike.py | 77 +++++++------ pandas/tests/arrays/test_datetimes.py | 30 ++--- pandas/tests/arrays/test_timedeltas.py | 20 ++-- .../arrays/timedeltas/test_constructors.py | 65 +++++++---- .../arrays/timedeltas/test_reductions.py | 2 +- pandas/tests/base/test_conversion.py | 24 ++-- pandas/tests/dtypes/test_generic.py | 5 +- pandas/tests/extension/test_datetime.py | 16 ++- .../indexes/datetimes/test_constructors.py | 8 +- .../indexes/timedeltas/test_constructors.py | 11 +- .../tests/reductions/test_stat_reductions.py | 11 +- pandas/tests/test_downstream.py | 4 +- pandas/tests/test_nanops.py | 9 +- pandas/tests/tools/test_to_datetime.py | 4 +- 26 files changed, 258 insertions(+), 193 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index f7017aba1d996..70039cc697b8a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -443,6 +443,7 @@ Other Deprecations - Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cb8f802239146..9ece12cf51a7b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -133,21 +133,20 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: cls = dtype.construct_array_type() return cls(arr.view("i8"), dtype=dtype) elif isinstance(dtype, DatetimeTZDtype): - # error: Incompatible types in assignment (expression has type - # "type[DatetimeArray]", variable has type "type[PeriodArray]") - cls = dtype.construct_array_type() # type: ignore[assignment] + dt_cls = dtype.construct_array_type() dt64_values = arr.view(f"M8[{dtype.unit}]") - return cls(dt64_values, dtype=dtype) + return dt_cls._simple_new(dt64_values, dtype=dtype) elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): from pandas.core.arrays import DatetimeArray dt64_values = arr.view(dtype) - return DatetimeArray(dt64_values, dtype=dtype) + return DatetimeArray._simple_new(dt64_values, dtype=dtype) + elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) - return TimedeltaArray(td64_values, dtype=dtype) + return TimedeltaArray._simple_new(td64_values, dtype=dtype) # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1530fae89aa00..11a0c7bf18fcb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -269,7 +269,7 @@ def _unbox_scalar( Examples -------- - >>> arr = pd.arrays.DatetimeArray(np.array(['1970-01-01'], 'datetime64[ns]')) + >>> arr = pd.array(np.array(['1970-01-01'], 'datetime64[ns]')) >>> arr._unbox_scalar(arr[0]) numpy.datetime64('1970-01-01T00:00:00.000000000') """ @@ -1409,7 +1409,7 @@ def __add__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray(result) + return TimedeltaArray._from_sequence(result) return result def __radd__(self, other): @@ -1469,7 +1469,7 @@ def __sub__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray(result) + return TimedeltaArray._from_sequence(result) return result def __rsub__(self, other): @@ -1488,7 +1488,7 @@ def __rsub__(self, other): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray(other) + other = DatetimeArray._from_sequence(other) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, @@ -1725,7 +1725,7 @@ def _groupby_op( self = cast("DatetimeArray | TimedeltaArray", self) new_dtype = f"m8[{self.unit}]" res_values = res_values.view(new_dtype) - return TimedeltaArray(res_values) + return TimedeltaArray._simple_new(res_values, dtype=res_values.dtype) res_values = res_values.view(self._ndarray.dtype) return self._from_backing_data(res_values) @@ -1944,6 +1944,13 @@ class TimelikeOps(DatetimeLikeArrayMixin): def __init__( self, values, dtype=None, freq=lib.no_default, copy: bool = False ) -> None: + warnings.warn( + # GH#55623 + f"{type(self).__name__}.__init__ is deprecated and will be " + "removed in a future version. Use pd.array instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9329f82c1b646..6b7ddc4a72957 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -202,8 +202,8 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] Examples -------- - >>> pd.arrays.DatetimeArray(pd.DatetimeIndex(['2023-01-01', '2023-01-02']), - ... freq='D') + >>> pd.arrays.DatetimeArray._from_sequence( + ... pd.DatetimeIndex(['2023-01-01', '2023-01-02'], freq='D')) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] Length: 2, dtype: datetime64[ns] diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 0e2d4409b9f39..2930b979bfe78 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -664,7 +664,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray(new_data) + dta = DatetimeArray._from_sequence(new_data) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 95c3c641fd51a..1b885a2bdcd47 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -132,7 +132,7 @@ class TimedeltaArray(dtl.TimelikeOps): Examples -------- - >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1h', '2h'])) + >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(['1h', '2h'])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -709,11 +709,13 @@ def __neg__(self) -> TimedeltaArray: return type(self)._simple_new(-self._ndarray, dtype=self.dtype, freq=freq) def __pos__(self) -> TimedeltaArray: - return type(self)(self._ndarray.copy(), freq=self.freq) + return type(self)._simple_new( + self._ndarray.copy(), dtype=self.dtype, freq=self.freq + ) def __abs__(self) -> TimedeltaArray: # Note: freq is not preserved - return type(self)(np.abs(self._ndarray)) + return type(self)._simple_new(np.abs(self._ndarray), dtype=self.dtype) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4c1654ab0f5e4..ed5256922377a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -919,7 +919,7 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: else: np_arr = array.to_numpy() - return DatetimeArray(np_arr, dtype=self, copy=False) + return DatetimeArray._from_sequence(np_arr, dtype=self, copy=False) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4fc2fdb3202b1..cad8737a987d4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -697,7 +697,10 @@ def _fast_union(self, other: Self, sort=None) -> Self: dates = concat_compat([left._values, right_chunk]) # The can_fast_union check ensures that the result.freq # should match self.freq - dates = type(self._data)(dates, freq=self.freq) + assert isinstance(dates, type(self._data)) + # error: Item "ExtensionArray" of "ExtensionArray | + # ndarray[Any, Any]" has no attribute "_freq" + assert dates._freq == self.freq # type: ignore[union-attr] result = type(self)._simple_new(dates) return result else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 781b6df5ebd3d..3719bf1f77f85 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2345,7 +2345,7 @@ def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: ts = Timestamp(fill_value).as_unit(dtype.unit) i8values = np.full(shape, ts._value) dt64values = i8values.view(f"M8[{dtype.unit}]") - return DatetimeArray(dt64values, dtype=dtype) + return DatetimeArray._simple_new(dt64values, dtype=dtype) elif is_1d_only_ea_dtype(dtype): dtype = cast(ExtensionDtype, dtype) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index d8a772aac6082..ee3f8787d78b5 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -545,7 +545,7 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): new_dtype = get_supported_dtype(obj.dtype) obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) - return DatetimeArray(right) + return DatetimeArray._simple_new(right, dtype=right.dtype) return Timestamp(obj) @@ -563,7 +563,7 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): new_dtype = get_supported_dtype(obj.dtype) obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) - return TimedeltaArray(right) + return TimedeltaArray._simple_new(right, dtype=right.dtype) # In particular non-nanosecond timedelta64 needs to be cast to # nanoseconds, or else we get undesired behavior like diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index 97fa6e8d529b7..daf4aa3b47f56 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -19,13 +19,16 @@ def test_from_sequence_invalid_type(self): def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - DatetimeArray(arr.reshape(2, 2, 1)) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - DatetimeArray(arr[[0]].squeeze()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + DatetimeArray(arr[[0]].squeeze()) def test_freq_validation(self): # GH#24623 check that invalid instances cannot be created with the @@ -36,8 +39,10 @@ def test_freq_validation(self): "Inferred frequency h from passed values does not " "conform to passed frequency W-SUN" ) - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, freq="W") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, freq="W") @pytest.mark.parametrize( "meth", @@ -72,31 +77,40 @@ def test_from_pandas_array(self): tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): - arr = DatetimeArray( - np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), - dtype=DatetimeTZDtype(tz="US/Central"), - ) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) dtype = DatetimeTZDtype(tz="US/Eastern") msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]" - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=dtype) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=dtype) # also with mismatched tzawareness - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=np.dtype("M8[ns]")) - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=np.dtype("M8[ns]")) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) def test_non_array_raises(self): - with pytest.raises(ValueError, match="list"): - DatetimeArray([1, 2, 3]) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + DatetimeArray([1, 2, 3]) def test_bool_dtype_raises(self): arr = np.array([1, 2, 3], dtype="bool") + depr_msg = "DatetimeArray.__init__ is deprecated" msg = "Unexpected value for 'dtype': 'bool'. Must be" - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr) msg = r"dtype bool cannot be converted to datetime64\[ns\]" with pytest.raises(TypeError, match=msg): @@ -109,43 +123,52 @@ def test_bool_dtype_raises(self): pd.to_datetime(arr) def test_incorrect_dtype_raises(self): - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]") - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]") def test_mismatched_values_dtype_units(self): arr = np.array([1, 2, 3], dtype="M8[s]") dtype = np.dtype("M8[ns]") msg = "Values resolution does not match dtype." + depr_msg = "DatetimeArray.__init__ is deprecated" - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, dtype=dtype) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype) dtype2 = DatetimeTZDtype(tz="UTC", unit="ns") - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, dtype=dtype2) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype2) def test_freq_infer_raises(self): - with pytest.raises(ValueError, match="Frequency inference"): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray(data, copy=False) + arr = DatetimeArray._from_sequence(data, copy=False) assert arr._ndarray is data - arr = DatetimeArray(data, copy=True) + arr = DatetimeArray._from_sequence(data, copy=True) assert arr._ndarray is not data @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_numpy_datetime_unit(self, unit): data = np.array([1, 2, 3], dtype=f"M8[{unit}]") - arr = DatetimeArray(data) + arr = DatetimeArray._from_sequence(data) assert arr.unit == unit assert arr[0].unit == unit @@ -210,7 +233,7 @@ def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray( + expected = DatetimeArray._from_sequence( np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), dtype=dtype, ) @@ -238,7 +261,7 @@ def test_from_arrow_from_empty(unit, tz): dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -254,7 +277,7 @@ def test_from_arrow_from_integers(): dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 4381469196e18..96263f498935b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -296,7 +296,7 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="M8[ns]"), - DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")), ), ( np.array([1, 2], dtype="M8[us]"), @@ -327,11 +327,11 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), ), # integer ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 4fba662631b42..82524ea115019 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -89,7 +89,10 @@ class SharedTests: def arr1d(self): """Fixture returning DatetimeArray with daily frequency.""" data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq="D") + if self.array_cls is PeriodArray: + arr = self.array_cls(data, freq="D") + else: + arr = self.index_cls(data, freq="D")._data return arr def test_compare_len1_raises(self, arr1d): @@ -161,7 +164,7 @@ def test_take(self): if self.array_cls is PeriodArray: arr = PeriodArray(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.index_cls(data)._data idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] @@ -211,7 +214,7 @@ def test_concat_same_type(self, arr1d): arr = arr1d idx = self.index_cls(arr) idx = idx.insert(0, NaT) - arr = self.array_cls(idx) + arr = arr1d result = arr._concat_same_type([arr[:-1], arr[1:], arr]) arr2 = arr.astype(object) @@ -251,7 +254,7 @@ def test_fillna_method_doesnt_change_orig(self, method): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.array_cls._from_sequence(data) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -267,7 +270,7 @@ def test_searchsorted(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.array_cls._from_sequence(data) # scalar result = arr.searchsorted(arr[1]) @@ -339,7 +342,7 @@ def test_getitem_near_implementation_bounds(self): if self.array_cls is PeriodArray: arr = self.array_cls(i8vals, dtype="period[ns]") else: - arr = self.array_cls(i8vals, freq="ns") + arr = self.index_cls(i8vals, freq="ns")._data arr[0] # should not raise OutOfBoundsDatetime index = pd.Index(arr) @@ -350,13 +353,15 @@ def test_getitem_near_implementation_bounds(self): def test_getitem_2d(self, arr1d): # 2d slicing on a 1D array - expected = type(arr1d)(arr1d._ndarray[:, np.newaxis], dtype=arr1d.dtype) + expected = type(arr1d)._simple_new( + arr1d._ndarray[:, np.newaxis], dtype=arr1d.dtype + ) result = arr1d[:, np.newaxis] tm.assert_equal(result, expected) # Lookup on a 2D array arr2d = expected - expected = type(arr2d)(arr2d._ndarray[:3, 0], dtype=arr2d.dtype) + expected = type(arr2d)._simple_new(arr2d._ndarray[:3, 0], dtype=arr2d.dtype) result = arr2d[:3, 0] tm.assert_equal(result, expected) @@ -409,7 +414,7 @@ def test_setitem(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data, freq="D") + arr = self.index_cls(data, freq="D")._data arr[0] = arr[1] expected = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 @@ -524,7 +529,7 @@ def test_inplace_arithmetic(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data, freq="D") + arr = self.index_cls(data, freq="D")._data expected = arr + pd.Timedelta(days=1) arr += pd.Timedelta(days=1) @@ -589,10 +594,13 @@ def test_median(self, arr1d): def test_from_integer_array(self): arr = np.array([1, 2, 3], dtype=np.int64) - expected = self.array_cls(arr, dtype=self.example_dtype) - data = pd.array(arr, dtype="Int64") - result = self.array_cls(data, dtype=self.example_dtype) + if self.array_cls is PeriodArray: + expected = self.array_cls(arr, dtype=self.example_dtype) + result = self.array_cls(data, dtype=self.example_dtype) + else: + expected = self.array_cls._from_sequence(arr, dtype=self.example_dtype) + result = self.array_cls._from_sequence(data, dtype=self.example_dtype) tm.assert_extension_array_equal(result, expected) @@ -629,7 +637,7 @@ def test_round(self, arr1d): tm.assert_datetime_array_equal(result, expected) def test_array_interface(self, datetime_index): - arr = DatetimeArray(datetime_index) + arr = datetime_index._data # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) @@ -723,10 +731,10 @@ def test_array_i8_dtype(self, arr1d): def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray(arr) + dta = DatetimeArray._from_sequence(arr) assert dta._ndarray is arr - dta = DatetimeArray(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0]) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): @@ -751,7 +759,7 @@ def test_astype_object(self, arr1d): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_to_period(self, datetime_index, freqstr): dti = datetime_index - arr = DatetimeArray(dti) + arr = dti._data freqstr = freq_to_period_freqstr(1, freqstr) expected = dti.to_period(freq=freqstr) @@ -853,14 +861,10 @@ def test_concat_same_type_invalid(self, arr1d): def test_concat_same_type_different_freq(self, unit): # we *can* concatenate DTI with different freqs. - a = DatetimeArray( - pd.date_range("2000", periods=2, freq="D", tz="US/Central", unit=unit) - ) - b = DatetimeArray( - pd.date_range("2000", periods=2, freq="h", tz="US/Central", unit=unit) - ) + a = pd.date_range("2000", periods=2, freq="D", tz="US/Central", unit=unit)._data + b = pd.date_range("2000", periods=2, freq="h", tz="US/Central", unit=unit)._data result = DatetimeArray._concat_same_type([a, b]) - expected = DatetimeArray( + expected = ( pd.to_datetime( [ "2000-01-01 00:00:00", @@ -871,6 +875,7 @@ def test_concat_same_type_different_freq(self, unit): ) .tz_localize("US/Central") .as_unit(unit) + ._data ) tm.assert_datetime_array_equal(result, expected) @@ -884,7 +889,7 @@ def test_strftime(self, arr1d): def test_strftime_nat(self): # GH 29578 - arr = DatetimeArray(DatetimeIndex(["2019-01-01", NaT])) + arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) @@ -899,7 +904,7 @@ class TestTimedeltaArray(SharedTests): def test_from_tdi(self): tdi = TimedeltaIndex(["1 Day", "3 Hours"]) - arr = TimedeltaArray(tdi) + arr = tdi._data assert list(arr) == list(tdi) # Check that Index.__new__ knows what to do with TimedeltaArray @@ -909,7 +914,7 @@ def test_from_tdi(self): def test_astype_object(self): tdi = TimedeltaIndex(["1 Day", "3 Hours"]) - arr = TimedeltaArray(tdi) + arr = tdi._data asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) assert asobj.dtype == "O" @@ -917,7 +922,7 @@ def test_astype_object(self): def test_to_pytimedelta(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data expected = tdi.to_pytimedelta() result = arr.to_pytimedelta() @@ -926,7 +931,7 @@ def test_to_pytimedelta(self, timedelta_index): def test_total_seconds(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data expected = tdi.total_seconds() result = arr.total_seconds() @@ -936,7 +941,7 @@ def test_total_seconds(self, timedelta_index): @pytest.mark.parametrize("propname", TimedeltaArray._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data result = getattr(arr, propname) expected = np.array(getattr(tdi, propname), dtype=result.dtype) @@ -944,7 +949,7 @@ def test_int_properties(self, timedelta_index, propname): tm.assert_numpy_array_equal(result, expected) def test_array_interface(self, timedelta_index): - arr = TimedeltaArray(timedelta_index) + arr = timedelta_index._data # default asarray gives the same underlying data result = np.asarray(arr) @@ -987,7 +992,7 @@ def test_array_interface(self, timedelta_index): def test_take_fill_valid(self, timedelta_index, fixed_now_ts): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data td1 = pd.Timedelta(days=1) result = arr.take([-1, 1], allow_fill=True, fill_value=td1) @@ -1062,7 +1067,7 @@ def test_to_timestamp(self, how, arr1d): pi = self.index_cls(arr1d) arr = arr1d - expected = DatetimeArray(pi.to_timestamp(how=how)) + expected = DatetimeIndex(pi.to_timestamp(how=how))._data result = arr.to_timestamp(how=how) assert isinstance(result, DatetimeArray) @@ -1308,8 +1313,10 @@ def test_from_pandas_array(dtype): cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - result = cls(arr) - expected = cls(data) + depr_msg = f"{cls.__name__}.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = cls(arr) + expected = cls(data) tm.assert_extension_array_equal(result, expected) result = cls._from_sequence(arr, dtype=dtype) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index db86f62a10484..9a576be10d5ca 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -281,7 +281,7 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op): op = comparison_op dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None) - arr = DatetimeArray(dti) + arr = dti._data assert arr.freq == dti.freq assert arr.tz == dti.tz @@ -426,7 +426,7 @@ def test_setitem_str_impute_tz(self, tz_naive_fixture): data = np.array([1, 2, 3], dtype="M8[ns]") dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz) - arr = DatetimeArray(data, dtype=dtype) + arr = DatetimeArray._from_sequence(data, dtype=dtype) expected = arr.copy() ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz) @@ -446,7 +446,9 @@ def test_setitem_different_tz_raises(self): # pre-2.0 we required exact tz match, in 2.0 we require only # tzawareness-match data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) + arr = DatetimeArray._from_sequence( + data, copy=False, dtype=DatetimeTZDtype(tz="US/Central") + ) with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): arr[0] = pd.Timestamp("2000") @@ -455,7 +457,7 @@ def test_setitem_different_tz_raises(self): assert arr[0] == ts.tz_convert("US/Central") def test_setitem_clears_freq(self): - a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + a = pd.date_range("2000", periods=2, freq="D", tz="US/Central")._data a[0] = pd.Timestamp("2000", tz="US/Central") assert a.freq is None @@ -477,7 +479,7 @@ def test_setitem_objects(self, obj): def test_repeat_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") - arr = DatetimeArray(dti) + arr = dti._data repeated = arr.repeat([1, 1]) @@ -487,7 +489,7 @@ def test_repeat_preserves_tz(self): def test_value_counts_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") - arr = DatetimeArray(dti).repeat([4, 3]) + arr = dti._data.repeat([4, 3]) result = arr.value_counts() @@ -502,7 +504,7 @@ def test_value_counts_preserves_tz(self): @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -561,7 +563,7 @@ def test_fillna_2d(self): def test_array_interface_tz(self): tz = "US/Central" - data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) + data = pd.date_range("2017", periods=2, tz=tz)._data result = np.asarray(data) expected = np.array( @@ -584,7 +586,7 @@ def test_array_interface_tz(self): tm.assert_numpy_array_equal(result, expected) def test_array_interface(self): - data = DatetimeArray(pd.date_range("2017", periods=2)) + data = pd.date_range("2017", periods=2)._data expected = np.array( ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]" ) @@ -602,7 +604,7 @@ def test_array_interface(self): @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_different_tz(self, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + arr = pd.DatetimeIndex(data, freq="D")._data.tz_localize("Asia/Tokyo") if index: arr = pd.Index(arr) @@ -617,7 +619,7 @@ def test_searchsorted_different_tz(self, index): @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_tzawareness_compat(self, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D") + arr = pd.DatetimeIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -651,7 +653,7 @@ def test_searchsorted_tzawareness_compat(self, index): @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D") + arr = pd.DatetimeIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -668,7 +670,7 @@ def test_shift_fill_value(self): dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -741,7 +743,7 @@ def test_iter_zoneinfo_fold(self, tz): ) utc_vals *= 1_000_000_000 - dta = DatetimeArray(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) left = dta[2] right = list(dta)[2] diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 3f5ee328bdfcf..a3f15467feb14 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -210,7 +210,7 @@ def test_astype_int(self, dtype): tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range("1h", periods=2, freq="h")) + a = pd.timedelta_range("1h", periods=2, freq="h")._data a[0] = Timedelta("1h") assert a.freq is None @@ -225,7 +225,7 @@ def test_setitem_clears_freq(self): def test_setitem_objects(self, obj): # make sure we accept timedelta64 and timedelta in addition to Timedelta tdi = pd.timedelta_range("2 Days", periods=4, freq="h") - arr = TimedeltaArray(tdi, freq=tdi.freq) + arr = tdi._data arr[0] = obj assert arr[0] == Timedelta(seconds=1) @@ -247,7 +247,7 @@ def test_setitem_objects(self, obj): @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = TimedeltaArray(data, freq="D") + arr = pd.TimedeltaIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -264,10 +264,10 @@ def test_searchsorted_invalid_types(self, other, index): class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray(evals) + expected = TimedeltaArray._from_sequence(evals) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -277,7 +277,7 @@ def test_abs(self): def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -289,10 +289,10 @@ def test_pos(self): def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray(evals) + expected = TimedeltaArray._from_sequence(evals) result = -arr tm.assert_timedelta_array_equal(result, expected) @@ -302,9 +302,9 @@ def test_neg(self): def test_neg_freq(self): tdi = pd.timedelta_range("2 Days", periods=4, freq="h") - arr = TimedeltaArray(tdi, freq=tdi.freq) + arr = tdi._data - expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) + expected = -tdi._data result = -arr tm.assert_timedelta_array_equal(result, expected) diff --git a/pandas/tests/arrays/timedeltas/test_constructors.py b/pandas/tests/arrays/timedeltas/test_constructors.py index 30894becc35cf..91b6f7fa222f9 100644 --- a/pandas/tests/arrays/timedeltas/test_constructors.py +++ b/pandas/tests/arrays/timedeltas/test_constructors.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray @@ -9,13 +10,16 @@ def test_only_1dim_accepted(self): # GH#25282 arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - TimedeltaArray(arr.reshape(2, 2, 1)) + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - TimedeltaArray(arr[[0]].squeeze()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + TimedeltaArray(arr[[0]].squeeze()) def test_freq_validation(self): # ensure that the public constructor cannot create an invalid instance @@ -25,54 +29,71 @@ def test_freq_validation(self): "Inferred frequency None from passed values does not " "conform to passed frequency D" ) - with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") def test_non_array_raises(self): - with pytest.raises(ValueError, match="list"): - TimedeltaArray([1, 2, 3]) + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + TimedeltaArray([1, 2, 3]) def test_other_type_raises(self): - msg = "dtype 'bool' is invalid, should be np.timedelta64 dtype" - with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="bool")) + msg = r"dtype bool cannot be converted to timedelta64\[ns\]" + with pytest.raises(TypeError, match=msg): + TimedeltaArray._from_sequence(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): msg = "dtype 'category' is invalid, should be np.timedelta64 dtype" with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="category" + ) msg = "dtype 'int64' is invalid, should be np.timedelta64 dtype" with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64") + ) msg = r"dtype 'datetime64\[ns\]' is invalid, should be np.timedelta64 dtype" with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("M8[ns]")) + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("M8[ns]") + ) msg = ( r"dtype 'datetime64\[us, UTC\]' is invalid, should be np.timedelta64 dtype" ) with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[us, UTC]") + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="M8[us, UTC]" + ) msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'" with pytest.raises(ValueError, match=msg): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]")) + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]") + ) def test_mismatched_values_dtype_units(self): arr = np.array([1, 2, 3], dtype="m8[s]") dtype = np.dtype("m8[ns]") msg = r"Values resolution does not match dtype" - with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr, dtype=dtype) + depr_msg = "TimedeltaArray.__init__ is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr, dtype=dtype) def test_copy(self): data = np.array([1, 2, 3], dtype="m8[ns]") - arr = TimedeltaArray(data, copy=False) + arr = TimedeltaArray._from_sequence(data, copy=False) assert arr._ndarray is data - arr = TimedeltaArray(data, copy=True) + arr = TimedeltaArray._from_sequence(data, copy=True) assert arr._ndarray is not data assert arr._ndarray.base is not data diff --git a/pandas/tests/arrays/timedeltas/test_reductions.py b/pandas/tests/arrays/timedeltas/test_reductions.py index f1d2cc6a90519..991dbf41c8087 100644 --- a/pandas/tests/arrays/timedeltas/test_reductions.py +++ b/pandas/tests/arrays/timedeltas/test_reductions.py @@ -105,7 +105,7 @@ def test_sum_2d_skipna_false(self): arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2) arr[-1, -1] = "Nat" - tda = TimedeltaArray(arr) + tda = TimedeltaArray._from_sequence(arr) result = tda.sum(skipna=False) assert result is pd.NaT diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 3955e0e88e776..fe0f1f1454a55 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -254,10 +254,13 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): (pd.array([0, np.nan], dtype="Int64"), "_data"), (IntervalArray.from_breaks([0, 1]), "_left"), (SparseArray([0, 1]), "_sparse_values"), - (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_ndarray"), + ( + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + "_ndarray", + ), # tz-aware Datetime ( - DatetimeArray( + DatetimeArray._from_sequence( np.array( ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" ), @@ -303,17 +306,16 @@ def test_array_multiindex_raises(): (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), # tz-naive datetime ( - DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), ), # tz-aware stays tz`-aware ( - DatetimeArray( - np.array( - ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" - ), - dtype=DatetimeTZDtype(tz="US/Central"), - ), + DatetimeArray._from_sequence( + np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]") + ) + .tz_localize("UTC") + .tz_convert("US/Central"), np.array( [ Timestamp("2000-01-01", tz="US/Central"), @@ -323,8 +325,8 @@ def test_array_multiindex_raises(): ), # Timedelta ( - TimedeltaArray( - np.array([0, 3600000000000], dtype="i8").view("m8[ns]"), freq="h" + TimedeltaArray._from_sequence( + np.array([0, 3600000000000], dtype="i8").view("m8[ns]") ), np.array([0, 3600000000000], dtype="m8[ns]"), ), diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 3da3237370e60..02c827853b29d 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -19,8 +19,9 @@ class TestABCClasses: categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) + + datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) + timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) abc_pairs = [ ("ABCMultiIndex", multi_index), diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index efb88dc7bd4e1..7f70957007dad 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -31,13 +31,15 @@ def dtype(request): @pytest.fixture def data(dtype): - data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype) + data = DatetimeArray._from_sequence( + pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype + ) return data @pytest.fixture def data_missing(dtype): - return DatetimeArray( + return DatetimeArray._from_sequence( np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype ) @@ -47,14 +49,18 @@ def data_for_sorting(dtype): a = pd.Timestamp("2000-01-01") b = pd.Timestamp("2000-01-02") c = pd.Timestamp("2000-01-03") - return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype) + return DatetimeArray._from_sequence( + np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture def data_missing_for_sorting(dtype): a = pd.Timestamp("2000-01-01") b = pd.Timestamp("2000-01-02") - return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype) + return DatetimeArray._from_sequence( + np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture @@ -68,7 +74,7 @@ def data_for_grouping(dtype): b = pd.Timestamp("2000-01-02") c = pd.Timestamp("2000-01-03") na = "NaT" - return DatetimeArray( + return DatetimeArray._from_sequence( np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype ) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 15597ecad6aea..2abbcf6688833 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -31,10 +31,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - period_array, -) +from pandas.core.arrays import period_array class TestDatetimeIndex: @@ -1112,9 +1109,6 @@ def test_explicit_none_freq(self): result = DatetimeIndex(rng._data, freq=None) assert result.freq is None - dta = DatetimeArray(rng, freq=None) - assert dta.freq is None - def test_dti_constructor_small_int(self, any_int_numpy_dtype): # see gh-13721 exp = DatetimeIndex( diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index abf7e093fd6cd..0510700bb64d7 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -70,6 +70,7 @@ def test_infer_from_tdi_mismatch(self): # has one and it does not match the `freq` input tdi = timedelta_range("1 second", periods=100, freq="1s") + depr_msg = "TimedeltaArray.__init__ is deprecated" msg = ( "Inferred frequency .* from passed values does " "not conform to passed frequency" @@ -79,13 +80,15 @@ def test_infer_from_tdi_mismatch(self): with pytest.raises(ValueError, match=msg): # GH#23789 - TimedeltaArray(tdi, freq="D") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi, freq="D") with pytest.raises(ValueError, match=msg): TimedeltaIndex(tdi._data, freq="D") with pytest.raises(ValueError, match=msg): - TimedeltaArray(tdi._data, freq="D") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi._data, freq="D") def test_dt64_data_invalid(self): # GH#23539 @@ -270,7 +273,9 @@ def test_explicit_none_freq(self): result = TimedeltaIndex(tdi._data, freq=None) assert result.freq is None - tda = TimedeltaArray(tdi, freq=None) + msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tda = TimedeltaArray(tdi, freq=None) assert tda.freq is None def test_from_categorical(self): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index a94ed0e044598..8fbb78737474c 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -13,15 +13,10 @@ date_range, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - PeriodArray, - TimedeltaArray, -) class TestDatetimeLikeStatReductions: - @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) def test_dt64_mean(self, tz_naive_fixture, box): tz = tz_naive_fixture @@ -41,7 +36,7 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) assert obj.mean(skipna=False) is pd.NaT - @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"]) def test_period_mean(self, box, freq): # GH#24757 @@ -67,7 +62,7 @@ def test_period_mean(self, box, freq): with pytest.raises(TypeError, match="ambiguous"): obj.mean(skipna=True) - @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) def test_td64_mean(self, box): m8values = np.array([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], "m8[D]") tdi = pd.TimedeltaIndex(m8values).as_unit("ns") diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 898a027255190..51ce73ef54300 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -304,7 +304,9 @@ def test_from_obscure_array(dtype, array_likes): cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - expected = cls(arr) + depr_msg = f"{cls.__name__}.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = cls(arr) result = cls._from_sequence(data, dtype=dtype) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 632d9783c7f81..a50054f33f382 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -14,7 +14,6 @@ ) import pandas._testing as tm from pandas.core import nanops -from pandas.core.arrays import DatetimeArray use_bn = nanops._USE_BOTTLENECK @@ -1113,17 +1112,13 @@ def test_nanmean(self, unit): dti = pd.date_range("2016-01-01", periods=3).as_unit(unit) expected = dti[1] - for obj in [dti, DatetimeArray(dti), Series(dti)]: - if isinstance(obj, Series): - obj = obj._values + for obj in [dti, dti._data]: result = nanops.nanmean(obj) assert result == expected dti2 = dti.insert(1, pd.NaT) - for obj in [dti2, DatetimeArray(dti2), Series(dti2)]: - if isinstance(obj, Series): - obj = obj._values + for obj in [dti2, dti2._data]: result = nanops.nanmean(obj) assert result == expected diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a23d2d3dc22af..417a56dc074a6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -991,7 +991,7 @@ def test_error_iso_week_year(self, msg, s, _format, errors): def test_to_datetime_dtarr(self, tz): # DatetimeArray dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) - arr = DatetimeArray(dti) + arr = dti._data result = to_datetime(arr) assert result is arr @@ -2822,7 +2822,7 @@ def test_dayfirst_warnings_invalid_input(self): ): to_datetime(arr, dayfirst=True) - @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray._from_sequence]) def test_to_datetime_dta_tz(self, klass): # GH#27733 dti = date_range("2015-04-05", periods=3).rename("foo") From 8f32ea5a7c4b73ae3a7569f662a6d4804519baa5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 21 Dec 2023 13:36:12 -0500 Subject: [PATCH 02/31] enable ASAN/UBSAN in pandas CI (#55102) * enable ASAN/UBSAN in pandas CI * try input * try removing sanitize * try no CFLAGS * try GH string substituion * change flags in build script * quotes * update script run * single_cpu updates * asan checks for datetime funcs * try smaller config * checkpoint * bool fixup * reverts * known UB marker * Finished marking tests with known UB * dedicated CI job * identifier fix * fixes * more test skip * try quotes * simplify ci * try CFLAGS * preload args * skip single_cpu tests * wording * removed unneeded marker * float set implementations * Revert "float set implementations" This reverts commit 6266422371022ba227ae01634c40090e26635784. * change marker name * dedicated actions file * consolidated into matrix * fixup * typos * fixups * add qt? * intentional UB with verbose * disable pytest-xdist * original issue * remove UB * Revert "remove UB" This reverts commit 677da0e4bce131d6992c320920885eded2a5d796. * merge fixup * remove UB --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/actions/build_pandas/action.yml | 11 +++++-- .github/actions/run-tests/action.yml | 9 +++++- .github/workflows/unit-tests.yml | 19 ++++++++++- ci/deps/actions-311-sanitizers.yaml | 32 +++++++++++++++++++ pandas/tests/frame/test_constructors.py | 2 ++ pandas/tests/groupby/test_cumulative.py | 1 + pandas/tests/io/parser/common/test_float.py | 10 +++++- .../scalar/timedelta/methods/test_round.py | 2 ++ .../tests/scalar/timedelta/test_arithmetic.py | 1 + .../tests/scalar/timedelta/test_timedelta.py | 1 + .../timestamp/methods/test_tz_localize.py | 1 + .../scalar/timestamp/test_constructors.py | 1 + pandas/tests/tools/test_to_datetime.py | 1 + pyproject.toml | 1 + scripts/tests/data/deps_minimum.toml | 1 + 15 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 ci/deps/actions-311-sanitizers.yaml diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 460ae2f8594c0..63f687324b0ae 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,6 +4,12 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true + meson_args: + description: Extra flags to pass to meson + required: false + cflags_adds: + description: Items to append to the CFLAGS variable + required: false runs: using: composite steps: @@ -24,11 +30,12 @@ runs: - name: Build Pandas run: | + export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps \ + pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ --config-settings=setup-args="--werror" else - pip install . --no-build-isolation -v --no-deps \ + pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ --config-settings=setup-args="--werror" fi shell: bash -el {0} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index fd7c3587f2254..b4778b74df335 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -1,9 +1,16 @@ name: Run tests and report results +inputs: + preload: + description: Preload arguments for sanitizer + required: false + asan_options: + description: Arguments for Address Sanitizer (ASAN) + required: false runs: using: composite steps: - name: Test - run: ci/run_tests.sh + run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh shell: bash -el {0} - name: Publish test results diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 88d705dbd9251..57f9893d36044 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -96,6 +96,14 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + - name: "ASAN / UBSAN" + env_file: actions-311-sanitizers.yaml + pattern: "not slow and not network and not single_cpu and not skip_ubsan" + asan_options: "ASAN_OPTIONS=detect_leaks=0" + preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so) + meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined" + cflags_adds: -fno-sanitize-recover=all + pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: @@ -105,7 +113,7 @@ jobs: PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} PANDAS_CI: ${{ matrix.pandas_ci || '1' }} TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: 'auto' + PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} # Clipboard tests QT_QPA_PLATFORM: offscreen @@ -174,16 +182,25 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas + with: + meson_args: ${{ matrix.meson_args }} + cflags_adds: ${{ matrix.cflags_adds }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests if: ${{ matrix.name != 'Pypy' }} + with: + preload: ${{ matrix.preload }} + asan_options: ${{ matrix.asan_options }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - name: Test (single_cpu) uses: ./.github/actions/run-tests + with: + preload: ${{ matrix.preload }} + asan_options: ${{ matrix.asan_options }} env: PATTERN: 'single_cpu' PYTEST_WORKERS: 0 diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml new file mode 100644 index 0000000000000..dcd381066b0ea --- /dev/null +++ b/ci/deps/actions-311-sanitizers.yaml @@ -0,0 +1,32 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + - meson[ninja]=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 + - boto3 + - hypothesis>=6.46.1 + - pyqt>=5.15.9 + + # required dependencies + - python-dateutil + - numpy<2 + - pytz + + # pandas dependencies + - pip + + - pip: + - "tzdata>=2022.7" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e1abd0344e356..f64d6a886fe9f 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -3206,6 +3206,7 @@ def test_from_out_of_bounds_ns_datetime( assert item.asm8.dtype == exp_dtype assert dtype == exp_dtype + @pytest.mark.skip_ubsan def test_out_of_s_bounds_datetime64(self, constructor): scalar = np.datetime64(np.iinfo(np.int64).max, "D") result = constructor(scalar) @@ -3241,6 +3242,7 @@ def test_from_out_of_bounds_ns_timedelta( assert item.asm8.dtype == exp_dtype assert dtype == exp_dtype + @pytest.mark.skip_ubsan @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64]) def test_out_of_s_bounds_timedelta64(self, constructor, cls): scalar = cls(np.iinfo(np.int64).max, "D") diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index bf572609f3d37..1bdbef6d50c4c 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -60,6 +60,7 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) +@pytest.mark.skip_ubsan def test_groupby_cumprod_overflow(): # GH#37493 if we overflow we return garbage consistent with numpy df = DataFrame({"key": ["b"] * 4, "value": 100_000}) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 4b23774ee2d5b..6069c23936297 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -40,7 +40,14 @@ def test_scientific_no_exponent(all_parsers_all_precisions): tm.assert_frame_equal(df_roundtrip, df) -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +@pytest.mark.parametrize( + "neg_exp", + [ + -617, + -100000, + pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan), + ], +) def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): # GH#38753 parser, precision = all_parsers_all_precisions @@ -51,6 +58,7 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): tm.assert_frame_equal(result, expected) +@pytest.mark.skip_ubsan @xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): diff --git a/pandas/tests/scalar/timedelta/methods/test_round.py b/pandas/tests/scalar/timedelta/methods/test_round.py index 4beb39510413c..e54adb27d126b 100644 --- a/pandas/tests/scalar/timedelta/methods/test_round.py +++ b/pandas/tests/scalar/timedelta/methods/test_round.py @@ -61,6 +61,7 @@ def test_round_invalid(self): with pytest.raises(ValueError, match=msg): t1.round(freq) + @pytest.mark.skip_ubsan def test_round_implementation_bounds(self): # See also: analogous test for Timestamp # GH#38964 @@ -86,6 +87,7 @@ def test_round_implementation_bounds(self): with pytest.raises(OutOfBoundsTimedelta, match=msg): Timedelta.max.round("s") + @pytest.mark.skip_ubsan @given(val=st.integers(min_value=iNaT + 1, max_value=lib.i8max)) @pytest.mark.parametrize( "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index cc1e91893e308..d2fa0f722ca6f 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -966,6 +966,7 @@ def test_td_op_timedelta_timedeltalike_array(self, op, arr): class TestTimedeltaComparison: + @pytest.mark.skip_ubsan def test_compare_pytimedelta_bounds(self): # GH#49021 don't overflow on comparison with very large pytimedeltas diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index ac605df935226..d4398f66e6f89 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -551,6 +551,7 @@ def test_timedelta_hash_equality(self): ns_td = Timedelta(1, "ns") assert hash(ns_td) != hash(ns_td.to_pytimedelta()) + @pytest.mark.skip_ubsan @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", ) diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 9df0a023730de..af3dee1880d2e 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -25,6 +25,7 @@ class TestTimestampTZLocalize: + @pytest.mark.skip_ubsan def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 98e4d581dc104..3975f3c46aaa1 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -822,6 +822,7 @@ def test_barely_out_of_bounds(self): with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp("2262-04-11 23:47:16.854775808") + @pytest.mark.skip_ubsan def test_bounds_with_different_units(self): out_of_bounds_dates = ("1677-09-21", "2262-04-12") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 417a56dc074a6..6791ac0340640 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1140,6 +1140,7 @@ def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors): assert ts.unit == "s" assert ts.asm8 == dt + @pytest.mark.skip_ubsan def test_to_datetime_dt64d_out_of_bounds(self, cache): dt64 = np.datetime64(np.iinfo(np.int64).max, "D") diff --git a/pyproject.toml b/pyproject.toml index 6e3424f9a7075..ca19f463edf40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -523,6 +523,7 @@ markers = [ "db: tests requiring a database (mysql or postgres)", "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", + "skip_ubsan: Tests known to fail UBSAN check", ] [tool.mypy] diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index f8535635e12ab..3be6be17d1ee2 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -382,6 +382,7 @@ markers = [ "db: tests requiring a database (mysql or postgres)", "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", + "skip_ubsan: tests known to invoke undefined behavior", ] [tool.mypy] From 62f4098f1e5b2934d17b1df442f193f73989484e Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:49:44 -0500 Subject: [PATCH 03/31] DOC: Fix doc redirects (#56577) * DOC: Update redirects * fixups * fixups --- doc/redirects.csv | 142 +++++++++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 39 deletions(-) diff --git a/doc/redirects.csv b/doc/redirects.csv index bd60cc6a732bd..27b41da63c513 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -100,7 +100,6 @@ generated/pandas.api.extensions.register_series_accessor,../reference/api/pandas generated/pandas.api.types.infer_dtype,../reference/api/pandas.api.types.infer_dtype generated/pandas.api.types.is_bool_dtype,../reference/api/pandas.api.types.is_bool_dtype generated/pandas.api.types.is_bool,../reference/api/pandas.api.types.is_bool -generated/pandas.api.types.is_categorical_dtype,../reference/api/pandas.api.types.is_categorical_dtype generated/pandas.api.types.is_complex_dtype,../reference/api/pandas.api.types.is_complex_dtype generated/pandas.api.types.is_complex,../reference/api/pandas.api.types.is_complex generated/pandas.api.types.is_datetime64_any_dtype,../reference/api/pandas.api.types.is_datetime64_any_dtype @@ -193,40 +192,39 @@ generated/pandas.core.groupby.DataFrameGroupBy.shift,../reference/api/pandas.cor generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take -generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg -generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate -generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all -generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.GroupBy.any -generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.GroupBy.apply -generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.GroupBy.bfill -generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.GroupBy.count -generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.GroupBy.cumcount -generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.GroupBy.ffill -generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.GroupBy.first -generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.GroupBy.get_group -generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.GroupBy.groups -generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.GroupBy.head -generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.GroupBy.indices -generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.GroupBy.__iter__ -generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.GroupBy.last -generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.GroupBy.max -generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.GroupBy.mean -generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.GroupBy.median -generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.GroupBy.min -generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.GroupBy.ngroup -generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.GroupBy.nth -generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.GroupBy.ohlc -generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.GroupBy.pct_change -generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.GroupBy.pipe -generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.GroupBy.prod -generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.GroupBy.rank -generated/pandas.core.groupby.GroupBy.sem,../reference/api/pandas.core.groupby.GroupBy.sem -generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.GroupBy.size -generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.GroupBy.std -generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.GroupBy.sum -generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.GroupBy.tail -generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.GroupBy.transform -generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.GroupBy.var +generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.DataFrameGroupBy.agg +generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate +generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.DataFrameGroupBy.all +generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.DataFrameGroupBy.any +generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.DataFrameGroupBy.apply +generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.DataFrameGroupBy.bfill +generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.DataFrameGroupBy.count +generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumcount +generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.DataFrameGroupBy.ffill +generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.DataFrameGroupBy.first +generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.DataFrameGroupBy.get_group +generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.DataFrameGroupBy.groups +generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.DataFrameGroupBy.head +generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.DataFrameGroupBy.indices +generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.DataFrameGroupBy.__iter__ +generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.DataFrameGroupBy.last +generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.DataFrameGroupBy.max +generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.DataFrameGroupBy.mean +generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.DataFrameGroupBy.median +generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.DataFrameGroupBy.min +generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.DataFrameGroupBy.ngroup +generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.DataFrameGroupBy.nth +generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.DataFrameGroupBy.ohlc +generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change +generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.DataFrameGroupBy.pipe +generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.DataFrameGroupBy.prod +generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.DataFrameGroupBy.rank +generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size +generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.DataFrameGroupBy.std +generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.DataFrameGroupBy.sum +generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.DataFrameGroupBy.tail +generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.DataFrameGroupBy.transform +generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.DataFrameGroupBy.var generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing generated/pandas.core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas.core.groupby.SeriesGroupBy.nlargest @@ -237,7 +235,7 @@ generated/pandas.core.groupby.SeriesGroupBy.value_counts,../reference/api/pandas generated/pandas.core.resample.Resampler.aggregate,../reference/api/pandas.core.resample.Resampler.aggregate generated/pandas.core.resample.Resampler.apply,../reference/api/pandas.core.resample.Resampler.apply generated/pandas.core.resample.Resampler.asfreq,../reference/api/pandas.core.resample.Resampler.asfreq -generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.backfill +generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill @@ -256,7 +254,6 @@ generated/pandas.core.resample.Resampler.min,../reference/api/pandas.core.resamp generated/pandas.core.resample.Resampler.nearest,../reference/api/pandas.core.resample.Resampler.nearest generated/pandas.core.resample.Resampler.nunique,../reference/api/pandas.core.resample.Resampler.nunique generated/pandas.core.resample.Resampler.ohlc,../reference/api/pandas.core.resample.Resampler.ohlc -generated/pandas.core.resample.Resampler.pad,../reference/api/pandas.core.resample.Resampler.pad generated/pandas.core.resample.Resampler.pipe,../reference/api/pandas.core.resample.Resampler.pipe generated/pandas.core.resample.Resampler.prod,../reference/api/pandas.core.resample.Resampler.prod generated/pandas.core.resample.Resampler.quantile,../reference/api/pandas.core.resample.Resampler.quantile @@ -708,7 +705,7 @@ generated/pandas.Index.summary,../reference/api/pandas.Index.summary generated/pandas.Index.symmetric_difference,../reference/api/pandas.Index.symmetric_difference generated/pandas.Index.take,../reference/api/pandas.Index.take generated/pandas.Index.T,../reference/api/pandas.Index.T -generated/pandas.Index.to_flat_index,../reference/api/pandas.Index.to_flat_index +generated/pandas.Index.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index generated/pandas.Index.to_frame,../reference/api/pandas.Index.to_frame generated/pandas.Index.to_list,../reference/api/pandas.Index.to_list generated/pandas.Index.tolist,../reference/api/pandas.Index.tolist @@ -753,7 +750,8 @@ generated/pandas.Interval.overlaps,../reference/api/pandas.Interval.overlaps generated/pandas.interval_range,../reference/api/pandas.interval_range generated/pandas.Interval.right,../reference/api/pandas.Interval.right generated/pandas.io.formats.style.Styler.apply,../reference/api/pandas.io.formats.style.Styler.apply -generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.applymap +generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.map +generated/pandas.io.formats.style.Styler.applymap_index,../reference/api/pandas.io.formats.style.Styler.map_index generated/pandas.io.formats.style.Styler.background_gradient,../reference/api/pandas.io.formats.style.Styler.background_gradient generated/pandas.io.formats.style.Styler.bar,../reference/api/pandas.io.formats.style.Styler.bar generated/pandas.io.formats.style.Styler.clear,../reference/api/pandas.io.formats.style.Styler.clear @@ -1384,3 +1382,69 @@ generated/pandas.wide_to_long,../reference/api/pandas.wide_to_long # Cached searches reference/api/pandas.DataFrame.from_csv,pandas.read_csv + +# GroupBy -> DataFrameGroupBy +reference/api/pandas.core.groupby.GroupBy.__iter__,pandas.core.groupby.DataFrameGroupBy.__iter__ +reference/api/pandas.core.groupby.GroupBy.agg,pandas.core.groupby.DataFrameGroupBy.agg +reference/api/pandas.core.groupby.GroupBy.aggregate,pandas.core.groupby.DataFrameGroupBy.aggregate +reference/api/pandas.core.groupby.GroupBy.all,pandas.core.groupby.DataFrameGroupBy.all +reference/api/pandas.core.groupby.GroupBy.any,pandas.core.groupby.DataFrameGroupBy.any +reference/api/pandas.core.groupby.GroupBy.apply,pandas.core.groupby.DataFrameGroupBy.apply +reference/api/pandas.core.groupby.GroupBy.bfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.count,pandas.core.groupby.DataFrameGroupBy.count +reference/api/pandas.core.groupby.GroupBy.cumcount,pandas.core.groupby.DataFrameGroupBy.cumcount +reference/api/pandas.core.groupby.GroupBy.cummax,pandas.core.groupby.DataFrameGroupBy.cummax +reference/api/pandas.core.groupby.GroupBy.cummin,pandas.core.groupby.DataFrameGroupBy.cummin +reference/api/pandas.core.groupby.GroupBy.cumprod,pandas.core.groupby.DataFrameGroupBy.cumprod +reference/api/pandas.core.groupby.GroupBy.cumsum,pandas.core.groupby.DataFrameGroupBy.cumsum +reference/api/pandas.core.groupby.GroupBy.ffill,pandas.core.groupby.DataFrameGroupBy.ffill +reference/api/pandas.core.groupby.GroupBy.first,pandas.core.groupby.DataFrameGroupBy.first +reference/api/pandas.core.groupby.GroupBy.get_group,pandas.core.groupby.DataFrameGroupBy.get_group +reference/api/pandas.core.groupby.GroupBy.groups,pandas.core.groupby.DataFrameGroupBy.groups +reference/api/pandas.core.groupby.GroupBy.head,pandas.core.groupby.DataFrameGroupBy.head +reference/api/pandas.core.groupby.GroupBy.indices,pandas.core.groupby.DataFrameGroupBy.indices +reference/api/pandas.core.groupby.GroupBy.last,pandas.core.groupby.DataFrameGroupBy.last +reference/api/pandas.core.groupby.GroupBy.max,pandas.core.groupby.DataFrameGroupBy.max +reference/api/pandas.core.groupby.GroupBy.mean,pandas.core.groupby.DataFrameGroupBy.mean +reference/api/pandas.core.groupby.GroupBy.median,pandas.core.groupby.DataFrameGroupBy.median +reference/api/pandas.core.groupby.GroupBy.min,pandas.core.groupby.DataFrameGroupBy.min +reference/api/pandas.core.groupby.GroupBy.ngroup,pandas.core.groupby.DataFrameGroupBy.ngroup +reference/api/pandas.core.groupby.GroupBy.nth,pandas.core.groupby.DataFrameGroupBy.nth +reference/api/pandas.core.groupby.GroupBy.ohlc,pandas.core.groupby.DataFrameGroupBy.ohlc +reference/api/pandas.core.groupby.GroupBy.pct_change,pandas.core.groupby.DataFrameGroupBy.pct_change +reference/api/pandas.core.groupby.GroupBy.pipe,pandas.core.groupby.DataFrameGroupBy.pipe +reference/api/pandas.core.groupby.GroupBy.prod,pandas.core.groupby.DataFrameGroupBy.prod +reference/api/pandas.core.groupby.GroupBy.rank,pandas.core.groupby.DataFrameGroupBy.rank +reference/api/pandas.core.groupby.GroupBy.sem,pandas.core.groupby.DataFrameGroupBy.sem +reference/api/pandas.core.groupby.GroupBy.size,pandas.core.groupby.DataFrameGroupBy.size +reference/api/pandas.core.groupby.GroupBy.std,pandas.core.groupby.DataFrameGroupBy.std +reference/api/pandas.core.groupby.GroupBy.sum,pandas.core.groupby.DataFrameGroupBy.sum +reference/api/pandas.core.groupby.GroupBy.tail,pandas.core.groupby.DataFrameGroupBy.tail +reference/api/pandas.core.groupby.GroupBy.transform,pandas.core.groupby.DataFrameGroupBy.transform +reference/api/pandas.core.groupby.GroupBy.var,pandas.core.groupby.DataFrameGroupBy.var + +# Renamed or alias doc page was removed +reference/api/pandas.DataFrame.subtract,pandas.DataFrame.sub +reference/api/pandas.DataFrame.multiply,pandas.DataFrame.mul +reference/api/pandas.DataFrame.divide,pandas.DataFrame.div +reference/api/pandas.Series.subtract,pandas.Series.sub +reference/api/pandas.Series.multiply,pandas.Series.mul +reference/api/pandas.Series.divide,pandas.Series.div +reference/api/pandas.Series.tolist,pandas.Series.to_list +reference/api/pandas.Series.transpose,pandas.Series.T +reference/api/pandas.Index.transpose,pandas.Index.T +reference/api/pandas.Index.notnull,pandas.Index.notna +reference/api/pandas.Index.tolist,pandas.Index.to_list +reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray +reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill +reference/api/pandas.io.formats.style.Styler.applymap,pandas.io.formats.style.Styler.map +reference/api/pandas.io.formats.style.Styler.applymap_index,pandas.io.formats.style.Styler.map_index + +# EWM -> ExponentialMovingWindow +reference/api/pandas.core.window.ewm.EWM.corr,pandas.core.window.ewm.ExponentialMovingWindow.corr +reference/api/pandas.core.window.ewm.EWM.cov,pandas.core.window.ewm.ExponentialMovingWindow.cov +reference/api/pandas.core.window.ewm.EWM.mean,pandas.core.window.ewm.ExponentialMovingWindow.mean +reference/api/pandas.core.window.ewm.EWM.std,pandas.core.window.ewm.ExponentialMovingWindow.std +reference/api/pandas.core.window.ewm.EWM.var,pandas.core.window.ewm.ExponentialMovingWindow.var From a3af1523938765527188fa779277d1cccb3e9900 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 21:45:11 +0100 Subject: [PATCH 04/31] CoW: Set a bunch of Series copy statements to False (#56520) --- pandas/core/groupby/groupby.py | 14 +++++++------- pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/io/sas/sas7bdat.py | 8 ++++---- pandas/io/stata.py | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 35d55efed5aa9..089e15afd465b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2200,7 +2200,7 @@ def any(self, skipna: bool = True) -> NDFrameT: """ return self._cython_agg_general( "any", - alt=lambda x: Series(x).any(skipna=skipna), + alt=lambda x: Series(x, copy=False).any(skipna=skipna), skipna=skipna, ) @@ -2257,7 +2257,7 @@ def all(self, skipna: bool = True) -> NDFrameT: """ return self._cython_agg_general( "all", - alt=lambda x: Series(x).all(skipna=skipna), + alt=lambda x: Series(x, copy=False).all(skipna=skipna), skipna=skipna, ) @@ -2451,7 +2451,7 @@ def mean( else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2531,7 +2531,7 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2640,7 +2640,7 @@ def std( else: return self._cython_agg_general( "std", - alt=lambda x: Series(x).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) @@ -2747,7 +2747,7 @@ def var( else: return self._cython_agg_general( "var", - alt=lambda x: Series(x).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) @@ -2977,7 +2977,7 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index eabeba43543ed..5e83eaee02afc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -707,7 +707,7 @@ def size(self) -> Series: out = np.bincount(ids[ids != -1], minlength=ngroups) else: out = [] - return Series(out, index=self.result_index, dtype="int64") + return Series(out, index=self.result_index, dtype="int64", copy=False) @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 17ed50dad29e5..edb851c7d0162 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -723,7 +723,7 @@ def _format_duplicate_message(self) -> DataFrame: assert len(duplicates) out = ( - Series(np.arange(len(self))) + Series(np.arange(len(self)), copy=False) .groupby(self, observed=False) .agg(list)[duplicates] ) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 331dd2d2da7a4..c5bdfb5541788 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -100,10 +100,10 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: sas_datetimes._values, unit="s", out_unit="ms" ) dt64ms = millis.view("M8[ms]") + td - return pd.Series(dt64ms, index=sas_datetimes.index) + return pd.Series(dt64ms, index=sas_datetimes.index, copy=False) else: vals = np.array(sas_datetimes, dtype="M8[D]") + td - return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index) + return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False) class _Column: @@ -727,7 +727,7 @@ def _chunk_to_dataframe(self) -> DataFrame: if self._column_types[j] == b"d": col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") - rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix) + rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False) if self.convert_dates: if self.column_formats[j] in const.sas_date_formats: rslt[name] = _convert_datetimes(rslt[name], "d") @@ -735,7 +735,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = _convert_datetimes(rslt[name], "s") jb += 1 elif self._column_types[j] == b"s": - rslt[name] = pd.Series(self._string_chunk[js, :], index=ix) + rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) js += 1 diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d1484510b654f..0f097c6059c7c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -494,11 +494,11 @@ def g(x: datetime) -> int: else: raise ValueError(f"Format {fmt} is not a known Stata date format") - conv_dates = Series(conv_dates, dtype=np.float64) + conv_dates = Series(conv_dates, dtype=np.float64, copy=False) missing_value = struct.unpack(" Date: Thu, 21 Dec 2023 15:59:19 -0500 Subject: [PATCH 05/31] BUG: stack changes NA values in the index (#56582) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/reshape.py | 4 +-- pandas/tests/frame/test_stack_unstack.py | 38 ++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 70039cc697b8a..3810171908340 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -689,6 +689,7 @@ Reshaping - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) +- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8c822ec58e011..7a49682d7c57c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -953,8 +953,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: index_levels = frame.index.levels index_codes = list(np.tile(frame.index.codes, (1, ratio))) else: - index_levels = [frame.index.unique()] - codes = factorize(frame.index)[0] + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] index_codes = list(np.tile(codes, (1, ratio))) if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 554a9d4ce2d5d..6e1e743eb60de 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2638,3 +2638,41 @@ def test_stack_tuple_columns(future_stack): ), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + ("float64", np.nan), + ("Float64", np.nan), + ("Float64", pd.NA), + ("Int64", pd.NA), + ], +) +@pytest.mark.parametrize("test_multiindex", [True, False]) +def test_stack_preserves_na(dtype, na_value, test_multiindex): + # GH#56573 + if test_multiindex: + index = MultiIndex.from_arrays(2 * [Index([na_value], dtype=dtype)]) + else: + index = Index([na_value], dtype=dtype) + df = DataFrame({"a": [1]}, index=index) + result = df.stack(future_stack=True) + + if test_multiindex: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + else: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + expected = Series(1, index=expected_index) + tm.assert_series_equal(result, expected) From 937964214e65c9d3154432e88e484003fa92358b Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Thu, 21 Dec 2023 16:01:17 -0500 Subject: [PATCH 06/31] Support tuple in startswith/endswith for arrow strings (#56580) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 30 ++++++++++++++++++++++++---- pandas/tests/extension/test_arrow.py | 24 +++++++++++++++++----- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3810171908340..14565a7e6a421 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -603,6 +603,7 @@ Strings - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 633efe43fce1a..de1fb0ec5d4d5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2174,14 +2174,36 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str, na=None): - result = pc.starts_with(self._pa_array, pattern=pat) + def _str_startswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) - def _str_endswith(self, pat: str, na=None): - result = pc.ends_with(self._pa_array, pattern=pat) + def _str_endswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 674a5da216011..28ee5446d4b5c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1801,19 +1801,33 @@ def test_str_contains_flags_unsupported(): @pytest.mark.parametrize( "side, pat, na, exp", [ - ["startswith", "ab", None, [True, None]], - ["startswith", "b", False, [False, False]], - ["endswith", "b", True, [False, True]], - ["endswith", "bc", None, [True, None]], + ["startswith", "ab", None, [True, None, False]], + ["startswith", "b", False, [False, False, False]], + ["endswith", "b", True, [False, True, False]], + ["endswith", "bc", None, [True, None, False]], + ["startswith", ("a", "e", "g"), None, [True, None, True]], + ["endswith", ("a", "c", "g"), None, [True, None, True]], + ["startswith", (), None, [False, None, False]], + ["endswith", (), None, [False, None, False]], ], ) def test_str_start_ends_with(side, pat, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string())) result = getattr(ser.str, side)(pat, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("side", ("startswith", "endswith")) +def test_str_starts_ends_with_all_nulls_empty_tuple(side): + ser = pd.Series([None, None], dtype=ArrowDtype(pa.string())) + result = getattr(ser.str, side)(()) + + # bool datatype preserved for all nulls. + expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_())) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "arg_name, arg", [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]], From e00a7bcef6af879931417a82f185d6d8e3d81cbb Mon Sep 17 00:00:00 2001 From: aram-cinnamon <97805700+aram-cinnamon@users.noreply.github.com> Date: Thu, 21 Dec 2023 22:02:08 +0100 Subject: [PATCH 07/31] Add test groupby complex numbers (#56570) --- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fce7caa90cce4..4c903e691add1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1196,6 +1196,24 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) +def test_groupby_complex_mean(): + # GH 26475 + df = DataFrame( + [ + {"a": 2, "b": 1 + 2j}, + {"a": 1, "b": 1 + 1j}, + {"a": 1, "b": 1 + 2j}, + ] + ) + result = df.groupby("b").mean() + expected = DataFrame( + [[1.0], [1.5]], + index=Index([(1 + 1j), (1 + 2j)], name="b"), + columns=Index(["a"]), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( From 5ef4a350d65b359c24858593de16466f70f2752a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Dec 2023 11:04:37 -1000 Subject: [PATCH 08/31] BUG: ArrowExtensionArray.to_numpy from timestamp to int (#56567) --- pandas/core/arrays/arrow/array.py | 16 ++++++---------- pandas/tests/extension/test_arrow.py | 13 ++++++++++++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de1fb0ec5d4d5..7183eb502ce5e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1322,16 +1322,12 @@ def to_numpy( copy = False if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = data._maybe_convert_datelike_array() - if (pa.types.is_timestamp(pa_type) and pa_type.tz is not None) or ( - dtype is not None and dtype.kind == "O" - ): - dtype = object - else: - # GH 55997 - dtype = None - na_value = pa_type.to_pandas_dtype().type("nat", pa_type.unit) - result = result.to_numpy(dtype=dtype, na_value=na_value) + # GH 55997 + if dtype != object and na_value is self.dtype.na_value: + na_value = lib.no_default + result = data._maybe_convert_datelike_array().to_numpy( + dtype=dtype, na_value=na_value + ) elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 28ee5446d4b5c..7d87c366f0566 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3030,7 +3030,10 @@ def test_to_numpy_temporal(pa_type, dtype): value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit) if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None): - na = pd.NA + if dtype == object: + na = pd.NA + else: + na = pd.NaT expected = np.array([value, na], dtype=object) assert result[0].unit == value.unit else: @@ -3142,3 +3145,11 @@ def test_string_to_time_parsing_cast(): ArrowExtensionArray(pa.array([time(11, 41, 43, 76160)], from_pandas=True)) ) tm.assert_series_equal(result, expected) + + +def test_to_numpy_timestamp_to_int(): + # GH 55997 + ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") + result = ser.to_numpy(dtype=np.int64) + expected = np.array([1577853000000000000]) + tm.assert_numpy_array_equal(result, expected) From 2488e5e6d783b2c63a45085ff3066b52f119b49c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 22:05:38 +0100 Subject: [PATCH 09/31] Switch arrow type for string array to large string (#56220) --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/core/arrays/arrow/array.py | 21 +++++++--- pandas/core/arrays/string_arrow.py | 38 +++++++++++++------ pandas/io/sql.py | 14 +++++-- pandas/tests/arrays/string_/test_string.py | 16 ++++++-- .../tests/arrays/string_/test_string_arrow.py | 9 +++-- pandas/tests/io/json/test_pandas.py | 7 ++++ pandas/tests/io/parser/test_read_fwf.py | 6 +++ pandas/tests/io/test_clipboard.py | 7 ++++ pandas/tests/io/test_feather.py | 6 +++ pandas/tests/io/test_html.py | 5 +++ pandas/tests/io/test_sql.py | 7 ++++ pandas/tests/io/xml/test_xml.py | 7 ++++ 13 files changed, 118 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 14565a7e6a421..84a1c2f0077bd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -236,6 +236,8 @@ Other enhancements - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) + .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7183eb502ce5e..859c4b39571f8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -291,6 +291,7 @@ def _from_sequence_of_strings( pa_type is None or pa.types.is_binary(pa_type) or pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) ): # pa_type is None: Let pa.array infer # pa_type is string/binary: scalars already correct type @@ -632,7 +633,9 @@ def __invert__(self) -> Self: # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): return type(self)(pc.bit_wise_not(self._pa_array)) - elif pa.types.is_string(self._pa_array.type): + elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): # Raise TypeError instead of pa.ArrowNotImplementedError raise TypeError("__invert__ is not supported for string dtypes") else: @@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type other = self._box_pa(other) - if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + if ( + pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_binary(pa_type) + ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) if op is operator.add: @@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs): result = pc.binary_repeat(binary, pa_integral) return type(self)(result) elif ( - pa.types.is_string(other.type) or pa.types.is_binary(other.type) + pa.types.is_string(other.type) + or pa.types.is_binary(other.type) + or pa.types.is_large_string(other.type) ) and op in [operator.mul, roperator.rmul]: binary = other integral = self._pa_array @@ -1467,7 +1476,7 @@ def _concat_same_type(cls, to_concat) -> Self: chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype - pa_dtype = pa.string() + pa_dtype = pa.large_string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype arr = pa.chunked_array(chunks, type=pa_dtype) @@ -2271,7 +2280,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return type(self)(result) def _str_join(self, sep: str): - if pa.types.is_string(self._pa_array.type): + if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): result = self._apply_elementwise(list) result = pa.chunked_array(result, type=pa.list_(pa.string())) else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 32ab3054c0f51..56732619a2d29 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr _storage = "pyarrow" def __init__(self, values) -> None: + _chk_pyarrow_available() + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + super().__init__(values) self._dtype = StringDtype(storage=self._storage) - if not pa.types.is_string(self._pa_array.type) and not ( + if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_string(self._pa_array.type.value_type) + and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" ) + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + def __len__(self) -> int: """ Length of this array. @@ -574,15 +597,6 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" - def __init__(self, values) -> None: - _chk_pyarrow_available() - - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( - values.type - ): - values = pc.cast(values, pa.string()) - super().__init__(values) - @classmethod def _result_converter(cls, values, na=None): if not isna(na): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 12118d1488932..b0fa6bc6e90c4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe( ) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - arrays = [ - ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays - ] + + result_arrays = [] + for arr in arrays: + pa_array = pa.array(arr, from_pandas=True) + if arr.dtype == "string": + # TODO: Arrow still infers strings arrays as regular strings instead + # of large_string, which is what we preserver everywhere else for + # dtype_backend="pyarrow". We may want to reconsider this + pa_array = pa_array.cast(pa.string()) + result_arrays.append(ArrowExtensionArray(pa_array)) + arrays = result_arrays # type: ignore[assignment] if arrays: df = DataFrame(dict(zip(list(range(len(columns))), arrays))) df.columns = columns diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 41255b2516e7e..320bdca60a932 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage): def test_arrow_array(dtype): # protocol added in 0.15.0 pa = pytest.importorskip("pyarrow") + import pyarrow.compute as pc data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) - expected = pa.array(list(data), type=pa.string(), from_pandas=True) + expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - + if dtype.storage == "python": + expected = pc.cast(expected, pa.string()) assert arr.equals(expected) @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage2): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 222b77cb4e94f..d7811b6fed883 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) @@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): arr = pa.chunked_array(arr) msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) +@pytest.mark.xfail( + reason="dict conversion does not seem to be implemented for large string in arrow" +) @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_valid_string_type_value_dictionary(chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 58a1e51d31b74..0eefb0b52c483 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend( string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 8566b87ef4292..bed2b5e10a6f7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend): if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + arr = ArrowExtensionArray(pa.array(["a", "b"])) + arr_na = ArrowExtensionArray(pa.array([None, "a"])) else: pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(["a", "b"])) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 8564f09ef7ae9..3c0208fcc74ec 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow" and engine != "c": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: string_array = ArrowStringArray(pa.array(["x", "y"])) string_array_na = ArrowStringArray(pa.array(["x", None])) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 15c5953e79bda..22a7d3b83a459 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f0256316e1689..607357e709b6e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e3272e5f5902d..6645aefd4f0a7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e4456b0a78e06..6f429c1ecbf8a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes( string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["x", "y"])) From 3096bd66632b4243c675cde715a0a1446870ef3b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 23:13:33 +0100 Subject: [PATCH 10/31] DOC: Add whatsnew illustrating upcoming changes (#56545) --- doc/source/user_guide/copy_on_write.rst | 2 + doc/source/whatsnew/v2.2.0.rst | 83 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index bc233f4323e2a..050c3901c3420 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -52,6 +52,8 @@ it explicitly disallows this. With CoW enabled, ``df`` is unchanged: The following sections will explain what this means and how it impacts existing applications. +.. _copy_on_write.migration_guide: + Migrating to Copy-on-Write -------------------------- diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 84a1c2f0077bd..03d6025b4ef93 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -9,6 +9,89 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- + +.. _whatsnew_220.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas 3.0 will bring two bigger changes to the default behavior of pandas. + +Copy-on-Write +^^^^^^^^^^^^^ + +The currently optional mode Copy-on-Write will be enabled by default in pandas 3.0. There +won't be an option to keep the current behavior enabled. The new behavioral semantics are +explained in the :ref:`user guide about Copy-on-Write `. + +The new behavior can be enabled since pandas 2.0 with the following option: + +.. code-block:: ipython + + pd.options.mode.copy_on_write = True + +This change brings different changes in behavior in how pandas operates with respect to +copies and views. Some of these changes allow a clear deprecation, like the changes in +chained assignment. Other changes are more subtle and thus, the warnings are hidden behind +an option that can be enabled in pandas 2.2. + +.. code-block:: ipython + + pd.options.mode.copy_on_write = "warn" + +This mode will warn in many different scenarios that aren't actually relevant to +most queries. We recommend exploring this mode, but it is not necessary to get rid +of all of these warnings. The :ref:`migration guide ` +explains the upgrade process in more detail. + +Dedicated string data type (backed by Arrow) by default +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Historically, pandas represented string columns with NumPy object data type. This +representation has numerous problems, including slow performance and a large memory +footprint. This will change in pandas 3.0. pandas will start inferring string columns +as a new ``string`` data type, backed by Arrow, which represents strings contiguous in memory. This brings +a huge performance and memory improvement. + +Old behavior: + +.. code-block:: ipython + + In [1]: ser = pd.Series(["a", "b"]) + Out[1]: + 0 a + 1 b + dtype: object + +New behavior: + + +.. code-block:: ipython + + In [1]: ser = pd.Series(["a", "b"]) + Out[1]: + 0 a + 1 b + dtype: string + +The string data type that is used in these scenarios will mostly behave as NumPy +object would, including missing value semantics and general operations on these +columns. + +This change includes a few additional changes across the API: + +- Currently, specifying ``dtype="string"`` creates a dtype that is backed by Python strings + which are stored in a NumPy array. This will change in pandas 3.0, this dtype + will create an Arrow backed string column. +- The column names and the Index will also be backed by Arrow strings. +- PyArrow will become a required dependency with pandas 3.0 to accommodate this change. + +This future dtype inference logic can be enabled with: + +.. code-block:: ipython + + pd.options.future.infer_string = True + .. _whatsnew_220.enhancements: Enhancements From 2a9c3d7d13ee898c9a5b187f5be54fb675c1cef6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 23:15:49 +0100 Subject: [PATCH 11/31] Convert ArrowExtensionArray to proper NumPy dtype (#56290) --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_utils.py | 54 ++++++++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 16 +++++---- pandas/core/arrays/masked.py | 34 ++---------------- pandas/tests/extension/test_arrow.py | 14 +++++--- pandas/tests/series/test_npfuncs.py | 11 ++++++ 6 files changed, 89 insertions(+), 42 deletions(-) create mode 100644 pandas/core/arrays/_utils.py diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 03d6025b4ef93..d9856eb695652 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -194,7 +194,7 @@ documentation. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``to_numpy`` for NumPy nullable and Arrow types will now convert to a -suitable NumPy dtype instead of ``object`` dtype for nullable extension dtypes. +suitable NumPy dtype instead of ``object`` dtype for nullable and PyArrow backed extension dtypes. *Old behavior:* diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py new file mode 100644 index 0000000000000..c75ec7f843ed2 --- /dev/null +++ b/pandas/core/arrays/_utils.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs import lib +from pandas.errors import LossySetitemError + +from pandas.core.dtypes.cast import np_can_hold_element +from pandas.core.dtypes.common import is_numeric_dtype + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + npt, + ) + + +def to_numpy_dtype_inference( + arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool +) -> tuple[npt.DTypeLike, Any]: + if dtype is None and is_numeric_dtype(arr.dtype): + dtype_given = False + if hasna: + if arr.dtype.kind == "b": + dtype = np.dtype(np.object_) + else: + if arr.dtype.kind in "iu": + dtype = np.dtype(np.float64) + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + if na_value is lib.no_default: + na_value = np.nan + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + elif dtype is not None: + dtype = np.dtype(dtype) + dtype_given = True + else: + dtype_given = True + + if na_value is lib.no_default: + na_value = arr.dtype.na_value + + if not dtype_given and hasna: + try: + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + except LossySetitemError: + dtype = np.dtype(np.object_) + return dtype, na_value diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 859c4b39571f8..23b5448029dd9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -39,6 +39,7 @@ is_bool_dtype, is_integer, is_list_like, + is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -50,8 +51,10 @@ ops, roperator, ) +from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -1317,12 +1320,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - if dtype is not None: - dtype = np.dtype(dtype) - - if na_value is lib.no_default: - na_value = self.dtype.na_value - + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self @@ -1366,6 +1364,12 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + def map(self, mapper, na_action=None): + if is_numeric_dtype(self.dtype): + return map_array(self.to_numpy(), mapper, na_action=None) + else: + return super().map(mapper, na_action) + @doc(ExtensionArray.duplicated) def duplicated( self, keep: Literal["first", "last", False] = "first" diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 00c7276a2216e..03c09c5b2fd18 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -35,15 +35,11 @@ IS64, is_platform_windows, ) -from pandas.errors import ( - AbstractMethodError, - LossySetitemError, -) +from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.cast import np_can_hold_element from pandas.core.dtypes.common import ( is_bool, is_integer_dtype, @@ -80,6 +76,7 @@ ) from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import ( array as pd_array, @@ -477,32 +474,7 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - - if dtype is None: - dtype_given = False - if hasna: - if self.dtype.kind == "b": - dtype = object - else: - if self.dtype.kind in "iu": - dtype = np.dtype(np.float64) - else: - dtype = self.dtype.numpy_dtype - if na_value is lib.no_default: - na_value = np.nan - else: - dtype = self.dtype.numpy_dtype - else: - dtype = np.dtype(dtype) - dtype_given = True - if na_value is lib.no_default: - na_value = libmissing.NA - - if not dtype_given and hasna: - try: - np_can_hold_element(dtype, na_value) # type: ignore[arg-type] - except LossySetitemError: - dtype = object + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) if hasna: if ( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d87c366f0566..3b03272f18203 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -278,7 +278,13 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy(dtype=object) tm.assert_numpy_array_equal(result, expected) else: - super().test_map(data_missing, na_action) + result = data_missing.map(lambda x: x, na_action=na_action) + if data_missing.dtype == "float32[pyarrow]": + # map roundtrips through objects, which converts to float64 + expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) + else: + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype @@ -1585,7 +1591,7 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna: + if data._hasna and not is_numeric_dtype(data.dtype): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA @@ -1597,8 +1603,8 @@ def test_to_numpy_int_with_na(): data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, pd.NA], dtype=object) - assert isinstance(result[0], int) + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py index 08950db25b282..11a51c4700d5c 100644 --- a/pandas/tests/series/test_npfuncs.py +++ b/pandas/tests/series/test_npfuncs.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series import pandas._testing as tm @@ -33,3 +35,12 @@ def test_numpy_argwhere(index): expected = np.array([[3], [4]], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_log_arrow_backed_missing_value(): + # GH#56285 + ser = Series([1, 2, None], dtype="float64[pyarrow]") + result = np.log(ser) + expected = np.log(Series([1, 2, None], dtype="float64")) + tm.assert_series_equal(result, expected) From 886431948d014b8b2dec77399085db3d74f27c75 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 23:45:21 +0100 Subject: [PATCH 12/31] DEPR: Deprecate dtype inference on pandas objects (#56244) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_testing/__init__.py | 13 +++++++++--- pandas/core/frame.py | 16 ++++++++++++++ pandas/core/indexes/base.py | 16 +++++++++++++- pandas/core/series.py | 15 +++++++++++++ pandas/core/strings/accessor.py | 13 ++++++------ pandas/tests/copy_view/test_constructors.py | 3 ++- pandas/tests/frame/test_constructors.py | 17 +++++++++++++++ .../indexes/base_class/test_constructors.py | 14 +++++++++++++ pandas/tests/indexes/test_base.py | 3 ++- .../series/accessors/test_dt_accessor.py | 6 +++--- pandas/tests/series/methods/test_between.py | 2 +- pandas/tests/series/methods/test_equals.py | 6 ++++-- pandas/tests/series/test_constructors.py | 17 ++++++++++++++- pandas/tests/strings/test_cat.py | 21 ++++++++++++++----- .../tseries/frequencies/test_inference.py | 4 +++- 16 files changed, 141 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9856eb695652..c8b9b3c7ede1b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -555,6 +555,7 @@ Other Deprecations - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) +- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) - Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 69b2b0876fc80..672c16a85086c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -10,6 +10,7 @@ ContextManager, cast, ) +import warnings import numpy as np @@ -285,11 +286,17 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - expected = Index(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Index(expected) elif box_cls is Series: - expected = Series(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected) elif box_cls is DataFrame: - expected = Series(expected).to_frame() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d0dc476ee76a..3e2e589440bd9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -722,6 +722,10 @@ def __init__( manager = _get_option("mode.data_manager", silent=True) + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -908,6 +912,18 @@ def __init__( NDFrame.__init__(self, mgr) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + # ---------------------------------------------------------------------- def __dataframe__( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index edb851c7d0162..88a08dd55f739 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -493,6 +493,8 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references + is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) + # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -572,7 +574,19 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - return klass._simple_new(arr, name, refs=refs) + result = klass._simple_new(arr, name, refs=refs) + if dtype is None and is_pandas_object and data_dtype == np.object_: + if result.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Index " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + return result # type: ignore[return-value] @classmethod def _ensure_array(cls, data, dtype, copy: bool): diff --git a/pandas/core/series.py b/pandas/core/series.py index a9d4438c5c56b..e3b401cd3c88b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -424,6 +424,10 @@ def __init__( self.name = name return + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -581,6 +585,17 @@ def __init__( self.name = name self._set_axis(0, index) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Series " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + def _init_dict( self, data, index: Index | None = None, dtype: DtypeObj | None = None ): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 75866c6f6013a..1b7d632c0fa80 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -689,19 +689,18 @@ def cat( result = cat_safe(all_cols, sep) out: Index | Series + if isinstance(self._orig.dtype, CategoricalDtype): + # We need to infer the new categories. + dtype = self._orig.dtype.categories.dtype + else: + dtype = self._orig.dtype if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA - dtype = None if isna(result).all(): - dtype = object + dtype = object # type: ignore[assignment] out = Index(result, dtype=dtype, name=self._orig.name) else: # Series - if isinstance(self._orig.dtype, CategoricalDtype): - # We need to infer the new categories. - dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] - else: - dtype = self._orig.dtype res_ser = Series( result, dtype=dtype, index=data.index, name=self._orig.name, copy=False ) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 7d5c485958039..1aa458a625028 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -314,7 +314,8 @@ def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, con def test_dataframe_from_series_infer_datetime(using_copy_on_write): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - df = DataFrame(ser) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + df = DataFrame(ser) assert not np.shares_memory(get_array(ser), get_array(df, 0)) if using_copy_on_write: assert df._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f64d6a886fe9f..6e818d79d5ba8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2768,6 +2768,23 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(idx, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": idx}) + assert result.dtypes.iloc[0] == np.object_ + + ser = Series([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(ser, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": ser}) + assert result.dtypes.iloc[0] == np.object_ + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 60abbfc441e8e..fd5176a28565e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -5,6 +5,7 @@ from pandas import ( Index, MultiIndex, + Series, ) import pandas._testing as tm @@ -57,3 +58,16 @@ def test_index_string_inference(self): with pd.option_context("future.infer_string", True): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) + + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([pd.Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(idx) + assert result.dtype != np.object_ + + ser = Series([pd.Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(ser) + assert result.dtype != np.object_ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 185e34efdc177..666d92064c86c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,7 +104,8 @@ def test_constructor_copy(self, index, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - result = Index(index.astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(index.astype(object)) else: result = Index(index) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 083a4c4b24adb..34465a7c12c18 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -259,9 +259,9 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - ser = Series( - period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - ) + idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index 8f4931beae589..3913419038876 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -20,7 +20,7 @@ def test_between(self): tm.assert_series_equal(result, expected) def test_between_datetime_object_dtype(self): - ser = Series(bdate_range("1/1/2000", periods=20).astype(object)) + ser = Series(bdate_range("1/1/2000", periods=20), dtype=object) ser[::2] = np.nan result = ser[ser.between(ser[3], ser[17])] diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index b94723b7cbddf..875ffdd3fe851 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,13 +82,15 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0e6f1c284a988..5f591b4b22f1c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1316,7 +1316,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - expected = Series(pi.astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + expected = Series(pi.astype(object)) tm.assert_series_equal(s, expected) def test_constructor_dict(self): @@ -2137,6 +2138,20 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) + def test_inference_on_pandas_objects(self): + # GH#56012 + ser = Series([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(None): + # This doesn't do inference + result = Series(ser) + assert result.dtype == np.object_ + + idx = Index([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Series(idx) + assert result.dtype != np.object_ + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 284932491a65e..c1e7ad6e02779 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -98,14 +98,18 @@ def test_str_cat_categorical( with option_context("future.infer_string", infer_string): s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) + s = s if box == Index else Series(s, index=s, dtype=s.dtype) t = Index(["b", "a", "b", "c"], dtype=dtype_target) - expected = Index(["ab", "aa", "bb", "ac"]) + expected = Index( + ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None + ) expected = ( expected if box == Index - else Series(expected, index=Index(s, dtype=dtype_caller)) + else Series( + expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype + ) ) # Series/Index with unaligned Index -> t.values @@ -123,12 +127,19 @@ def test_str_cat_categorical( # Series/Index with Series having different Index t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) + expected = Index( + ["aa", "aa", "bb", "bb", "aa"], + dtype=object if dtype_caller == "object" else None, + ) dtype = object if dtype_caller == "object" else s.dtype.categories.dtype expected = ( expected if box == Index - else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + else Series( + expected, + index=Index(expected.str[:1], dtype=dtype), + dtype=expected.dtype, + ) ) result = s.str.cat(t, sep=sep) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 45741e852fef7..99a504f4188c1 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,6 +23,7 @@ date_range, period_range, ) +import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -206,7 +207,8 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): ) def test_infer_freq_index(freq, expected): rng = period_range("1959Q2", "2009Q3", freq=freq) - rng = Index(rng.to_timestamp("D", how="e").astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + rng = Index(rng.to_timestamp("D", how="e").astype(object)) assert rng.inferred_freq == expected From 64c20dcaf5f6d1599d5a4c9aa9123d3bdc56e6d1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 21 Dec 2023 23:45:36 +0100 Subject: [PATCH 13/31] BUG: __eq__ raising for new arrow string dtype for incompatible objects (#56245) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/string_arrow.py | 6 +++++- pandas/tests/series/test_logical_ops.py | 16 ++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c8b9b3c7ede1b..c8595dd6d299a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -691,6 +691,7 @@ Strings - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) +- Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval ^^^^^^^^ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 56732619a2d29..d5a76811a12e6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -41,6 +41,7 @@ BaseStringArray, StringDtype, ) +from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: @@ -676,7 +677,10 @@ def _convert_int_dtype(self, result): return result def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) + try: + result = super()._cmp_method(other, op) + except pa.ArrowNotImplementedError: + return invalid_comparison(self, other, op) if op == operator.ne: return result.to_numpy(np.bool_, na_value=True) else: diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 153b4bfaaf444..d9c94e871bd4b 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -530,3 +530,19 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + + def test_pyarrow_numpy_string_invalid(self): + # GH#56008 + pytest.importorskip("pyarrow") + ser = Series([False, True]) + ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + result = ser == ser2 + expected = Series(False, index=ser.index) + tm.assert_series_equal(result, expected) + + result = ser != ser2 + expected = Series(True, index=ser.index) + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser2 From 69ee05fd3d07eea2894583fe48c1f4bf230a5059 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Dec 2023 23:46:07 +0100 Subject: [PATCH 14/31] DOC: add deprecation of chained assignment to 2.2 whatsnew (#56403) --- doc/source/whatsnew/v2.2.0.rst | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c8595dd6d299a..535704a27169c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -451,6 +451,70 @@ Other API changes Deprecations ~~~~~~~~~~~~ +Chained assignment +^^^^^^^^^^^^^^^^^^ + +In preparation of larger upcoming changes to the copy / view behaviour in pandas 3.0 +(:ref:`copy_on_write`, PDEP-7), we started deprecating *chained assignment*. + +Chained assignment occurs when you try to update a pandas DataFrame or Series through +two subsequent indexing operations. Depending on the type and order of those operations +this currently does or does not work. + +A typical example is as follows: + +.. code-block:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + + # first selecting rows with a mask, then assigning values to a column + # -> this has never worked and raises a SettingWithCopyWarning + df[df["bar"] > 5]["foo"] = 100 + + # first selecting the column, and then assigning to a subset of that column + # -> this currently works + df["foo"][df["bar"] > 5] = 100 + +This second example of chained assignment currently works to update the original ``df``. +This will no longer work in pandas 3.0, and therefore we started deprecating this: + +.. code-block:: python + + >>> df["foo"][df["bar"] > 5] = 100 + FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! + You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. + A typical example is when you are setting values in a column of a DataFrame, like: + + df["col"][row_indexer] = value + + Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. + + See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy + +You can fix this warning and ensure your code is ready for pandas 3.0 by removing +the usage of chained assignment. Typically, this can be done by doing the assignment +in a single step using for example ``.loc``. For the example above, we can do: + +.. code-block:: python + + df.loc[df["bar"] > 5, "foo"] = 100 + +The same deprecation applies to inplace methods that are done in a chained manner, such as: + +.. code-block:: python + + >>> df["foo"].fillna(0, inplace=True) + FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. + The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. + + For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. + +When the goal is to update the column in the DataFrame ``df``, the alternative here is +to call the method on ``df`` itself, such as ``df.fillna({"foo": 0}, inplace=True)``. + +See more details in the :ref:`migration guide `. + + Deprecate aliases ``M``, ``Q``, ``Y``, etc. in favour of ``ME``, ``QE``, ``YE``, etc. for offsets ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 0d8a0f3ac7b5b841fb9b38dad17e6a91f0c48afc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 22 Dec 2023 01:07:36 +0100 Subject: [PATCH 15/31] DOC: Move deprecation note (#56593) --- doc/source/whatsnew/v2.2.0.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 535704a27169c..d1481639ca5a0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -571,6 +571,9 @@ These methods are: - :meth:`DataFrame.fillna`, :meth:`Series.fillna` - :meth:`DataFrame.ffill`, :meth:`Series.ffill` - :meth:`DataFrame.bfill`, :meth:`Series.bfill` +- :meth:`DataFrame.mask`, :meth:`Series.mask` +- :meth:`DataFrame.where`, :meth:`Series.where` +- :meth:`DataFrame.clip`, :meth:`Series.clip` Explicitly call :meth:`DataFrame.infer_objects` to replicate the current behavior in the future. @@ -578,6 +581,8 @@ Explicitly call :meth:`DataFrame.infer_objects` to replicate the current behavio result = result.infer_objects(copy=False) +Or explicitly cast all-round floats to ints using ``astype``. + Set the following option to opt into the future behavior: .. code-block:: ipython @@ -618,7 +623,6 @@ Other Deprecations - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) -- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) - Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) From e0e47e8d37d4fc7c2e7634c5f9ca0ab7e44c1291 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:35:01 -1000 Subject: [PATCH 16/31] DEPS: Test NEP 50 (#55739) * DEPS: Test NEP 50 * Use Python floats in test_maybe_promote_float_with_float * Refactor test_to_html_multiindex to allow tests to collect * Supress deprecationwarning for now * Use old invocation * Use Python ints in _range.py functions * Address test_constructor * Fix test_constructor_coercion_signed_to_unsigned * Fix test_constructor_coercion_signed_to_unsigned * Cast numpy scalars as python scalars before arith ops * add xfail reason to TestCoercionFloat32 * only set promotion state for numpy > 2.0 * order was backwards * Version promotion state call * fix timedelta tests * go for green * fix non npdev too? * fixes * adjust xfail condition * go for green * add tests * add negative numbers test * updates * fix accidental changes * more * simplify * linter --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 3 +- ci/run_tests.sh | 3 +- pandas/_libs/tslibs/timedeltas.pyx | 12 +++++ pandas/core/arrays/_ranges.py | 14 +++--- pandas/core/dtypes/cast.py | 39 +++++++++++++-- pandas/core/ops/array_ops.py | 8 ++++ pandas/tests/dtypes/cast/test_promote.py | 16 +++---- pandas/tests/dtypes/test_inference.py | 50 ++++++++++++++++++++ pandas/tests/indexes/numeric/test_numeric.py | 16 +++++-- pandas/tests/indexing/test_coercion.py | 7 +++ pandas/tests/indexing/test_loc.py | 11 ++++- pandas/tests/io/formats/test_to_html.py | 6 +-- pandas/tests/series/indexing/test_setitem.py | 5 ++ pandas/tests/series/test_constructors.py | 20 ++++++-- pyproject.toml | 6 +-- 15 files changed, 177 insertions(+), 39 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 57f9893d36044..6ca4d19196874 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,7 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" + test_args: "-W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" @@ -115,6 +115,7 @@ jobs: TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen concurrency: diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 6a70ea1df3e71..48ef21686a26f 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,8 +10,7 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE -PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 475b85fa64800..f6c69cf6d3875 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2060,6 +2060,12 @@ class Timedelta(_Timedelta): # integers or floats if util.is_nan(other): return NaT + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + if isinstance(other, cnp.integer): + other = int(other) + if isinstance(other, cnp.floating): + other = float(other) return Timedelta._from_value_and_reso( (self._value/ other), self._creso ) @@ -2114,6 +2120,12 @@ class Timedelta(_Timedelta): elif is_integer_object(other) or is_float_object(other): if util.is_nan(other): return NaT + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + if isinstance(other, cnp.integer): + other = int(other) + if isinstance(other, cnp.floating): + other = float(other) return type(self)._from_value_and_reso(self._value// other, self._creso) elif is_array(other): diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 6a1ef0800385d..3e89391324ad4 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -54,8 +54,8 @@ def generate_regular_range( iend = end._value if end is not None else None freq.nanos # raises if non-fixed frequency td = Timedelta(freq) - b: int | np.int64 | np.uint64 - e: int | np.int64 | np.uint64 + b: int + e: int try: td = td.as_unit(unit, round_ok=False) except ValueError as err: @@ -96,7 +96,7 @@ def generate_regular_range( def _generate_range_overflow_safe( endpoint: int, periods: int, stride: int, side: str = "start" -) -> np.int64 | np.uint64: +) -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -115,7 +115,7 @@ def _generate_range_overflow_safe( Returns ------- - other_end : np.int64 | np.uint64 + other_end : int Raises ------ @@ -163,7 +163,7 @@ def _generate_range_overflow_safe( def _generate_range_overflow_safe_signed( endpoint: int, periods: int, stride: int, side: str -) -> np.int64 | np.uint64: +) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. @@ -181,7 +181,7 @@ def _generate_range_overflow_safe_signed( # Putting this into a DatetimeArray/TimedeltaArray # would incorrectly be interpreted as NaT raise OverflowError - return result + return int(result) except (FloatingPointError, OverflowError): # with endpoint negative and addend positive we risk # FloatingPointError; with reversed signed we risk OverflowError @@ -200,7 +200,7 @@ def _generate_range_overflow_safe_signed( i64max = np.uint64(i8max) assert uresult > i64max if uresult <= i64max + np.uint64(stride): - return uresult + return int(uresult) raise OutOfBoundsDatetime( f"Cannot generate range with {side}={endpoint} and periods={periods}" diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d5144174d3c71..7a088bf84c48e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,6 +39,7 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -1314,6 +1315,30 @@ def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: # which will make us upcast too far. if lib.is_float(right) and right.is_integer() and left_dtype.kind != "f": right = int(right) + # After NEP 50, numpy won't inspect Python scalars + # TODO: do we need to recreate numpy's inspection logic for floats too + # (this breaks some tests) + if isinstance(right, int) and not isinstance(right, np.integer): + # This gives an unsigned type by default + # (if our number is positive) + + # If our left dtype is signed, we might not want this since + # this might give us 1 dtype too big + # We should check if the corresponding int dtype (e.g. int64 for uint64) + # can hold the number + right_dtype = np.min_scalar_type(right) + if right == 0: + # Special case 0 + right = left_dtype + elif ( + not np.issubdtype(left_dtype, np.unsignedinteger) + and 0 < right <= 2 ** (8 * right_dtype.itemsize - 1) - 1 + ): + # If left dtype isn't unsigned, check if it fits in the signed dtype + right = np.dtype(f"i{right_dtype.itemsize}") + else: + right = right_dtype + new_dtype = np.result_type(left_dtype, right) elif is_valid_na_for_dtype(right, left_dtype): @@ -1619,11 +1644,13 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of out-of-bound Python int", - DeprecationWarning, - ) + if not np_version_gt2: + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of " + "out-of-bound Python int", + DeprecationWarning, + ) casted = np.array(arr, dtype=dtype, copy=False) else: with warnings.catch_warnings(): @@ -1660,6 +1687,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n raise ValueError(f"string values cannot be losslessly cast to {dtype}") if dtype.kind == "u" and (arr < 0).any(): + # TODO: can this be hit anymore after numpy 2.0? raise OverflowError("Trying to coerce negative values to unsigned integers") if arr.dtype.kind == "f": @@ -1672,6 +1700,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n raise ValueError("Trying to coerce float values to integers") if casted.dtype < arr.dtype: + # TODO: Can this path be hit anymore with numpy > 2 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows raise ValueError( f"Values are too large to be losslessly converted to {dtype}. " diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index ee3f8787d78b5..4b762a359d321 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -570,6 +570,14 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + elif isinstance(obj, np.integer): + return int(obj) + + elif isinstance(obj, np.floating): + return float(obj) + return obj diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 1becf3b9843b7..021107724bef7 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -229,24 +229,24 @@ def test_maybe_promote_float_with_int(float_numpy_dtype, any_int_numpy_dtype): [ # float filled with float ("float32", 1, "float32"), - ("float32", np.finfo("float32").max * 1.1, "float64"), + ("float32", float(np.finfo("float32").max) * 1.1, "float64"), ("float64", 1, "float64"), - ("float64", np.finfo("float32").max * 1.1, "float64"), + ("float64", float(np.finfo("float32").max) * 1.1, "float64"), # complex filled with float ("complex64", 1, "complex64"), - ("complex64", np.finfo("float32").max * 1.1, "complex128"), + ("complex64", float(np.finfo("float32").max) * 1.1, "complex128"), ("complex128", 1, "complex128"), - ("complex128", np.finfo("float32").max * 1.1, "complex128"), + ("complex128", float(np.finfo("float32").max) * 1.1, "complex128"), # float filled with complex ("float32", 1 + 1j, "complex64"), - ("float32", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float32", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ("float64", 1 + 1j, "complex128"), - ("float64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), # complex filled with complex ("complex64", 1 + 1j, "complex64"), - ("complex64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ("complex128", 1 + 1j, "complex128"), - ("complex128", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex128", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ], ) def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ff2cfc1278331..49eb06c299886 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -33,8 +33,10 @@ missing as libmissing, ops as libops, ) +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes import inference +from pandas.core.dtypes.cast import find_result_type from pandas.core.dtypes.common import ( ensure_int32, is_bool, @@ -1995,3 +1997,51 @@ def test_ensure_int32(): values = np.arange(10, dtype=np.int64) result = ensure_int32(values) assert result.dtype == np.int32 + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.uint8), + (-1, np.int16), + (300, np.uint16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.uint16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16 if np_version_gt2 else np.uint16), + ], +) +def test_find_result_type_uint_int(right, result): + left_dtype = np.dtype("uint8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.int8), + (-1, np.int8), + (300, np.int16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.int16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16), + ], +) +def test_find_result_type_int_int(right, result): + left_dtype = np.dtype("int8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (300.0, np.float64), + (np.float32(300), np.float32), + ], +) +def test_find_result_type_floats(right, result): + left_dtype = np.dtype("float16") + assert find_result_type(left_dtype, right) == result diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 7ce55db6c0bbc..4fd807e1827dd 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -354,11 +354,13 @@ def test_constructor(self, dtype): arr = index.values.copy() new_index = index_cls(arr, copy=True) tm.assert_index_equal(new_index, index, exact=True) - val = arr[0] + 3000 + val = int(arr[0]) + 3000 # this should not change index - arr[0] = val - assert new_index[0] != val + if dtype != np.int8: + # NEP 50 won't allow assignment that would overflow + arr[0] = val + assert new_index[0] != val if dtype == np.int64: # pass list, coerce fine @@ -407,8 +409,12 @@ def test_constructor_coercion_signed_to_unsigned( any_unsigned_int_numpy_dtype, ): # see gh-15832 - msg = "Trying to coerce negative values to unsigned integers" - + msg = "|".join( + [ + "Trying to coerce negative values to unsigned integers", + "The elements provided in the data cannot all be casted", + ] + ) with pytest.raises(OverflowError, match=msg): Index([-1], dtype=any_unsigned_int_numpy_dtype) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 45a9c207f0acc..0e32399b131c3 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -15,6 +15,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -226,6 +227,8 @@ def test_insert_int_index( "insert, coerced_val, coerced_dtype", [ (1, 1.0, None), + # When float_numpy_dtype=float32, this is not the case + # see the correction below (1.1, 1.1, np.float64), (False, False, object), # GH#36319 ("x", "x", object), @@ -238,6 +241,10 @@ def test_insert_float_index( obj = pd.Index([1.0, 2.0, 3.0, 4.0], dtype=dtype) coerced_dtype = coerced_dtype if coerced_dtype is not None else dtype + if np_version_gt2 and dtype == "float32" and coerced_val == 1.1: + # Hack, in the 2nd test case, since 1.1 can be losslessly cast to float32 + # the expected dtype will be float32 if the original dtype was float32 + coerced_dtype = np.float32 exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0], dtype=coerced_dtype) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ce7dde3c4cb42..fb0adc56c401b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -15,6 +15,7 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import index as libindex +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -3020,7 +3021,15 @@ def test_loc_setitem_uint8_upcast(value): with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): df.loc[2, "col1"] = value # value that can't be held in uint8 - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype="uint16") + if np_version_gt2 and isinstance(value, np.int16): + # Note, result type of uint8 + int16 is int16 + # in numpy < 2, though, numpy would inspect the + # value and see that it could fit in an uint16, resulting in a uint16 + dtype = "int16" + else: + dtype = "uint16" + + expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 0a30039815485..790ba92f70c40 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -419,15 +419,15 @@ def test_to_html_columns_arg(float_frame): "columns,justify,expected", [ ( - MultiIndex.from_tuples( - list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), + MultiIndex.from_arrays( + [np.arange(2).repeat(2), np.mod(range(4), 2)], names=["CL0", "CL1"], ), "left", "multiindex_1", ), ( - MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))), + MultiIndex.from_arrays([np.arange(4), np.mod(range(4), 2)]), "right", "multiindex_2", ), diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index e583d55101a8b..23137f0975fb1 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -7,6 +7,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError from pandas.core.dtypes.common import is_list_like @@ -1440,6 +1441,10 @@ def obj(self): np.float32, None, marks=pytest.mark.xfail( + ( + not np_version_gte1p24 + or (np_version_gte1p24 and np._get_promotion_state() != "weak") + ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", ), diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5f591b4b22f1c..da069afe5e709 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -773,11 +774,16 @@ def test_constructor_cast(self): def test_constructor_signed_int_overflow_raises(self): # GH#41734 disallow silent overflow, enforced in 2.0 - msg = "Values are too large to be losslessly converted" - with pytest.raises(ValueError, match=msg): + if np_version_gt2: + msg = "The elements provided in the data cannot all be casted to the dtype" + err = OverflowError + else: + msg = "Values are too large to be losslessly converted" + err = ValueError + with pytest.raises(err, match=msg): Series([1, 200, 923442], dtype="int8") - with pytest.raises(ValueError, match=msg): + with pytest.raises(err, match=msg): Series([1, 200, 923442], dtype="uint8") @pytest.mark.parametrize( @@ -801,7 +807,13 @@ def test_constructor_numpy_uints(self, values): def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype): # see gh-15832 - msg = "Trying to coerce negative values to unsigned integers" + if np_version_gt2: + msg = ( + f"The elements provided in the data cannot " + f"all be casted to the dtype {any_unsigned_int_numpy_dtype}" + ) + else: + msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=any_unsigned_int_numpy_dtype) diff --git a/pyproject.toml b/pyproject.toml index ca19f463edf40..d52c2f393f909 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4,<2; python_version<'3.11'", - "numpy>=1.23.2,<2; python_version=='3.11'", - "numpy>=1.26.0,<2; python_version>='3.12'", + "numpy>=1.22.4; python_version<'3.11'", + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.7" From 7b0b7555d2869753c70f17444feca917c4e9d707 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 22 Dec 2023 07:24:08 -0800 Subject: [PATCH 17/31] BLD: Add wheel builds for musllinux on aarch64 (#56590) * BLD: Add wheel builds for musllinux on aarch64 * remove skip * Update config.yml * Update config.yml * Update config.yml * syntax * change img * another typo * Update config.yml * reorder * correct line separator --- .circleci/config.yml | 39 ++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 +- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1c70debca0caf..90afb1ce29684 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,6 +18,29 @@ jobs: PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD ci/run_tests.sh + linux-musl: + docker: + - image: quay.io/pypa/musllinux_1_1_aarch64 + resource_class: arm.large + steps: + # Install pkgs first to have git in the image + # (needed for checkout) + - run: | + apk update + apk add git + apk add musl-locales + - checkout + - run: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + - run: | + . ~/virtualenvs/pandas-dev/bin/activate + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: @@ -89,6 +112,13 @@ workflows: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - test-arm + test-musl: + # Don't run trigger this one when scheduled pipeline runs + when: + not: + equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] + jobs: + - linux-musl build-wheels: jobs: - build-aarch64: @@ -97,4 +127,11 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", + "cp310-manylinux_aarch64", + "cp311-manylinux_aarch64", + "cp312-manylinux_aarch64", + "cp39-musllinux_aarch64", + "cp310-musllinux_aarch64", + "cp311-musllinux_aarch64", + "cp312-musllinux_aarch64",] diff --git a/pyproject.toml b/pyproject.toml index d52c2f393f909..5e65edf81f9c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,7 +152,7 @@ parentdir_prefix = "pandas-" setup = ['--vsenv'] # For Windows [tool.cibuildwheel] -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" +skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" From 9b49a0a3c871695f7a8e7b9ed6947bc917157f84 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Dec 2023 05:34:40 -1000 Subject: [PATCH 18/31] DOC: Update pydata-sphinx-theme to 0.14 (#55885) * DEPS: Unpin pydata-sphinx-theme * Pin to 0.14 * update switcher_version * Add preferred version --------- Co-authored-by: Joris Van den Bossche Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/conf.py | 6 +++++- environment.yml | 2 +- requirements-dev.txt | 2 +- web/pandas/versions.json | 3 ++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index e7ce8511b76a1..be6150d4e54ba 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -230,11 +230,13 @@ # further. For a list of options available for each theme, see the # documentation. -switcher_version = version if ".dev" in version: switcher_version = "dev" elif "rc" in version: switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)" +else: + # only keep major.minor version number to match versions.json + switcher_version = ".".join(version.split(".")[:2]) html_theme_options = { "external_links": [], @@ -246,11 +248,13 @@ "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", }, "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, + "navbar_align": "left", "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "switcher": { "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, + "show_version_warning_banner": True, "icon_links": [ { "name": "Mastodon", diff --git a/environment.yml b/environment.yml index 5fad8b5031b0a..74317d47e2e53 100644 --- a/environment.yml +++ b/environment.yml @@ -86,7 +86,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.13 + - pydata-sphinx-theme=0.14 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/requirements-dev.txt b/requirements-dev.txt index 76f4de2d8f0c4..cbfb6336b2e16 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -61,7 +61,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme==0.13 +pydata-sphinx-theme==0.14 pytest-cython sphinx sphinx-design diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 43efaf8ebe259..e355005c7c937 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -7,7 +7,8 @@ { "name": "2.1 (stable)", "version": "2.1", - "url": "https://pandas.pydata.org/docs/" + "url": "https://pandas.pydata.org/docs/", + "preferred": true }, { "name": "2.0", From dc37a6d035474c25932d805cfe50a47b04916a83 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 22 Dec 2023 11:14:56 -0800 Subject: [PATCH 19/31] Start 2.3.0 From 157631d97840b7918eec4c8b40bd9c24b25771a7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 23 Dec 2023 19:56:41 +0100 Subject: [PATCH 20/31] CI: Move target for the deprecations bot (#56597) --- .github/workflows/deprecation-tracking-bot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index ec71daf6f84ab..3d4cab7be09c5 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -19,7 +19,7 @@ jobs: issues: write runs-on: ubuntu-22.04 env: - DEPRECATION_TRACKER_ISSUE: 50578 + DEPRECATION_TRACKER_ISSUE: 56596 steps: - uses: actions/github-script@v7 id: update-deprecation-issue From b0c2e45997e8f164a181ce8e896dfb414e3eb60c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 24 Dec 2023 04:19:14 -1000 Subject: [PATCH 21/31] TST/CLN: Inline seldom used fixture (#56595) --- pandas/tests/arrays/categorical/conftest.py | 9 --------- pandas/tests/arrays/categorical/test_api.py | 3 ++- pandas/tests/arrays/categorical/test_indexing.py | 6 ++++-- pandas/tests/arrays/categorical/test_operators.py | 3 ++- pandas/tests/arrays/categorical/test_repr.py | 3 ++- pandas/tests/indexes/datetimes/test_ops.py | 3 +-- pandas/tests/tseries/offsets/conftest.py | 13 ------------- pandas/tests/tseries/offsets/test_common.py | 3 ++- 8 files changed, 13 insertions(+), 30 deletions(-) delete mode 100644 pandas/tests/arrays/categorical/conftest.py delete mode 100644 pandas/tests/tseries/offsets/conftest.py diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py deleted file mode 100644 index 37249210f28f4..0000000000000 --- a/pandas/tests/arrays/categorical/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b4215b4a6fe21..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -385,7 +385,8 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3377c411a7084..5e1c5c64fa660 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ def test_getitem(self, factor): subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 16b941eab4830..4174d2adc810b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,8 @@ def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d6f93fbbd912f..ef0315130215c 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -17,7 +17,8 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor, using_infer_string): + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 5db0aa5cf510f..bac9548b932c1 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,8 +10,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: def test_infer_freq(self, freq_sample): @@ -26,6 +24,7 @@ def test_infer_freq(self, freq_sample): class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py deleted file mode 100644 index 2fc846353dcb5..0000000000000 --- a/pandas/tests/tseries/offsets/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 5b80b8b1c4ab4..aa4e22f71ad66 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -250,7 +250,8 @@ def test_sub(date, offset_box, offset2): [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) From 58b1d12ee975ec6ad63ad8ec5ce3434d6ea7163e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 25 Dec 2023 21:35:20 +0100 Subject: [PATCH 22/31] CI: Fix deprecation warnings (#56615) --- pandas/tests/io/parser/common/test_chunksize.py | 5 +++-- pandas/tests/io/parser/common/test_read_errors.py | 2 +- pandas/tests/io/test_parquet.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 5e47bcc1c5b0e..9660b283a491b 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -223,7 +223,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame" + depr_msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): with monkeypatch.context() as m: m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) @@ -254,7 +254,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): if parser.engine == "pyarrow": df = parser.read_csv_check_warnings( DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated", + "Passing a BlockManager to DataFrame is deprecated|" + "make_block is deprecated", buf, check_stacklevel=False, ) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4a4ae2b259289..db8b586d22fc0 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -171,7 +171,7 @@ def test_suppress_error_output(all_parsers): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame" + msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): result = parser.read_csv(StringIO(data), on_bad_lines="skip") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7cdad363e78..e4b94177eedb2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1000,9 +1000,7 @@ def test_filter_row_groups(self, pa): df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): From f1de9c74bf49249e4890d768569729d157f5ae11 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 26 Dec 2023 20:35:01 +0100 Subject: [PATCH 23/31] DEPR: Remove array manager branches from tests (#56621) --- pandas/conftest.py | 8 -- pandas/tests/apply/test_frame_apply.py | 4 +- pandas/tests/arithmetic/test_numeric.py | 8 +- pandas/tests/arithmetic/test_timedelta64.py | 9 +- pandas/tests/copy_view/test_array.py | 12 +-- pandas/tests/copy_view/test_constructors.py | 9 +- pandas/tests/copy_view/test_indexing.py | 90 +++++-------------- pandas/tests/copy_view/test_methods.py | 28 +++--- pandas/tests/copy_view/test_replace.py | 4 +- pandas/tests/extension/base/setitem.py | 6 +- pandas/tests/frame/indexing/test_indexing.py | 14 +-- pandas/tests/frame/indexing/test_insert.py | 9 +- pandas/tests/frame/indexing/test_setitem.py | 17 ++-- pandas/tests/frame/indexing/test_xs.py | 18 +--- pandas/tests/frame/methods/test_equals.py | 4 +- .../tests/frame/methods/test_interpolate.py | 14 +-- pandas/tests/frame/methods/test_quantile.py | 65 +++----------- pandas/tests/frame/methods/test_shift.py | 8 +- .../tests/frame/methods/test_sort_values.py | 5 +- pandas/tests/frame/test_arithmetic.py | 14 +-- pandas/tests/frame/test_constructors.py | 65 ++++---------- pandas/tests/frame/test_nonunique_indexes.py | 7 +- pandas/tests/frame/test_reductions.py | 19 +--- pandas/tests/frame/test_stack_unstack.py | 25 ++---- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_reductions.py | 5 +- .../indexing/test_chaining_and_caching.py | 35 ++------ pandas/tests/indexing/test_iloc.py | 23 ++--- pandas/tests/indexing/test_indexing.py | 8 +- pandas/tests/indexing/test_loc.py | 8 +- pandas/tests/indexing/test_partial.py | 4 +- pandas/tests/io/test_parquet.py | 7 +- pandas/tests/reshape/concat/test_append.py | 13 +-- pandas/tests/reshape/concat/test_concat.py | 10 +-- pandas/tests/reshape/concat/test_datetimes.py | 23 ++--- pandas/tests/reshape/merge/test_merge.py | 9 +- pandas/tests/reshape/test_crosstab.py | 5 +- pandas/tests/reshape/test_pivot.py | 10 +-- pandas/tests/reshape/test_pivot_multilevel.py | 6 +- pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/test_constructors.py | 7 +- pandas/tests/series/test_reductions.py | 17 ++-- 42 files changed, 171 insertions(+), 487 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..046cda259eefd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1877,14 +1877,6 @@ def indexer_ial(request): return request.param -@pytest.fixture -def using_array_manager() -> bool: - """ - Fixture to check if the array manager is being used. - """ - return _get_option("mode.data_manager", silent=True) == "array" - - @pytest.fixture def using_copy_on_write() -> bool: """ diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..0839f005305a5 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1487,7 +1487,7 @@ def test_apply_dtype(col): tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write): +def test_apply_mutating(using_copy_on_write, warn_copy_on_write): # GH#35462 case where applied func pins a new BlockManager to a row df = DataFrame({"a": range(100), "b": range(100, 200)}) df_orig = df.copy() @@ -1505,7 +1505,7 @@ def func(row): result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) - if using_copy_on_write or using_array_manager: + if using_copy_on_write: # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place, # with ArrayManager the row is not a view, and thus not mutated in place diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d8c1786b6b422..ebcd7cbd963d7 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -586,16 +586,12 @@ def test_df_div_zero_series_does_not_commute(self): # ------------------------------------------------------------------ # Mod By Zero - def test_df_mod_zero_df(self, using_array_manager): + def test_df_mod_zero_df(self): # GH#3590, modulo as ints df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float first = Series([0, 0, 0, 0]) - if not using_array_manager: - # INFO(ArrayManager) BlockManager doesn't preserve dtype per column - # while ArrayManager performs op column-wisedoes and thus preserves - # dtype if possible - first = first.astype("float64") + first = first.astype("float64") second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) result = df % df diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 007d1e670e1e0..b2007209dd5b9 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1736,9 +1736,7 @@ def test_td64_div_object_mixed_result(self, box_with_array): # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ - def test_td64arr_floordiv_td64arr_with_nat( - self, box_with_array, using_array_manager - ): + def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): # GH#35529 box = box_with_array xbox = np.ndarray if box is pd.array else box @@ -1751,11 +1749,6 @@ def test_td64arr_floordiv_td64arr_with_nat( expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) expected = tm.box_expected(expected, xbox) - if box is DataFrame and using_array_manager: - # INFO(ArrayManager) floordiv returns integer, and ArrayManager - # performs ops column-wise and thus preserves int64 dtype for - # columns without missing values - expected[[0, 1]] = expected[[0, 1]].astype("int64") with tm.maybe_produces_warning( RuntimeWarning, box is pd.array, check_stacklevel=False diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..13f42cce4fe69 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -48,7 +48,7 @@ def test_series_values(using_copy_on_write, method): [lambda df: df.values, lambda df: np.asarray(df)], ids=["values", "asarray"], ) -def test_dataframe_values(using_copy_on_write, using_array_manager, method): +def test_dataframe_values(using_copy_on_write, method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() @@ -70,10 +70,7 @@ def test_dataframe_values(using_copy_on_write, using_array_manager, method): else: assert arr.flags.writeable is True arr[0, 0] = 0 - if not using_array_manager: - assert df.iloc[0, 0] == 0 - else: - tm.assert_frame_equal(df, df_orig) + assert df.iloc[0, 0] == 0 def test_series_to_numpy(using_copy_on_write): @@ -157,11 +154,10 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True -def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): +def test_dataframe_array_string_dtype(using_copy_on_write): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) - if not using_array_manager: - assert np.shares_memory(arr, get_array(df, "a")) + assert np.shares_memory(arr, get_array(df, "a")) if using_copy_on_write: assert arr.flags.writeable is False else: diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1aa458a625028..c325e49e8156e 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -339,16 +339,11 @@ def test_dataframe_from_dict_of_series_with_dtype(index): @pytest.mark.parametrize("copy", [False, None, True]) -def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): +def test_frame_from_numpy_array(using_copy_on_write, copy): arr = np.array([[1, 2], [3, 4]]) df = DataFrame(arr, copy=copy) - if ( - using_copy_on_write - and copy is not False - or copy is True - or (using_array_manager and copy is None) - ): + if using_copy_on_write and copy is not False or copy is True: assert not np.shares_memory(get_array(df, 0), arr) else: assert np.shares_memory(get_array(df, 0), arr) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6f3850ab64daa..9afc98e558c11 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -140,15 +140,11 @@ def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_column_slice( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype -): +def test_subset_column_slice(backend, using_copy_on_write, warn_copy_on_write, dtype): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset dtype_backend, DataFrame, _ = backend - single_block = ( - dtype == "int64" and dtype_backend == "numpy" - ) and not using_array_manager + single_block = dtype == "int64" and dtype_backend == "numpy" df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -176,7 +172,7 @@ def test_subset_column_slice( tm.assert_frame_equal(subset, expected) # original parent dataframe is not modified (also not for BlockManager case, # except for single block) - if not using_copy_on_write and (using_array_manager or single_block): + if not using_copy_on_write and single_block: df_orig.iloc[0, 1] = 0 tm.assert_frame_equal(df, df_orig) else: @@ -201,7 +197,6 @@ def test_subset_loc_rows_columns( dtype, row_indexer, column_indexer, - using_array_manager, using_copy_on_write, warn_copy_on_write, ): @@ -224,14 +219,7 @@ def test_subset_loc_rows_columns( mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and ( - using_array_manager - or ( - dtype == "int64" - and dtype_backend == "numpy" - and not using_copy_on_write - ) - ) + and (dtype == "int64" and dtype_backend == "numpy" and not using_copy_on_write) ) # modifying the subset never modifies the parent @@ -265,7 +253,6 @@ def test_subset_iloc_rows_columns( dtype, row_indexer, column_indexer, - using_array_manager, using_copy_on_write, warn_copy_on_write, ): @@ -288,14 +275,7 @@ def test_subset_iloc_rows_columns( mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and ( - using_array_manager - or ( - dtype == "int64" - and dtype_backend == "numpy" - and not using_copy_on_write - ) - ) + and (dtype == "int64" and dtype_backend == "numpy" and not using_copy_on_write) ) # modifying the subset never modifies the parent @@ -422,7 +402,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) def test_subset_set_column_with_loc( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype + backend, using_copy_on_write, warn_copy_on_write, dtype ): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value @@ -440,10 +420,7 @@ def test_subset_set_column_with_loc( subset.loc[:, "a"] = np.array([10, 11], dtype="int64") else: with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning( - None, - raise_on_extra_warnings=not using_array_manager, - ): + with tm.assert_produces_warning(None): subset.loc[:, "a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() @@ -461,9 +438,7 @@ def test_subset_set_column_with_loc( tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_subset_set_column_with_loc2(backend, using_copy_on_write, warn_copy_on_write): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate @@ -480,10 +455,7 @@ def test_subset_set_column_with_loc2( subset.loc[:, "a"] = 0 else: with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning( - None, - raise_on_extra_warnings=not using_array_manager, - ): + with tm.assert_produces_warning(None): subset.loc[:, "a"] = 0 subset._mgr._verify_integrity() @@ -600,7 +572,6 @@ def test_subset_chained_getitem( method, dtype, using_copy_on_write, - using_array_manager, warn_copy_on_write, ): # Case: creating a subset using multiple, chained getitem calls using views @@ -614,17 +585,10 @@ def test_subset_chained_getitem( # when not using CoW, it depends on whether we have a single block or not # and whether we are slicing the columns -> in that case we have a view test_callspec = request.node.callspec.id - if not using_array_manager: - subset_is_view = test_callspec in ( - "numpy-single-block-column-iloc-slice", - "numpy-single-block-column-loc-slice", - ) - else: - # with ArrayManager, it doesn't matter whether we have - # single vs mixed block or numpy vs nullable dtypes - subset_is_view = test_callspec.endswith( - ("column-iloc-slice", "column-loc-slice") - ) + subset_is_view = test_callspec in ( + "numpy-single-block-column-iloc-slice", + "numpy-single-block-column-loc-slice", + ) # modify subset -> don't modify parent subset = method(df) @@ -726,9 +690,7 @@ def test_subset_chained_getitem_series( assert subset.iloc[0] == 0 -def test_subset_chained_single_block_row( - using_copy_on_write, using_array_manager, warn_copy_on_write -): +def test_subset_chained_single_block_row(using_copy_on_write, warn_copy_on_write): # not parametrizing this for dtype backend, since this explicitly tests single block df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -737,7 +699,7 @@ def test_subset_chained_single_block_row( subset = df[:].iloc[0].iloc[0:2] with tm.assert_cow_warning(warn_copy_on_write): subset.iloc[0] = 0 - if using_copy_on_write or using_array_manager: + if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: assert df.iloc[0, 0] == 0 @@ -747,7 +709,7 @@ def test_subset_chained_single_block_row( with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = 0 expected = Series([1, 4], index=["a", "b"], name=0) - if using_copy_on_write or using_array_manager: + if using_copy_on_write: tm.assert_series_equal(subset, expected) else: assert subset.iloc[0] == 0 @@ -967,9 +929,7 @@ def test_del_series(backend): # Accessing column as Series -def test_column_as_series( - backend, using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_column_as_series(backend, using_copy_on_write, warn_copy_on_write): # Case: selecting a single column now also uses Copy-on-Write dtype_backend, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -979,7 +939,7 @@ def test_column_as_series( assert np.shares_memory(get_array(s, "a"), get_array(df, "a")) - if using_copy_on_write or using_array_manager: + if using_copy_on_write: s[0] = 0 else: if warn_copy_on_write: @@ -1004,7 +964,7 @@ def test_column_as_series( def test_column_as_series_set_with_upcast( - backend, using_copy_on_write, using_array_manager, warn_copy_on_write + backend, using_copy_on_write, warn_copy_on_write ): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent @@ -1019,7 +979,7 @@ def test_column_as_series_set_with_upcast( with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" expected = Series([1, 2, 3], name="a") - elif using_copy_on_write or warn_copy_on_write or using_array_manager: + elif using_copy_on_write or warn_copy_on_write: # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): s[0] = "foo" @@ -1063,7 +1023,6 @@ def test_column_as_series_no_item_cache( method, using_copy_on_write, warn_copy_on_write, - using_array_manager, ): # Case: selecting a single column (which now also uses Copy-on-Write to protect # the view) should always give a new object (i.e. not make use of a cache) @@ -1080,7 +1039,7 @@ def test_column_as_series_no_item_cache( else: assert s1 is s2 - if using_copy_on_write or using_array_manager: + if using_copy_on_write: s1.iloc[0] = 0 elif warn_copy_on_write: with tm.assert_cow_warning(): @@ -1181,9 +1140,7 @@ def test_series_midx_slice(using_copy_on_write, warn_copy_on_write): tm.assert_series_equal(ser, expected) -def test_getitem_midx_slice( - using_copy_on_write, warn_copy_on_write, using_array_manager -): +def test_getitem_midx_slice(using_copy_on_write, warn_copy_on_write): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] @@ -1191,8 +1148,7 @@ def test_getitem_midx_slice( if using_copy_on_write: assert not new_df._mgr._has_no_reference(0) - if not using_array_manager: - assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) + assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..590829b6dc759 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -119,9 +119,7 @@ def test_copy_shallow(using_copy_on_write, warn_copy_on_write): "set_flags", ], ) -def test_methods_copy_keyword( - request, method, copy, using_copy_on_write, using_array_manager -): +def test_methods_copy_keyword(request, method, copy, using_copy_on_write): index = None if "to_timestamp" in request.node.callspec.id: index = period_range("2012-01-01", freq="D", periods=3) @@ -145,7 +143,7 @@ def test_methods_copy_keyword( if request.node.callspec.id.startswith("reindex-"): # TODO copy=False without CoW still returns a copy in this case - if not using_copy_on_write and not using_array_manager and copy is False: + if not using_copy_on_write and copy is False: share_memory = False if share_memory: @@ -227,11 +225,10 @@ def test_methods_series_copy_keyword(request, method, copy, using_copy_on_write) @pytest.mark.parametrize("copy", [True, None, False]) -def test_transpose_copy_keyword(using_copy_on_write, copy, using_array_manager): +def test_transpose_copy_keyword(using_copy_on_write, copy): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = df.transpose(copy=copy) share_memory = using_copy_on_write or copy is False or copy is None - share_memory = share_memory and not using_array_manager if share_memory: assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) @@ -1718,11 +1715,8 @@ def test_get(using_copy_on_write, warn_copy_on_write, key): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_xs( - using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype -): - single_block = (dtype == "int64") and not using_array_manager - is_view = single_block or (using_array_manager and axis == 1) +def test_xs(using_copy_on_write, warn_copy_on_write, axis, key, dtype): + single_block = dtype == "int64" df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -1735,7 +1729,7 @@ def test_xs( elif using_copy_on_write: assert result._mgr._has_no_reference(0) - if using_copy_on_write or (is_view and not warn_copy_on_write): + if using_copy_on_write or (single_block and not warn_copy_on_write): result.iloc[0] = 0 elif warn_copy_on_write: with tm.assert_cow_warning(single_block or axis == 1): @@ -1753,9 +1747,7 @@ def test_xs( @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)]) -def test_xs_multiindex( - using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis -): +def test_xs_multiindex(using_copy_on_write, warn_copy_on_write, key, level, axis): arr = np.arange(18).reshape(6, 3) index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"]) df = DataFrame(arr, index=index, columns=list("abc")) @@ -1772,7 +1764,7 @@ def test_xs_multiindex( if warn_copy_on_write: warn = FutureWarning if level == 0 else None - elif not using_copy_on_write and not using_array_manager: + elif not using_copy_on_write: warn = SettingWithCopyWarning else: warn = None @@ -1884,12 +1876,12 @@ def test_inplace_arithmetic_series_with_reference( @pytest.mark.parametrize("copy", [True, False]) -def test_transpose(using_copy_on_write, copy, using_array_manager): +def test_transpose(using_copy_on_write, copy): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() result = df.transpose(copy=copy) - if not copy and not using_array_manager or using_copy_on_write: + if not copy or using_copy_on_write: assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) else: assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 6d16bc3083883..1a0a77b332743 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -118,7 +118,7 @@ def test_replace_mask_all_false_second_block(using_copy_on_write): # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) -def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): +def test_replace_coerce_single_column(using_copy_on_write): df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() @@ -128,7 +128,7 @@ def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - elif not using_array_manager: + else: assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index ca19845041e23..9dd0a2eba6c0d 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -397,10 +397,6 @@ def test_setitem_series(self, data, full_indexer): def test_setitem_frame_2d_values(self, data): # GH#44514 df = pd.DataFrame({"A": data}) - - # Avoiding using_array_manager fixture - # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 - using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) using_copy_on_write = pd.options.mode.copy_on_write blk_data = df._mgr.arrays[0] @@ -415,7 +411,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:] = df.values tm.assert_frame_equal(df, orig) - if not using_array_manager and not using_copy_on_write: + if not using_copy_on_write: # GH#33457 Check that this setting occurred in-place # FIXME(ArrayManager): this should work there too assert df._mgr.arrays[0] is blk_data diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 97e7ae15c6c63..7837adec0c9e0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -739,7 +739,7 @@ def test_getitem_setitem_boolean_multi(self): expected.loc[[0, 2], [1]] = 5 tm.assert_frame_equal(df, expected) - def test_getitem_setitem_float_labels(self, using_array_manager): + def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) @@ -1110,16 +1110,14 @@ def test_iloc_col(self): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_iloc_col_slice_view(self, using_copy_on_write, warn_copy_on_write): df = DataFrame( np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) ) original = df.copy() subset = df.iloc[:, slice(4, 8)] - if not using_array_manager and not using_copy_on_write: + if not using_copy_on_write: # verify slice is view assert np.shares_memory(df[8]._values, subset[8]._values) @@ -1617,7 +1615,7 @@ def test_setitem(self): ) -def test_object_casting_indexing_wraps_datetimelike(using_array_manager): +def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack df = DataFrame( { @@ -1639,10 +1637,6 @@ def test_object_casting_indexing_wraps_datetimelike(using_array_manager): assert isinstance(ser.values[1], Timestamp) assert isinstance(ser.values[2], pd.Timedelta) - if using_array_manager: - # remainder of the test checking BlockManager internals - return - mgr = df._mgr mgr._rebuild_blknos_and_blklocs() arr = mgr.fast_xs(0).array diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 7e702bdc993bd..b9fc5dc195026 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -71,15 +71,10 @@ def test_insert_with_columns_dups(self): ) tm.assert_frame_equal(df, exp) - def test_insert_item_cache(self, using_array_manager, using_copy_on_write): + def test_insert_item_cache(self, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) ser = df[0] - - if using_array_manager: - expected_warning = None - else: - # with BlockManager warn about high fragmentation of single dtype - expected_warning = PerformanceWarning + expected_warning = PerformanceWarning with tm.assert_produces_warning(expected_warning): for n in range(100): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e802a56ecbc81..f031cb2218e31 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1208,9 +1208,7 @@ def test_setitem_always_copy(self, float_frame): assert notna(s[5:10]).all() @pytest.mark.parametrize("consolidate", [True, False]) - def test_setitem_partial_column_inplace( - self, consolidate, using_array_manager, using_copy_on_write - ): + def test_setitem_partial_column_inplace(self, consolidate, using_copy_on_write): # This setting should be in-place, regardless of whether frame is # single-block or multi-block # GH#304 this used to be incorrectly not-inplace, in which case @@ -1220,12 +1218,11 @@ def test_setitem_partial_column_inplace( {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] ) df.insert(2, "z", np.nan) - if not using_array_manager: - if consolidate: - df._consolidate_inplace() - assert len(df._mgr.blocks) == 1 - else: - assert len(df._mgr.blocks) == 2 + if consolidate: + df._consolidate_inplace() + assert len(df._mgr.blocks) == 1 + else: + assert len(df._mgr.blocks) == 2 zvals = df["z"]._values @@ -1254,7 +1251,7 @@ def test_setitem_duplicate_columns_not_inplace(self): @pytest.mark.parametrize( "value", [1, np.array([[1], [1]], dtype="int64"), [[1], [1]]] ) - def test_setitem_same_dtype_not_inplace(self, value, using_array_manager): + def test_setitem_same_dtype_not_inplace(self, value): # GH#39510 cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index be809e3a17c8e..535137edd16cf 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -122,9 +122,7 @@ def test_xs_keep_level(self): result = df.xs((2008, "sat"), level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_xs_view(self, using_copy_on_write, warn_copy_on_write): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent @@ -135,13 +133,6 @@ def test_xs_view( with tm.raises_chained_assignment_error(): dm.xs(2)[:] = 20 tm.assert_frame_equal(dm, df_orig) - elif using_array_manager: - # INFO(ArrayManager) with ArrayManager getting a row as a view is - # not possible - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - dm.xs(2)[:] = 20 - assert not (dm.xs(2) == 20).any() else: with tm.raises_chained_assignment_error(): dm.xs(2)[:] = 20 @@ -400,9 +391,7 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view( - self, using_array_manager, using_copy_on_write, warn_copy_on_write - ): + def test_xs_droplevel_false_view(self, using_copy_on_write, warn_copy_on_write): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) @@ -427,9 +416,6 @@ def test_xs_droplevel_false_view( if using_copy_on_write: # with copy on write the subset is never modified expected = DataFrame({"a": [1]}) - elif using_array_manager: - # Here the behavior is consistent - expected = DataFrame({"a": [2]}) else: # FIXME: iloc does not update the array inplace using # "split" path diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index d0b9d96cafa0d..88b3fec02182b 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager, using_infer_string): + def test_equals_different_blocks(self, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager and not using_infer_string: + if not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index e0641fcb65bd3..a93931a970687 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -52,12 +52,8 @@ def test_interpolate_datetimelike_values(self, frame_or_series): expected_td = frame_or_series(orig - orig[0]) tm.assert_equal(res_td, expected_td) - def test_interpolate_inplace(self, frame_or_series, using_array_manager, request): + def test_interpolate_inplace(self, frame_or_series, request): # GH#44749 - if using_array_manager and frame_or_series is DataFrame: - mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") - request.applymarker(mark) - obj = frame_or_series([1, np.nan, 2]) orig = obj.values @@ -474,14 +470,8 @@ def test_interp_string_axis(self, axis_name, axis_number): @pytest.mark.parametrize("multiblock", [True, False]) @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) - def test_interp_fillna_methods( - self, request, axis, multiblock, method, using_array_manager - ): + def test_interp_fillna_methods(self, request, axis, multiblock, method): # GH 12918 - if using_array_manager and axis in (1, "columns"): - # TODO(ArrayManager) support axis=1 - td.mark_array_manager_not_yet_implemented(request) - df = DataFrame( { "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0], diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0f27eae1a3bfc..e31e29b1b0cb2 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -45,9 +45,7 @@ def test_quantile_sparse(self, df, expected): expected = expected.astype("Sparse[float]") tm.assert_series_equal(result, expected) - def test_quantile( - self, datetime_frame, interp_method, using_array_manager, request - ): + def test_quantile(self, datetime_frame, interp_method, request): interpolation, method = interp_method df = datetime_frame result = df.quantile( @@ -63,11 +61,6 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.applymarker( - pytest.mark.xfail( - using_array_manager, reason="Name set incorrectly for arraymanager" - ) - ) assert result.name == expected.name result = df.quantile( @@ -83,11 +76,6 @@ def test_quantile( tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.applymarker( - pytest.mark.xfail( - using_array_manager, reason="Name set incorrectly for arraymanager" - ) - ) assert result.name == expected.name def test_empty(self, interp_method): @@ -97,7 +85,7 @@ def test_empty(self, interp_method): ) assert np.isnan(q["x"]) and np.isnan(q["y"]) - def test_non_numeric_exclusion(self, interp_method, request, using_array_manager): + def test_non_numeric_exclusion(self, interp_method, request): interpolation, method = interp_method df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile( @@ -106,11 +94,9 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager xp = df.median(numeric_only=True).rename(0.5) if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(rs, xp) - def test_axis(self, interp_method, request, using_array_manager): + def test_axis(self, interp_method): # axis interpolation, method = interp_method df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) @@ -118,8 +104,6 @@ def test_axis(self, interp_method, request, using_array_manager): expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) result = df.quantile( @@ -134,7 +118,7 @@ def test_axis(self, interp_method, request, using_array_manager): expected = expected.astype(np.int64) tm.assert_frame_equal(result, expected, check_index_type=True) - def test_axis_numeric_only_true(self, interp_method, request, using_array_manager): + def test_axis_numeric_only_true(self, interp_method): # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 @@ -146,11 +130,9 @@ def test_axis_numeric_only_true(self, interp_method, request, using_array_manage expected = Series([3.0, 4.0], index=[0, 1], name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) - def test_quantile_date_range(self, interp_method, request, using_array_manager): + def test_quantile_date_range(self, interp_method): # GH 2460 interpolation, method = interp_method dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") @@ -163,12 +145,10 @@ def test_quantile_date_range(self, interp_method, request, using_array_manager): expected = Series( ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) - def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): + def test_quantile_axis_mixed(self, interp_method): # mixed on axis=1 interpolation, method = interp_method df = DataFrame( @@ -185,8 +165,6 @@ def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): expected = Series([1.5, 2.5, 3.5], name=0.5) if interpolation == "nearest": expected -= 0.5 - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) # must raise @@ -194,11 +172,9 @@ def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): with pytest.raises(TypeError, match=msg): df.quantile(0.5, axis=1, numeric_only=False) - def test_quantile_axis_parameter(self, interp_method, request, using_array_manager): + def test_quantile_axis_parameter(self, interp_method): # GH 9543/9544 interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) @@ -312,7 +288,7 @@ def test_quantile_interpolation_int(self, int_frame): assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) - def test_quantile_multi(self, interp_method, request, using_array_manager): + def test_quantile_multi(self, interp_method): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method) @@ -323,11 +299,9 @@ def test_quantile_multi(self, interp_method, request, using_array_manager): ) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) - def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): + def test_quantile_multi_axis_1(self, interp_method): interpolation, method = interp_method df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) result = df.quantile( @@ -338,8 +312,6 @@ def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager ) if interpolation == "nearest": expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_empty(self, interp_method): @@ -443,10 +415,8 @@ def test_quantile_invalid(self, invalid, datetime_frame, interp_method): with pytest.raises(ValueError, match=msg): datetime_frame.quantile(invalid, interpolation=interpolation, method=method) - def test_quantile_box(self, interp_method, request, using_array_manager): + def test_quantile_box(self, interp_method): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame( { "A": [ @@ -574,10 +544,8 @@ def test_quantile_box_nat(self): ) tm.assert_frame_equal(res, exp) - def test_quantile_nan(self, interp_method, request, using_array_manager): + def test_quantile_nan(self, interp_method): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan @@ -621,10 +589,8 @@ def test_quantile_nan(self, interp_method, request, using_array_manager): exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self, interp_method, request, using_array_manager, unit): + def test_quantile_nat(self, interp_method, unit): interpolation, method = interp_method - if method == "table" and using_array_manager: - request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # full NaT column df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]") @@ -757,9 +723,7 @@ def test_quantile_empty_no_columns(self, interp_method): expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) - def test_quantile_item_cache( - self, using_array_manager, interp_method, using_copy_on_write - ): + def test_quantile_item_cache(self, interp_method, using_copy_on_write): # previous behavior incorrect retained an invalid _item_cache entry interpolation, method = interp_method df = DataFrame( @@ -767,8 +731,7 @@ def test_quantile_item_cache( ) df["D"] = df["A"] * 2 ser = df["A"] - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 df.quantile(numeric_only=False, interpolation=interpolation, method=method) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..907ff67eac7a1 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -423,13 +423,12 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) - def test_shift_axis1_multiple_blocks(self, using_array_manager): + def test_shift_axis1_multiple_blocks(self): # GH#35488 df1 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 3))) df2 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 2))) df3 = pd.concat([df1, df2], axis=1) - if not using_array_manager: - assert len(df3._mgr.blocks) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(2, axis=1) @@ -449,8 +448,7 @@ def test_shift_axis1_multiple_blocks(self, using_array_manager): # Case with periods < 0 # rebuild df3 because `take` call above consolidated df3 = pd.concat([df1, df2], axis=1) - if not using_array_manager: - assert len(df3._mgr.blocks) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-2, axis=1) expected = df3.take([2, 3, 4, -1, -1], axis=1) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index f2f02058a534e..be75efcdfe9d3 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -598,15 +598,14 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write): + def test_sort_values_item_cache(self, using_copy_on_write): # previous behavior incorrect retained an invalid _item_cache entry df = DataFrame( np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] ) df["D"] = df["A"] * 2 ser = df["A"] - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 df.sort_values(by="A") diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 42ce658701355..ecaf826c46d9b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,8 +13,6 @@ from pandas._config import using_pyarrow_string_dtype -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -894,15 +892,11 @@ def test_df_add_2d_array_collike_broadcasts(self): tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_rowlike_broadcasts( - self, request, all_arithmetic_operators, using_array_manager + self, request, all_arithmetic_operators ): # GH#23000 opname = all_arithmetic_operators - if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): - # TODO(ArrayManager) decide on dtypes - td.mark_array_manager_not_yet_implemented(request) - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) @@ -921,15 +915,11 @@ def test_df_arith_2d_array_rowlike_broadcasts( tm.assert_frame_equal(result, expected) def test_df_arith_2d_array_collike_broadcasts( - self, request, all_arithmetic_operators, using_array_manager + self, request, all_arithmetic_operators ): # GH#23000 opname = all_arithmetic_operators - if using_array_manager and opname in ("__rmod__", "__rfloordiv__"): - # TODO(ArrayManager) decide on dtypes - td.mark_array_manager_not_yet_implemented(request) - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e818d79d5ba8..8ff69472ea113 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -84,16 +84,15 @@ def test_constructor_from_ndarray_with_str_dtype(self): expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) - def test_constructor_from_2d_datetimearray(self, using_array_manager): + def test_constructor_from_2d_datetimearray(self): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") dta = dti._data.reshape(3, 2) df = DataFrame(dta) expected = DataFrame({0: dta[:, 0], 1: dta[:, 1]}) tm.assert_frame_equal(df, expected) - if not using_array_manager: - # GH#44724 big performance hit if we de-consolidate - assert len(df._mgr.blocks) == 1 + # GH#44724 big performance hit if we de-consolidate + assert len(df._mgr.blocks) == 1 def test_constructor_dict_with_tzaware_scalar(self): # GH#42505 @@ -310,10 +309,10 @@ def test_constructor_dtype_nocast_view_dataframe( assert df.values[0, 0] == 99 def test_constructor_dtype_nocast_view_2d_array( - self, using_array_manager, using_copy_on_write, warn_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): df = DataFrame([[1, 2], [3, 4]], dtype="int64") - if not using_array_manager and not using_copy_on_write: + if not using_copy_on_write: should_be_view = DataFrame(df.values, dtype=df[0].dtype) # TODO(CoW-warn) this should warn # with tm.assert_cow_warning(warn_copy_on_write): @@ -2147,35 +2146,19 @@ def test_constructor_frame_shallow_copy(self, float_frame): cop.index = np.arange(len(cop)) tm.assert_frame_equal(float_frame, orig) - def test_constructor_ndarray_copy( - self, float_frame, using_array_manager, using_copy_on_write - ): - if not using_array_manager: - arr = float_frame.values.copy() - df = DataFrame(arr) - - arr[5] = 5 - if using_copy_on_write: - assert not (df.values[5] == 5).all() - else: - assert (df.values[5] == 5).all() + def test_constructor_ndarray_copy(self, float_frame, using_copy_on_write): + arr = float_frame.values.copy() + df = DataFrame(arr) - df = DataFrame(arr, copy=True) - arr[6] = 6 - assert not (df.values[6] == 6).all() + arr[5] = 5 + if using_copy_on_write: + assert not (df.values[5] == 5).all() else: - arr = float_frame.values.copy() - # default: copy to ensure contiguous arrays - df = DataFrame(arr) - assert df._mgr.arrays[0].flags.c_contiguous - arr[0, 0] = 100 - assert df.iloc[0, 0] != 100 - - # manually specify copy=False - df = DataFrame(arr, copy=False) - assert not df._mgr.arrays[0].flags.c_contiguous - arr[0, 0] = 1000 - assert df.iloc[0, 0] == 1000 + assert (df.values[5] == 5).all() + + df = DataFrame(arr, copy=True) + arr[6] = 6 + assert not (df.values[6] == 6).all() def test_constructor_series_copy(self, float_frame): series = float_frame._series @@ -2328,15 +2311,10 @@ def test_check_dtype_empty_numeric_column(self, dtype): @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) - def test_check_dtype_empty_string_column(self, request, dtype, using_array_manager): + def test_check_dtype_empty_string_column(self, request, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) - - if using_array_manager and dtype in tm.BYTES_DTYPES: - # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype - td.mark_array_manager_not_yet_implemented(request) - assert data.b.dtype.name == "object" def test_to_frame_with_falsey_names(self): @@ -2515,17 +2493,8 @@ def test_dict_nocopy( copy, any_numeric_ea_dtype, any_numpy_dtype, - using_array_manager, using_copy_on_write, ): - if ( - using_array_manager - and not copy - and any_numpy_dtype not in tm.STRING_DTYPES + tm.BYTES_DTYPES - ): - # TODO(ArrayManager) properly honor copy keyword for dict input - td.mark_array_manager_not_yet_implemented(request) - a = np.array([1, 2], dtype=any_numpy_dtype) b = np.array([3, 4], dtype=any_numpy_dtype) if b.dtype.kind in ["S", "U"]: diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 34f172e900ab7..1e9aa2325e880 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -284,7 +284,7 @@ def test_multi_dtype2(self): expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) tm.assert_frame_equal(df, expected) - def test_dups_across_blocks(self, using_array_manager): + def test_dups_across_blocks(self): # dups across blocks df_float = DataFrame( np.random.default_rng(2).standard_normal((10, 3)), dtype="float64" @@ -299,9 +299,8 @@ def test_dups_across_blocks(self, using_array_manager): ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) - if not using_array_manager: - assert len(df._mgr.blknos) == len(df.columns) - assert len(df._mgr.blklocs) == len(df.columns) + assert len(df._mgr.blknos) == len(df.columns) + assert len(df._mgr.blklocs) == len(df.columns) # testing iloc for i in range(len(df.columns)): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66145c32c18d7..512b5d6ace469 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -817,17 +817,8 @@ def test_std_timedelta64_skipna_false(self): @pytest.mark.parametrize( "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]] ) - def test_std_datetime64_with_nat( - self, values, skipna, using_array_manager, request, unit - ): + def test_std_datetime64_with_nat(self, values, skipna, request, unit): # GH#51335 - if using_array_manager and ( - not skipna or all(value is pd.NaT for value in values) - ): - mark = pytest.mark.xfail( - reason="GH#51446: Incorrect type inference on NaT in reduction result" - ) - request.applymarker(mark) dti = to_datetime(values).as_unit(unit) df = DataFrame({"a": dti}) result = df.std(skipna=skipna) @@ -1926,14 +1917,8 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): tm.assert_series_equal(result, expected) -def test_sum_timedelta64_skipna_false(using_array_manager, request): +def test_sum_timedelta64_skipna_false(): # GH#17235 - if using_array_manager: - mark = pytest.mark.xfail( - reason="Incorrect type inference on NaT in reduction result" - ) - request.applymarker(mark) - arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6e1e743eb60de..ea66290ab0417 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -72,13 +72,12 @@ def test_stack_mixed_level(self, future_stack): expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) - def test_unstack_not_consolidated(self, using_array_manager): + def test_unstack_not_consolidated(self): # Gh#34708 df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) df2 = df[["x"]] df2["y"] = df["y"] - if not using_array_manager: - assert len(df2._mgr.blocks) == 2 + assert len(df2._mgr.blocks) == 2 res = df2.unstack() expected = df.unstack() @@ -969,7 +968,7 @@ def test_unstack_nan_index2(self): right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) - def test_unstack_nan_index3(self, using_array_manager): + def test_unstack_nan_index3(self): # GH7401 df = DataFrame( { @@ -991,10 +990,6 @@ def test_unstack_nan_index3(self, using_array_manager): ) right = DataFrame(vals, columns=cols, index=idx) - if using_array_manager: - # INFO(ArrayManager) with ArrayManager preserve dtype where possible - cols = right.columns[[1, 2, 3, 5]] - right[cols] = right[cols].astype(df["C"].dtype) tm.assert_frame_equal(left, right) def test_unstack_nan_index4(self): @@ -1498,7 +1493,7 @@ def test_stack_positional_level_duplicate_column_names(future_stack): tm.assert_frame_equal(result, expected) -def test_unstack_non_slice_like_blocks(using_array_manager): +def test_unstack_non_slice_like_blocks(): # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like mi = MultiIndex.from_product([range(5), ["A", "B", "C"]]) @@ -1511,8 +1506,7 @@ def test_unstack_non_slice_like_blocks(using_array_manager): }, index=mi, ) - if not using_array_manager: - assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks) + assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks) res = df.unstack() @@ -2354,7 +2348,7 @@ def test_unstack_group_index_overflow(self, future_stack): result = s.unstack(4) assert result.shape == (500, 2) - def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): + def test_unstack_with_missing_int_cast_to_float(self): # https://github.com/pandas-dev/pandas/issues/37115 df = DataFrame( { @@ -2366,8 +2360,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): # add another int column to get 2 blocks df["is_"] = 1 - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 result = df.unstack("b") result[("is_", "ca")] = result[("is_", "ca")].fillna(0) @@ -2380,10 +2373,6 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): names=[None, "b"], ), ) - if using_array_manager: - # INFO(ArrayManager) with ArrayManager preserve dtype where possible - expected[("v", "cb")] = expected[("v", "cb")].astype("int64") - expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64") tm.assert_frame_equal(result, expected) def test_unstack_with_level_has_nan(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c903e691add1..3cc06ae4d2387 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2050,9 +2050,7 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby( - columns, keys, values, method, op, using_array_manager, dropna, using_infer_string -): +def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 override_dtype = None diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..8333dba439be9 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -362,7 +362,7 @@ def test_max_min_non_numeric(): assert "ss" in result -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -375,8 +375,7 @@ def test_max_min_object_multiple_columns(using_array_manager): } ) df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 gb = df.groupby("A") diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b97df376ac47f..ca796463f4a1e 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -213,7 +213,7 @@ def test_detect_chained_assignment(self, using_copy_on_write): @pytest.mark.arm_slow def test_detect_chained_assignment_raises( - self, using_array_manager, using_copy_on_write, warn_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): # test with the chaining df = DataFrame( @@ -236,7 +236,7 @@ def test_detect_chained_assignment_raises( df["A"][0] = -5 with tm.raises_chained_assignment_error(): df["A"][1] = np.nan - elif not using_array_manager: + else: with pytest.raises(SettingWithCopyError, match=msg): with tm.raises_chained_assignment_error(): df["A"][0] = -5 @@ -246,14 +246,6 @@ def test_detect_chained_assignment_raises( df["A"][1] = np.nan assert df["A"]._is_copy is None - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe - df["A"][0] = -5 - df["A"][1] = -6 - expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB")) - expected["B"] = expected["B"].astype("float64") - tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_fails( @@ -297,7 +289,7 @@ def test_detect_chained_assignment_doc_example( @pytest.mark.arm_slow def test_detect_chained_assignment_object_dtype( - self, using_array_manager, using_copy_on_write, warn_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame( @@ -317,18 +309,13 @@ def test_detect_chained_assignment_object_dtype( with tm.raises_chained_assignment_error(): df["A"][0] = 111 tm.assert_frame_equal(df, expected) - elif not using_array_manager: + else: with pytest.raises(SettingWithCopyError, match=msg): with tm.raises_chained_assignment_error(): df["A"][0] = 111 df.loc[0, "A"] = 111 tm.assert_frame_equal(df, expected) - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe - df["A"][0] = 111 - tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_is_copy_pickle(self): @@ -453,7 +440,7 @@ def test_detect_chained_assignment_undefined_column( @pytest.mark.arm_slow def test_detect_chained_assignment_changing_dtype( - self, using_array_manager, using_copy_on_write, warn_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -485,15 +472,9 @@ def test_detect_chained_assignment_changing_dtype( with pytest.raises(SettingWithCopyError, match=msg): df.loc[2]["C"] = "foo" - if not using_array_manager: - with pytest.raises(SettingWithCopyError, match=msg): - with tm.raises_chained_assignment_error(): - df["C"][2] = "foo" - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not - df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" + with pytest.raises(SettingWithCopyError, match=msg): + with tm.raises_chained_assignment_error(): + df["C"][2] = "foo" def test_setting_with_copy_bug(self, using_copy_on_write, warn_copy_on_write): # operating on a copy diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 409eca42f404b..a1d8577d534f5 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -72,13 +72,12 @@ class TestiLocBaseIndependent: ], ) @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): + def test_iloc_setitem_fullcol_categorical(self, indexer, key): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) - if not using_array_manager: - assert frame._mgr.blocks[0]._can_hold_element(cat) + assert frame._mgr.blocks[0]._can_hold_element(cat) df = frame.copy() orig_vals = df.values @@ -86,8 +85,7 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage indexer(df)[key, 0] = cat expected = DataFrame({0: cat}).astype(object) - if not using_array_manager: - assert np.shares_memory(df[0].values, orig_vals) + assert np.shares_memory(df[0].values, orig_vals) tm.assert_frame_equal(df, expected) @@ -520,9 +518,7 @@ def test_iloc_setitem_dups(self): df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) - def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( - self, using_array_manager - ): + def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): # Same as the "assign back to self" check in test_iloc_setitem_dups # but on a DataFrame with multiple blocks df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) @@ -530,14 +526,12 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # setting float values that can be held by existing integer arrays # is inplace df.iloc[:, 0] = df.iloc[:, 0].astype("f8") - if not using_array_manager: - assert len(df._mgr.blocks) == 1 + assert len(df._mgr.blocks) == 1 # if the assigned values cannot be held by existing integer arrays, # we cast df.iloc[:, 0] = df.iloc[:, 0] + 0.5 - if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 2 expected = df.copy() @@ -632,7 +626,7 @@ def test_iloc_getitem_labelled_frame(self): with pytest.raises(ValueError, match=msg): df.iloc["j", "D"] - def test_iloc_getitem_doc_issue(self, using_array_manager): + def test_iloc_getitem_doc_issue(self): # multi axis slicing issue with single block # surfaced in GH 6059 @@ -662,8 +656,7 @@ def test_iloc_getitem_doc_issue(self, using_array_manager): columns = list(range(0, 8, 2)) df = DataFrame(arr, index=index, columns=columns) - if not using_array_manager: - df._mgr.blocks[0].mgr_locs + df._mgr.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 57f45f867254d..45ec968714aff 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -77,9 +77,7 @@ def test_setitem_ndarray_1d_2(self): "ignore:Series.__getitem__ treating keys as positions is deprecated:" "FutureWarning" ) - def test_getitem_ndarray_3d( - self, index, frame_or_series, indexer_sli, using_array_manager - ): + def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): # GH 25567 obj = gen_obj(frame_or_series, index) idxr = indexer_sli(obj) @@ -88,12 +86,8 @@ def test_getitem_ndarray_3d( msgs = [] if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]: msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]") - if using_array_manager: - msgs.append("Passed array should be 1-dimensional") if frame_or_series is Series or indexer_sli is tm.iloc: msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)") - if using_array_manager: - msgs.append("indexer should be 1-dimensional") if indexer_sli is tm.loc or ( frame_or_series is Series and indexer_sli is tm.setitem ): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index fb0adc56c401b..da10555e60301 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1490,7 +1490,7 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) - def test_loc_setitem_time_key(self, using_array_manager): + def test_loc_setitem_time_key(self): index = date_range("2012-01-01", "2012-01-05", freq="30min") df = DataFrame( np.random.default_rng(2).standard_normal((len(index), 5)), index=index @@ -1505,9 +1505,6 @@ def test_loc_setitem_time_key(self, using_array_manager): result = result.loc[akey] expected = df.loc[akey].copy() expected.loc[:] = 0 - if using_array_manager: - # TODO(ArrayManager) we are still overwriting columns - expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() @@ -1520,9 +1517,6 @@ def test_loc_setitem_time_key(self, using_array_manager): result = result.loc[bkey] expected = df.loc[bkey].copy() expected.loc[:] = 0 - if using_array_manager: - # TODO(ArrayManager) we are still overwriting columns - expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1f..b0a041ed5b69c 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -279,7 +279,7 @@ def test_partial_setting(self): s.iat[3] = 5.0 @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") - def test_partial_setting_frame(self, using_array_manager): + def test_partial_setting_frame(self): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" ) @@ -292,8 +292,6 @@ def test_partial_setting_frame(self, using_array_manager): df.iloc[4, 2] = 5.0 msg = "index 2 is out of bounds for axis 0 with size 2" - if using_array_manager: - msg = "list index out of range" with pytest.raises(IndexError, match=msg): df.iat[4, 2] = 5.0 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e4b94177eedb2..8fc02cc7799ed 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1003,7 +1003,7 @@ def test_filter_row_groups(self, pa): result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 - def test_read_parquet_manager(self, pa, using_array_manager): + def test_read_parquet_manager(self, pa): # ensure that read_parquet honors the pandas.options.mode.data_manager option df = pd.DataFrame( np.random.default_rng(2).standard_normal((10, 3)), columns=["A", "B", "C"] @@ -1012,10 +1012,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) result = read_parquet(path, pa) - if using_array_manager: - assert isinstance(result._mgr, pd.core.internals.ArrayManager) - else: - assert isinstance(result._mgr, pd.core.internals.BlockManager) + assert isinstance(result._mgr, pd.core.internals.BlockManager) def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 81ca227fb7afb..3fb6a3fb61396 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -328,16 +328,13 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df._append([ser, ser], ignore_index=True) tm.assert_frame_equal(result, expected) - def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager): + def test_append_empty_tz_frame_with_datetime64ns(self): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df._append({"a": pd.NaT}, ignore_index=True) - if using_array_manager: - expected = DataFrame({"a": [pd.NaT]}, dtype=object) - else: - expected = DataFrame({"a": [np.nan]}, dtype=object) + expected = DataFrame({"a": [np.nan]}, dtype=object) tm.assert_frame_equal(result, expected) # also test with typed value to append @@ -356,9 +353,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self, using_array_manager): "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] ) @pytest.mark.parametrize("val", [1, "NaT"]) - def test_append_empty_frame_with_timedelta64ns_nat( - self, dtype_str, val, using_array_manager - ): + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): # https://github.com/pandas-dev/pandas/issues/35460 df = DataFrame(columns=["a"]).astype(dtype_str) @@ -366,7 +361,7 @@ def test_append_empty_frame_with_timedelta64ns_nat( result = df._append(other, ignore_index=True) expected = other.astype(object) - if isinstance(val, str) and dtype_str != "int64" and not using_array_manager: + if isinstance(val, str) and dtype_str != "int64": # TODO: expected used to be `other.astype(object)` which is a more # reasonable result. This was changed when tightening # assert_frame_equal's treatment of mismatched NAs to match the diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9e34d02091e69..2cc91992f1fd7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -44,7 +44,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - def test_concat_copy(self, using_array_manager, using_copy_on_write): + def test_concat_copy(self, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) @@ -72,18 +72,14 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): elif arr.dtype.kind in ["i", "u"]: assert arr.base is df2._mgr.arrays[0].base elif arr.dtype == object: - if using_array_manager: - # we get the same array object, which has no base - assert arr is df3._mgr.arrays[0] - else: - assert arr.base is not None + assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) result = concat([df, df2, df3, df4], axis=1, copy=False) for arr in result._mgr.arrays: if arr.dtype.kind == "f": - if using_array_manager or using_copy_on_write: + if using_copy_on_write: # this is a view on some array in either df or df4 assert any( np.shares_memory(arr, other) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..77485788faa02 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -214,9 +214,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) - def test_concat_NaT_dataframes_all_NaT_axis_0( - self, tz1, tz2, item, using_array_manager - ): + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 # tz-naive @@ -228,7 +226,7 @@ def test_concat_NaT_dataframes_all_NaT_axis_0( expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) - if item is pd.NaT and not using_array_manager: + if item is pd.NaT: # GH#18463 # TODO: setting nan here is to keep the test passing as we # make assert_frame_equal stricter, but is nan really the @@ -567,7 +565,7 @@ def test_concat_multiindex_datetime_nat(): tm.assert_frame_equal(result, expected) -def test_concat_float_datetime64(using_array_manager): +def test_concat_float_datetime64(): # GH#32934 df_time = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) df_float = DataFrame({"A": pd.array([1.0], dtype="float64")}) @@ -592,15 +590,8 @@ def test_concat_float_datetime64(using_array_manager): result = concat([df_time.iloc[:0], df_float]) tm.assert_frame_equal(result, expected) - if not using_array_manager: - expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df_time, df_float.iloc[:0]]) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( - {"A": "object"} - ) + expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" + with tm.assert_produces_warning(FutureWarning, match=msg): result = concat([df_time, df_float.iloc[:0]]) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..9f832c7b1d1ca 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -316,7 +316,7 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) @@ -702,7 +702,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self, using_array_manager): + def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 @@ -712,8 +712,6 @@ def test_join_append_timedeltas(self, using_array_manager): df = DataFrame(columns=list("dt")) msg = "The behavior of DataFrame concatenation with empty or all-NA entries" warn = FutureWarning - if using_array_manager: - warn = None with tm.assert_produces_warning(warn, match=msg): df = concat([df, d], ignore_index=True) result = concat([df, d], ignore_index=True) @@ -723,9 +721,6 @@ def test_join_append_timedeltas(self, using_array_manager): "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) - if using_array_manager: - # TODO(ArrayManager) decide on exact casting rules in concat - expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self): diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 136e76986df9d..8a30b63cf0e17 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -459,7 +459,7 @@ def test_crosstab_normalize_arrays(self): ) tm.assert_frame_equal(test_case, norm_sum) - def test_crosstab_with_empties(self, using_array_manager): + def test_crosstab_with_empties(self): # Check handling of empties df = DataFrame( { @@ -484,9 +484,6 @@ def test_crosstab_with_empties(self, using_array_manager): index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - nans[3] = nans[3].astype("int64") calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) tm.assert_frame_equal(nans, calculated) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 18a449b4d0c67..bf2717be4d7ae 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1277,7 +1277,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name, data): margins_name=margin_name, ) - def test_pivot_timegrouper(self, using_array_manager): + def test_pivot_timegrouper(self): df = DataFrame( { "Branch": "A A A A A A A B".split(), @@ -1331,9 +1331,6 @@ def test_pivot_timegrouper(self, using_array_manager): ) expected.index.name = "Date" expected.columns.name = "Buyer" - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - expected["Carl"] = expected["Carl"].astype("int64") result = pivot_table( df, @@ -2370,7 +2367,7 @@ def test_pivot_table_datetime_warning(self): ) tm.assert_frame_equal(result, expected) - def test_pivot_table_with_mixed_nested_tuples(self, using_array_manager): + def test_pivot_table_with_mixed_nested_tuples(self): # GH 50342 df = DataFrame( { @@ -2434,9 +2431,6 @@ def test_pivot_table_with_mixed_nested_tuples(self, using_array_manager): [["bar", "bar", "foo", "foo"], ["one", "two"] * 2], names=["A", "B"] ), ) - if using_array_manager: - # INFO(ArrayManager) column without NaNs can preserve int dtype - expected["small"] = expected["small"].astype("int64") tm.assert_frame_equal(result, expected) def test_pivot_table_aggfunc_nunique_with_different_values(self): diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index 08ef29440825f..2c9d54c3db72c 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -197,7 +197,7 @@ def test_pivot_list_like_columns( tm.assert_frame_equal(result, expected) -def test_pivot_multiindexed_rows_and_cols(using_array_manager): +def test_pivot_multiindexed_rows_and_cols(): # GH 36360 df = pd.DataFrame( @@ -225,9 +225,7 @@ def test_pivot_multiindexed_rows_and_cols(using_array_manager): ), index=Index([0, 1], dtype="int64", name="idx_L0"), ) - if not using_array_manager: - # BlockManager does not preserve the dtypes - expected = expected.astype("float64") + expected = expected.astype("float64") tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 6f0c8d751a92a..1a4a390da1323 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -318,7 +318,7 @@ def test_reindex_fill_value(): @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) @pytest.mark.parametrize("fill_value", ["string", 0, Timedelta(0)]) -def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_manager): +def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value): # https://github.com/pandas-dev/pandas/issues/42921 if dtype == "timedelta64[ns]" and fill_value == Timedelta(0): # use the scalar that is not compatible with the dtype for this test diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..866bfb995a6d5 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2187,13 +2187,12 @@ def test_series_constructor_infer_multiindex(self, container, data): class TestSeriesConstructorInternals: - def test_constructor_no_pandas_array(self, using_array_manager): + def test_constructor_no_pandas_array(self): ser = Series([1, 2, 3]) result = Series(ser.array) tm.assert_series_equal(ser, result) - if not using_array_manager: - assert isinstance(result._mgr.blocks[0], NumpyBlock) - assert result._mgr.blocks[0].is_numeric + assert isinstance(result._mgr.blocks[0], NumpyBlock) + assert result._mgr.blocks[0].is_numeric @td.skip_array_manager_invalid_test def test_from_array(self): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 76353ab25fca6..e200f7d9933aa 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -163,7 +163,7 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): +def test_mean_with_convertible_string_raises(using_infer_string): # GH#44008 ser = Series(["1", "2"]) if using_infer_string: @@ -177,19 +177,15 @@ def test_mean_with_convertible_string_raises(using_array_manager, using_infer_st ser.mean() df = ser.to_frame() - if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() -def test_mean_dont_convert_j_to_complex(using_array_manager): +def test_mean_dont_convert_j_to_complex(): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) - if using_array_manager: - msg = "Could not convert string 'J' to numeric" - else: - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() @@ -204,15 +200,14 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): np.mean(df["db"].astype("string").array) -def test_median_with_convertible_string_raises(using_array_manager): +def test_median_with_convertible_string_raises(): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() - if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() From 12d69c8b5739e7070b8ca2ff86587f828c9a96c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 26 Dec 2023 14:37:35 -0500 Subject: [PATCH 24/31] TYP: some return types from ruff (#56617) * TYP: some return types from ruff * fix CI * and another one * adjust pre-commit --- .pre-commit-config.yaml | 2 ++ doc/source/whatsnew/v2.2.0.rst | 2 +- environment.yml | 2 +- pandas/_testing/asserters.py | 7 ++++--- pandas/_version.py | 2 +- pandas/core/computation/expr.py | 4 ++-- pandas/io/html.py | 8 ++++---- pandas/io/json/_json.py | 10 +++++----- pandas/io/parsers/arrow_parser_wrapper.py | 6 +++--- pandas/io/pytables.py | 2 +- pandas/io/sas/sas_xport.py | 2 +- pandas/io/sql.py | 8 ++++---- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/core.py | 8 ++++---- pandas/util/_validators.py | 6 +++--- requirements-dev.txt | 2 +- 16 files changed, 38 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a070e9a49b97..7f3fc95ce00cc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,8 @@ repos: # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture rev: 'v2.10' diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d1481639ca5a0..5ee94b74c527e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -431,7 +431,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| mypy (dev) | 1.7.1 | X | +| mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ | | | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index 74317d47e2e53..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -76,7 +76,7 @@ dependencies: # code checks - flake8=6.1.0 # run in subprocess over docstring examples - - mypy=1.7.1 # pre-commit uses locally installed mypy + - mypy=1.8.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=3.6.0 diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..800b03707540f 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -4,6 +4,7 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) @@ -143,7 +144,7 @@ def assert_almost_equal( ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -576,7 +577,7 @@ def assert_timedelta_array_equal( def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different @@ -664,7 +665,7 @@ def _get_base(obj): if left_base is right_base: raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( diff --git a/pandas/_version.py b/pandas/_version.py index 5d610b5e1ea7e..f8a960630126d 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -386,7 +386,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4770f403b1bdb..b5861fbaebe9c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -695,8 +695,8 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..26e71c9546ffd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -269,7 +269,7 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -392,7 +392,7 @@ def _parse_tables(self, document, match, attrs): """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -629,7 +629,7 @@ def _href_getter(self, obj) -> str | None: def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -758,7 +758,7 @@ def _parse_tables(self, document, match, kwargs): raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ed66e46b300f7..4c490c6b2cda2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -255,7 +255,7 @@ def __init__( self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -287,7 +287,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -304,7 +304,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -1193,7 +1193,7 @@ def parse(self): self._try_convert_types() return self.obj - def _parse(self): + def _parse(self) -> None: raise AbstractMethodError(self) @final @@ -1217,7 +1217,7 @@ def _convert_axes(self) -> None: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) setattr(self.obj, axis_name, new_axis) - def _try_convert_types(self): + def _try_convert_types(self) -> None: raise AbstractMethodError(self) @final diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 66a7ccacf675b..890b22154648e 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -41,7 +41,7 @@ def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -104,7 +104,7 @@ def _get_pyarrow_options(self) -> None: ] = None # PyArrow raises an exception by default elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: - def handle_warning(invalid_row): + def handle_warning(invalid_row) -> str: warnings.warn( f"Expected {invalid_row.expected_columns} columns, but found " f"{invalid_row.actual_columns}: {invalid_row.text}", @@ -219,7 +219,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: raise ValueError(e) return frame - def _validate_usecols(self, usecols): + def _validate_usecols(self, usecols) -> None: if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): raise ValueError( "The pyarrow engine does not allow 'usecols' to be integer " diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50611197ad7dd..1139519d2bcd3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1707,7 +1707,7 @@ def info(self) -> str: # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e68f4789f0a06..11b2ed0ee7316 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -288,7 +288,7 @@ def close(self) -> None: def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0fa6bc6e90c4..3a58daf681cfb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1514,7 +1514,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -2073,7 +2073,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2433,7 +2433,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: raise NotImplementedError("not implemented for adbc") @@ -2879,7 +2879,7 @@ def _create_sql_schema( keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f097c6059c7c..a4d8054ea4f8c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -687,7 +687,7 @@ def __init__( self._prepare_value_labels() - def _prepare_value_labels(self): + def _prepare_value_labels(self) -> None: """Encode value labels.""" self.text_len = 0 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 479a5e19dc1c5..2979903edf360 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -662,7 +662,7 @@ def _ensure_frame(self, data) -> DataFrame: return data @final - def _compute_plot_data(self): + def _compute_plot_data(self) -> None: data = self.data # GH15079 reconstruct data if by is defined @@ -699,7 +699,7 @@ def _compute_plot_data(self): self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) @final @@ -745,7 +745,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" @final - def _adorn_subplots(self, fig: Figure): + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: all_axes = self._get_subplots(fig) @@ -1323,7 +1323,7 @@ def __init__( c = self.data.columns[c] self.c = c - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index a47f622216ef7..cb0b4d549f49e 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -26,7 +26,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +46,7 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +125,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None: _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. diff --git a/requirements-dev.txt b/requirements-dev.txt index cbfb6336b2e16..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -53,7 +53,7 @@ moto flask asv>=0.6.1 flake8==6.1.0 -mypy==1.7.1 +mypy==1.8.0 tokenize-rt pre-commit>=3.6.0 gitpython From 3c15cfdf0e961c4e8f74a205bac6d34e0930f988 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Dec 2023 05:30:31 -1000 Subject: [PATCH 25/31] TST: Remove arraymanager markers (#56626) --- pandas/tests/copy_view/test_astype.py | 2 -- pandas/tests/copy_view/test_internals.py | 5 ----- pandas/tests/frame/indexing/test_indexing.py | 5 ----- pandas/tests/frame/indexing/test_setitem.py | 6 ------ pandas/tests/frame/methods/test_copy.py | 1 - pandas/tests/frame/methods/test_fillna.py | 4 ---- pandas/tests/frame/methods/test_interpolate.py | 2 +- .../frame/methods/test_is_homogeneous_dtype.py | 5 ----- pandas/tests/frame/methods/test_reindex.py | 3 --- pandas/tests/frame/methods/test_shift.py | 3 --- .../frame/methods/test_to_dict_of_blocks.py | 4 ---- pandas/tests/frame/methods/test_to_numpy.py | 3 --- pandas/tests/frame/methods/test_transpose.py | 4 ---- pandas/tests/frame/methods/test_update.py | 3 --- pandas/tests/frame/methods/test_values.py | 5 ----- pandas/tests/frame/test_arithmetic.py | 2 -- pandas/tests/frame/test_block_internals.py | 6 ------ pandas/tests/frame/test_constructors.py | 4 ---- pandas/tests/frame/test_reductions.py | 2 -- pandas/tests/groupby/test_bin_groupby.py | 3 +-- .../multiindex/test_chaining_and_caching.py | 2 -- pandas/tests/indexing/multiindex/test_partial.py | 5 ----- pandas/tests/indexing/multiindex/test_setitem.py | 6 ------ .../tests/indexing/test_chaining_and_caching.py | 3 --- pandas/tests/indexing/test_iloc.py | 2 -- pandas/tests/indexing/test_loc.py | 3 --- pandas/tests/internals/test_internals.py | 5 ----- pandas/tests/io/pytables/test_append.py | 5 ----- pandas/tests/io/test_fsspec.py | 2 -- pandas/tests/io/test_http_headers.py | 2 -- pandas/tests/io/test_pickle.py | 2 -- pandas/tests/reshape/concat/test_concat.py | 4 ---- pandas/tests/series/methods/test_reindex.py | 3 --- pandas/tests/series/test_constructors.py | 4 ---- pandas/util/_test_decorators.py | 16 ---------------- 35 files changed, 2 insertions(+), 134 deletions(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d462ce3d3187d..3c1a157dd2c6a 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -4,7 +4,6 @@ import pytest from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -88,7 +87,6 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): tm.assert_frame_equal(df2, df_orig.astype(dtype)) -@td.skip_array_manager_invalid_test def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) with pd.option_context("mode.copy_on_write", True): diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e..615b024bd06bf 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,15 +1,12 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame import pandas._testing as tm from pandas.tests.copy_view.util import get_array -@td.skip_array_manager_invalid_test def test_consolidate(using_copy_on_write): # create unconsolidated DataFrame df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) @@ -46,7 +43,6 @@ def test_consolidate(using_copy_on_write): @pytest.mark.single_cpu -@td.skip_array_manager_invalid_test def test_switch_options(): # ensure we can switch the value of the option within one session # (assuming data is constructed after switching) @@ -75,7 +71,6 @@ def test_switch_options(): assert df.iloc[0, 0] == 0 -@td.skip_array_manager_invalid_test @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 7837adec0c9e0..e3a467e8bf65b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -15,7 +15,6 @@ PerformanceWarning, SettingWithCopyError, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer @@ -574,7 +573,6 @@ def test_getitem_setitem_integer_slice_keyerrors(self): with pytest.raises(KeyError, match=r"^3$"): df2.loc[3:11] = 0 - @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view def test_fancy_getitem_slice_mixed( self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write ): @@ -640,7 +638,6 @@ def test_getitem_fancy_scalar(self, float_frame): for idx in f.index[::5]: assert ix[idx, col] == ts[idx] - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_scalar(self, float_frame): f = float_frame expected = float_frame.copy() @@ -680,7 +677,6 @@ def test_getitem_fancy_boolean(self, float_frame): expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_boolean(self, float_frame): # from 2d, set with booleans frame = float_frame.copy() @@ -1404,7 +1400,6 @@ def test_loc_setitem_rhs_frame(self, idxr, val, warn): expected = DataFrame({"a": [np.nan, val]}) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): # GH#47381 df = DataFrame(columns=["a", "b"]) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index f031cb2218e31..0e0f8cf61d3d7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.base import _registry as ea_registry from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import ( @@ -704,8 +702,6 @@ def test_setitem_ea_dtype_rhs_series(self): expected = DataFrame({"a": [1, 2]}, dtype="Int64") tm.assert_frame_equal(df, expected) - # TODO(ArrayManager) set column with 2d column array, see #44788 - @td.skip_array_manager_not_yet_implemented def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -1063,7 +1059,6 @@ def inc(x): class TestDataFrameSetItemBooleanMask: - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values @pytest.mark.parametrize( "mask_type", [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], @@ -1307,7 +1302,6 @@ def test_setitem_not_operating_inplace(self, value, set_value, indexer): df[indexer] = set_value tm.assert_frame_equal(view, expected) - @td.skip_array_manager_invalid_test def test_setitem_column_update_inplace( self, using_copy_on_write, warn_copy_on_write ): diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index e7901ed363106..6208d0256a655 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -46,7 +46,6 @@ def test_copy(self, float_frame, float_string_frame): copy = float_string_frame.copy() assert copy._mgr is not float_string_frame._mgr - @td.skip_array_manager_invalid_test def test_copy_consolidates(self): # GH#42477 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6757669351c5c..4131138a7c588 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -3,8 +3,6 @@ from pandas._config import using_pyarrow_string_dtype -import pandas.util._test_decorators as td - from pandas import ( Categorical, DataFrame, @@ -49,7 +47,6 @@ def test_fillna_dict_inplace_nonunique_columns( if not using_copy_on_write: assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2]) - @td.skip_array_manager_not_yet_implemented def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) @@ -752,7 +749,6 @@ def test_fillna_inplace_with_columns_limit_and_value(self): df.fillna(axis=1, value=100, limit=1, inplace=True) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) def test_inplace_dict_update_view( self, val, using_copy_on_write, warn_copy_on_write diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index a93931a970687..e377fdd635bfe 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -470,7 +470,7 @@ def test_interp_string_axis(self, axis_name, axis_number): @pytest.mark.parametrize("multiblock", [True, False]) @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) - def test_interp_fillna_methods(self, request, axis, multiblock, method): + def test_interp_fillna_methods(self, axis, multiblock, method): # GH 12918 df = DataFrame( { diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index 1fe28cb8eb856..086986702d24f 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -1,16 +1,11 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, DataFrame, ) -# _is_homogeneous_type always returns True for ArrayManager -pytestmark = td.skip_array_manager_invalid_test - @pytest.mark.parametrize( "data, expected", diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d862e14ce86cb..d2ec84bc9371f 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -13,7 +13,6 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_gt2 -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -136,7 +135,6 @@ class TestDataFrameSelectReindex: reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) - @td.skip_array_manager_not_yet_implemented def test_reindex_tzaware_fill_value(self): # GH#52586 df = DataFrame([[1]]) @@ -198,7 +196,6 @@ def test_reindex_copies_ea(self, using_copy_on_write): else: assert not np.shares_memory(result2[0].array._data, df[0].array._data) - @td.skip_array_manager_not_yet_implemented def test_reindex_date_fill_value(self): # passing date to dt64 is deprecated; enforced in 2.0 to cast to object arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 907ff67eac7a1..c477c9c1852b7 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( CategoricalIndex, @@ -464,7 +462,6 @@ def test_shift_axis1_multiple_blocks(self): tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support def test_shift_axis1_multiple_blocks_with_int_fill(self): # GH#42719 rng = np.random.default_rng(2) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f64cfd5fe6a2d..217010ab2e7ee 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, MultiIndex, @@ -10,8 +8,6 @@ import pandas._testing as tm from pandas.core.arrays import NumpyExtensionArray -pytestmark = td.skip_array_manager_invalid_test - class TestToDictOfBlocks: @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..d92af2775922b 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,7 +1,5 @@ import numpy as np -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Timestamp, @@ -22,7 +20,6 @@ def test_to_numpy_dtype(self): result = df.to_numpy(dtype="int64") tm.assert_numpy_array_equal(result, expected) - @td.skip_array_manager_invalid_test def test_to_numpy_copy(self, using_copy_on_write): arr = np.random.default_rng(2).standard_normal((4, 3)) df = DataFrame(arr) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..45bd8ff0268a8 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, DatetimeIndex, @@ -126,7 +124,6 @@ def test_transpose_mixed(self): for col, s in mixed_T.items(): assert s.dtype == np.object_ - @td.skip_array_manager_invalid_test def test_transpose_get_view(self, float_frame, using_copy_on_write): dft = float_frame.T dft.iloc[:, 5:10] = 5 @@ -136,7 +133,6 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write): else: assert (float_frame.values[5:10] == 5).all() - @td.skip_array_manager_invalid_test def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") arr = dti._data.reshape(3, 2) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 7c7a0d23ff75f..fd4c9d64d656e 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -175,7 +173,6 @@ def test_update_with_different_dtype(self, using_copy_on_write): ) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test def test_update_modify_view( self, using_copy_on_write, warn_copy_on_write, using_infer_string ): diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index bbca4ee1b88b1..f1230e55f9054 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, NaT, @@ -15,7 +13,6 @@ class TestDataFrameValues: - @td.skip_array_manager_invalid_test def test_values(self, float_frame, using_copy_on_write): if using_copy_on_write: with pytest.raises(ValueError, match="read-only"): @@ -231,7 +228,6 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: - @td.skip_array_manager_invalid_test def test_private_values_dt64tz(self, using_copy_on_write): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) @@ -249,7 +245,6 @@ def test_private_values_dt64tz(self, using_copy_on_write): df2 = df - df tm.assert_equal(df2._values, tda) - @td.skip_array_manager_invalid_test def test_private_values_dt64tz_multicol(self, using_copy_on_write): dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index ecaf826c46d9b..be6ed91973e80 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -896,7 +896,6 @@ def test_df_arith_2d_array_rowlike_broadcasts( ): # GH#23000 opname = all_arithmetic_operators - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) @@ -919,7 +918,6 @@ def test_df_arith_2d_array_collike_broadcasts( ): # GH#23000 opname = all_arithmetic_operators - arr = np.arange(6).reshape(3, 2) df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 712494ef15f97..22fff2116510a 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -8,7 +8,6 @@ import pytest from pandas.errors import PerformanceWarning -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -26,11 +25,6 @@ # structure -# TODO(ArrayManager) check which of those tests need to be rewritten to test the -# equivalent for ArrayManager -pytestmark = td.skip_array_manager_invalid_test - - class TestDataFrameBlockInternals: def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8ff69472ea113..aefb0377d1bf4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -25,7 +25,6 @@ from pandas._libs import lib from pandas.errors import IntCastingNaNError -import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -324,7 +323,6 @@ def test_constructor_dtype_nocast_view_2d_array( df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.arrays[0].flags.c_contiguous - @td.skip_array_manager_invalid_test @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 @@ -332,7 +330,6 @@ def test_1d_object_array_does_not_copy(self): df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) - @td.skip_array_manager_invalid_test @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 @@ -2489,7 +2486,6 @@ def test_constructor_list_str_na(self, string_dtype): @pytest.mark.parametrize("copy", [False, True]) def test_dict_nocopy( self, - request, copy, any_numeric_ea_dtype, any_numpy_dtype, diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 512b5d6ace469..dd88e7401a03f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1701,7 +1701,6 @@ def test_reductions_skipna_none_raises( with pytest.raises(ValueError, match=msg): getattr(obj, all_reductions)(skipna=None) - @td.skip_array_manager_invalid_test def test_reduction_timestamp_smallest_unit(self): # GH#52524 df = DataFrame( @@ -1720,7 +1719,6 @@ def test_reduction_timestamp_smallest_unit(self): ) tm.assert_series_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_reduction_timedelta_smallest_unit(self): # GH#52524 df = DataFrame( diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 49b2e621b7adc..ac5374597585a 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,6 @@ import pytest from pandas._libs import lib -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -22,7 +21,7 @@ def cumsum_max(x): "func", [ cumsum_max, - pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test), + assert_block_lengths, ], ) def test_mgr_locs_updated(func): diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 0dd1a56890fee..014ba6fc12b72 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -3,7 +3,6 @@ from pandas._libs import index as libindex from pandas.errors import SettingWithCopyError -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -43,7 +42,6 @@ def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): zed["eyes"]["right"].fillna(value=555, inplace=True) -@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view def test_cache_updating(using_copy_on_write, warn_copy_on_write): # 5216 # make sure that we don't try to set a dead cache diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index fdf88b2a97e46..5aff1f1309004 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, DatetimeIndex, @@ -118,9 +116,6 @@ def test_getitem_partial_column_select(self): with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): df.loc[("a", "foo"), :] - # TODO(ArrayManager) rewrite test to not use .values - # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view - @td.skip_array_manager_invalid_test def test_partial_set( self, multiindex_year_month_day_dataframe_random_data, diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 53ad4d6b41687..22a0a49762097 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -2,7 +2,6 @@ import pytest from pandas.errors import SettingWithCopyError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -126,9 +125,6 @@ def test_setitem_multiindex3(self): expected=copy, ) - # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in - # all NaNs -> doesn't work in the "split" path (also for BlockManager actually) - @td.skip_array_manager_not_yet_implemented def test_multiindex_setitem(self): # GH 3738 # setting with a multi-index right hand side @@ -520,8 +516,6 @@ def test_setitem_enlargement_keep_index_names(self): tm.assert_frame_equal(df, expected) -@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values -# is not a view def test_frame_setitem_view_direct( multiindex_dataframe_random_data, using_copy_on_write ): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index ca796463f4a1e..5eeaa50e2c3b6 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -7,7 +7,6 @@ SettingWithCopyError, SettingWithCopyWarning, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -539,8 +538,6 @@ def test_detect_chained_assignment_warning_stacklevel( chained[2] = rhs tm.assert_frame_equal(df, df_original) - # TODO(ArrayManager) fast_xs with array-like scalars is not yet working - @td.skip_array_manager_not_yet_implemented def test_chained_getitem_with_lists(self): # GH6394 # Regression in chained getitem indexing with embedded list-like from diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index a1d8577d534f5..13d786f98c42b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -7,7 +7,6 @@ import pytest from pandas.errors import IndexingError -import pandas.util._test_decorators as td from pandas import ( NA, @@ -1193,7 +1192,6 @@ def test_iloc_setitem_2d_ndarray_into_ea_block(self): expected = DataFrame({"status": ["a", "a", "c"]}, dtype=df["status"].dtype) tm.assert_frame_equal(df, expected) - @td.skip_array_manager_not_yet_implemented def test_iloc_getitem_int_single_ea_block_view(self): # GH#45241 # TODO: make an extension interface test for this? diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index da10555e60301..1aa988cca0400 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -17,7 +17,6 @@ from pandas._libs import index as libindex from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2640,7 +2639,6 @@ def test_loc_setitem_mask_td64_series_value(self): assert expected == result tm.assert_frame_equal(df, df_copy) - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_loc_setitem_boolean_and_column(self, float_frame): expected = float_frame.copy() mask = float_frame["A"] > 0 @@ -3315,7 +3313,6 @@ def test_loc_assign_dict_to_row(self, dtype): tm.assert_frame_equal(df, expected) - @td.skip_array_manager_invalid_test def test_loc_setitem_dict_timedelta_multiple_set(self): # GH 16309 result = DataFrame(columns=["time", "value"]) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..66dd893df51de 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -10,7 +10,6 @@ from pandas._libs.internals import BlockPlacement from pandas.compat import IS64 -import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_scalar @@ -44,10 +43,6 @@ new_block, ) -# this file contains BlockManager specific tests -# TODO(ArrayManager) factor out interleave_dtype tests -pytestmark = td.skip_array_manager_invalid_test - @pytest.fixture(params=[new_block, make_block]) def block_maker(request): diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 00a81a4f1f385..706610316ef43 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -6,7 +6,6 @@ import pytest from pandas._libs.tslibs import Timestamp -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -733,10 +732,6 @@ def test_append_misc_empty_frame(setup_path): tm.assert_frame_equal(store.select("df2"), df) -# TODO(ArrayManager) currently we rely on falling back to BlockManager, but -# the conversion from AM->BM converts the invalid object dtype column into -# a datetime64 column no longer raising an error -@td.skip_array_manager_not_yet_implemented def test_append_raise(setup_path): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a1dec8a2d05b4..f6fb032b9d51a 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -194,7 +194,6 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -253,7 +252,6 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): @pytest.mark.single_cpu -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") pytest.importorskip("s3fs") diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 2ca11ad1f74e6..550637a50c1c4 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -100,11 +100,9 @@ def stata_responder(df): pytest.param( parquetfastparquet_responder, partial(pd.read_parquet, engine="fastparquet"), - # TODO(ArrayManager) fastparquet marks=[ td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), - td.skip_array_manager_not_yet_implemented, ], ), (pickle_respnder, pd.read_pickle), diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 4f3993a038197..4e1f09b929224 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -37,7 +37,6 @@ ) from pandas.compat._optional import import_optional_dependency from pandas.compat.compressors import flatten_buffer -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -600,7 +599,6 @@ def test_pickle_strings(string_series): tm.assert_series_equal(unp_series, string_series) -@td.skip_array_manager_invalid_test def test_pickle_preserves_block_ndim(): # GH#37631 ser = Series(list("abc")).astype("category").iloc[[0]] diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2cc91992f1fd7..5ec95cbf24b39 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,7 +10,6 @@ import pytest from pandas.errors import InvalidIndexError -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -773,7 +772,6 @@ def test_concat_retain_attrs(data): assert df.attrs[1] == 1 -@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): @@ -799,7 +797,6 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): tm.assert_frame_equal(result, expected) -@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): @@ -827,7 +824,6 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): tm.assert_frame_equal(result, expected) -@td.skip_array_manager_invalid_test def test_concat_ignore_empty_from_reindex(): # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856 df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]}) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 1a4a390da1323..1959e71f60775 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -3,8 +3,6 @@ from pandas._config import using_pyarrow_string_dtype -import pandas.util._test_decorators as td - from pandas import ( NA, Categorical, @@ -315,7 +313,6 @@ def test_reindex_fill_value(): tm.assert_series_equal(result, expected) -@td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) @pytest.mark.parametrize("fill_value", ["string", 0, Timedelta(0)]) def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 866bfb995a6d5..b802e92e4fcca 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -16,7 +16,6 @@ ) from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import CategoricalDtype @@ -702,7 +701,6 @@ def test_constructor_copy(self): assert x[0] == 2.0 assert y[0] == 1.0 - @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test @pytest.mark.parametrize( "index", [ @@ -2194,7 +2192,6 @@ def test_constructor_no_pandas_array(self): assert isinstance(result._mgr.blocks[0], NumpyBlock) assert result._mgr.blocks[0].is_numeric - @td.skip_array_manager_invalid_test def test_from_array(self): result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False @@ -2202,7 +2199,6 @@ def test_from_array(self): result = Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._mgr.blocks[0].is_extension is False - @td.skip_array_manager_invalid_test def test_from_list_dtype(self): result = Series(["1h", "2h"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 2c1912bce856d..37908c9ac255b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,7 +38,6 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F -from pandas._config.config import _get_option from pandas.compat import ( IS64, @@ -147,21 +146,6 @@ def documented_fixture(fixture): return documented_fixture -def mark_array_manager_not_yet_implemented(request) -> None: - mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") - request.applymarker(mark) - - -skip_array_manager_not_yet_implemented = pytest.mark.xfail( - _get_option("mode.data_manager", silent=True) == "array", - reason="Not yet implemented for ArrayManager", -) - -skip_array_manager_invalid_test = pytest.mark.skipif( - _get_option("mode.data_manager", silent=True) == "array", - reason="Test that relies on BlockManager internals or specific behaviour", -) - skip_copy_on_write_not_yet_implemented = pytest.mark.xfail( get_option("mode.copy_on_write") is True, reason="Not yet implemented/adapted for Copy-on-Write mode", From 0643a1816aa86811885270397454f49995a07d64 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 27 Dec 2023 10:52:01 -0800 Subject: [PATCH 26/31] PERF: resolution, is_normalized (#56637) PERF: resolution --- pandas/_libs/tslibs/vectorized.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 0a19092f57706..1e09874639d4f 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -234,7 +234,7 @@ def get_resolution( for i in range(n): # Analogous to: utc_val = stamps[i] - utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it)) + utc_val = (cnp.PyArray_ITER_DATA(it))[0] if utc_val == NPY_NAT: pass @@ -331,7 +331,7 @@ def is_date_array_normalized(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso) - for i in range(n): # Analogous to: utc_val = stamps[i] - utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it)) + utc_val = (cnp.PyArray_ITER_DATA(it))[0] local_val = info.utc_val_to_local_val(utc_val, &pos) From 7cba64ee3c10eabc2fc88ce4b52877d6e0e17f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 27 Dec 2023 13:54:54 -0500 Subject: [PATCH 27/31] TYP: more simple return types from ruff (#56628) TYP: more return types from ruff --- pandas/core/apply.py | 2 +- pandas/core/array_algos/replace.py | 2 +- pandas/core/arrays/arrow/accessors.py | 2 +- pandas/core/arrays/arrow/extension_types.py | 2 +- pandas/core/arrays/categorical.py | 6 ++++-- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/arrays/sparse/accessor.py | 6 +++--- pandas/core/arrays/sparse/array.py | 4 +++- pandas/core/arrays/sparse/scipy_sparse.py | 2 +- pandas/core/arrays/string_.py | 2 +- pandas/core/computation/eval.py | 8 ++++---- pandas/core/computation/ops.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 4 ++-- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/multi.py | 2 +- pandas/core/indexing.py | 14 ++++++++------ pandas/core/internals/base.py | 2 +- pandas/core/internals/managers.py | 6 ++++-- pandas/core/ops/array_ops.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/encoding.py | 2 +- pandas/core/reshape/merge.py | 2 +- pandas/core/reshape/reshape.py | 2 +- pandas/core/window/rolling.py | 6 +++--- pandas/io/formats/style_render.py | 4 ++-- pandas/io/pytables.py | 2 +- 28 files changed, 55 insertions(+), 47 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25a71ce5b5f4f..784e11415ade6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -827,7 +827,7 @@ def generate_numba_apply_func( def apply_with_numba(self): pass - def validate_values_for_numba(self): + def validate_values_for_numba(self) -> None: # Validate column dtyps all OK for colname, dtype in self.obj.dtypes.items(): if not is_numeric_dtype(dtype): diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 5f377276be480..60fc172139f13 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -67,7 +67,7 @@ def compare_or_regex_search( def _check_comparison_types( result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern - ): + ) -> None: """ Raises an error if the two arrays (a,b) cannot be compared. Otherwise, returns the comparison result as expected. diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..23825faa70095 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -39,7 +39,7 @@ def __init__(self, data, validation_msg: str) -> None: def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: pass - def _validate(self, data): + def _validate(self, data) -> None: dtype = data.dtype if not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 72bfd6f2212f8..d52b60df47adc 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -135,7 +135,7 @@ def to_pandas_dtype(self) -> IntervalDtype: """ -def patch_pyarrow(): +def patch_pyarrow() -> None: # starting from pyarrow 14.0.1, it has its own mechanism if not pa_version_under14p1: return diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 065a942cae768..8a88227ad54a3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2164,7 +2164,9 @@ def __contains__(self, key) -> bool: # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed: bool = False): + # error: Return type "None" of "_formatter" incompatible with return + # type "Callable[[Any], str | None]" in supertype "ExtensionArray" + def _formatter(self, boxed: bool = False) -> None: # type: ignore[override] # Returning None here will cause format_array to do inference. return None @@ -2890,7 +2892,7 @@ def __init__(self, data) -> None: self._freeze() @staticmethod - def _validate(data): + def _validate(data) -> None: if not isinstance(data.dtype, CategoricalDtype): raise AttributeError("Can only use .cat accessor with a 'category' dtype") diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 11a0c7bf18fcb..e04fcb84d51a0 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2058,7 +2058,7 @@ def freq(self, value) -> None: self._freq = value @final - def _maybe_pin_freq(self, freq, validate_kwds: dict): + def _maybe_pin_freq(self, freq, validate_kwds: dict) -> None: """ Constructor helper to pin the appropriate `freq` attribute. Assumes that self._freq is currently set to any freq inferred in @@ -2092,7 +2092,7 @@ def _maybe_pin_freq(self, freq, validate_kwds: dict): @final @classmethod - def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): + def _validate_frequency(cls, index, freq: BaseOffset, **kwargs) -> None: """ Validate that a frequency is compatible with the values of a given Datetime Array/Index or Timedelta Array/Index diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fc7debb1f31e4..3dd7ebf564ca1 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -30,7 +30,7 @@ def __init__(self, data=None) -> None: self._parent = data self._validate(data) - def _validate(self, data): + def _validate(self, data) -> None: raise NotImplementedError @@ -50,7 +50,7 @@ class SparseAccessor(BaseAccessor, PandasDelegate): array([2, 2, 2]) """ - def _validate(self, data): + def _validate(self, data) -> None: if not isinstance(data.dtype, SparseDtype): raise AttributeError(self._validation_msg) @@ -243,7 +243,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): 0.5 """ - def _validate(self, data): + def _validate(self, data) -> None: dtypes = data.dtypes if not all(isinstance(t, SparseDtype) for t in dtypes): raise AttributeError(self._validation_msg) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..7a3ea85dde2b4 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1830,7 +1830,9 @@ def __repr__(self) -> str: pp_index = printing.pprint_thing(self.sp_index) return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" - def _formatter(self, boxed: bool = False): + # error: Return type "None" of "_formatter" incompatible with return + # type "Callable[[Any], str | None]" in supertype "ExtensionArray" + def _formatter(self, boxed: bool = False) -> None: # type: ignore[override] # Defer to the formatter from the GenericArrayFormatter calling us. # This will infer the correct formatter from the dtype of the values. return None diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 71b71a9779da5..31e09c923d933 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -27,7 +27,7 @@ ) -def _check_is_partition(parts: Iterable, whole: Iterable): +def _check_is_partition(parts: Iterable, whole: Iterable) -> None: whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..f451ebc352733 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -364,7 +364,7 @@ def __init__(self, values, copy: bool = False) -> None: self._validate() NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) - def _validate(self): + def _validate(self) -> None: """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..6313c2e2c98de 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -72,7 +72,7 @@ def _check_engine(engine: str | None) -> str: return engine -def _check_parser(parser: str): +def _check_parser(parser: str) -> None: """ Make sure a valid parser is passed. @@ -91,7 +91,7 @@ def _check_parser(parser: str): ) -def _check_resolvers(resolvers): +def _check_resolvers(resolvers) -> None: if resolvers is not None: for resolver in resolvers: if not hasattr(resolver, "__getitem__"): @@ -102,7 +102,7 @@ def _check_resolvers(resolvers): ) -def _check_expression(expr): +def _check_expression(expr) -> None: """ Make sure an expression is not an empty string @@ -149,7 +149,7 @@ def _convert_expression(expr) -> str: return s -def _check_for_locals(expr: str, stack_level: int, parser: str): +def _check_for_locals(expr: str, stack_level: int, parser: str) -> None: at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 95ac20ba39edc..9422434b5cde3 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -491,7 +491,7 @@ def stringify(value): v = v.tz_convert("UTC") self.lhs.update(v) - def _disallow_scalar_only_bool_ops(self): + def _disallow_scalar_only_bool_ops(self) -> None: rhs = self.rhs lhs = self.lhs diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a088bf84c48e..72c33e95f68a0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -231,7 +231,7 @@ def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar: return value -def _disallow_mismatched_datetimelike(value, dtype: DtypeObj): +def _disallow_mismatched_datetimelike(value, dtype: DtypeObj) -> None: """ numpy allows np.array(dt64values, dtype="timedelta64[ns]") and vice-versa, but we do not want to allow this, so we need to diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..a46e42b9241ff 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4316,7 +4316,7 @@ def _setitem_array(self, key, value): else: self._iset_not_inplace(key, value) - def _iset_not_inplace(self, key, value): + def _iset_not_inplace(self, key, value) -> None: # GH#39510 when setting with df[key] = obj with a list-like key and # list-like value, we iterate over those listlikes and set columns # one at a time. This is different from dispatching to @@ -4360,7 +4360,7 @@ def igetitem(obj, i: int): finally: self.columns = orig_columns - def _setitem_frame(self, key, value): + def _setitem_frame(self, key, value) -> None: # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 if isinstance(key, np.ndarray): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..91a150c63c5b6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4394,7 +4394,7 @@ def _check_is_chained_assignment_possible(self) -> bool_t: return False @final - def _check_setitem_copy(self, t: str = "setting", force: bool_t = False): + def _check_setitem_copy(self, t: str = "setting", force: bool_t = False) -> None: """ Parameters @@ -4510,7 +4510,7 @@ def __delitem__(self, key) -> None: # Unsorted @final - def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t): + def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t) -> None: if inplace and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'inplace=True' when " diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..d262dcd144d79 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3209,7 +3209,7 @@ def _get_reconciled_name_object(self, other): return self @final - def _validate_sort_keyword(self, sort): + def _validate_sort_keyword(self, sort) -> None: if sort not in [None, False, True]: raise ValueError( "The 'sort' keyword only takes the values of " @@ -6051,7 +6051,7 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: # by RangeIndex, MultIIndex return self._data.argsort(*args, **kwargs) - def _check_indexing_error(self, key): + def _check_indexing_error(self, key) -> None: if not is_scalar(key): # if key is not a scalar, directly raise an error (the code below # would convert to numpy arrays and raise later any way) - GH29926 diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..56e3899eae6f6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1571,7 +1571,7 @@ def _format_multi( def _get_names(self) -> FrozenList: return FrozenList(self._names) - def _set_names(self, names, *, level=None, validate: bool = True): + def _set_names(self, names, *, level=None, validate: bool = True) -> None: """ Set new names on index. Each name has to be a hashable type. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4be7e17035128..a7dd3b486ab11 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -911,7 +911,7 @@ def __setitem__(self, key, value) -> None: iloc = self if self.name == "iloc" else self.obj.iloc iloc._setitem_with_indexer(indexer, value, self.name) - def _validate_key(self, key, axis: AxisInt): + def _validate_key(self, key, axis: AxisInt) -> None: """ Ensure that key is valid for current indexer. @@ -1225,7 +1225,7 @@ class _LocIndexer(_LocationIndexer): # Key Checks @doc(_LocationIndexer._validate_key) - def _validate_key(self, key, axis: Axis): + def _validate_key(self, key, axis: Axis) -> None: # valid for a collection of labels (we check their presence later) # slice of labels (where start-end in labels) # slice of integers (only if in the labels) @@ -1572,7 +1572,7 @@ class _iLocIndexer(_LocationIndexer): # ------------------------------------------------------------------- # Key Checks - def _validate_key(self, key, axis: AxisInt): + def _validate_key(self, key, axis: AxisInt) -> None: if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": @@ -1783,7 +1783,7 @@ def _get_setitem_indexer(self, key): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): + def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -2038,7 +2038,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): for loc in ilocs: self._setitem_single_column(loc, value, pi) - def _setitem_with_indexer_2d_value(self, indexer, value): + def _setitem_with_indexer_2d_value(self, indexer, value) -> None: # We get here with np.ndim(value) == 2, excluding DataFrame, # which goes through _setitem_with_indexer_frame_value pi = indexer[0] @@ -2060,7 +2060,9 @@ def _setitem_with_indexer_2d_value(self, indexer, value): value_col = value_col.tolist() self._setitem_single_column(loc, value_col, pi) - def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): + def _setitem_with_indexer_frame_value( + self, indexer, value: DataFrame, name: str + ) -> None: ilocs = self._ensure_iterable_column_indexer(indexer[1]) sub_indexer = list(indexer) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index ae91f167205a0..8f16a6623c8cb 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -53,7 +53,7 @@ class _AlreadyWarned: - def __init__(self): + def __init__(self) -> None: # This class is used on the manager level to the block level to # ensure that we warn only once. The block method can update the # warned_already option without returning a value to keep the diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..5f38720135efa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1940,13 +1940,15 @@ def _post_setstate(self) -> None: def _block(self) -> Block: return self.blocks[0] + # error: Cannot override writeable attribute with read-only property @property - def _blknos(self): + def _blknos(self) -> None: # type: ignore[override] """compat with BlockManager""" return None + # error: Cannot override writeable attribute with read-only property @property - def _blklocs(self): + def _blklocs(self) -> None: # type: ignore[override] """compat with BlockManager""" return None diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 4b762a359d321..8ccd7c84cb05c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -591,7 +591,7 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): } -def _bool_arith_check(op, a: np.ndarray, b): +def _bool_arith_check(op, a: np.ndarray, b) -> None: """ In contrast to numpy, pandas raises an error for certain operations with booleans. diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..31859c7d04e04 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -765,7 +765,7 @@ def _get_concat_axis(self) -> Index: return concat_axis - def _maybe_check_integrity(self, concat_index: Index): + def _maybe_check_integrity(self, concat_index: Index) -> None: if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..44158227d903b 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -169,7 +169,7 @@ def get_dummies( data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols - def check_len(item, name: str): + def check_len(item, name: str) -> None: if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = ( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690e3c2700c6c..f4903023e8059 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2091,7 +2091,7 @@ def _maybe_require_matching_dtypes( ) -> None: # TODO: why do we do this for AsOfMerge but not the others? - def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): + def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int) -> None: if left.dtype != right.dtype: if isinstance(left.dtype, CategoricalDtype) and isinstance( right.dtype, CategoricalDtype diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7a49682d7c57c..3493f1c78da91 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -188,7 +188,7 @@ def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: return sorted_values return values - def _make_selectors(self): + def _make_selectors(self) -> None: new_levels = self.new_index_levels # make the mask diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..fa5b84fefb883 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1143,7 +1143,7 @@ class Window(BaseWindow): "method", ] - def _validate(self): + def _validate(self) -> None: super()._validate() if not isinstance(self.win_type, str): @@ -1861,7 +1861,7 @@ class Rolling(RollingAndExpandingMixin): "method", ] - def _validate(self): + def _validate(self) -> None: super()._validate() # we allow rolling on a datetimelike index @@ -2906,7 +2906,7 @@ def _get_window_indexer(self) -> GroupbyIndexer: ) return window_indexer - def _validate_datetimelike_monotonic(self): + def _validate_datetimelike_monotonic(self) -> None: """ Validate that each group in self._on is monotonic """ diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 416b263ba8497..55541e5262719 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -2288,12 +2288,12 @@ def _parse_latex_css_conversion(styles: CSSList) -> CSSList: Ignore conversion if tagged with `--latex` option, skipped if no conversion found. """ - def font_weight(value, arg): + def font_weight(value, arg) -> tuple[str, str] | None: if value in ("bold", "bolder"): return "bfseries", f"{arg}" return None - def font_style(value, arg): + def font_style(value, arg) -> tuple[str, str] | None: if value == "italic": return "itshape", f"{arg}" if value == "oblique": diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1139519d2bcd3..c30238e412450 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3207,7 +3207,7 @@ class SeriesFixed(GenericFixed): name: Hashable @property - def shape(self): + def shape(self) -> tuple[int] | None: try: return (len(self.group.values),) except (TypeError, AttributeError): From bad7db4ae807ef9c03677f5923344d80d24db570 Mon Sep 17 00:00:00 2001 From: Caden Gobat <36030084+cgobat@users.noreply.github.com> Date: Wed, 27 Dec 2023 10:59:52 -0800 Subject: [PATCH 28/31] ENH: Update CFF with publication reference, Zenodo DOI, and other details (#56589) Add McKinney (2010) reference, DOI, team website, & more keywords --- CITATION.cff | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 741e7e7ac8c85..11f45b0d87ec7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,12 +3,50 @@ title: 'pandas-dev/pandas: Pandas' message: 'If you use this software, please cite it as below.' authors: - name: "The pandas development team" + website: "https://pandas.pydata.org/about/team.html" abstract: "Pandas is a powerful data structures for data analysis, time series, and statistics." +doi: 10.5281/zenodo.3509134 license: BSD-3-Clause license-url: "https://github.com/pandas-dev/pandas/blob/main/LICENSE" repository-code: "https://github.com/pandas-dev/pandas" keywords: - python - data science + - flexible + - pandas + - alignment + - data analysis type: software -url: "https://github.com/pandas-dev/pandas" +url: "https://pandas.pydata.org/" +references: + - type: article + authors: + - given-names: Wes + family-names: McKinney + affiliation: AQR Capital Management, LLC + email: wesmckinn@gmail.com + title: Data Structures for Statistical Computing in Python + doi: 10.25080/Majora-92bf1922-00a + license: CC-BY-3.0 + start: 56 + end: 61 + year: 2010 + collection-title: Proceedings of the 9th Python in Science Conference + collection-doi: 10.25080/Majora-92bf1922-012 + collection-type: proceedings + editors: + - given-names: Stéfan + name-particle: van der + family-names: Walt + - given-names: Jarrod + family-names: Millman + conference: + name: 9th Python in Science Conference (SciPy 2010) + city: Austin, TX + country: US + date-start: "2010-06-28" + date-end: "2010-07-03" + keywords: + - data structure + - statistics + - R From 44330e85d0698e787d8a8196e5f8e74d69cbf4e0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:02:54 +0100 Subject: [PATCH 29/31] DOC: Fixup CoW userguide (#56636) --- doc/source/user_guide/copy_on_write.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 050c3901c3420..a083297925007 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -317,7 +317,7 @@ you are modifying one object inplace. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -328,7 +328,7 @@ held by the object. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. From ee8f3358007bb515cefa78726d012a52ab54b634 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 27 Dec 2023 11:09:17 -0800 Subject: [PATCH 30/31] REF: check monotonicity inside _can_use_libjoin (#55342) * REF: fix can_use_libjoin check * DOC: docstring for can_use_libjoin * Make can_use_libjoin checks more-correct * avoid allocating mapping in monotonic cases * fix categorical memory usage tests * catch decimal.InvalidOperation --------- Co-authored-by: Luke Manley --- pandas/_libs/index.pyx | 12 +++++++++++- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 20 ++++++++++---------- pandas/tests/extension/test_categorical.py | 5 ----- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..675288e20d1f8 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -43,6 +43,8 @@ from pandas._libs.missing cimport ( is_matching_na, ) +from decimal import InvalidOperation + # Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 @@ -248,6 +250,10 @@ cdef class IndexEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: self._do_unique_check() @@ -281,7 +287,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, InvalidOperation): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 @@ -843,6 +849,10 @@ cdef class SharedEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: arr = self.values.unique() self.unique = len(arr) == len(self.values) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a46e42b9241ff..c24ef4d6d6d42 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + 5136 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d262dcd144d79..166d6946beacf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None): if ( sort in (None, True) - and self.is_monotonic_increasing - and other.is_monotonic_increasing - and not (self.has_duplicates and other.has_duplicates) + and (self.is_unique or other.is_unique) and self._can_use_libjoin and other._can_use_libjoin ): @@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False): """ intersection specialized to the case with matching dtypes. """ - if ( - self.is_monotonic_increasing - and other.is_monotonic_increasing - and self._can_use_libjoin - and other._can_use_libjoin - ): + if self._can_use_libjoin and other._can_use_libjoin: try: res_indexer, indexer, _ = self._inner_indexer(other) except TypeError: @@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # We only get here with matching dtypes and both monotonic increasing + # We only get here with (caller is responsible for ensuring): + # 1) matching dtypes + # 2) both monotonic increasing + # 3) other.is_unique or self.is_unique assert other.dtype == self.dtype assert self._can_use_libjoin and other._can_use_libjoin @@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool: making a copy. If we cannot, this negates the performance benefit of using libjoin. """ + if not self.is_monotonic_increasing: + # The libjoin functions all assume monotonicity. + return False + if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..1b322b1797144 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -75,11 +75,6 @@ def data_for_grouping(): class TestCategorical(base.ExtensionTests): - @pytest.mark.xfail(reason="Memory usage doesn't match") - def test_memory_usage(self, data): - # TODO: Is this deliberate? - super().test_memory_usage(data) - def test_contains(self, data, data_missing): # GH-37867 # na value handling in Categorical.__contains__ is deprecated. From 8b77271393efc678b9897e009c3201e6eeef57fc Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 27 Dec 2023 14:19:25 -0500 Subject: [PATCH 31/31] DOC: Minor fixups for 2.2.0 whatsnew (#56632) --- doc/source/whatsnew/v2.2.0.rst | 118 ++++++++++++--------------------- 1 file changed, 43 insertions(+), 75 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5ee94b74c527e..5b955aa45219a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -123,7 +123,7 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df.to_sql("pandas_table", conn, index=False) - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) @@ -176,7 +176,7 @@ leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` .. code-block:: ipython - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") @@ -306,22 +306,21 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) -- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) -- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) -- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) -- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -386,6 +385,8 @@ index levels when joining on two indexes with different levels (:issue:`34133`). left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right result = left.join(right) *Old Behavior* @@ -415,15 +416,6 @@ Backwards incompatible API changes Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| | | X | X | -+-----------------+-----------------+----------+---------+ - For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. @@ -433,8 +425,6 @@ Optional libraries below the lowest tested version may still work, but are not c +=================+=================+=========+ | mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ -| | | X | -+-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -606,20 +596,20 @@ Other Deprecations - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) @@ -692,31 +682,30 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) -- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) -- +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) - Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) -- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) -- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) -- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) @@ -739,14 +728,12 @@ Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) -- Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Strings ^^^^^^^ @@ -763,13 +750,12 @@ Strings Interval ^^^^^^^^ -- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) -- Indexing ^^^^^^^^ @@ -781,25 +767,23 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) -- MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) -- I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) -- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) -- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) -- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) @@ -808,12 +792,11 @@ Period - Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) - Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) -- Plotting ^^^^^^^^ -- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) -- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) Groupby/resample/rolling @@ -821,9 +804,9 @@ Groupby/resample/rolling - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) @@ -845,22 +828,11 @@ Reshaping - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) -- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ - Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- Other ^^^^^ @@ -871,15 +843,11 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) -- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) -.. ***DO NOT USE THIS SECTION*** - -- -- .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: