From 9e6faeeaa18d54a09cf115bcaab002f2e5267a4f Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 5 Mar 2025 19:50:18 -0500 Subject: [PATCH 1/6] Allow NestedDtype initialization from pd.ArrowDtype --- docs/tutorials/low_level.ipynb | 11 +++----- src/nested_pandas/series/dtype.py | 34 ++++++++---------------- tests/nested_pandas/series/test_dtype.py | 7 +++-- 3 files changed, 19 insertions(+), 33 deletions(-) diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index e0004a56..3ba68762 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -340,16 +340,11 @@ { "cell_type": "code", "execution_count": null, - "id": "422e719861ae40f6", - "metadata": { - "ExecuteTime": { - "end_time": "2025-03-05T20:34:52.352751Z", - "start_time": "2025-03-05T20:34:52.350143Z" - } - }, + "id": "da7788cc04b78a2a", + "metadata": {}, "outputs": [], "source": [ - "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))" + "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype(struct_series.dtype)))" ] }, { diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index a798235e..2d1fea4e 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -20,7 +20,14 @@ @register_extension_dtype class NestedDtype(ExtensionDtype): - """Data type to handle packed time series data""" + """Data type to handle packed time series data + + Parameters + ---------- + pyarrow_dtype : pyarrow.StructType or pd.ArrowDtype + The pyarrow data type to use for the nested type. It must be a struct + type where all fields are list types. + """ # ExtensionDtype overrides # @@ -135,7 +142,9 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray: pyarrow_dtype: pa.StructType - def __init__(self, pyarrow_dtype: pa.DataType) -> None: + def __init__(self, pyarrow_dtype: pa.DataType | pd.ArrowDtype) -> None: + if isinstance(pyarrow_dtype, pd.ArrowDtype): + pyarrow_dtype = pyarrow_dtype.pyarrow_dtype self.pyarrow_dtype = self._validate_dtype(pyarrow_dtype) @classmethod @@ -193,27 +202,6 @@ def field_names(self) -> list[str]: """The list of field names of the nested type""" return [field.name for field in self.pyarrow_dtype] - @classmethod - def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype): - """Construct NestedDtype from a pandas.ArrowDtype. - - Parameters - ---------- - pandas_arrow_dtype : ArrowDtype - The pandas.ArrowDtype to construct NestedDtype from. - - Returns - ------- - NestedDtype - The constructed NestedDtype. - - Raises - ------ - ValueError - If the given dtype is not a valid nested type. - """ - return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype) - def to_pandas_arrow_dtype(self) -> ArrowDtype: """Convert NestedDtype to a pandas.ArrowDtype. diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index 2b6b3b2b..07369534 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -20,8 +20,9 @@ ) def test_from_pyarrow_dtype(pyarrow_dtype): """Test that we can construct NestedDtype from pyarrow struct type.""" - dtype = NestedDtype(pyarrow_dtype) - assert dtype.pyarrow_dtype == pyarrow_dtype + dtype1 = NestedDtype(pyarrow_dtype) + dtype2 = NestedDtype(pd.ArrowDtype(pyarrow_dtype)) + assert dtype1.pyarrow_dtype == dtype2.pyarrow_dtype == pyarrow_dtype @pytest.mark.parametrize( @@ -39,6 +40,8 @@ def test_from_pyarrow_dtype_raises(pyarrow_dtype): """Test that we raise an error when constructing NestedDtype from invalid pyarrow type.""" with pytest.raises(ValueError): NestedDtype(pyarrow_dtype) + with pytest.raises(ValueError): + NestedDtype(pd.ArrowDtype(pyarrow_dtype)) def test_to_pandas_arrow_dtype(): From 105af32aa9cc4ffd13b00479796a7188111db503 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 5 Mar 2025 20:55:16 -0500 Subject: [PATCH 2/6] NestedDtype.inner_dtypes --- src/nested_pandas/series/dtype.py | 97 ++++++++++++++++++++---- tests/nested_pandas/series/test_dtype.py | 43 +++++++++-- 2 files changed, 122 insertions(+), 18 deletions(-) diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index 2d1fea4e..f8590085 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -27,11 +27,27 @@ class NestedDtype(ExtensionDtype): pyarrow_dtype : pyarrow.StructType or pd.ArrowDtype The pyarrow data type to use for the nested type. It must be a struct type where all fields are list types. + inner_dtypes : Mapping[str, object] or None, default None + A mapping of field names and their inner types. This will be used to: + 1. Cast to the correct types when getting flat representations + of the nested fields. + 2. To handle information of the double-nested fields, you should use + this NestedDtype for the inner types in this case. + Dtypes must be pandas-recognisable types, such as Python native types, + numpy dtypes or extension array dtypes. Please wrap pyarrow types with + pd.ArrowDtype. + We trust these dtypes and make no attempt to validate them when + casting. + If None, all inner types are assumed to be the same as the + corresponding list element types. """ # ExtensionDtype overrides # - _metadata = ("pyarrow_dtype",) + _metadata = ( + "pyarrow_dtype", + "inner_dtypes", + ) """Attributes to use as metadata for __eq__ and __hash__""" @property @@ -45,7 +61,12 @@ def na_value(self) -> Type[pd.NA]: @property def name(self) -> str: """The string representation of the nested type""" - fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype]) + # Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__ + nice_dtypes = { + field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype + for field, dtype in self.fields.items() + } + fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()]) return f"nested<{fields}>" @classmethod @@ -141,21 +162,26 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray: # Additional methods and attributes # pyarrow_dtype: pa.StructType + inner_dtypes: dict[str, object] - def __init__(self, pyarrow_dtype: pa.DataType | pd.ArrowDtype) -> None: + def __init__( + self, pyarrow_dtype: pa.DataType | pd.ArrowDtype, inner_dtypes: Mapping[str, object] | None = None + ) -> None: if isinstance(pyarrow_dtype, pd.ArrowDtype): pyarrow_dtype = pyarrow_dtype.pyarrow_dtype self.pyarrow_dtype = self._validate_dtype(pyarrow_dtype) + self.inner_dtypes = self._validate_inner_dtypes(self.pyarrow_dtype, inner_dtypes) @classmethod - def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 + def from_fields(cls, fields: Mapping[str, pa.DataType | Self]) -> Self: # type: ignore[name-defined] # noqa: F821 """Make NestedDtype from a mapping of field names and list item types. Parameters ---------- - fields : Mapping[str, pa.DataType] - A mapping of field names and their item types. Since all fields are lists, the item types are - inner types of the lists, not the list types themselves. + fields : Mapping[str, pa.DataType | NestedDtype] + A mapping of field names and their item types. Since all fields are + lists, the item types are inner types of the lists, not the list + types themselves. Returns ------- @@ -172,9 +198,15 @@ def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) ... ) """ - pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()}) - pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) - return cls(pyarrow_dtype=pyarrow_dtype) + pa_fields = {} + inner_dtypes = {} + for field, dtype in fields.items(): + if isinstance(dtype, NestedDtype): + inner_dtypes[field] = dtype + dtype = dtype.pyarrow_dtype + pa_fields[field] = dtype + pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in pa_fields.items()}) + return cls(pyarrow_dtype=pyarrow_dtype, inner_dtypes=inner_dtypes or None) @staticmethod def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType: @@ -192,10 +224,49 @@ def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType: ) return pyarrow_dtype + @staticmethod + def _validate_inner_dtypes( + pyarrow_dtype: pa.StructType, inner_dtypes: Mapping[str, object] | None + ) -> dict[str, object]: + # Short circuit if there are no inner dtypes + if inner_dtypes is None: + return {} + + inner_dtypes = dict(inner_dtypes) + + for field_name, inner_dtype in inner_dtypes.items(): + if field_name not in pyarrow_dtype.names: + raise ValueError(f"Field '{field_name}' not found in the pyarrow struct type.") + element_type = pyarrow_dtype[field_name].type.value_type + test_series = pd.Series([], dtype=pd.ArrowDtype(element_type)) + try: + _ = test_series.astype(inner_dtype) + except TypeError as e: + raise TypeError( + f"Could not cast the inner dtype '{inner_dtype}' for field '{field_name}' to the" + f" corresponding element type '{element_type}'. {e}" + ) from e + return inner_dtypes + + def inner_dtype(self, field: str) -> object: + """Get the inner dtype for a field. + + Parameters + ---------- + field : str + The field name. + + Returns + ------- + object + The inner dtype for the field. + """ + return self.inner_dtypes.get(field, pd.ArrowDtype(self.pyarrow_dtype[field].type.value_type)) + @property - def fields(self) -> dict[str, pa.DataType]: - """The mapping of field names and their item types.""" - return {field.name: field.type.value_type for field in self.pyarrow_dtype} + def fields(self) -> dict[str, object]: + """The mapping of field names and pandas dtypes of their items""" + return {field.name: self.inner_dtype(field.name) for field in self.pyarrow_dtype} @property def field_names(self) -> list[str]: diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index 07369534..a463d0e2 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -54,12 +54,25 @@ def test_to_pandas_arrow_dtype(): def test_from_fields(): """Test NestedDtype.from_fields().""" - fields = {"a": pa.int64(), "b": pa.float64()} - dtype = NestedDtype.from_fields(fields) - assert dtype.pyarrow_dtype == pa.struct( + fields1 = {"a": pa.int64(), "b": pa.float64()} + dtype1 = NestedDtype.from_fields(fields1) + assert dtype1.pyarrow_dtype == pa.struct( [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] ) + fields2 = {"x": pa.string(), "y": pa.bool_(), "nested": dtype1} + dtype2 = NestedDtype.from_fields(fields2) + assert dtype2 == NestedDtype( + pa.struct( + [ + pa.field("x", pa.list_(pa.string())), + pa.field("y", pa.list_(pa.bool_())), + pa.field("nested", pa.list_(dtype1.pyarrow_dtype)), + ] + ), + inner_dtypes={"nested": dtype1}, + ) + def test_na_value(): """Test that NestedDtype.na_value is a singleton instance of NAType.""" @@ -69,10 +82,30 @@ def test_na_value(): def test_fields(): """Test NestedDtype.fields property""" - dtype = NestedDtype( + dtype1 = NestedDtype( pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) ) - assert dtype.fields == {"a": pa.int64(), "b": pa.float64()} + assert dtype1.fields == {"a": pd.ArrowDtype(pa.int64()), "b": pd.ArrowDtype(pa.float64())} + + dtype2 = NestedDtype( + pa.struct( + [ + pa.field("x", pa.list_(pa.float64())), + pa.field("y", pa.list_(pa.string())), + pa.field("nested", pa.list_(dtype1.pyarrow_dtype)), + ] + ), + inner_dtypes={"x": pd.Float64Dtype(), "nested": dtype1}, + ) + assert dtype2.fields == {"x": pd.Float64Dtype(), "y": pd.ArrowDtype(pa.string()), "nested": dtype1} + + # field name missmatch + with pytest.raises(ValueError): + NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))]), inner_dtypes={"xyz": pa.int64()}) + + # element type is not compatible with inner dtype + with pytest.raises(TypeError): + NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))]), inner_dtypes={"a": pa.bool_()}) def test_field_names(): From c7cd6d9a40ad22b73d1d7b640c44bf1255fe2082 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 5 Mar 2025 21:41:59 -0500 Subject: [PATCH 3/6] .nest.to_flat to respect inner_dtypes --- src/nested_pandas/series/accessor.py | 4 ++-- src/nested_pandas/series/ext_array.py | 17 +++++++++----- tests/nested_pandas/series/test_accessor.py | 25 +++++++++------------ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 2ae9afc8..75c464c1 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -109,7 +109,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame: index=pd.Series(index, name=self._series.index.name), name=field, copy=False, - dtype=pd.ArrowDtype(chunked_array.type), + dtype=self._series.dtype.inner_dtype(field), ) return pd.DataFrame(flat_series) @@ -292,7 +292,7 @@ def get_flat_series(self, field: str) -> pd.Series: return pd.Series( flat_chunked_array, - dtype=pd.ArrowDtype(flat_chunked_array.type), + dtype=self._series.dtype.inner_dtype(field), index=self.get_flat_index(), name=field, copy=False, diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index 43735dbb..ba9c109f 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -35,7 +35,7 @@ # typing.Self and "|" union syntax don't exist in Python 3.9 from __future__ import annotations -from collections.abc import Generator, Iterable, Iterator, Sequence +from collections.abc import Generator, Iterable, Iterator, Mapping, Sequence from typing import Any, Callable, cast import numpy as np @@ -212,7 +212,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # Parameters ---------- scalars : Sequence - The sequence of scalars: disctionaries, DataFrames, None, pd.NA, pa.Array or anything convertible + The sequence of scalars: dictionaries, DataFrames, None, pd.NA, pa.Array or anything convertible to PyArrow scalars. dtype : dtype or None dtype of the resulting array @@ -223,7 +223,8 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # pa_type = to_pyarrow_dtype(dtype) pa_array = cls._box_pa_array(scalars, pa_type=pa_type) - return cls(pa_array) + inner_dtypes = dtype.inner_dtypes if isinstance(dtype, NestedDtype) else None + return cls(pa_array, inner_dtypes=inner_dtypes) # Tricky to implement, but required by things like pd.read_csv @classmethod @@ -655,7 +656,13 @@ def _convert_struct_scalar_to_df(cls, value: pa.StructScalar, *, copy: bool, na_ _chunked_array: pa.ChunkedArray _dtype: NestedDtype - def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True) -> None: + def __init__( + self, + values: pa.Array | pa.ChunkedArray, + *, + inner_dtypes: Mapping[str, object] | None = None, + validate: bool = True, + ) -> None: if isinstance(values, pa.Array): values = pa.chunked_array([values]) @@ -670,7 +677,7 @@ def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True) self._validate(values) self._chunked_array = values - self._dtype = NestedDtype(values.type) + self._dtype = NestedDtype(values.type, inner_dtypes=inner_dtypes) @property def _list_array(self) -> pa.ChunkedArray: diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index 37b385a8..17cdcb10 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -148,14 +148,16 @@ def test_to_flat(): """Test that the .nest.to_flat() method works.""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0]), np.array([1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0]), [None]]), ], names=["a", "b"], ) series = pd.Series( - struct_array, dtype=NestedDtype(struct_array.type), index=pd.Series([0, 1], name="idx") + struct_array, + dtype=NestedDtype(struct_array.type, inner_dtypes={"b": pd.Float64Dtype()}), + index=pd.Series([0, 1, 2], name="idx"), ) flat = series.nest.to_flat() @@ -164,28 +166,23 @@ def test_to_flat(): data={ "a": pd.Series( data=[1.0, 2.0, 3.0, 1.0, 2.0, 1.0], - index=[0, 0, 0, 1, 1, 1], + index=[0, 0, 0, 1, 1, 2], name="a", copy=False, dtype=pd.ArrowDtype(pa.float64()), ), "b": pd.Series( - data=[-4.0, -5.0, -6.0, -3.0, -4.0, -5.0], - index=[0, 0, 0, 1, 1, 1], + data=[-4.0, -5.0, -6.0, -3.0, -4.0, None], + index=[0, 0, 0, 1, 1, 2], name="b", copy=False, - dtype=pd.ArrowDtype(pa.float64()), + dtype=pd.Float64Dtype(), ), }, - index=pd.Index([0, 0, 0, 1, 1, 1], name="idx"), + index=pd.Index([0, 0, 0, 1, 1, 2], name="idx"), ) - assert_array_equal(flat.dtypes, desired.dtypes) - assert_array_equal(flat.index, desired.index) - assert flat.index.name == desired.index.name - - for column in flat.columns: - assert_array_equal(flat[column], desired[column]) + assert_frame_equal(flat, desired) def test_to_flat_for_chunked_array(): From 7b3d22ae3d6355c97799cb617ce8db6a6b9cad0b Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 12 Mar 2025 16:49:30 -0400 Subject: [PATCH 4/6] Allow ArrowDtype in NestedDtype.from_fields --- src/nested_pandas/series/dtype.py | 6 ++++-- tests/nested_pandas/series/test_dtype.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index f8590085..5e95f118 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -173,7 +173,7 @@ def __init__( self.inner_dtypes = self._validate_inner_dtypes(self.pyarrow_dtype, inner_dtypes) @classmethod - def from_fields(cls, fields: Mapping[str, pa.DataType | Self]) -> Self: # type: ignore[name-defined] # noqa: F821 + def from_fields(cls, fields: Mapping[str, pa.DataType | pa.ArrowDtype | Self]) -> Self: # type: ignore[name-defined] # noqa: F821 """Make NestedDtype from a mapping of field names and list item types. Parameters @@ -204,6 +204,8 @@ def from_fields(cls, fields: Mapping[str, pa.DataType | Self]) -> Self: # type: if isinstance(dtype, NestedDtype): inner_dtypes[field] = dtype dtype = dtype.pyarrow_dtype + elif isinstance(dtype, pd.ArrowDtype): + dtype = dtype.pyarrow_dtype pa_fields[field] = dtype pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in pa_fields.items()}) return cls(pyarrow_dtype=pyarrow_dtype, inner_dtypes=inner_dtypes or None) @@ -229,7 +231,7 @@ def _validate_inner_dtypes( pyarrow_dtype: pa.StructType, inner_dtypes: Mapping[str, object] | None ) -> dict[str, object]: # Short circuit if there are no inner dtypes - if inner_dtypes is None: + if inner_dtypes is None or len(inner_dtypes) == 0: return {} inner_dtypes = dict(inner_dtypes) diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index a463d0e2..1fde0bc2 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -54,7 +54,7 @@ def test_to_pandas_arrow_dtype(): def test_from_fields(): """Test NestedDtype.from_fields().""" - fields1 = {"a": pa.int64(), "b": pa.float64()} + fields1 = {"a": pa.int64(), "b": pd.ArrowDtype(pa.float64())} dtype1 = NestedDtype.from_fields(fields1) assert dtype1.pyarrow_dtype == pa.struct( [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] From c9f0c29bc75b839dc57d5ead022aa4a4d1c995de Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Sat, 5 Apr 2025 16:15:47 -0400 Subject: [PATCH 5/6] Handle and derive inner_dtypes --- src/nested_pandas/series/ext_array.py | 131 ++++++++++++++++--- src/nested_pandas/series/packer.py | 27 +++- tests/nested_pandas/series/test_ext_array.py | 58 ++++++-- tests/nested_pandas/series/test_packer.py | 14 ++ 4 files changed, 195 insertions(+), 35 deletions(-) diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index ba9c109f..e5318abb 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -179,9 +179,32 @@ def replace_with_mask(array: pa.ChunkedArray, mask: pa.BooleanArray, value: pa.A return pa.compute.if_else(mask, broadcast_value, array) -def convert_df_to_pa_scalar(df: pd.DataFrame, *, pa_type: pa.DataType | None) -> pa.Scalar: - d = {column: series.values for column, series in df.to_dict("series").items()} - return pa.scalar(d, type=pa_type, from_pandas=True) +def convert_df_to_pa_scalar( + df: pd.DataFrame, *, pa_type: pa.DataType | None +) -> tuple[pa.StructScalar, dict[str, NestedDtype]]: + """Convert a pandas DataFrame to a PyArrow StructScalar + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to be converted + pa_type : pa.DataType | None + The PyArrow data type to be used for the scalar. + If None, the data type will be inferred from the DataFrame. + + Returns + ------- + pa.StructScalar + The PyArrow StructScalar representing the DataFrame + dict[str, object] + Pandas dtypes of the DataFrame columns which we'd like to cast the result to. + """ + d = { + column: series.values.to_pyarrow_scalar() if isinstance(series.dtype, NestedDtype) else series.values + for column, series in df.to_dict("series").items() + } + inner_dtypes = {column: dtype for column, dtype in df.dtypes.items() if isinstance(dtype, NestedDtype)} + return pa.scalar(d, type=pa_type, from_pandas=True), inner_dtypes class NestedExtensionArray(ExtensionArray): @@ -222,8 +245,13 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # del copy pa_type = to_pyarrow_dtype(dtype) - pa_array = cls._box_pa_array(scalars, pa_type=pa_type) - inner_dtypes = dtype.inner_dtypes if isinstance(dtype, NestedDtype) else None + pa_array, infered_inner_dtypes = cls._box_pa_array(scalars, pa_type=pa_type) + if isinstance(dtype, NestedDtype): + inner_dtypes = dtype.inner_dtypes + elif len(infered_inner_dtypes) > 1: + inner_dtypes = infered_inner_dtypes + else: + inner_dtypes = None return cls(pa_array, inner_dtypes=inner_dtypes) # Tricky to implement, but required by things like pd.read_csv @@ -298,10 +326,10 @@ def __setitem__(self, key, value) -> None: # Try to convert to struct_scalar first, if it fails, convert to array try: - scalar = self._box_pa_scalar(value, pa_type=self._pyarrow_dtype) + scalar, _ = self._box_pa_scalar(value, pa_type=self._pyarrow_dtype) except (ValueError, TypeError): # Copy will happen later in replace_with_mask() anyway - value = self._box_pa_array(value, pa_type=self._pyarrow_dtype) + value, _ = self._box_pa_array(value, pa_type=self._pyarrow_dtype) else: # Our replace_with_mask implementation doesn't work with scalars value = pa.array([scalar] * pa.compute.sum(pa_mask).as_py()) @@ -460,7 +488,7 @@ def take( raise IndexError("out of bounds value in 'indices'.") if allow_fill: - fill_value = self._box_pa_scalar(fill_value, pa_type=self._pyarrow_dtype) + fill_value, _inner_dtypes = self._box_pa_scalar(fill_value, pa_type=self._pyarrow_dtype) fill_mask = indices_array < 0 if not fill_mask.any(): @@ -595,23 +623,60 @@ def __setstate__(self, state): # End of Additional magic methods # @classmethod - def _box_pa_scalar(cls, value, *, pa_type: pa.DataType | None) -> pa.Scalar: - """Convert a value to a PyArrow scalar with the specified type.""" - if isinstance(value, pa.Scalar): + def _box_pa_scalar( + cls, value, *, pa_type: pa.DataType | None + ) -> tuple[pa.StructScalar, dict[str, NestedDtype]]: + """Convert a value to a PyArrow scalar with the specified type. + + Parameters + ---------- + value: convertible to a PyArrow scalar + The value to be converted. + pa_type: PyArrow data type or None (default: None) + The type to which the value should be converted. If None, + the type is inferred from the value. + + Returns + ------- + pa.StructScalar + The converted PyArrow scalar. + dict[str, object] + Pandas datatypes of the scalar struct-fields. + """ + empty_inner_dtypes = cast(dict[str, NestedDtype], {}) + if isinstance(value, (pa.StructScalar, pa.NullScalar)): if pa_type is None: - return value - return value.cast(pa_type) + return value, empty_inner_dtypes + return value.cast(pa_type), empty_inner_dtypes if value is pd.NA or value is None: - return pa.scalar(None, type=pa_type, from_pandas=True) + return pa.scalar(None, type=pa_type, from_pandas=True), empty_inner_dtypes if isinstance(value, pd.DataFrame): return convert_df_to_pa_scalar(value, pa_type=pa_type) - return pa.scalar(value, type=pa_type, from_pandas=True) + return pa.scalar(value, type=pa_type, from_pandas=True), empty_inner_dtypes @classmethod - def _box_pa_array(cls, value, *, pa_type: pa.DataType | None) -> pa.Array | pa.ChunkedArray: - """Convert a value to a PyArrow array with the specified type.""" + def _box_pa_array( + cls, value, *, pa_type: pa.DataType | None + ) -> tuple[pa.Array | pa.ChunkedArray, dict[str, object]]: + """Convert a value to a PyArrow array with the specified type. + + Parameters + ---------- + value + Value to convert + pa_type : pyarrow.DataType or None + Pyarrow type to cast to. If None it will be derived + + Returns + ------- + pyarrow.Array or pyarrow.ChunkedArray + The result array + dict of inferred inner dtypes + """ + inner_dtypes: dict[str, object] = {} if isinstance(value, cls): pa_array = value._chunked_array + inner_dtypes = value.dtype.inner_dtypes.copy() elif isinstance(value, (pa.Array, pa.ChunkedArray)): pa_array = value else: @@ -619,11 +684,14 @@ def _box_pa_array(cls, value, *, pa_type: pa.DataType | None) -> pa.Array | pa.C pa_array = pa.array(value, type=pa_type) except (ValueError, TypeError, KeyError): scalars: list[pa.Scalar] = [] + # Pandas dtypes to cast the result Series to. Currently NestedDtype only. for v in value: # If pa_type is not specified, then cast to the first non-null type if pa_type is None and len(scalars) > 0 and not isinstance(scalars[-1], pa.NullScalar): pa_type = scalars[-1].type - scalars.append(cls._box_pa_scalar(v, pa_type=pa_type)) + scalar, dtypes = cls._box_pa_scalar(v, pa_type=pa_type) + scalars.append(scalar) + inner_dtypes.update(dtypes) # We recast the scalars to the specified type. # Logically, we should 1) have `pa_type is not None` here, # 2) only "head" null-scalars to be not cast to the specified type. @@ -633,13 +701,15 @@ def _box_pa_array(cls, value, *, pa_type: pa.DataType | None) -> pa.Array | pa.C scalars = [s.cast(pa_type) for s in scalars] pa_array = pa.array(scalars) # We already copied the data into scalars + else: + inner_dtypes = {} # We always cast - even if the type is the same, it does not hurt - # If the type is different the result may still be a view, so we do not set copy=False + # If the type is different, the result array may still be a view, so we do not set copy=False if pa_type is not None: pa_array = pa_array.cast(pa_type) - return pa_array + return pa_array, inner_dtypes @classmethod def _convert_struct_scalar_to_df(cls, value: pa.StructScalar, *, copy: bool, na_value: Any = None) -> Any: @@ -757,11 +827,32 @@ def to_arrow_ext_array(self, list_struct: bool = False) -> ArrowExtensionArray: list_struct : bool, optional If False (default), return struct-list array, otherwise return list-struct array. + + Returns + ------- + pandas.ArrowExtensionArray """ if list_struct: return ArrowExtensionArray(self._list_array) return ArrowExtensionArray(self._chunked_array) + def to_pyarrow_scalar(self, list_struct: bool = False) -> pa.ListScalar: + """Convert to a pyarrow scalar of a list type + + Parameters + ---------- + list_struct : bool, optional + If False (default), return list-struct-list scalar, + otherwise list-list-struct scalar. + + Returns + ------- + pyarrow.ListScalar + """ + pa_array = self._list_array if list_struct else self._chunked_array + pa_type = pa.list_(pa_array.type) + return cast(pa.ListScalar, pa.scalar(pa_array, type=pa_type)) + def _replace_chunked_array(self, pa_array: pa.ChunkedArray, *, validate: bool) -> None: if validate: self._validate(pa_array) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index 73dc58ce..c9a092ae 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -7,7 +7,7 @@ # "|" for python 3.9 from __future__ import annotations -from collections.abc import Sequence +from collections.abc import Mapping, Sequence import numpy as np import pandas as pd @@ -158,12 +158,22 @@ def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd. raise ValueError("The index of the input dataframe must be sorted") packed_df = view_sorted_df_as_list_arrays(df) + + # Handle columns which are already nested in the input dataframe + inner_dtypes = {str(col): dtype for col, dtype in df.dtypes.items() if isinstance(dtype, NestedDtype)} + # No need to validate the dataframe, the length of the nested arrays is forced to be the same by # the view_sorted_df_as_list_arrays function. - return pack_lists(packed_df, name=name, validate=False) + return pack_lists(packed_df, name=name, validate=False, inner_dtypes=inner_dtypes) -def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series: +def pack_lists( + df: pd.DataFrame, + name: str | None = None, + *, + validate: bool = True, + inner_dtypes: Mapping[str, object] | None = None, +) -> pd.Series: """Make a series of arrow structures from a dataframe with nested arrays. For the input dataframe with repeated indexes, make a pandas.Series, @@ -184,6 +194,9 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr Name of the pd.Series. validate : bool, default True Whether to validate the input dataframe. + inner_dtypes : mapping of field names to pandas dtypes to cast to, optional + The dtypes to cast the inner arrays to. If not provided, the dtypes + may be inferred from the input arrays. Returns ------- @@ -225,11 +238,19 @@ def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = Tr ) ext_array = NestedExtensionArray(struct_array, validate=validate) + + # Put nested dtypes of the input dataframe into the output series dtype + # Prefer inferred dtypes over what we previously detected in pack_sorted_df_into_struct + inferred_dtype = ext_array.dtype + inner_dtypes = dict(inner_dtypes or {}) + dtype = NestedDtype(inferred_dtype.pyarrow_dtype, inner_dtypes=inner_dtypes | inferred_dtype.inner_dtypes) + return pd.Series( ext_array, index=df.index, copy=False, name=name, + dtype=dtype, ) diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py index 391e06cc..c0dc7aef 100644 --- a/tests/nested_pandas/series/test_ext_array.py +++ b/tests/nested_pandas/series/test_ext_array.py @@ -5,8 +5,9 @@ import pyarrow as pa import pyarrow.compute as pc import pytest -from nested_pandas import NestedDtype +from nested_pandas import NestedDtype, NestedFrame from nested_pandas.series.ext_array import NestedExtensionArray, convert_df_to_pa_scalar, replace_with_mask +from nested_pandas.series.packer import pack_flat from numpy.testing import assert_array_equal from pandas.core.arrays import ArrowExtensionArray from pandas.testing import assert_frame_equal, assert_series_equal @@ -278,24 +279,28 @@ def test_series_built_from_dict(): def test_convert_df_to_pa_scalar(): """Test that we can convert a DataFrame to a pyarrow struct_scalar.""" df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) - pa_scalar = convert_df_to_pa_scalar(df, pa_type=None) - - assert pa_scalar == pa.scalar( + actual, inner_dtypes = convert_df_to_pa_scalar(df, pa_type=None) + expected = pa.scalar( {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), ) + assert actual == expected + assert inner_dtypes == {} + def test_convert_df_to_pa_from_scalar(): """Test that we can convert a DataFrame to a pyarrow struct_scalar.""" df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) - pa_scalar = convert_df_to_pa_scalar(df, pa_type=None) - - assert pa_scalar == pa.scalar( + actual, inner_dtypes = convert_df_to_pa_scalar(df, pa_type=None) + expected = pa.scalar( {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), ) + assert actual == expected + assert inner_dtypes == {} + def test__box_pa_array_from_series_of_df(): """Test that we can convert a DataFrame to a pyarrow scalar.""" @@ -305,7 +310,9 @@ def test__box_pa_array_from_series_of_df(): pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), ] ) - list_of_dicts = list(NestedExtensionArray._box_pa_array(series, pa_type=None)) + pa_array, inner_dtypes = NestedExtensionArray._box_pa_array(series, pa_type=None) + assert inner_dtypes == {} + list_of_dicts = list(pa_array) desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) @@ -321,7 +328,9 @@ def test__box_pa_array_from_list_of_df(): pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), ] - list_of_dicts = list(NestedExtensionArray._box_pa_array(list_of_dfs, pa_type=None)) + pa_array, inner_dtypes = NestedExtensionArray._box_pa_array(list_of_dfs, pa_type=None) + assert inner_dtypes == {} + list_of_dicts = list(pa_array) desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) @@ -1155,8 +1164,9 @@ def test___array__(): ) def test__box_pa_scalar(value, pa_type, desired): """Tests _box_pa_scalar()""" - actual = NestedExtensionArray._box_pa_scalar(value, pa_type=pa_type) + actual, inner_dtypes = NestedExtensionArray._box_pa_scalar(value, pa_type=pa_type) assert actual == desired + assert inner_dtypes == {} @pytest.mark.parametrize( @@ -1203,8 +1213,32 @@ def test__box_pa_scalar(value, pa_type, desired): ) def test__box_pa_array(value, pa_type, desired): """Tests _box_pa_array""" - actual = NestedExtensionArray._box_pa_array(value, pa_type=pa_type) - assert actual == desired + pa_array, inner_dtypes = NestedExtensionArray._box_pa_array(value, pa_type=pa_type) + assert pa_array == desired + assert inner_dtypes == {} + + +def test__box_pa_array_from_nested_frames(): + """Tests _box_pa_array for a collection of nested-frames""" + + nf1 = NestedFrame( + { + "base": pd.Series([1, 2, 3]), + "nested": pack_flat( + pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}, index=[0, 0, 1, 1, 2]) + ), + } + ) + nf2 = NestedFrame( + { + "base": pd.Series([-1, -2, -3, -4]), + "nested": pack_flat( + pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}, index=[0, 0, 1, 2, 3]) + ), + } + ) + + pa_array, dtypes = NestedExtensionArray._box_pa_array([nf1, nf2], pa_type=None) def test_series_apply_udf_argument(): diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index df59c0f8..8e27db91 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -3,6 +3,7 @@ import pyarrow as pa import pytest from nested_pandas import NestedDtype +from nested_pandas.datasets import generate_data from nested_pandas.series import packer from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal, assert_series_equal @@ -221,6 +222,19 @@ def test_pack_flat_with_on(): assert_series_equal(actual, desired) +def test_pack_flat_with_nested(): + """Test pack_flat when input already has nested columns.""" + df = generate_data(10, 3) + index = [0, 0, 1, 1, 2, 2, 2, 2, 0, 0] + df.index = index + actual = packer.pack_flat(df) + + desired_dtype = NestedDtype.from_fields( + {col: t if isinstance(t, NestedDtype) else pa.from_numpy_dtype(t) for col, t in df.dtypes.items()} + ) + assert actual.dtype == desired_dtype, f"{actual.dtype.name} != {desired_dtype.name}" + + def test_pack_sorted_df_into_struct(): """Test pack_sorted_df_into_struct().""" df = pd.DataFrame( From 85cdd7e12c64068159a4ce8ad7bdb688439432e0 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Mon, 7 Apr 2025 10:51:47 -0400 Subject: [PATCH 6/6] Fix a typo in variable name --- src/nested_pandas/series/ext_array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index e5318abb..e4ab6616 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -245,11 +245,11 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # del copy pa_type = to_pyarrow_dtype(dtype) - pa_array, infered_inner_dtypes = cls._box_pa_array(scalars, pa_type=pa_type) + pa_array, inferred_inner_dtypes = cls._box_pa_array(scalars, pa_type=pa_type) if isinstance(dtype, NestedDtype): inner_dtypes = dtype.inner_dtypes - elif len(infered_inner_dtypes) > 1: - inner_dtypes = infered_inner_dtypes + elif len(inferred_inner_dtypes) > 1: + inner_dtypes = inferred_inner_dtypes else: inner_dtypes = None return cls(pa_array, inner_dtypes=inner_dtypes)