Skip to content

Commit c771a8b

Browse files
authored
Merge pull request #246 from lincc-frameworks/sean/read_pandas_kwargs
Support kwargs and loading from directories in `read_parquet`
2 parents 71828ad + c15ace3 commit c771a8b

File tree

3 files changed

+43
-6
lines changed

3 files changed

+43
-6
lines changed

src/nested_pandas/nestedframe/io.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def read_parquet(
1414
data: str | UPath | bytes,
1515
columns: list[str] | None = None,
1616
reject_nesting: list[str] | str | None = None,
17+
**kwargs,
1718
) -> NestedFrame:
1819
"""
1920
Load a parquet object from a file path into a NestedFrame.
@@ -35,6 +36,8 @@ def read_parquet(
3536
is castable to a nested column. However, this assumption is invalid if
3637
the lists within the struct have mismatched lengths for any given item.
3738
Columns specified here will be read using the corresponding pandas.ArrowDtype.
39+
kwargs: dict
40+
Keyword arguments passed to `pyarrow.parquet.read_table`
3841
3942
Returns
4043
-------
@@ -82,12 +85,12 @@ def read_parquet(
8285
# Check if `data` is a file-like object
8386
if hasattr(data, "read"):
8487
# If `data` is a file-like object, pass it directly to pyarrow
85-
table = pq.read_table(data, columns=columns)
88+
table = pq.read_table(data, columns=columns, **kwargs)
8689
else:
8790
# Otherwise, treat `data` as a file path and use UPath
8891
path = UPath(data)
89-
with path.open("rb") as f:
90-
table = pq.read_table(f, columns=columns)
92+
filesystem = kwargs.pop("filesystem", path.fs)
93+
table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs)
9194

9295
# Resolve partial loading of nested structures
9396
# Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -147,9 +150,7 @@ def read_parquet(
147150
# Convert to NestedFrame
148151
# not zero-copy, but reduce memory pressure via the self_destruct kwarg
149152
# https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
150-
df = NestedFrame(
151-
table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), split_blocks=True, self_destruct=True)
152-
)
153+
df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True))
153154
del table
154155
# Attempt to cast struct columns to NestedDTypes
155156
df = _cast_struct_cols_to_nested(df, reject_nesting)

tests/nested_pandas/e2e_tests/test_issue89.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@ def test_issue89():
1616
object_ndf = npd.read_parquet(
1717
f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet",
1818
columns=["ra", "dec", "ps1_objid"],
19+
partitioning=None,
1920
).set_index("ps1_objid")
2021

2122
source_ndf = npd.read_parquet(
2223
f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet",
2324
columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"],
25+
partitioning=None,
2426
).set_index("ps1_objid")
2527

2628
object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")

tests/nested_pandas/nestedframe/test_io.py

+34
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from nested_pandas import read_parquet
99
from nested_pandas.datasets import generate_data
1010
from pandas.testing import assert_frame_equal
11+
from upath import UPath
1112

1213

1314
def test_read_parquet():
@@ -26,6 +27,39 @@ def test_read_parquet():
2627
assert nf.lincc.nest.fields == ["band", "frameworks"]
2728

2829

30+
def test_read_parquet_directory():
31+
"""Test reading a parquet file with no columns specified"""
32+
# Load in the example file
33+
nf = read_parquet("tests/test_data")
34+
35+
# Check the columns
36+
assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
37+
38+
# Make sure nested columns were recognized
39+
assert nf.nested_columns == ["nested", "lincc"]
40+
41+
# Check the nested columns
42+
assert nf.nested.nest.fields == ["t", "flux", "band"]
43+
assert nf.lincc.nest.fields == ["band", "frameworks"]
44+
45+
46+
def test_read_parquet_directory_with_filesystem():
47+
"""Test reading a parquet file with no columns specified"""
48+
# Load in the example file
49+
path = UPath("tests/test_data")
50+
nf = read_parquet(path.path, filesystem=path.fs)
51+
52+
# Check the columns
53+
assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
54+
55+
# Make sure nested columns were recognized
56+
assert nf.nested_columns == ["nested", "lincc"]
57+
58+
# Check the nested columns
59+
assert nf.nested.nest.fields == ["t", "flux", "band"]
60+
assert nf.lincc.nest.fields == ["band", "frameworks"]
61+
62+
2963
def test_file_object_read_parquet():
3064
"""Test reading parquet from a file-object"""
3165
with open("tests/test_data/nested.parquet", "rb") as f:

0 commit comments

Comments
 (0)