Skip to content

Commit f75d700

Browse files
authored
Support loading sequences in read_parquet (#250)
* support sequences in read parquet * unit tests * comments * lint
1 parent b6412b5 commit f75d700

File tree

2 files changed

+27
-3
lines changed
  • src/nested_pandas/nestedframe
  • tests/nested_pandas/nestedframe

2 files changed

+27
-3
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# typing.Self and "|" union syntax don't exist in Python 3.9
22
from __future__ import annotations
33

4+
from collections.abc import Sequence
5+
46
import pandas as pd
57
import pyarrow as pa
68
import pyarrow.parquet as pq
@@ -82,9 +84,11 @@ def read_parquet(
8284
reject_nesting = [reject_nesting]
8385

8486
# First load through pyarrow
85-
# Check if `data` is a file-like object
86-
if hasattr(data, "read"):
87-
# If `data` is a file-like object, pass it directly to pyarrow
87+
# Check if `data` is a file-like object or a sequence
88+
if hasattr(data, "read") or (
89+
isinstance(data, Sequence) and not isinstance(data, (str, bytes, bytearray))
90+
):
91+
# If `data` is a file-like object or a sequence, pass it directly to pyarrow
8892
table = pq.read_table(data, columns=columns, **kwargs)
8993
else:
9094
# Otherwise, treat `data` as a file path and use UPath

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,26 @@ def test_read_parquet():
2727
assert nf.lincc.nest.fields == ["band", "frameworks"]
2828

2929

30+
def test_read_parquet_list():
31+
"""Test reading a parquet file with no columns specified"""
32+
# Load in the example files
33+
single_file_nf = read_parquet("tests/test_data/nested.parquet")
34+
nf = read_parquet(["tests/test_data/nested.parquet", "tests/test_data/nested.parquet"])
35+
36+
# Check the columns
37+
assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
38+
39+
# Make sure nested columns were recognized
40+
assert nf.nested_columns == ["nested", "lincc"]
41+
42+
# Check the nested columns
43+
assert nf.nested.nest.fields == ["t", "flux", "band"]
44+
assert nf.lincc.nest.fields == ["band", "frameworks"]
45+
46+
# Check loading list works correctly
47+
assert len(nf) == 2 * len(single_file_nf)
48+
49+
3050
def test_read_parquet_directory():
3151
"""Test reading a parquet file with no columns specified"""
3252
# Load in the example file

0 commit comments

Comments
 (0)