@@ -14,6 +14,7 @@ def read_parquet(
14
14
data : str | UPath | bytes ,
15
15
columns : list [str ] | None = None ,
16
16
reject_nesting : list [str ] | str | None = None ,
17
+ ** kwargs ,
17
18
) -> NestedFrame :
18
19
"""
19
20
Load a parquet object from a file path into a NestedFrame.
@@ -35,6 +36,8 @@ def read_parquet(
35
36
is castable to a nested column. However, this assumption is invalid if
36
37
the lists within the struct have mismatched lengths for any given item.
37
38
Columns specified here will be read using the corresponding pandas.ArrowDtype.
39
+ kwargs: dict
40
+ Keyword arguments passed to `pyarrow.parquet.read_table`
38
41
39
42
Returns
40
43
-------
@@ -82,12 +85,12 @@ def read_parquet(
82
85
# Check if `data` is a file-like object
83
86
if hasattr (data , "read" ):
84
87
# If `data` is a file-like object, pass it directly to pyarrow
85
- table = pq .read_table (data , columns = columns )
88
+ table = pq .read_table (data , columns = columns , ** kwargs )
86
89
else :
87
90
# Otherwise, treat `data` as a file path and use UPath
88
91
path = UPath (data )
89
- with path . open ( "rb" ) as f :
90
- table = pq .read_table (f , columns = columns )
92
+ filesystem = kwargs . pop ( "filesystem" , path . fs )
93
+ table = pq .read_table (path . path , columns = columns , filesystem = filesystem , ** kwargs )
91
94
92
95
# Resolve partial loading of nested structures
93
96
# Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -147,9 +150,7 @@ def read_parquet(
147
150
# Convert to NestedFrame
148
151
# not zero-copy, but reduce memory pressure via the self_destruct kwarg
149
152
# https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
150
- df = NestedFrame (
151
- table .to_pandas (types_mapper = lambda ty : pd .ArrowDtype (ty ), split_blocks = True , self_destruct = True )
152
- )
153
+ df = NestedFrame (table .to_pandas (types_mapper = pd .ArrowDtype , split_blocks = True , self_destruct = True ))
153
154
del table
154
155
# Attempt to cast struct columns to NestedDTypes
155
156
df = _cast_struct_cols_to_nested (df , reject_nesting )
0 commit comments