Merge pull request #246 from lincc-frameworks/sean/read_pandas_kwargs

smcguire-cmu · web-flow · commit c771a8b712f1 · 2025-04-28T13:02:36.000-04:00
Support kwargs and loading from directories in `read_parquet`
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -14,6 +14,7 @@ def read_parquet(
     data: str | UPath | bytes,
     columns: list[str] | None = None,
     reject_nesting: list[str] | str | None = None,
+    **kwargs,
 ) -> NestedFrame:
     """
     Load a parquet object from a file path into a NestedFrame.
@@ -35,6 +36,8 @@ def read_parquet(
         is castable to a nested column. However, this assumption is invalid if
         the lists within the struct have mismatched lengths for any given item.
         Columns specified here will be read using the corresponding pandas.ArrowDtype.
+    kwargs: dict
+        Keyword arguments passed to `pyarrow.parquet.read_table`
 
     Returns
     -------
@@ -82,12 +85,12 @@ def read_parquet(
     # Check if `data` is a file-like object
     if hasattr(data, "read"):
         # If `data` is a file-like object, pass it directly to pyarrow
-        table = pq.read_table(data, columns=columns)
+        table = pq.read_table(data, columns=columns, **kwargs)
     else:
         # Otherwise, treat `data` as a file path and use UPath
         path = UPath(data)
-        with path.open("rb") as f:
-            table = pq.read_table(f, columns=columns)
+        filesystem = kwargs.pop("filesystem", path.fs)
+        table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs)
 
     # Resolve partial loading of nested structures
     # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -147,9 +150,7 @@ def read_parquet(
     # Convert to NestedFrame
     # not zero-copy, but reduce memory pressure via the self_destruct kwarg
     # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
-    df = NestedFrame(
-        table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), split_blocks=True, self_destruct=True)
-    )
+    df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True))
     del table
     # Attempt to cast struct columns to NestedDTypes
     df = _cast_struct_cols_to_nested(df, reject_nesting)
diff --git a/tests/nested_pandas/e2e_tests/test_issue89.py b/tests/nested_pandas/e2e_tests/test_issue89.py
@@ -16,11 +16,13 @@ def test_issue89():
     object_ndf = npd.read_parquet(
         f"{catalogs_dir}/ztf_object/Norder=3/Dir=0/Npix=432.parquet",
         columns=["ra", "dec", "ps1_objid"],
+        partitioning=None,
     ).set_index("ps1_objid")
 
     source_ndf = npd.read_parquet(
         f"{catalogs_dir}/ztf_source/Norder=6/Dir=20000/Npix=27711.parquet",
         columns=["mjd", "mag", "magerr", "band", "ps1_objid", "catflags"],
+        partitioning=None,
     ).set_index("ps1_objid")
 
     object_ndf = object_ndf.add_nested(source_ndf, "ztf_source")
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -8,6 +8,7 @@
 from nested_pandas import read_parquet
 from nested_pandas.datasets import generate_data
 from pandas.testing import assert_frame_equal
+from upath import UPath
 
 
 def test_read_parquet():
@@ -26,6 +27,39 @@ def test_read_parquet():
     assert nf.lincc.nest.fields == ["band", "frameworks"]
 
 
+def test_read_parquet_directory():
+    """Test reading a parquet file with no columns specified"""
+    # Load in the example file
+    nf = read_parquet("tests/test_data")
+
+    # Check the columns
+    assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
+
+    # Make sure nested columns were recognized
+    assert nf.nested_columns == ["nested", "lincc"]
+
+    # Check the nested columns
+    assert nf.nested.nest.fields == ["t", "flux", "band"]
+    assert nf.lincc.nest.fields == ["band", "frameworks"]
+
+
+def test_read_parquet_directory_with_filesystem():
+    """Test reading a parquet file with no columns specified"""
+    # Load in the example file
+    path = UPath("tests/test_data")
+    nf = read_parquet(path.path, filesystem=path.fs)
+
+    # Check the columns
+    assert nf.columns.tolist() == ["a", "flux", "nested", "lincc"]
+
+    # Make sure nested columns were recognized
+    assert nf.nested_columns == ["nested", "lincc"]
+
+    # Check the nested columns
+    assert nf.nested.nest.fields == ["t", "flux", "band"]
+    assert nf.lincc.nest.fields == ["band", "frameworks"]
+
+
 def test_file_object_read_parquet():
     """Test reading parquet from a file-object"""
     with open("tests/test_data/nested.parquet", "rb") as f: