astronomy-commons · smcguire-cmu · May 5, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "numba>=0.58",
     "numpy<3", 
     "pandas",
+    "nested-pandas>=0.3.8,<0.5.0",
     "pyarrow>=14.0.1",
     "pydantic",
     "scipy",

diff --git a/src/hats/io/file_io/file_io.py b/src/hats/io/file_io/file_io.py
@@ -4,6 +4,7 @@
 from collections.abc import Generator
 from pathlib import Path
 
+import nested_pandas as npd
 import numpy as np
 import pandas as pd
 import pyarrow.dataset as pds
@@ -283,7 +284,7 @@
     return storage_options
 
 
-def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> pd.DataFrame:
+def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> npd.NestedFrame:
     """Reads parquet file(s) to a pandas DataFrame
 
     Args:
@@ -296,17 +297,15 @@
     file_pointer = get_upath(file_pointer)
     # If we are trying to read a directory over http, we need to send the explicit list of files instead.
     # We don't want to get the list unnecessarily because it can be expensive.
-    if isinstance(file_pointer, upath.implementations.http.HTTPPath) and len(file_pointer.suffixes) == 0:
+    if isinstance(file_pointer, upath.implementations.http.HTTPPath) and file_pointer.is_dir():
         file_pointers = [f for f in file_pointer.iterdir() if f.is_file()]
-        storage_options = unnest_headers_for_pandas(file_pointer.storage_options)
-        return pd.read_parquet(
+        return npd.read_parquet(
             file_pointers,
-            storage_options=storage_options,
             filesystem=file_pointer.fs,
             partitioning=None,  # Avoid the ArrowTypeError described in #367
             **kwargs,
         )
-    return pd.read_parquet(
+    return npd.read_parquet(
         file_pointer.path,
         filesystem=file_pointer.fs,
         partitioning=None,  # Avoid the ArrowTypeError described in #367

diff --git a/tests/hats/io/file_io/test_file_io.py b/tests/hats/io/file_io/test_file_io.py
@@ -113,6 +113,7 @@ def test_write_df_to_csv(tmp_path):
 
 def test_read_parquet_data(tmp_path):
     random_df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))
+    random_df = random_df.convert_dtypes(dtype_backend="pyarrow")
     test_file_path = tmp_path / "test.parquet"
     random_df.to_parquet(test_file_path)
     dataframe = read_parquet_file_to_pandas(test_file_path)