huggingface · CloseChoice · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025
diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx
@@ -103,6 +103,12 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t")
 
 [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolder
 
+### Nifti
+
+[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolderConfig
+
+[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder
+
 ### WebDataset
 
 [[autodoc]] datasets.packaged_modules.webdataset.WebDataset
diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx
@@ -271,6 +271,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable
 
 [[autodoc]] datasets.Pdf
 
+### Nifti
+
+[[autodoc]] datasets.Nifti
+
 ## Filesystems
 
 [[autodoc]] datasets.filesystems.is_remote_filesystem

diff --git a/src/datasets/config.py b/src/datasets/config.py
@@ -139,6 +139,7 @@
 TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None
 TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
 PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None
+NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None
 
 # Optional compression tools
 RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None

diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py
@@ -15,10 +15,12 @@
     "TranslationVariableLanguages",
     "Video",
     "Pdf",
+    "Nifti",
 ]
 from .audio import Audio
 from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value
 from .image import Image
+from .nifti import Nifti
 from .pdf import Pdf
 from .translation import Translation, TranslationVariableLanguages
 from .video import Video
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
@@ -42,6 +42,7 @@
 from ..utils.py_utils import asdict, first_non_null_value, zip_dict
 from .audio import Audio
 from .image import Image, encode_pil_image
+from .nifti import Nifti
 from .pdf import Pdf, encode_pdfplumber_pdf
 from .translation import Translation, TranslationVariableLanguages
 from .video import Video
@@ -1270,6 +1271,7 @@ def __repr__(self):
     Image,
     Video,
     Pdf,
+    Nifti,
 ]
 
 
@@ -1428,6 +1430,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni
     Image.__name__: Image,
     Video.__name__: Video,
     Pdf.__name__: Pdf,
+    Nifti.__name__: Nifti,
 }
 
 
@@ -1761,6 +1764,9 @@ class Features(dict):
         - [`Pdf`] feature to store the absolute path to a PDF file, a `pdfplumber.pdf.PDF` object
           or a dictionary with the relative path to a PDF file ("path" key) and its bytes content ("bytes" key).
           This feature loads the PDF lazily with a PDF reader.
+        - [`Nifti`] feature to store the absolute path to a NIfTI neuroimaging file, a `nibabel.Nifti1Image` object
+          or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key).
+          This feature loads the NIfTI file lazily with nibabel.
         - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation.
     """
 

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
@@ -0,0 +1,244 @@
+import os
+from dataclasses import dataclass, field
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
+
+import pyarrow as pa
+
+from .. import config
+from ..download.download_config import DownloadConfig
+from ..table import array_cast
+from ..utils.file_utils import is_local_path, xopen
+from ..utils.py_utils import string_to_dict
+
+
+if TYPE_CHECKING:
+    import nibabel as nib
+
+    from .features import FeatureType
+
+
+@dataclass
+class Nifti:
+    """
+    **Experimental.**
+    Nifti [`Feature`] to read NIfTI neuroimaging files.
+
+    Input: The Nifti feature accepts as input:
+    - A `str`: Absolute path to the NIfTI file (i.e. random access is allowed).
+    - A `pathlib.Path`: path to the NIfTI file (i.e. random access is allowed).
+    - A `dict` with the keys:
+        - `path`: String with relative path of the NIfTI file in a dataset repository.
+        - `bytes`: Bytes of the NIfTI file.
+      This is useful for archived files with sequential access.
+
+    - A `nibabel` image object (e.g., `nibabel.nifti1.Nifti1Image`).
+
+    Args:
+        decode (`bool`, defaults to `True`):
+            Whether to decode the NIfTI data. If `False`,
+            returns the underlying dictionary in the format `{"path": nifti_path, "bytes": nifti_bytes}`.
+
+    Examples:
+
+    ```py
+    >>> from datasets import Dataset, Nifti
+    >>> ds = Dataset.from_dict({"nifti": ["path/to/file.nii.gz"]}).cast_column("nifti", Nifti())
+    >>> ds.features["nifti"]
+    Nifti(decode=True, id=None)
+    >>> ds[0]["nifti"]
+    <nibabel.nifti1.Nifti1Image object at 0x7f8a1c2d8f40>
+    >>> ds = ds.cast_column("nifti", Nifti(decode=False))
+    >>> ds[0]["nifti"]
+    {'bytes': None,
+    'path': 'path/to/file.nii.gz'}
+    ```
+    """
+
+    decode: bool = True
+    id: Optional[str] = field(default=None, repr=False)
+
+    # Automatically constructed
+    dtype: ClassVar[str] = "nibabel.nifti1.Nifti1Image"
+    pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()})
+    _type: str = field(default="Nifti", init=False, repr=False)
+
+    def __call__(self):
+        return self.pa_type
+
+    def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Image"]) -> dict:
+        """Encode example into a format for Arrow.
+
+        Args:
+            value (`str`, `bytes`, `nibabel.Nifti1Image` or `dict`):
+                Data passed as input to Nifti feature.
+
+        Returns:
+            `dict` with "path" and "bytes" fields
+        """
+        if config.NIBABEL_AVAILABLE:
+            import nibabel as nib
+        else:
+            nib = None
+
+        if isinstance(value, str):
+            return {"path": value, "bytes": None}
+        elif isinstance(value, Path):
+            return {"path": str(value.absolute()), "bytes": None}
+        elif isinstance(value, (bytes, bytearray)):
+            return {"path": None, "bytes": value}
+        elif nib is not None and isinstance(value, nib.spatialimages.SpatialImage):
+            # nibabel image object - try to get path or convert to bytes
+            return encode_nibabel_image(value)
+        elif isinstance(value, dict):
+            if value.get("path") is not None and os.path.isfile(value["path"]):
+                # we set "bytes": None to not duplicate the data if they're already available locally
+                return {"bytes": None, "path": value.get("path")}
+            elif value.get("bytes") is not None or value.get("path") is not None:
+                # store the nifti bytes, and path is used to infer the format using the file extension
+                return {"bytes": value.get("bytes"), "path": value.get("path")}
+            else:
+                raise ValueError(
+                    f"A nifti sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
+                )
+        else:
+            raise ValueError(
+                f"A nifti sample should be a string, bytes, Path, nibabel image, or dict, but got {type(value)}."
+            )
+
+    def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.Nifti1Image":
+        """Decode example NIfTI file into nibabel image object.
+
+        Args:
+            value (`str` or `dict`):
+                A string with the absolute NIfTI file path, a dictionary with
+                keys:
+
+                - `path`: String with absolute or relative NIfTI file path.
+                - `bytes`: The bytes of the NIfTI file.
+
+            token_per_repo_id (`dict`, *optional*):
+                To access and decode NIfTI files from private repositories on
+                the Hub, you can pass a dictionary
+                repo_id (`str`) -> token (`bool` or `str`).
+
+        Returns:
+            `nibabel.Nifti1Image` or similar nibabel image object
+        """
+        if not self.decode:
+            raise RuntimeError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.")
+
+        if config.NIBABEL_AVAILABLE:
+            import nibabel as nib
+        else:
+            raise ImportError("To support decoding NIfTI files, please install 'nibabel'.")
+
+        if token_per_repo_id is None:
+            token_per_repo_id = {}
+
+        path, bytes_ = value["path"], value["bytes"]
+        if bytes_ is None:
+            if path is None:
+                raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.")
+            else:
+                if is_local_path(path):
+                    nifti = nib.load(path)
+                else:
+                    source_url = path.split("::")[-1]
+                    pattern = (
+                        config.HUB_DATASETS_URL
+                        if source_url.startswith(config.HF_ENDPOINT)
+                        else config.HUB_DATASETS_HFFS_URL
+                    )
+                    try:
+                        repo_id = string_to_dict(source_url, pattern)["repo_id"]
+                        token = token_per_repo_id.get(repo_id)
+                    except ValueError:
+                        token = None
+                    download_config = DownloadConfig(token=token)
+                    with xopen(path, "rb", download_config=download_config) as f:
+                        nifti = nib.load(f)
+        else:
+            import gzip
+
+            if (
+                bytes_[:2] == b"\x1f\x8b"
+            ):  # gzip magic number, see https://stackoverflow.com/a/76055284/9534390 or "Magic number" on https://en.wikipedia.org/wiki/Gzip
+                bytes_ = gzip.decompress(bytes_)
+
+            bio = BytesIO(bytes_)
+            fh = nib.FileHolder(fileobj=bio)
+            nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh})
+
+        return nifti
+
+    def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
+        """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
+        from .features import Value
+
+        return (
+            self
+            if self.decode
+            else {
+                "bytes": Value("binary"),
+                "path": Value("string"),
+            }
+        )
+
+    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryArray]) -> pa.StructArray:
+        """Cast an Arrow array to the Nifti arrow storage type.
+        The Arrow types that can be converted to the Nifti pyarrow storage type are:
+
+        - `pa.string()` - it must contain the "path" data
+        - `pa.binary()` - it must contain the NIfTI bytes
+        - `pa.struct({"bytes": pa.binary()})`
+        - `pa.struct({"path": pa.string()})`
+        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
+
+        Args:
+            storage (`Union[pa.StringArray, pa.StructArray, pa.BinaryArray]`):
+                PyArrow array to cast.
+
+        Returns:
+            `pa.StructArray`: Array in the Nifti arrow storage type, that is
+                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
+        """
+        if pa.types.is_string(storage.type):
+            bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_binary(storage.type):
+            path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null())
+        elif pa.types.is_struct(storage.type):
+            if storage.type.get_field_index("bytes") >= 0:
+                bytes_array = storage.field("bytes")
+            else:
+                bytes_array = pa.array([None] * len(storage), type=pa.binary())
+            if storage.type.get_field_index("path") >= 0:
+                path_array = storage.field("path")
+            else:
+                path_array = pa.array([None] * len(storage), type=pa.string())
+            storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())
+        return array_cast(storage, self.pa_type)
+
+
+def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str, bytes]]]:
+    """
+    Encode a nibabel image object into a dictionary.
+
+    If the image has an associated file path, returns the path. Otherwise, serializes
+    the image content into bytes.
+
+    Args:
+        img: A nibabel image object (e.g., Nifti1Image).
+
+    Returns:
+        dict: A dictionary with "path" or "bytes" field.
+    """
+    if hasattr(img, "file_map") and img.file_map is not None:
+        filename = img.file_map["image"].filename
+        return {"path": filename, "bytes": None}
+
+    bytes_data = img.to_bytes()
+    return {"path": None, "bytes": bytes_data}
diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
@@ -11,6 +11,7 @@
 from .hdf5 import hdf5
 from .imagefolder import imagefolder
 from .json import json
+from .niftifolder import niftifolder
 from .pandas import pandas
 from .parquet import parquet
 from .pdffolder import pdffolder
@@ -46,6 +47,7 @@ def _hash_python_lines(lines: list[str]) -> str:
     "audiofolder": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())),
     "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())),
     "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())),
+    "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())),
     "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())),
     "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())),
     "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())),
@@ -89,6 +91,8 @@ def _hash_python_lines(lines: list[str]) -> str:
 _EXTENSION_TO_MODULE.update({ext.upper(): ("videofolder", {}) for ext in videofolder.VideoFolder.EXTENSIONS})
 _EXTENSION_TO_MODULE.update({ext: ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS})
 _EXTENSION_TO_MODULE.update({ext.upper(): ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS})
+_EXTENSION_TO_MODULE.update({ext: ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS})
+_EXTENSION_TO_MODULE.update({ext.upper(): ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS})
 
 # Used to filter data files based on extensions given a module name
 _MODULE_TO_EXTENSIONS: dict[str, list[str]] = {}
@@ -106,3 +110,4 @@ def _hash_python_lines(lines: list[str]) -> str:
 _MODULE_TO_METADATA_FILE_NAMES["audiofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES
 _MODULE_TO_METADATA_FILE_NAMES["videofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES
 _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES
+_MODULE_TO_METADATA_FILE_NAMES["niftifolder"] = imagefolder.ImageFolder.METADATA_FILENAMES
diff --git a/src/datasets/packaged_modules/niftifolder/__init__.py b/src/datasets/packaged_modules/niftifolder/__init__.py
diff --git a/src/datasets/packaged_modules/niftifolder/niftifolder.py b/src/datasets/packaged_modules/niftifolder/niftifolder.py
@@ -0,0 +1,23 @@
+import datasets
+
+from ..folder_based_builder import folder_based_builder
+
+
+logger = datasets.utils.logging.get_logger(__name__)
+
+
+class NiftiFolderConfig(folder_based_builder.FolderBasedBuilderConfig):
+    """BuilderConfig for NiftiFolder."""
+
+    drop_labels: bool = None
+    drop_metadata: bool = None
+
+    def __post_init__(self):
+        super().__post_init__()
+
+
+class NiftiFolder(folder_based_builder.FolderBasedBuilder):
+    BASE_FEATURE = datasets.Nifti
+    BASE_COLUMN_NAME = "nifti"
+    BUILDER_CONFIG_CLASS = NiftiFolderConfig
+    EXTENSIONS: list[str] = [".nii", ".nii.gz"]
diff --git a/tests/features/data/test_nifti.nii b/tests/features/data/test_nifti.nii
diff --git a/tests/features/data/test_nifti.nii.gz b/tests/features/data/test_nifti.nii.gz