abdenlab
diff --git a/‎pyproject.toml
+6-13 b/‎pyproject.toml
+6-13
diff --git a/‎src/dask_ngs/__init__.py
+127-13 b/‎src/dask_ngs/__init__.py
+127-13
diff --git a/‎src/dask_ngs/_compat/__init__.py
-1 b/‎src/dask_ngs/_compat/__init__.py
-1
diff --git a/‎src/dask_ngs/_compat/typing.py
-14 b/‎src/dask_ngs/_compat/typing.py
-14
diff --git a/‎src/dask_ngs/_index.py
+32-23 b/‎src/dask_ngs/_index.py
+32-23
@@ -13,7 +13,7 @@ license = { text = "MIT" }
 readme = "README.md"
 requires-python = ">=3.7"
 classifiers = [
-  "Development Status :: 1 - Planning",
+  "Development Status :: 4 - Beta",
   "Intended Audience :: Science/Research",
   "Intended Audience :: Developers",
   "License :: OSI Approved :: MIT License",
@@ -28,14 +28,13 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Topic :: Scientific/Engineering",
+  "Topic :: Scientific/Engineering :: Bio-Informatics",
   "Typing :: Typed",
 ]
 dynamic = ["version"]
 dependencies = [
-  "bioframe",
   "dask",
-  "oxbow",
-  "pandas",
+  "oxbow>=0.2.0",
   "pyarrow",
   "typing_extensions >=3.7; python_version<'3.8'",
 ]
@@ -46,8 +45,10 @@ test = [
   "pytest-cov >=3",
 ]
 dev = [
+  "black",
   "pytest >=6",
   "pytest-cov >=3",
+  "ruff"
 ]
 docs = [
   "furo",
@@ -72,7 +73,7 @@ envs.default.dependencies = [
 ]
 
 [tool.hatch.envs.default.scripts]
-fix = "ruff --fix ."
+lint = "ruff --fix ."
 test = "pytest ."
 docs = "sphinx-autobuild docs docs/_build/html"
 
@@ -139,12 +140,7 @@ select = [
   "NPY",         # NumPy specific rules
   "PD",          # pandas-vet
 ]
-extend-ignore = [
-  "PLR",    # Design related pylint codes
-  "E501",   # Line too long
-]
 target-version = "py37"
-typing-modules = ["dask_ngs._compat.typing"]
 src = ["src"]
 unfixable = [
   "T20",  # Removes print statements
@@ -153,6 +149,3 @@ unfixable = [
 exclude = []
 flake8-unused-arguments.ignore-variadic-names = true
 isort.required-imports = ["from __future__ import annotations"]
-
-[tool.ruff.per-file-ignores]
-"tests/**" = ["T20"]
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 from io import BytesIO
+from pathlib import Path
 
-import bioframe
 import dask
 import dask.dataframe as dd
 import oxbow as ox
@@ -11,37 +11,151 @@
 
 __version__ = "0.1.0"
 
-__all__ = ("__version__", "read_bam")
+__all__ = ("__version__", "read_bam", "read_vcf", "read_bcf")
 
 
-def _read_bam_query_from_path(
-    path: str, chrom: str, start: int, end: int
+def _read_bam_vpos_from_path(
+    path: str, vpos_lo: tuple[int, int], vpos_hi: tuple[int, int]
 ) -> pd.DataFrame:
-    stream = BytesIO(ox.read_bam(path, f"{chrom}:{start}-{end}"))
+    stream = BytesIO(ox.read_bam_vpos(path, vpos_lo, vpos_hi))
     ipc = pyarrow.ipc.open_file(stream)
     return ipc.read_pandas()
 
 
-def read_bam(path: str, chunksize: int = 10_000_000) -> dd.DataFrame:
+def _read_vcf_vpos_from_path(
+    path: str, vpos_lo: tuple[int, int], vpos_hi: tuple[int, int]
+) -> pd.DataFrame:
+    stream = BytesIO(ox.read_vcf_vpos(path, vpos_lo, vpos_hi))
+    ipc = pyarrow.ipc.open_file(stream)
+    return ipc.read_pandas()
+
+
+def _read_bcf_vpos_from_path(
+    path: str, vpos_lo: tuple[int, int], vpos_hi: tuple[int, int]
+) -> pd.DataFrame:
+    stream = BytesIO(ox.read_bcf_vpos(path, vpos_lo, vpos_hi))
+    ipc = pyarrow.ipc.open_file(stream)
+    return ipc.read_pandas()
+
+
+def read_bam(
+    path: str | Path, chunksize: int = 10_000_000, index: str | Path | None = None
+) -> dd.DataFrame:
     """
     Map an indexed BAM file to a Dask DataFrame.
 
     Parameters
     ----------
-    path : str
+    path : str or Path
         Path to the BAM file.
     chunksize : int, optional [default=10_000_000]
-        Chunk size, currently in base pair coordinates.
+        Approximate partition size, in compressed bytes.
+    index : str or Path, optional
+        Path to the index file. If not provided, the index file is assumed to
+        be at the same location as the BAM file, with the same name but with
+        the additional .bai or .csi extension.
+
+    Returns
+    -------
+    dask.dataframe.DataFrame
+    """
+    path = Path(path)
+    if index is None:
+        bai_index = path.with_suffix(".bai")
+        csi_index = path.with_suffix(".csi")
+        if bai_index.exists():
+            index = bai_index
+        elif csi_index.exists():
+            index = csi_index
+        else:
+            msg = "Index .bai or .csi file not found."
+            raise FileNotFoundError(msg)
+
+    vpos = ox.partition_from_index_file(index, chunksize)
+    chunks = [
+        dask.delayed(_read_bam_vpos_from_path)(path, tuple(vpos[i]), tuple(vpos[i + 1]))
+        for i in range(len(vpos) - 1)
+    ]
+
+    return dd.from_delayed(chunks)
+
+
+def read_vcf(
+    path: str | Path, chunksize: int = 10_000_000, index: str | Path | None = None
+) -> dd.DataFrame:
+    """
+    Map an indexed, bgzf-compressed VCF.gz file to a Dask DataFrame.
+
+    Parameters
+    ----------
+    path : str or Path
+        Path to the VCF.gz file.
+    chunksize : int, optional [default=10_000_000]
+        Approximate partition size, in compressed bytes.
+    index : str or Path, optional
+        Path to the index file. If not provided, the index file is assumed to
+        be at the same location as the VCF.gz file, with the same name but with
+        the additional .tbi or .csi extension.
 
     Returns
     -------
     dask.dataframe.DataFrame
-        A Dask DataFrame with the BAM file contents.
     """
-    chromsizes = bioframe.fetch_chromsizes("hg38")
-    chunk_spans = bioframe.binnify(chromsizes, chunksize)
+    path = Path(path)
+    if index is None:
+        tbi_index = path.with_suffix(".tbi")
+        csi_index = path.with_suffix(".csi")
+        if tbi_index.exists():
+            index = tbi_index
+        elif csi_index.exists():
+            index = csi_index
+        else:
+            msg = "Index .tbi or .csi file not found."
+            raise FileNotFoundError(msg)
+
+    vpos = ox.partition_from_index_file(index, chunksize)
     chunks = [
-        dask.delayed(_read_bam_query_from_path)(path, chrom, start + 1, end)
-        for chrom, start, end in chunk_spans.to_numpy()
+        dask.delayed(_read_vcf_vpos_from_path)(path, tuple(vpos[i]), tuple(vpos[i + 1]))
+        for i in range(len(vpos) - 1)
     ]
+
+    return dd.from_delayed(chunks)
+
+
+def read_bcf(
+    path: str | Path, chunksize: int = 10_000_000, index: str | Path | None = None
+) -> dd.DataFrame:
+    """
+    Map an indexed BCF file to a Dask DataFrame.
+
+    Parameters
+    ----------
+    path : str or Path
+        Path to the BCF file.
+    chunksize : int, optional [default=10_000_000]
+        Approximate partition size, in compressed bytes.
+    index : str or Path, optional
+        Path to the index file. If not provided, the index file is assumed to
+        be at the same location as the BCF file, with the same name but with
+        the additional .csi extension.
+
+    Returns
+    -------
+    dask.dataframe.DataFrame
+    """
+    path = Path(path)
+    if index is None:
+        csi_index = path.with_suffix(".csi")
+        if csi_index.exists():
+            index = csi_index
+        else:
+            msg = "Index .csi file not found."
+            raise FileNotFoundError(msg)
+
+    vpos = ox.partition_from_index_file(index, chunksize)
+    chunks = [
+        dask.delayed(_read_bcf_vpos_from_path)(path, tuple(vpos[i]), tuple(vpos[i + 1]))
+        for i in range(len(vpos) - 1)
+    ]
+
     return dd.from_delayed(chunks)
@@ -120,21 +120,23 @@ def _cumsum_assign_chunks(arr: np.array, thresh: int) -> tuple[np.array, np.arra
     """
     Loops through a given array of integers, cumulatively summing the values.
     The rows are labeled with a `chunk_id`, starting at 0.
+
     When the cumulative sum exceeds the threshold, the chunk_id is incremented,
     and the next rows are binned into the next chunk until again the threshold
     is reached. The cumulative sum of that chunk is also recorded as `size`.
     Returns a tuple of the cumulative sum array and the chunk_id array.
 
-    Args:
-        arr : numpy array
-            The array of byte offsets to chunk
-        thresh : int
-            The size of chunks in bytes
-
-    Returns:
-        Tuple of numpy arrays
-        0 : array of cumulative byte sums
-        1 : array of chunk_ids assigned to each row
+    Parameters
+    ----------
+    arr : numpy array
+        The array of byte offsets to chunk
+    thresh : int
+        The size of chunks in bytes
+
+    Returns
+    -------
+    array of cumulative byte sums
+    array of chunk_ids assigned to each row
     """
     sum = 0
     chunkid = 0
@@ -153,14 +155,16 @@ def _cumsum_assign_chunks(arr: np.array, thresh: int) -> tuple[np.array, np.arra
 def map_offsets_to_chunks(offsets: pd.DataFrame, chunksize_bytes: int) -> pd.DataFrame:
     """Given a dataframe of offset positions, calculate the difference
     between each byte offset.
+
     Group those differences into chunks of size `chunksize_bytes`.
 
-    Returns:
-        A Pandas dataframe with additional columns:
-        chunk_id : int
-            The chunk index that row was assigned
-        size : int
-            The cumulative size of that chunk
+    Returns
+    -------
+    A Pandas dataframe with additional columns:
+    chunk_id : int
+        The chunk index that row was assigned
+    size : int
+        The cumulative size of that chunk
     """
 
     # calculate the difference in byte positions from the prior row
@@ -191,15 +195,20 @@ def map_offsets_to_chunks(offsets: pd.DataFrame, chunksize_bytes: int) -> pd.Dat
 
 
 def consolidate_chunks(offsets_uniq: pd.DataFrame) -> pd.DataFrame:
-    """Group the data by `chunk_id`,
-    keeping the first compressed byte value (`ioffset.cpos`)
-    and the first uncompressed byte value of that stream (`ioffset.upos`).
+    """Group the data by `chunk_id`, keeping the first compressed byte value
+    (`ioffset.cpos`) and the first uncompressed byte value of that stream
+    (`ioffset.upos`).
+
     Take the last `size` value which tells you how many compressed bytes to read.
 
-    Returns:
-        A Pandas dataframe grouped by `chunk_id`
-        Now you can decompress the data starting from `ioffset.cpos` and read `size` bytes.
-        `ioffsets.upos` tells you which byte to read first from the uncompressed data.
+    Returns
+    -------
+    A Pandas dataframe grouped by `chunk_id`
+
+    Notes
+    -----
+    Now you can decompress the data starting from `ioffset.cpos` and read `size` bytes.
+    `ioffsets.upos` tells you which byte to read first from the uncompressed data.
     """
     return offsets_uniq.groupby("chunk_id").agg(
         {"ioffset.cpos": "first", "ioffset.upos": "first", "size": "last"}