Skip to content

Commit ec56277

Browse files
authored
Merge pull request #141 from rhpvorderman/chunk_ubam
Allow chunking uBAM
2 parents 059a66e + 497310f commit ec56277

10 files changed

+159
-51
lines changed

src/dnaio/_bam.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import struct
2+
from io import BufferedIOBase
3+
4+
5+
def read_bam_header(fileobj: BufferedIOBase) -> bytes:
6+
magic = fileobj.read(4)
7+
if not isinstance(magic, bytes):
8+
raise TypeError(
9+
f"fileobj {fileobj} (type: {type(fileobj)}), was not opened in binary mode."
10+
)
11+
if len(magic) < 4:
12+
raise EOFError("Truncated BAM file")
13+
if magic[:4] != b"BAM\1":
14+
raise ValueError(
15+
f"fileobj: {fileobj}, is not a BAM file. No BAM file signature, instead "
16+
f"found {magic!r}"
17+
)
18+
return read_bam_header_after_magic(fileobj)
19+
20+
21+
def read_bam_header_after_magic(fileobj: BufferedIOBase) -> bytes:
22+
header_size = fileobj.read(4)
23+
if len(header_size) < 4:
24+
raise EOFError("Truncated BAM file")
25+
(l_text,) = struct.unpack("<I", header_size)
26+
header = fileobj.read(l_text)
27+
if len(header) < l_text:
28+
raise EOFError("Truncated BAM file")
29+
n_ref_obj = fileobj.read(4)
30+
if len(n_ref_obj) < 4:
31+
raise EOFError("Truncated BAM file")
32+
(n_ref,) = struct.unpack("<I", n_ref_obj)
33+
for i in range(n_ref):
34+
l_name_obj = fileobj.read(4)
35+
if len(l_name_obj) < 4:
36+
raise EOFError("Truncated BAM file")
37+
(l_name,) = struct.unpack("<I", l_name_obj)
38+
reference_chunk_size = l_name + 4 # Include name and uint32_t of size
39+
reference_chunk = fileobj.read(reference_chunk_size)
40+
if len(reference_chunk) < reference_chunk_size:
41+
raise EOFError("Truncated BAM file")
42+
# Fileobj is now skipped ahead and at the start of the BAM records
43+
return header

src/dnaio/_core.pyi

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from typing import (
23
Generic,
34
Optional,
@@ -36,6 +37,7 @@ class SequenceRecord:
3637
def paired_fastq_heads(
3738
buf1: ByteString, buf2: ByteString, end1: int, end2: int
3839
) -> Tuple[int, int]: ...
40+
def bam_head(buf: bytes, end: Optional[int] = None) -> int: ...
3941
def records_are_mates(
4042
__first_record: SequenceRecord,
4143
__second_record: SequenceRecord,
@@ -54,7 +56,7 @@ class FastqIter(Generic[T]):
5456
def number_of_records(self) -> int: ...
5557

5658
class BamIter:
57-
def __init__(self, file: BinaryIO, buffer_size: int): ...
59+
def __init__(self, file: BinaryIO, read_in_size: int, with_header: bool = True): ...
5860
def __iter__(self) -> Iterator[SequenceRecord]: ...
5961
def __next__(self) -> SequenceRecord: ...
6062
@property

src/dnaio/_core.pyx

+51-35
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@ from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes
55
from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
66
from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnicode_DecodeASCII
77
from cpython.object cimport Py_TYPE, PyTypeObject
8+
from cpython.pyport cimport PY_SSIZE_T_MAX
89
from cpython.ref cimport PyObject
910
from cpython.tuple cimport PyTuple_GET_ITEM
1011
from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
1112
from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t
1213

1314
cimport cython
1415

16+
from ._bam import read_bam_header
17+
1518
cdef extern from "Python.h":
1619
void *PyUnicode_DATA(object o)
1720
bint PyUnicode_IS_COMPACT_ASCII(object o)
@@ -431,6 +434,29 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2):
431434
return record_start1 - data1, record_start2 - data2
432435

433436

437+
def bam_head(buf, end = None):
438+
"""Return the end of the last complete BAM record in the buf."""
439+
cdef Py_ssize_t c_end = PY_SSIZE_T_MAX
440+
if end is not None:
441+
c_end = end
442+
cdef Py_buffer buffer
443+
PyObject_GetBuffer(buf, &buffer, PyBUF_SIMPLE)
444+
cdef:
445+
uint8_t *buffer_start = <uint8_t *>buffer.buf
446+
uint8_t *record_start = buffer_start
447+
uint8_t *buffer_end = buffer_start + min(c_end, buffer.len)
448+
uint32_t block_size
449+
size_t record_size
450+
451+
while (record_start + 4) < buffer_end:
452+
record_size = (<uint32_t *>record_start)[0] + 4
453+
if (record_start + record_size) > buffer_end:
454+
break
455+
record_start += record_size
456+
cdef Py_ssize_t head = <Py_ssize_t>(record_start - buffer_start)
457+
PyBuffer_Release(&buffer)
458+
return head
459+
434460
cdef class FastqIter:
435461
"""
436462
Parse a FASTQ file and yield SequenceRecord objects
@@ -688,6 +714,22 @@ cdef struct BamRecordHeader:
688714
int32_t tlen
689715

690716
cdef class BamIter:
717+
"""
718+
Parse a uBAM file and yield SequenceRecord objects
719+
720+
Arguments:
721+
file: a file-like object, opened in binary mode (it must have a readinto
722+
method)
723+
724+
buffer_size: size of the initial buffer. This is automatically grown
725+
if a BAM record is encountered that does not fit.
726+
727+
with_header: The BAM file has a header that needs parsing. Default is True.
728+
False can be used in circumstances where chunks of BAM records are read.
729+
730+
Yields:
731+
SequenceRecord Objects
732+
"""
691733
cdef:
692734
uint8_t *record_start
693735
uint8_t *buffer_end
@@ -701,42 +743,16 @@ cdef class BamIter:
701743
def __dealloc__(self):
702744
PyMem_Free(self.read_in_buffer)
703745

704-
def __cinit__(self, fileobj, read_in_size = 48 * 1024):
746+
def __cinit__(self, fileobj, read_in_size = 48 * 1024, with_header = True):
705747
if read_in_size < 4:
706748
raise ValueError(f"read_in_size must be at least 4 got "
707749
f"{read_in_size}")
708750

709-
# Skip ahead and save the BAM header for later inspection
710-
magic_and_header_size = fileobj.read(8)
711-
if not isinstance(magic_and_header_size, bytes):
712-
raise TypeError(f"fileobj {fileobj} is not a binary IO type, "
713-
f"got {type(fileobj)}")
714-
if len(magic_and_header_size) < 8:
715-
raise EOFError("Truncated BAM file")
716-
if magic_and_header_size[:4] != b"BAM\1":
717-
raise ValueError(
718-
f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead "
719-
f"found {magic_and_header_size[:4]}")
720-
l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False)
721-
header = fileobj.read(l_text)
722-
if len(header) < l_text:
723-
raise EOFError("Truncated BAM file")
724-
n_ref_obj = fileobj.read(4)
725-
if len(n_ref_obj) < 4:
726-
raise EOFError("Truncated BAM file")
727-
n_ref = int.from_bytes(n_ref_obj, "little", signed=False)
728-
for i in range(n_ref):
729-
l_name_obj = fileobj.read(4)
730-
if len(l_name_obj) < 4:
731-
raise EOFError("Truncated BAM file")
732-
l_name = int.from_bytes(l_name_obj, "little", signed=False)
733-
reference_chunk_size = l_name + 4 # Include name and uint32_t of size
734-
reference_chunk = fileobj.read(reference_chunk_size)
735-
if len(reference_chunk) < reference_chunk_size:
736-
raise EOFError("Truncated BAM file")
737-
# Fileobj is now skipped ahead and at the start of the BAM records
738-
739-
self.header = header
751+
if with_header:
752+
# Skip ahead and save the BAM header for later inspection
753+
self.header = read_bam_header(fileobj)
754+
else:
755+
self.header = b""
740756
self.read_in_size = read_in_size
741757
self.file = fileobj
742758
self.read_in_buffer = NULL
@@ -746,9 +762,9 @@ cdef class BamIter:
746762

747763
def __iter__(self):
748764
return self
749-
765+
750766
cdef _read_into_buffer(self):
751-
cdef size_t read_in_size
767+
cdef size_t read_in_size
752768
cdef size_t leftover_size = self.buffer_end - self.record_start
753769
cdef uint32_t block_size
754770
memmove(self.read_in_buffer, self.record_start, leftover_size)
@@ -769,7 +785,7 @@ cdef class BamIter:
769785
raise StopIteration()
770786
elif new_bytes_size == 0:
771787
raise EOFError("Incomplete record at the end of file")
772-
cdef uint8_t *tmp
788+
cdef uint8_t *tmp
773789
if new_buffer_size > self.read_in_buffer_size:
774790
tmp = <uint8_t *>PyMem_Realloc(self.read_in_buffer, new_buffer_size)
775791
if tmp == NULL:

src/dnaio/chunks.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
or subprocess and be parsed and processed there.
77
"""
88

9-
from io import RawIOBase
9+
from io import BufferedIOBase
1010
from typing import Optional, Iterator, Tuple
1111

12+
from ._bam import read_bam_header_after_magic
1213
from ._core import paired_fastq_heads as _paired_fastq_heads
14+
from ._core import bam_head as _bam_head
1315
from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat
1416

1517

@@ -79,7 +81,9 @@ def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
7981
return right + 1 # type: ignore
8082

8183

82-
def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memoryview]:
84+
def read_chunks(
85+
f: BufferedIOBase, buffer_size: int = 4 * 1024**2
86+
) -> Iterator[memoryview]:
8387
"""
8488
Read chunks of complete FASTA or FASTQ records from a file.
8589
If the format is detected to be FASTQ, all chunks except possibly the last contain
@@ -104,19 +108,23 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
104108

105109
# Read one byte to determine file format.
106110
# If there is a comment char, we assume FASTA!
107-
start = f.readinto(memoryview(buf)[0:1])
111+
start = f.readinto(memoryview(buf)[0:4])
108112
if start == 0:
109113
# Empty file
110114
return
111-
assert start == 1
115+
assert start == 4
112116
if buf[0:1] == b"@":
113117
head = _fastq_head
114118
elif buf[0:1] == b"#" or buf[0:1] == b">":
115119
head = _fasta_head
120+
elif buf[0:4] == b"BAM\x01":
121+
head = _bam_head
122+
_ = read_bam_header_after_magic(f)
123+
start = 0 # Skip header and start at the records.
116124
else:
117125
raise UnknownFileFormat(
118-
f"Cannnot determine input file format: First character expected to be '>' or '@', "
119-
f"but found {repr(chr(buf[0]))}"
126+
f"Cannnot determine input file format: First characters expected "
127+
f"to be '>'. '@', or 'BAM\1', but found {repr(buf[0:4])}"
120128
)
121129

122130
# Layout of buf
@@ -135,7 +143,7 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
135143
while True:
136144
if start == len(buf):
137145
raise OverflowError("FASTA/FASTQ record does not fit into buffer")
138-
bufend = f.readinto(memoryview(buf)[start:]) + start # type: ignore
146+
bufend = f.readinto(memoryview(buf)[start:]) + start
139147
if start == bufend:
140148
# End of file
141149
break
@@ -152,8 +160,8 @@ def read_chunks(f: RawIOBase, buffer_size: int = 4 * 1024**2) -> Iterator[memory
152160

153161

154162
def read_paired_chunks(
155-
f: RawIOBase,
156-
f2: RawIOBase,
163+
f: BufferedIOBase,
164+
f2: BufferedIOBase,
157165
buffer_size: int = 4 * 1024**2,
158166
) -> Iterator[Tuple[memoryview, memoryview]]:
159167
"""
@@ -222,8 +230,8 @@ def read_paired_chunks(
222230
raise ValueError(
223231
f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}"
224232
)
225-
bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore
226-
bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore
233+
bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1
234+
bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2
227235
if start1 == bufend1 and start2 == bufend2:
228236
break
229237

src/dnaio/readers.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -212,13 +212,16 @@ def __init__(
212212
buffer_size: int = 128 * 1024, # Buffer size used by cat, pigz etc.
213213
opener=xopen,
214214
_close_file: Optional[bool] = None,
215+
with_header: bool = True,
215216
):
216217
super().__init__(file, opener=opener, _close_file=_close_file)
217218
self.sequence_class = sequence_class
218219
self.delivers_qualities = True
219220
self.buffer_size = buffer_size
220221
try:
221-
self._iter: Iterator[SequenceRecord] = BamIter(self._file, self.buffer_size)
222+
self._iter: Iterator[SequenceRecord] = BamIter(
223+
self._file, read_in_size=self.buffer_size, with_header=with_header
224+
)
222225
except Exception:
223226
self.close()
224227
raise

src/dnaio/singleend.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .writers import FastaWriter, FastqWriter
77

88

9-
def _open_single(
9+
def _open_single( # noqa: C901
1010
file_or_path: Union[str, os.PathLike, BinaryIO],
1111
opener,
1212
*,
@@ -64,6 +64,11 @@ def _open_single(
6464
return BamReader(file, _close_file=close_file)
6565
# This should not be reached
6666
raise NotImplementedError("Only reading is supported for BAM files")
67+
elif fileformat == "bam_no_header":
68+
if "r" in mode:
69+
return BamReader(file, _close_file=close_file, with_header=False)
70+
# This should not be reached
71+
raise NotImplementedError("Only reading is supported for headerless BAM files")
6772
if close_file:
6873
file.close()
6974
raise UnknownFileFormat(
197 Bytes
Binary file not shown.

tests/test_chunks.py

+22
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import gzip
2+
import struct
13
import textwrap
24

35
from pytest import raises
@@ -7,6 +9,7 @@
79
from dnaio import UnknownFileFormat, FileFormatError
810
from dnaio._core import paired_fastq_heads
911
from dnaio.chunks import (
12+
_bam_head,
1013
_fastq_head,
1114
_fasta_head,
1215
read_chunks,
@@ -41,6 +44,13 @@ def test_fasta_head_with_comment():
4144
assert _fasta_head(b"#\n>3\n5\n>") == 7
4245

4346

47+
def test_bam_head():
48+
assert _bam_head(struct.pack("<Ix", 32)) == 0
49+
assert _bam_head(struct.pack("<I32x", 32)) == 36
50+
assert _bam_head(struct.pack("<I32xI32x", 32, 33)) == 36
51+
assert _bam_head(struct.pack("<I32xI33x", 32, 33)) == 73
52+
53+
4454
def test_paired_fasta_heads():
4555
def pheads(buf1, buf2):
4656
return _paired_fasta_heads(buf1, buf2, len(buf1), len(buf2))
@@ -179,3 +189,15 @@ def test_read_chunks_empty():
179189
def test_invalid_file_format():
180190
with raises(UnknownFileFormat):
181191
list(read_chunks(BytesIO(b"invalid format")))
192+
193+
194+
def test_read_chunks_bam():
195+
with gzip.open("tests/data/simple.unaligned.bam", "rb") as f:
196+
for i, chunk in enumerate(read_chunks(f, 70)):
197+
if i == 0:
198+
assert b"Myheader" in chunk.tobytes()
199+
if i == 1:
200+
assert b"AnotherHeader" in chunk.tobytes()
201+
if i == 2:
202+
assert b"YetAnotherHeader" in chunk.tobytes()
203+
assert b"RGZA\x00" in chunk.tobytes()

tests/test_internal.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from dnaio.readers import BinaryFileReader
3434
from dnaio._core import bytes_ascii_check
3535

36-
3736
TEST_DATA = Path(__file__).parent / "data"
3837
SIMPLE_FASTQ = str(TEST_DATA / "simple.fastq")
3938
# files tests/data/simple.fast{q,a}
@@ -786,7 +785,7 @@ def test_bam_parser_not_binary_error(self):
786785
)
787786
with pytest.raises(TypeError) as error:
788787
BamReader(file)
789-
error.match("binary IO")
788+
error.match("binary mode")
790789

791790
@pytest.mark.parametrize("buffersize", [4, 8, 10, 20, 40])
792791
def test_small_buffersize(self, buffersize):

0 commit comments

Comments
 (0)