@@ -5,13 +5,16 @@ from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING, PyBytes
5
5
from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
6
6
from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH, PyUnicode_DecodeASCII
7
7
from cpython.object cimport Py_TYPE, PyTypeObject
8
+ from cpython.pyport cimport PY_SSIZE_T_MAX
8
9
from cpython.ref cimport PyObject
9
10
from cpython.tuple cimport PyTuple_GET_ITEM
10
11
from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
11
12
from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t
12
13
13
14
cimport cython
14
15
16
+ from ._bam import read_bam_header
17
+
15
18
cdef extern from " Python.h" :
16
19
void * PyUnicode_DATA(object o)
17
20
bint PyUnicode_IS_COMPACT_ASCII(object o)
@@ -431,6 +434,29 @@ def paired_fastq_heads(buf1, buf2, Py_ssize_t end1, Py_ssize_t end2):
431
434
return record_start1 - data1, record_start2 - data2
432
435
433
436
437
+ def bam_head (buf , end = None ):
438
+ """ Return the end of the last complete BAM record in the buf."""
439
+ cdef Py_ssize_t c_end = PY_SSIZE_T_MAX
440
+ if end is not None :
441
+ c_end = end
442
+ cdef Py_buffer buffer
443
+ PyObject_GetBuffer(buf, & buffer , PyBUF_SIMPLE)
444
+ cdef:
445
+ uint8_t * buffer_start = < uint8_t * > buffer .buf
446
+ uint8_t * record_start = buffer_start
447
+ uint8_t * buffer_end = buffer_start + min (c_end, buffer .len)
448
+ uint32_t block_size
449
+ size_t record_size
450
+
451
+ while (record_start + 4 ) < buffer_end:
452
+ record_size = (< uint32_t * > record_start)[0 ] + 4
453
+ if (record_start + record_size) > buffer_end:
454
+ break
455
+ record_start += record_size
456
+ cdef Py_ssize_t head = < Py_ssize_t> (record_start - buffer_start)
457
+ PyBuffer_Release(& buffer )
458
+ return head
459
+
434
460
cdef class FastqIter:
435
461
"""
436
462
Parse a FASTQ file and yield SequenceRecord objects
@@ -688,6 +714,22 @@ cdef struct BamRecordHeader:
688
714
int32_t tlen
689
715
690
716
cdef class BamIter:
717
+ """
718
+ Parse a uBAM file and yield SequenceRecord objects
719
+
720
+ Arguments:
721
+ file: a file-like object, opened in binary mode (it must have a readinto
722
+ method)
723
+
724
+ buffer_size: size of the initial buffer. This is automatically grown
725
+ if a BAM record is encountered that does not fit.
726
+
727
+ with_header: The BAM file has a header that needs parsing. Default is True.
728
+ False can be used in circumstances where chunks of BAM records are read.
729
+
730
+ Yields:
731
+ SequenceRecord Objects
732
+ """
691
733
cdef:
692
734
uint8_t * record_start
693
735
uint8_t * buffer_end
@@ -701,42 +743,16 @@ cdef class BamIter:
701
743
def __dealloc__ (self ):
702
744
PyMem_Free(self .read_in_buffer)
703
745
704
- def __cinit__ (self , fileobj , read_in_size = 48 * 1024 ):
746
+ def __cinit__ (self , fileobj , read_in_size = 48 * 1024 , with_header = True ):
705
747
if read_in_size < 4 :
706
748
raise ValueError (f" read_in_size must be at least 4 got "
707
749
f" {read_in_size}" )
708
750
709
- # Skip ahead and save the BAM header for later inspection
710
- magic_and_header_size = fileobj.read(8 )
711
- if not isinstance (magic_and_header_size, bytes):
712
- raise TypeError (f" fileobj {fileobj} is not a binary IO type, "
713
- f" got {type(fileobj)}" )
714
- if len (magic_and_header_size) < 8 :
715
- raise EOFError (" Truncated BAM file" )
716
- if magic_and_header_size[:4 ] != b" BAM\1" :
717
- raise ValueError (
718
- f" fileobj: {fileobj}, is not a BAM file. No BAM magic, instead "
719
- f" found {magic_and_header_size[:4]}" )
720
- l_text = int .from_bytes(magic_and_header_size[4 :], " little" , signed = False )
721
- header = fileobj.read(l_text)
722
- if len (header) < l_text:
723
- raise EOFError (" Truncated BAM file" )
724
- n_ref_obj = fileobj.read(4 )
725
- if len (n_ref_obj) < 4 :
726
- raise EOFError (" Truncated BAM file" )
727
- n_ref = int .from_bytes(n_ref_obj, " little" , signed = False )
728
- for i in range (n_ref):
729
- l_name_obj = fileobj.read(4 )
730
- if len (l_name_obj) < 4 :
731
- raise EOFError (" Truncated BAM file" )
732
- l_name = int .from_bytes(l_name_obj, " little" , signed = False )
733
- reference_chunk_size = l_name + 4 # Include name and uint32_t of size
734
- reference_chunk = fileobj.read(reference_chunk_size)
735
- if len (reference_chunk) < reference_chunk_size:
736
- raise EOFError (" Truncated BAM file" )
737
- # Fileobj is now skipped ahead and at the start of the BAM records
738
-
739
- self .header = header
751
+ if with_header:
752
+ # Skip ahead and save the BAM header for later inspection
753
+ self .header = read_bam_header(fileobj)
754
+ else :
755
+ self .header = b" "
740
756
self .read_in_size = read_in_size
741
757
self .file = fileobj
742
758
self .read_in_buffer = NULL
@@ -746,9 +762,9 @@ cdef class BamIter:
746
762
747
763
def __iter__ (self ):
748
764
return self
749
-
765
+
750
766
cdef _read_into_buffer(self ):
751
- cdef size_t read_in_size
767
+ cdef size_t read_in_size
752
768
cdef size_t leftover_size = self .buffer_end - self .record_start
753
769
cdef uint32_t block_size
754
770
memmove(self .read_in_buffer, self .record_start, leftover_size)
@@ -769,7 +785,7 @@ cdef class BamIter:
769
785
raise StopIteration ()
770
786
elif new_bytes_size == 0 :
771
787
raise EOFError (" Incomplete record at the end of file" )
772
- cdef uint8_t * tmp
788
+ cdef uint8_t * tmp
773
789
if new_buffer_size > self .read_in_buffer_size:
774
790
tmp = < uint8_t * > PyMem_Realloc(self .read_in_buffer, new_buffer_size)
775
791
if tmp == NULL :
0 commit comments