Skip to content

Commit 0cbf5cb

Browse files
committed
Add plink contig info
1 parent d61af16 commit 0cbf5cb

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

bio2zarr/plink.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import zarr
77

8-
from bio2zarr import constants, vcz
8+
from bio2zarr import constants, core, vcz
99

1010
logger = logging.getLogger(__name__)
1111

@@ -18,6 +18,9 @@ def __init__(self, path):
1818
self.samples = [vcz.Sample(id=sample) for sample in self.bed.iid]
1919
self.num_samples = len(self.samples)
2020
self.root_attrs = {}
21+
self.contigs = [
22+
vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)
23+
]
2124

2225
def iter_alleles(self, start, stop, num_alleles):
2326
ref_field = self.bed.allele_1
@@ -32,6 +35,11 @@ def iter_alleles(self, start, stop, num_alleles):
3235
alleles[1 : 1 + len(alt)] = alt
3336
yield alleles
3437

38+
def iter_contig(self, start, stop):
39+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
40+
for chrom in self.bed.chromosome[start:stop]:
41+
yield chrom_to_contig_index[str(chrom)]
42+
3543
def iter_field(self, field_name, shape, start, stop):
3644
data = {
3745
"position": self.bed.bp_position,
@@ -89,6 +97,15 @@ def generate_schema(
8997
chunks=[schema_instance.variants_chunk_size, 2],
9098
description=None,
9199
),
100+
vcz.ZarrArraySpec.new(
101+
vcf_field=None,
102+
name="variant_contig",
103+
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
104+
shape=[m],
105+
dimensions=["variants"],
106+
chunks=[schema_instance.variants_chunk_size],
107+
description="Contig/chromosome index for each variant",
108+
),
92109
vcz.ZarrArraySpec.new(
93110
vcf_field=None,
94111
name="call_genotype_phased",
@@ -160,9 +177,7 @@ def convert(
160177
show_progress=show_progress,
161178
)
162179
vzw.finalise(show_progress)
163-
164-
# TODO - index code needs variant_contig
165-
# vzw.create_index()
180+
vzw.create_index()
166181

167182

168183
# FIXME do this more efficiently - currently reading the whole thing

tests/test_plink.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106106
def test_variant_allele(self, ds):
107107
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
108108

109+
def test_contig_id(self, ds):
110+
"""Test that contig identifiers are correctly extracted and stored."""
111+
nt.assert_array_equal(ds.contig_id, ["1"])
112+
113+
def test_variant_contig(self, ds):
114+
"""Test that variant to contig mapping is correctly stored."""
115+
nt.assert_array_equal(
116+
ds.variant_contig, [0, 0]
117+
) # Both variants on chromosome 1
118+
109119
def test_genotypes(self, ds):
110120
call_genotype = ds.call_genotype.values
111121
assert call_genotype.shape == (2, 10, 2)
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196206
# FIXME not working
197207
nt.assert_array_equal(actual, expected)
198208

209+
def test_contig_arrays(self, ds):
210+
"""Test that contig arrays are correctly created and filled."""
211+
# Verify contig_id exists and is a string array
212+
assert hasattr(ds, "contig_id")
213+
assert ds.contig_id.dtype == np.dtype("O")
214+
215+
# Verify variant_contig exists and contains integer indices
216+
assert hasattr(ds, "variant_contig")
217+
assert ds.variant_contig.dtype == np.dtype("int8")
218+
assert ds.variant_contig.shape[0] == 100 # 100 variants
219+
220+
# Verify mapping between variant_contig and contig_id
221+
# For each unique contig index, check at least one corresponding variant exists
222+
for i, contig in enumerate(ds.contig_id.values):
223+
assert np.any(
224+
ds.variant_contig.values == i
225+
), f"No variants found for contig {contig}"
226+
199227
# @pytest.mark.xfail
200228
@pytest.mark.parametrize(
201229
("variants_chunk_size", "samples_chunk_size"),

0 commit comments

Comments
 (0)