55import numpy as np
66import zarr
77
8- from bio2zarr import constants , vcz
8+ from bio2zarr import constants , core , vcz
99
1010logger = logging .getLogger (__name__ )
1111
@@ -18,6 +18,9 @@ def __init__(self, path):
1818 self .samples = [vcz .Sample (id = sample ) for sample in self .bed .iid ]
1919 self .num_samples = len (self .samples )
2020 self .root_attrs = {}
21+ self .contigs = [
22+ vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bed .chromosome )
23+ ]
2124
2225 def iter_alleles (self , start , stop , num_alleles ):
2326 ref_field = self .bed .allele_1
@@ -32,6 +35,11 @@ def iter_alleles(self, start, stop, num_alleles):
3235 alleles [1 : 1 + len (alt )] = alt
3336 yield alleles
3437
38+ def iter_contig (self , start , stop ):
39+ chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
40+ for chrom in self .bed .chromosome [start :stop ]:
41+ yield chrom_to_contig_index [str (chrom )]
42+
3543 def iter_field (self , field_name , shape , start , stop ):
3644 data = {
3745 "position" : self .bed .bp_position ,
@@ -89,6 +97,15 @@ def generate_schema(
8997 chunks = [schema_instance .variants_chunk_size , 2 ],
9098 description = None ,
9199 ),
100+ vcz .ZarrArraySpec .new (
101+ vcf_field = None ,
102+ name = "variant_contig" ,
103+ dtype = core .min_int_dtype (0 , len (np .unique (self .bed .chromosome ))),
104+ shape = [m ],
105+ dimensions = ["variants" ],
106+ chunks = [schema_instance .variants_chunk_size ],
107+ description = "Contig/chromosome index for each variant" ,
108+ ),
92109 vcz .ZarrArraySpec .new (
93110 vcf_field = None ,
94111 name = "call_genotype_phased" ,
@@ -160,9 +177,7 @@ def convert(
160177 show_progress = show_progress ,
161178 )
162179 vzw .finalise (show_progress )
163-
164- # TODO - index code needs variant_contig
165- # vzw.create_index()
180+ vzw .create_index ()
166181
167182
168183# FIXME do this more efficiently - currently reading the whole thing
0 commit comments