1+ import abc
12import dataclasses
23import json
34import logging
1314
1415logger = logging .getLogger (__name__ )
1516
16- ZARR_SCHEMA_FORMAT_VERSION = "0.4 "
17+ ZARR_SCHEMA_FORMAT_VERSION = "0.5 "
1718DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
1819
1920_fixed_field_descriptions = {
2829}
2930
3031
32+ class Source (abc .ABC ):
33+ @property
34+ @abc .abstractmethod
35+ def path (self ):
36+ pass
37+
38+ @property
39+ @abc .abstractmethod
40+ def num_records (self ):
41+ pass
42+
43+ @property
44+ @abc .abstractmethod
45+ def num_samples (self ):
46+ pass
47+
48+ @property
49+ @abc .abstractmethod
50+ def samples (self ):
51+ pass
52+
53+ @property
54+ def contigs (self ):
55+ return None
56+
57+ @property
58+ def filters (self ):
59+ return None
60+
61+ @property
62+ def root_attrs (self ):
63+ return {}
64+
65+ @abc .abstractmethod
66+ def iter_alleles (self , start , stop , num_alleles ):
67+ pass
68+
69+ @abc .abstractmethod
70+ def iter_genotypes (self , start , stop , num_alleles ):
71+ pass
72+
73+ def iter_id (self , start , stop ):
74+ return
75+
76+ def iter_contig (self , start , stop ):
77+ return
78+
79+ @abc .abstractmethod
80+ def iter_field (self , field_name , shape , start , stop ):
81+ """Iterate over values for the specified field from start to stop positions."""
82+ pass
83+
84+ @abc .abstractmethod
85+ def generate_schema (self , variants_chunk_size , samples_chunk_size , local_alleles ):
86+ pass
87+
88+
3189@dataclasses .dataclass
3290class ZarrArraySpec :
3391 name : str
@@ -182,25 +240,16 @@ class VcfZarrSchema(core.JsonDataclass):
182240 format_version : str
183241 samples_chunk_size : int
184242 variants_chunk_size : int
185- samples : list
186- contigs : list
187- filters : list
188243 fields : list
189244
190245 def __init__ (
191246 self ,
192247 format_version : str ,
193- samples : list ,
194- contigs : list ,
195- filters : list ,
196248 fields : list ,
197249 variants_chunk_size : int = None ,
198250 samples_chunk_size : int = None ,
199251 ):
200252 self .format_version = format_version
201- self .samples = samples
202- self .contigs = contigs
203- self .filters = filters
204253 self .fields = fields
205254 if variants_chunk_size is None :
206255 variants_chunk_size = 1000
@@ -238,9 +287,6 @@ def fromdict(d):
238287 f"{ d ['format_version' ]} != { ZARR_SCHEMA_FORMAT_VERSION } "
239288 )
240289 ret = VcfZarrSchema (** d )
241- ret .samples = [Sample (** sd ) for sd in d ["samples" ]]
242- ret .contigs = [Contig (** sd ) for sd in d ["contigs" ]]
243- ret .filters = [Filter (** sd ) for sd in d ["filters" ]]
244290 ret .fields = [ZarrArraySpec (** sd ) for sd in d ["fields" ]]
245291 return ret
246292
@@ -474,8 +520,10 @@ def init(
474520
475521 # Doing this synchronously - this is fine surely
476522 self .encode_samples (root )
477- self .encode_filter_id (root )
478- self .encode_contig_id (root )
523+ if self .source .filters is not None :
524+ self .encode_filter_id (root )
525+ if self .source .contigs is not None :
526+ self .encode_contigs (root )
479527
480528 self .wip_path .mkdir ()
481529 self .arrays_path .mkdir ()
@@ -502,33 +550,33 @@ def init(
502550 )
503551
504552 def encode_samples (self , root ):
505- if [s .id for s in self .schema .samples ] != self .source .samples :
506- raise ValueError ("Subsetting or reordering samples not supported currently" )
553+ samples = self .source .samples
507554 array = root .array (
508555 "sample_id" ,
509- data = [sample .id for sample in self . schema . samples ],
510- shape = len (self . schema . samples ),
556+ data = [sample .id for sample in samples ],
557+ shape = len (samples ),
511558 dtype = "str" ,
512559 compressor = DEFAULT_ZARR_COMPRESSOR ,
513560 chunks = (self .schema .samples_chunk_size ,),
514561 )
515562 array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
516563 logger .debug ("Samples done" )
517564
518- def encode_contig_id (self , root ):
565+ def encode_contigs (self , root ):
566+ contigs = self .source .contigs
519567 array = root .array (
520568 "contig_id" ,
521- data = [contig .id for contig in self . schema . contigs ],
522- shape = len (self . schema . contigs ),
569+ data = [contig .id for contig in contigs ],
570+ shape = len (contigs ),
523571 dtype = "str" ,
524572 compressor = DEFAULT_ZARR_COMPRESSOR ,
525573 )
526574 array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
527- if all (contig .length is not None for contig in self . schema . contigs ):
575+ if all (contig .length is not None for contig in contigs ):
528576 array = root .array (
529577 "contig_length" ,
530- data = [contig .length for contig in self . schema . contigs ],
531- shape = len (self . schema . contigs ),
578+ data = [contig .length for contig in contigs ],
579+ shape = len (contigs ),
532580 dtype = np .int64 ,
533581 compressor = DEFAULT_ZARR_COMPRESSOR ,
534582 )
@@ -537,10 +585,11 @@ def encode_contig_id(self, root):
537585 def encode_filter_id (self , root ):
538586 # TODO need a way to store description also
539587 # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
588+ filters = self .source .filters
540589 array = root .array (
541590 "filter_id" ,
542- data = [filt .id for filt in self . schema . filters ],
543- shape = len (self . schema . filters ),
591+ data = [filt .id for filt in filters ],
592+ shape = len (filters ),
544593 dtype = "str" ,
545594 compressor = DEFAULT_ZARR_COMPRESSOR ,
546595 )
0 commit comments