@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106106 def test_variant_allele (self , ds ):
107107 nt .assert_array_equal (ds .variant_allele , [["A" , "G" ], ["T" , "C" ]])
108108
109+ def test_contig_id (self , ds ):
110+ """Test that contig identifiers are correctly extracted and stored."""
111+ nt .assert_array_equal (ds .contig_id , ["1" ])
112+
113+ def test_variant_contig (self , ds ):
114+ """Test that variant to contig mapping is correctly stored."""
115+ nt .assert_array_equal (
116+ ds .variant_contig , [0 , 0 ]
117+ ) # Both variants on chromosome 1
118+
109119 def test_genotypes (self , ds ):
110120 call_genotype = ds .call_genotype .values
111121 assert call_genotype .shape == (2 , 10 , 2 )
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196206 # FIXME not working
197207 nt .assert_array_equal (actual , expected )
198208
209+ def test_contig_arrays (self , ds ):
210+ """Test that contig arrays are correctly created and filled."""
211+ # Verify contig_id exists and is a string array
212+ assert hasattr (ds , "contig_id" )
213+ assert ds .contig_id .dtype == np .dtype ("O" )
214+
215+ # Verify variant_contig exists and contains integer indices
216+ assert hasattr (ds , "variant_contig" )
217+ assert ds .variant_contig .dtype == np .dtype ("int8" )
218+ assert ds .variant_contig .shape [0 ] == 100 # 100 variants
219+
220+ # Verify mapping between variant_contig and contig_id
221+ # For each unique contig index, check at least one corresponding variant exists
222+ for i , contig in enumerate (ds .contig_id .values ):
223+ assert np .any (
224+ ds .variant_contig .values == i
225+ ), f"No variants found for contig { contig } "
226+
199227 # @pytest.mark.xfail
200228 @pytest .mark .parametrize (
201229 ("variants_chunk_size" , "samples_chunk_size" ),
@@ -249,3 +277,81 @@ def test_by_validating(
249277 worker_processes = worker_processes ,
250278 )
251279 plink .validate (path , out )
280+
281+
282+ class TestMultipleContigs :
283+ """Test handling of multiple contigs in PLINK files."""
284+
285+ @pytest .fixture (scope = "class" )
286+ def multi_contig_bed_path (self , tmp_path_factory ):
287+ tmp_path = tmp_path_factory .mktemp ("data" )
288+ path = tmp_path / "multi_contig.bed"
289+ # 6 sites x 4 samples
290+ # - 2 variants on chromosome 1
291+ # - 1 variant on chromosome 2
292+ # - 2 variants on chromosome X
293+ # - 1 variant on chromosome Y
294+ # values are counts of allele 1, will be flipped by the converter
295+ dosages = np .array (
296+ [
297+ [0 , 1 , 2 , 0 ], # chr1
298+ [1 , 0 , 2 , 1 ], # chr1
299+ [0 , 0 , 0 , 2 ], # chr2
300+ [2 , 0 , 1 , 0 ], # chrX
301+ [1 , 1 , 1 , 1 ], # chrX
302+ [0 , 2 , 0 , 2 ], # chrY
303+ ],
304+ dtype = np .int8 ,
305+ )
306+ bed_reader .to_bed (path , dosages .T )
307+
308+ bim_path = path .with_suffix (".bim" )
309+ with open (bim_path , "w" ) as f :
310+ # Format: chr, variant_id, genetic_dist, pos, allele1, allele2
311+ f .write ("1\t 1_10\t 0\t 10\t A\t G\n " )
312+ f .write ("1\t 1_20\t 0\t 20\t T\t C\n " )
313+ f .write ("2\t 2_10\t 0\t 10\t A\t G\n " )
314+ f .write ("X\t X_10\t 0\t 10\t C\t T\n " )
315+ f .write ("X\t X_20\t 0\t 20\t G\t A\n " )
316+ f .write ("Y\t Y_10\t 0\t 10\t T\t C\n " )
317+
318+ fam_path = path .with_suffix (".fam" )
319+ with open (fam_path , "w" ) as f :
320+ for i in range (4 ):
321+ # Format: fam_id, ind_id, pat_id, mat_id, sex, phenotype
322+ f .write (f"fam{ i } ind{ i } 0 0 0 -9\n " )
323+
324+ return path
325+
326+ @pytest .fixture (scope = "class" )
327+ def ds (self , tmp_path_factory , multi_contig_bed_path ):
328+ tmp_path = tmp_path_factory .mktemp ("data" )
329+ zarr_path = tmp_path / "multi_contig.plink.zarr"
330+ plink .convert (multi_contig_bed_path , zarr_path )
331+ return sg .load_dataset (zarr_path )
332+
333+ def test_contig_ids (self , ds ):
334+ nt .assert_array_equal (ds .contig_id , ["1" , "2" , "X" , "Y" ])
335+
336+ def test_variant_contig (self , ds ):
337+ nt .assert_array_equal (
338+ ds .variant_contig , [0 , 0 , 1 , 2 , 2 , 3 ]
339+ ) # Each variant mapped to its correct contig index
340+
341+ def test_genotypes (self , ds ):
342+ call_genotype = ds .call_genotype .values
343+ assert call_genotype .shape == (6 , 4 , 2 )
344+ nt .assert_array_equal (
345+ call_genotype ,
346+ [
347+ [[1 , 1 ], [1 , 0 ], [0 , 0 ], [1 , 1 ]], # chr1
348+ [[1 , 0 ], [1 , 1 ], [0 , 0 ], [1 , 0 ]], # chr1
349+ [[1 , 1 ], [1 , 1 ], [1 , 1 ], [0 , 0 ]], # chr2
350+ [[0 , 0 ], [1 , 1 ], [1 , 0 ], [1 , 1 ]], # chrX
351+ [[1 , 0 ], [1 , 0 ], [1 , 0 ], [1 , 0 ]], # chrX
352+ [[1 , 1 ], [0 , 0 ], [1 , 1 ], [0 , 0 ]], # chrY
353+ ],
354+ )
355+
356+ def test_variant_position (self , ds ):
357+ nt .assert_array_equal (ds .variant_position , [10 , 20 , 10 , 10 , 20 , 10 ])
0 commit comments