@@ -355,6 +355,104 @@ def test_wrong_format(self):
355355 with self .assertRaises (exceptions .FileFormatError ):
356356 tszip .decompress (self .path )
357357
358+ def test_struct_metadata_roundtrip (self ):
359+ ts = msprime .simulate (10 , random_seed = 1 )
360+
361+ struct_metadata = {
362+ "reverse_node_map" : [847973 , 1442881 , 356055 , 2542708 , 285222 , 175110 ]
363+ }
364+
365+ tables = ts .dump_tables ()
366+ schema = {
367+ "codec" : "struct" ,
368+ "type" : "object" ,
369+ "properties" : {
370+ "reverse_node_map" : {
371+ "type" : "array" ,
372+ "items" : {
373+ "type" : "integer" ,
374+ "binaryFormat" : "I" ,
375+ }, # unsigned 32-bit int
376+ }
377+ },
378+ }
379+ tables .metadata_schema = tskit .MetadataSchema (schema )
380+ tables .metadata = struct_metadata
381+ ts_with_metadata = tables .tree_sequence ()
382+ tszip .compress (ts_with_metadata , self .path )
383+ ts_decompressed = tszip .decompress (self .path )
384+ self .assertEqual (ts_decompressed .metadata , ts_with_metadata .metadata )
385+
386+ def test_utf8_time_units_roundtrip (self ):
387+ """Test that time_units with non-ASCII UTF-8 characters work correctly."""
388+ ts = msprime .simulate (10 , random_seed = 1 )
389+ tables = ts .dump_tables ()
390+ # Use time_units with characters that require multi-byte UTF-8 encoding (>127)
391+ tables .time_units = "μβrånches per γενεᾱ 世代" # Greek, Nordic, Chinese chars
392+ ts_with_unicode_units = tables .tree_sequence ()
393+
394+ tszip .compress (ts_with_unicode_units , self .path )
395+ ts_decompressed = tszip .decompress (self .path )
396+ self .assertEqual (ts_decompressed .time_units , ts_with_unicode_units .time_units )
397+
398+ def test_json_metadata_roundtrip (self ):
399+ ts = msprime .simulate (10 , random_seed = 1 )
400+
401+ json_metadata = {
402+ "description" : "Test tree sequence with JSON metadata" ,
403+ "sample_count" : 10 ,
404+ "parameters" : {
405+ "Ne" : 1000 ,
406+ "mutation_rate" : 1e-8 ,
407+ "recombination_rate" : 1e-8 ,
408+ },
409+ "tags" : ["test" , "simulation" , "msprime" ],
410+ "version" : 1.0 ,
411+ "unicode_text" : "Héllo Wørld! 你好世界 🧬🌳" , # Characters with ASCII > 127
412+ "author" : "José María González-Pérez" , # Accented characters
413+ }
414+
415+ tables = ts .dump_tables ()
416+ schema = {
417+ "codec" : "json" ,
418+ "type" : "object" ,
419+ "properties" : {
420+ "description" : {"type" : "string" },
421+ "sample_count" : {"type" : "integer" },
422+ "parameters" : {
423+ "type" : "object" ,
424+ "properties" : {
425+ "Ne" : {"type" : "number" },
426+ "mutation_rate" : {"type" : "number" },
427+ "recombination_rate" : {"type" : "number" },
428+ },
429+ },
430+ "tags" : {"type" : "array" , "items" : {"type" : "string" }},
431+ "version" : {"type" : "number" },
432+ "unicode_text" : {"type" : "string" },
433+ "author" : {"type" : "string" },
434+ },
435+ }
436+ tables .metadata_schema = tskit .MetadataSchema (schema )
437+ tables .metadata = json_metadata
438+ ts_with_metadata = tables .tree_sequence ()
439+ tszip .compress (ts_with_metadata , self .path )
440+ ts_decompressed = tszip .decompress (self .path )
441+ self .assertEqual (ts_decompressed .metadata , json_metadata )
442+ self .assertEqual (
443+ ts_decompressed .metadata_schema , ts_with_metadata .metadata_schema
444+ )
445+
446+ def test_raw_metadata_with_high_bytes (self ):
447+ ts = msprime .simulate (10 , random_seed = 1 )
448+ tables = ts .dump_tables ()
449+ raw_metadata_bytes = bytes ([65 , 66 , 200 , 150 , 255 , 128 ]) # Contains bytes > 127
450+ tables .metadata = raw_metadata_bytes
451+ ts_with_metadata = tables .tree_sequence ()
452+ tszip .compress (ts_with_metadata , self .path )
453+ ts_decompressed = tszip .decompress (self .path )
454+ self .assertEqual (ts_decompressed .metadata , raw_metadata_bytes )
455+
358456
359457class TestFileErrors (unittest .TestCase ):
360458 """
@@ -411,3 +509,20 @@ def test_open_both(self):
411509 ts = tszip .load (files / "1.0.0.trees.tsz" )
412510 ts2 = tszip .load (files / "1.0.0.trees" )
413511 assert ts == ts2
512+
513+ def test_issue95_metadata_dtype_regression (self ):
514+ # Test that we can decompress files with struct metadata that were compressed by
515+ # version <=0.2.5 that stored metadata as the wrong dtype.
516+
517+ files = pathlib .Path (__file__ ).parent / "files"
518+
519+ ts_original = tszip .load (files / "issue95_metadata_dtype.trees" )
520+ # This file was compressed with 0.2.5 and should now decompress successfully
521+ ts_decompressed = tszip .load (files / "issue95_metadata_bug.tsz" )
522+
523+ assert ts_decompressed .metadata == ts_original .metadata
524+ assert isinstance (ts_decompressed .metadata , dict )
525+ assert "reverse_node_map" in ts_decompressed .metadata
526+ assert len (ts_decompressed .metadata ["reverse_node_map" ]) == len (
527+ ts_original .metadata ["reverse_node_map" ]
528+ )
0 commit comments