Skip to content

Commit c4acfa2

Browse files
benjefferymergify[bot]
authored andcommitted
Add support for reference sequences
1 parent 73ef7e7 commit c4acfa2

File tree

3 files changed

+27
-4
lines changed

3 files changed

+27
-4
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
- Fix for `time_units` in tskit 0.4.0 (benjeffery, #54, #55)
66

7+
- Add support for reference sequence (benjeffery, #59)
8+
79
--------------------
810
[0.2.0] - 2021-11-08
911
--------------------

tests/test_compression.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,19 @@ def test_small_msprime_complex_mutations(self):
188188
)
189189
self.verify(tables.tree_sequence())
190190

191+
def test_ref_seq(self):
192+
ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2)
193+
tables = ts.tables
194+
tables.reference_sequence.metadata_schema = (
195+
tskit.MetadataSchema.permissive_json()
196+
)
197+
tables.reference_sequence.metadata = {"some": "data"}
198+
tables.reference_sequence.data = "ACTG"
199+
# NOTE: it's unclear whether we'll want to have this set at the same time as
200+
# 'data', but it's useful to have something in all columns for now.
201+
tables.reference_sequence.url = "http://example.com/a_reference"
202+
self.verify(tables.tree_sequence())
203+
191204
def test_mutation_parent_example(self):
192205
tables = tskit.TableCollection(1)
193206
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
@@ -263,7 +276,7 @@ def verify(self, ts):
263276
path = pathlib.Path(tmpdir) / "treeseq.tsz"
264277
tszip.compress(ts, path)
265278
other_ts = tszip.decompress(path)
266-
self.assertEqual(ts.tables, other_ts.tables)
279+
ts.tables.assert_equals(other_ts.tables)
267280

268281

269282
class TestMetadata(unittest.TestCase):

tszip/compression.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,11 @@ def compress_zarr(ts, root, variants_only=False):
194194

195195
# Schemas, metadata and units need to be converted to arrays
196196
for name in columns:
197-
if name.endswith("metadata_schema") or name == "time_units":
197+
if name.endswith("metadata_schema") or name in [
198+
"time_units",
199+
"reference_sequence/data",
200+
"reference_sequence/url",
201+
]:
198202
columns[name] = np.frombuffer(columns[name].encode("utf-8"), np.int8)
199203
if name.endswith("metadata"):
200204
columns[name] = np.frombuffer(columns[name], np.int8)
@@ -296,10 +300,15 @@ def decompress_zarr(root):
296300
for sub_key, sub_value in value.items():
297301
if f"{key}/{sub_key}" in quantised_arrays:
298302
dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value]
299-
elif sub_key.endswith("metadata_schema"):
303+
elif sub_key.endswith("metadata_schema") or (key, sub_key) in [
304+
("reference_sequence", "data"),
305+
("reference_sequence", "url"),
306+
]:
300307
dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode(
301308
"utf-8"
302309
)
310+
elif (key, sub_key) == ("reference_sequence", "metadata"):
311+
dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value)
303312
else:
304313
dict_repr.setdefault(key, {})[sub_key] = sub_value
305314
elif key.endswith("metadata_schema") or key == "time_units":
@@ -308,7 +317,6 @@ def decompress_zarr(root):
308317
dict_repr[key] = bytes(value)
309318
else:
310319
dict_repr[key] = value
311-
312320
return tskit.TableCollection.fromdict(dict_repr).tree_sequence()
313321

314322

0 commit comments

Comments
 (0)