Skip to content

Commit

Permalink
Update python package and builder to tiledbsoma 1.0.0rc2 (#227)
Browse files Browse the repository at this point in the history
* checkpoint on changes for rc1 port

* update python package dep to RC2

* fix bug in fragment counts to allow for zero length arrays

* update builder to RC2

* improve validation fix for fragment check on empty arrays

* comments

* PR review feedback

* use functools cache to simplify singleton code
  • Loading branch information
Bruce Martin authored Feb 28, 2023
1 parent c154d4b commit c67b936
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 58 deletions.
2 changes: 1 addition & 1 deletion api/python/cell_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies= [
# of TileDB on-disk storage format. Make sure this doesn't fall behind the builder's tiledbsoma version.
# NOTE: tiledb is also a requirement of the API, but tiledbsoma also has a tiledb dependency, so just use
# the same version here
"tiledbsoma==1.0.0rc1",
"tiledbsoma==1.0.0rc2",
"typing_extensions",
"s3fs",
"scikit-misc",
Expand Down
4 changes: 2 additions & 2 deletions tools/cell_census_builder/consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import tiledbsoma as soma

from .globals import SOMA_TileDB_Context
from .globals import DEFAULT_TILEDB_CONFIG, SOMA_TileDB_Context
from .mp import create_process_pool_executor, log_on_broken_process_pool


Expand Down Expand Up @@ -65,7 +65,7 @@ def consolidate_tiledb_object(uri: str) -> str:
import tiledb

logging.info(f"Consolidate: start uri {uri}")
tiledb.consolidate(uri, config=tiledb.Config({"sm.consolidation.buffer_size": 1 * 1024**3}))
tiledb.consolidate(uri, config=tiledb.Config(DEFAULT_TILEDB_CONFIG))
tiledb.vacuum(uri)
logging.info(f"Consolidate: end uri {uri}")
return uri
49 changes: 14 additions & 35 deletions tools/cell_census_builder/globals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import time
import functools
from typing import Set

import pyarrow as pa
Expand Down Expand Up @@ -210,46 +210,25 @@
FEATURE_REFERENCE_IGNORE: Set[str] = set()


# The default configuration for TileDB contexts used in the builder.
DEFAULT_TILEDB_CONFIG = {
"py.init_buffer_bytes": 512 * 1024**2,
"py.deduplicate": "true",
"soma.init_buffer_bytes": 512 * 1024**2,
"sm.consolidation.buffer_size": 1 * 1024**3,
}


"""
Singletons used throughout the package
"""

# Global SOMATileDBContext
_SOMA_TileDB_Context: soma.options.SOMATileDBContext = None

# Global TileDB context
_TileDB_Ctx: tiledb.Ctx = None

# The logical timestamp at which all builder data should be recorded
WRITE_TIMESTAMP = int(time.time() * 1000)

# Using "end of time" for read_timestamp means that all writes are visible, no matter what write timestamp was used
END_OF_TIME = 0xFFFFFFFFFFFFFFFF


@functools.cache
def SOMA_TileDB_Context() -> soma.options.SOMATileDBContext:
global _SOMA_TileDB_Context
if _SOMA_TileDB_Context is None or _SOMA_TileDB_Context != TileDB_Ctx():
# Set write timestamp to "now", so that we use consistent timestamps across all writes (mostly for aesthetic
# reasons). Set read timestamps to be same as write timestamp so that post-build validation reads can "see"
# the writes. Without setting read timestamp explicitly, the read timestamp would default to a time that
# prevents seeing the builder's writes.
_SOMA_TileDB_Context = soma.options.SOMATileDBContext(
tiledb_ctx=TileDB_Ctx(),
# TODO: Setting an explicit write timestamp causes later reads to fail!
# write_timestamp=write_timestamp,
# TODO: We *should* be able to set this equal to WRITE_TIMESTAMP, but as specifying a write_timestamp is
# problematic, we must use "end of time" for now
read_timestamp=END_OF_TIME,
)
return _SOMA_TileDB_Context
return soma.options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx(), timestamp=None)


@functools.cache
def TileDB_Ctx() -> tiledb.Ctx:
return _TileDB_Ctx


def set_tiledb_ctx(ctx: tiledb.Ctx) -> None:
global _TileDB_Ctx, _SOMA_TileDB_Context
_TileDB_Ctx = ctx
_SOMA_TileDB_Context = None
return tiledb.Ctx(DEFAULT_TILEDB_CONFIG)
18 changes: 0 additions & 18 deletions tools/cell_census_builder/mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,6 @@
import os
from typing import Optional, cast

import tiledbsoma as soma

from .globals import set_tiledb_ctx

if soma.get_storage_engine() == "tiledb":
import tiledb


def cpu_count() -> int:
"""Sign, os.cpu_count() returns None if "undetermined" number of CPUs"""
Expand All @@ -29,17 +22,6 @@ def process_initializer(verbose: int = 0) -> None:
)
logging.captureWarnings(True)

if soma.get_storage_engine() == "tiledb":
set_tiledb_ctx(
tiledb.Ctx(
{
"py.init_buffer_bytes": 512 * 1024**2,
"py.deduplicate": "true",
"soma.init_buffer_bytes": 512 * 1024**2,
}
)
)


def create_process_pool_executor(
args: argparse.Namespace, max_workers: Optional[int] = None
Expand Down
12 changes: 11 additions & 1 deletion tools/cell_census_builder/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,10 +447,20 @@ def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> boo

def validate_consolidation(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> bool:
"""Verify that obs, var and X layers are all fully consolidated & vacuumed"""

def is_empty_tiledb_array(uri: str) -> bool:
with tiledb.open(uri) as A:
return A.nonempty_domain() is None

with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census:
consolidated_uris = list_uris_to_consolidate(census)
for uri in consolidated_uris:
assert len(tiledb.array_fragments(uri)) == 1, f"{uri} has not been fully consolidated & vacuumed"
# If an empty array, must have fragment count of zero. If a non-empty array,
# must have fragment count of one.
assert (len(tiledb.array_fragments(uri)) == 1) or (
len(tiledb.array_fragments(uri)) == 0 and is_empty_tiledb_array(uri)
), f"{uri} has not been fully consolidated & vacuumed"

return True


Expand Down
2 changes: 1 addition & 1 deletion tools/scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ numpy
# NOTE: You can also build this dependency from source, per ./notebooks/README.md.
# NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility
# of TileDB on-disk storage format
tiledbsoma==1.0rc0
tiledbsoma==1.0rc2
# NOTE: tiledb is also a requirement of the builder, but builder must not use a tiledb version that is ahead of
# tiledbsoma's tiledb version (so just use the same version)
# tiledb
Expand Down

0 comments on commit c67b936

Please sign in to comment.