diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py index efe7221..e84eed7 100644 --- a/velociraptor/catalogue/catalogue.py +++ b/velociraptor/catalogue/catalogue.py @@ -15,6 +15,7 @@ from velociraptor.catalogue.derived import DerivedQuantities from velociraptor.catalogue.registration import global_registration_functions from velociraptor.exceptions import RegistrationDoesNotMatchError +from velociraptor.catalogue.reader import VelociraptorCatalogueReader class VelociraptorFieldMetadata(object): @@ -88,7 +89,7 @@ def register_field_properties(self): return -def generate_getter(filename, name: str, field: str, full_name: str, unit): +def generate_getter(reader, name: str, field: str, full_name: str, unit): """ Generates a function that: @@ -113,14 +114,9 @@ def getter(self): if current_value is not None: return current_value else: - with h5py.File(filename, "r") as handle: - try: - setattr(self, f"_{name}", unyt.unyt_array(handle[field][...], unit)) - getattr(self, f"_{name}").name = full_name - getattr(self, f"_{name}").file = filename - except KeyError: - print(f"Could not read {field}") - return None + setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit)) + getattr(self, f"_{name}").name = full_name + getattr(self, f"_{name}").file = reader.filename return getattr(self, f"_{name}") @@ -156,7 +152,7 @@ def deleter(self): def generate_sub_catalogue( - filename, + reader, registration_name: str, registration_function: Callable, units: VelociraptorUnits, @@ -173,10 +169,7 @@ def generate_sub_catalogue( """ # This creates a _copy_ of the _class_, not object. - this_sub_catalogue_bases = ( - __VelociraptorSubCatalogue, - object, - ) + this_sub_catalogue_bases = (__VelociraptorSubCatalogue, object) this_sub_catalogue_dict = {} valid_sub_paths = [] @@ -186,11 +179,7 @@ def generate_sub_catalogue( this_sub_catalogue_dict[metadata.snake_case] = property( generate_getter( - filename, - metadata.snake_case, - metadata.path, - metadata.name, - metadata.unit, + reader, metadata.snake_case, metadata.path, metadata.name, metadata.unit ), generate_setter(metadata.snake_case), generate_deleter(metadata.snake_case), @@ -205,7 +194,7 @@ def generate_sub_catalogue( ) # Finally, we can actually create an instance of our new class. - catalogue = ThisSubCatalogue(filename=filename) + catalogue = ThisSubCatalogue(filename=reader.filename) catalogue.valid_sub_paths = valid_sub_paths return catalogue @@ -375,6 +364,8 @@ def __create_sub_catalogues(self): else: self.invalid_field_paths.append(path) + reader = VelociraptorCatalogueReader(self.filename) + # For each registration function, we create a dynamic sub-class that # contains only that information - otherwise the namespace of the # VelociraptorCatalogue is way too crowded. @@ -383,7 +374,7 @@ def __create_sub_catalogues(self): self, attribute_name, generate_sub_catalogue( - filename=self.filename, + reader=reader, registration_name=attribute_name, # This ensures each class has a unique name registration_function=self.registration_functions[attribute_name], units=self.units, diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py new file mode 100644 index 0000000..79894fe --- /dev/null +++ b/velociraptor/catalogue/reader.py @@ -0,0 +1,103 @@ +""" +Main objects for the velociraptor reading library. + +This is based upon the reading routines in the SWIFTsimIO library. +""" + +import h5py +import re +import numpy as np + +from typing import List + + +class VelociraptorCatalogueReader(object): + """ + VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it + will detect whether this catalogue is self-contained or part of a larger + split catalogue consisting of multiple files. + + When a split catalogue is used, any of the catalogue.properties.X files can + be passed on to the constructor, where X is a counter ranging from 0 to + properties_file["Num_of_files"]-1. When a dataset is extracted from such a + catalogue, the elements in the resulting dataset will be ordered in blocks + of increasing X. + + For split catalogues, this class's read_field() method handles reading the + distributed datasets. For unsplit catalogues, it behaves exactly the same + as a direct read from the HDF5 file. + """ + + # List of files that make up the catalogue + filenames: List[str] + + def __init__(self, filename: str): + """ + I take in: + + + filename of (one of) the velociraptor properties file(s) + """ + with h5py.File(filename, "r") as handle: + num_files = handle["Num_of_files"][0] + if num_files == 1: + self.filenames = [filename] + else: + # compose the other file names + # we cast to str() because filename could be a pathlib.Path + basename = re.match("(\S+properties)\.\d+\Z", str(filename)).groups()[0] + self.filenames = [f"{basename}.{idx}" for idx in range(num_files)] + + @property + def filename(self): + """ + Returns the velociraptor properties file name or the first file name + if the catalogue is split + """ + return self.filenames[0] + + def read_field(self, field: str): + """ + Read the given field from the catalogue file(s) + """ + if len(self.filenames) == 1: + with h5py.File(self.filenames[0], "r") as handle: + try: + value = handle[field][...] + except KeyError: + print(f"Could not read {field}") + return None + return value + else: + # figure out the shape and dtype of the return value, so that we can + # create the appropriate array + dtype = None + shape = None + for filename in self.filenames: + with h5py.File(filename, "r") as handle: + try: + ds = handle[field] + except KeyError: + print(f"Could not read {field}") + return None + if dtype is None: + dtype = ds.dtype + shape = ds.shape + else: + # tuples are immutable, so instead of + # shape[0]+= ds.shape[0], we have to unpack, sum and + # then pack again + shape0, *shaperest = shape + shape0 += ds.shape[0] + shape = (shape0, *shaperest) + + # create an empty array to store the return value + value = np.zeros(shape, dtype=dtype) + # now read the data (no need to check for existence again, this was + # done when getting the shape and type) + offset = 0 + for filename in self.filenames: + with h5py.File(filename, "r") as handle: + size = handle[field].shape[0] + value[offset : offset + size] = handle[field][...] + offset += size + return value